1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
;*****************************************************************************
;* x86-optimized AC-3 DSP functions
;* Copyright (c) 2011 Justin Ruggles
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000
; used in ff_ac3_compute_mantissa_size()
cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
; used in ff_ac3_extract_exponents()
cextern pd_1
pd_151: times 4 dd 151
SECTION .text
;-----------------------------------------------------------------------------
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
;-----------------------------------------------------------------------------
%macro AC3_EXPONENT_MIN 0
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
shl reuse_blksd, 8
jz .end
LOOP_ALIGN
.nextexp:
mov offsetq, reuse_blksq
mova m0, [expq+offsetq]
sub offsetq, 256
LOOP_ALIGN
.nextblk:
PMINUB m0, [expq+offsetq], m1
sub offsetq, 256
jae .nextblk
mova [expq], m0
add expq, mmsize
sub expnd, mmsize
jg .nextexp
.end:
RET
%endmacro
%define LOOP_ALIGN ALIGN 16
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXPONENT_MIN
%endif
%undef LOOP_ALIGN
;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, size_t len)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
movaps m0, [pf_1_24]
shl lenq, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
movaps m1, [srcq+lenq ]
movaps m2, [srcq+lenq+16 ]
movaps m3, [srcq+lenq+32 ]
movaps m4, [srcq+lenq+48 ]
%ifdef m8
movaps m5, [srcq+lenq+64 ]
movaps m6, [srcq+lenq+80 ]
movaps m7, [srcq+lenq+96 ]
movaps m8, [srcq+lenq+112]
%endif
mulps m1, m0
mulps m2, m0
mulps m3, m0
mulps m4, m0
%ifdef m8
mulps m5, m0
mulps m6, m0
mulps m7, m0
mulps m8, m0
%endif
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
%ifdef m8
cvtps2dq m5, m5
cvtps2dq m6, m6
cvtps2dq m7, m7
cvtps2dq m8, m8
%endif
movdqa [dstq+lenq ], m1
movdqa [dstq+lenq+16 ], m2
movdqa [dstq+lenq+32 ], m3
movdqa [dstq+lenq+48 ], m4
%ifdef m8
movdqa [dstq+lenq+64 ], m5
movdqa [dstq+lenq+80 ], m6
movdqa [dstq+lenq+96 ], m7
movdqa [dstq+lenq+112], m8
add lenq, 128
%else
add lenq, 64
%endif
jl .loop
RET
INIT_YMM avx
cglobal float_to_fixed24, 3, 3, 5, dst, src, len
vbroadcastf128 m0, [pf_1_24]
shl lenq, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
mulps m1, m0, [srcq+lenq+mmsize*0]
mulps m2, m0, [srcq+lenq+mmsize*1]
mulps m3, m0, [srcq+lenq+mmsize*2]
mulps m4, m0, [srcq+lenq+mmsize*3]
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
mova [dstq+lenq+mmsize*0], m1
mova [dstq+lenq+mmsize*1], m2
mova [dstq+lenq+mmsize*2], m3
mova [dstq+lenq+mmsize*3], m4
add lenq, mmsize*4
jl .loop
RET
;------------------------------------------------------------------------------
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
;------------------------------------------------------------------------------
%macro PHADDD4 2 ; xmm src, xmm tmp
movhlps %2, %1
paddd %1, %2
pshufd %2, %1, 0x1
paddd %1, %2
%endmacro
INIT_XMM sse2
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
movdqa m0, [mant_cntq ]
movdqa m1, [mant_cntq+ 1*16]
paddw m0, [mant_cntq+ 2*16]
paddw m1, [mant_cntq+ 3*16]
paddw m0, [mant_cntq+ 4*16]
paddw m1, [mant_cntq+ 5*16]
paddw m0, [mant_cntq+ 6*16]
paddw m1, [mant_cntq+ 7*16]
paddw m0, [mant_cntq+ 8*16]
paddw m1, [mant_cntq+ 9*16]
paddw m0, [mant_cntq+10*16]
paddw m1, [mant_cntq+11*16]
pmaddwd m0, [ac3_bap_bits ]
pmaddwd m1, [ac3_bap_bits+16]
paddd m0, m1
PHADDD4 m0, m1
movd sumd, m0
movdqa m3, [pw_bap_mul1]
movhpd m0, [mant_cntq +2]
movlpd m0, [mant_cntq+1*32+2]
movhpd m1, [mant_cntq+2*32+2]
movlpd m1, [mant_cntq+3*32+2]
movhpd m2, [mant_cntq+4*32+2]
movlpd m2, [mant_cntq+5*32+2]
pmulhuw m0, m3
pmulhuw m1, m3
pmulhuw m2, m3
paddusw m0, m1
paddusw m0, m2
pmaddwd m0, [pw_bap_mul2]
PHADDD4 m0, m1
movd eax, m0
add eax, sumd
RET
;------------------------------------------------------------------------------
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
;------------------------------------------------------------------------------
%macro PABSD 1-2 ; src/dst, unused
%if cpuflag(ssse3)
pabsd %1, %1
%else ; src/dst, tmp
pxor %2, %2
pcmpgtd %2, %1
pxor %1, %2
psubd %1, %2
%endif
%endmacro
%macro AC3_EXTRACT_EXPONENTS 0
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
movsxdifnidn lenq, lend
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
.loop:
; move 4 32-bit coefs to xmm0
mova m0, [coefq+4*lenq]
; absolute value
PABSD m0, m1
; convert to float and extract exponents
pslld m0, 1
por m0, m2
cvtdq2ps m1, m0
psrld m1, 23
mova m0, m3
psubd m0, m1
; move the lowest byte in each of 4 dwords to the low dword
; NOTE: We cannot just extract the low bytes with pshufb because the dword
; result for 16777215 is -1 due to float inaccuracy. Using packuswb
; clips this to 0, which is the correct exponent.
packssdw m0, m0
packuswb m0, m0
movd [expq+lenq], m0
add lenq, 4
jl .loop
RET
%endmacro
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXTRACT_EXPONENTS
%endif
%if HAVE_SSSE3_EXTERNAL
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
%endif
|