1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
SECTION_TEXT
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2, r1
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
%define STEP 40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
mov r5, r3
and r3, 0xFC
lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
jz .loop1
.loop4:
movlps m0, [r2 + 4*r3 + 0]
movlps m1, [r2 + 4*r3 + 8]
movlps m2, [r1 + 0*STEP]
movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
unpcklps m0, m0
unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
movu [r0 + 8*r3 + 16], m1
add r1, 4*STEP
add r3, 4
jnz .loop4
and r5, 3 ; number of single element loops
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
movlps m2, [r1]
unpcklps m0, m0
mulps m2, m0
movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP
dec r5
jnz .loop1
.end:
RET
; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
; const float alpha0[2], const float alpha1[2],
; float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
movss bw, BWm
%endif
movlps m2, [alpha1q]
movlps m1, [alpha0q]
shufps bw, bw, 0
mulps m2, bw ; (a1[0] a1[1])*bw
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
mova m7, [ps_mask]
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
; start and end 6th and 7th args on stack
mov r2d, Sm
mov r3d, Em
%define start r2q
%define end r3q
%else
; BW does not actually occupy a register, so shift by 1
%define start BWq
%define end Sq
%endif
sub start, end ; neg num of loops
lea X_highq, [X_highq + end*2*4]
lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
shl start, 3 ; offset from num loops
mova m0, [X_lowq + start]
movlhps m1, m1 ; (a2 a3 a2 a3)
movlhps m2, m2 ; (a0 a1 a0 a1)
shufps m3, m3, q0101 ; (a3 a2 a3 a2)
shufps m4, m4, q0101 ; (a1 a0 a1 a0)
xorps m3, m7 ; (-a3 a2 -a3 a2)
xorps m4, m7 ; (-a1 a0 -a1 a0)
.loop2:
mova m5, m0
mova m6, m0
shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"}
shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"}
mulps m0, m2
mulps m5, m4
mova m7, m6
addps m5, m0
mova m0, [X_lowq + start + 2*2*4]
shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"}
shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"}
mulps m6, m1
mulps m7, m3
addps m5, m6
addps m7, m0
addps m5, m7
mova [X_highq + start], m5
add start, 16
jnz .loop2
RET
|