libavcodec/x86/mlpdsp.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

;******************************************************************************
;* SIMD-optimized MLP DSP functions
;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

%if ARCH_X86_64

%macro SHLX 2
%if cpuflag(bmi2)
   shlx %1, %1, %2q
%else
   shl  %1, %2b
%endif
%endmacro

%macro REMATRIX 0
    movdqa        m0, [samplesq]
    movdqa        m1, [coeffsq ]
    pshufd        m2, m0, q2301
    pshufd        m3, m1, q2301
    pmuldq        m0, m1
    pmuldq        m3, m2
    paddq         m0, m3
%if notcpuflag(avx2)
    movdqa        m1, [samplesq + 16]
    movdqa        m2, [coeffsq  + 16]
    pshufd        m3, m1, q2301
    pshufd        m4, m2, q2301
    pmuldq        m1, m2
    pmuldq        m4, m3
    paddq         m0, m1
    paddq         m0, m4
%else
    vextracti128 xm1, m0, 1
    paddq        xm0, xm1
%endif
%endmacro

%macro LOOP_END 0
    pshufd       xm1, xm0, q0032
    paddq        xm0, xm1
    movq      accumq, xm0
    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
    sar       accumq, 14                            ; accum >>= 14
    and       accumd, maskd                         ; accum &= mask
    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
    add     samplesq, 32                            ; samples += MAX_CHANNELS;
    cmp   blsbs_ptrq, cntq
%endmacro

%macro LOOP_SHIFT_END 0
    pshufd       xm1, xm0, q0032
    paddq        xm0, xm1
    movq      accumq, xm0
    and       indexd, auspd                         ; index &= access_unit_size_pow2;
    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
    add       indexd, index2d                       ; index += index2
    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
    add       accumq, noiseq                        ; accum += noise_buffer[index]
    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
    sar       accumq, 14                            ; accum >>= 14
    and       accumd, maskd                         ; accum &= mask
    add       accumd, noised                        ; accum += *bypassed_lsbs
    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
    add     samplesq, 32                            ; samples += MAX_CHANNELS;
    cmp   blsbs_ptrq, cntq
%endmacro

;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
;                             int index, unsigned int dest_ch, uint16_t blockpos,
;                             unsigned int maxchan, int matrix_noise_shift,
;                             int access_unit_size_pow2, int32_t mask)
%macro MLP_REMATRIX_CHANNEL 0
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
                                        index, dest_ch, blockpos, maxchan, mns, \
                                        accum, mask, cnt
    mov         mnsd, mnsm                          ; load matrix_noise_shift
    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
    mov     maxchand, maxchanm                      ; load maxchan
    mov        maskd, maskm                         ; load mask
%if WIN64
    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
%endif
    shl     dest_chd, 2
    lea         cntq, [blsbs_ptrq + blockposq*8]
    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
    jne .shift                                      ; jump if true
    cmp     maxchand, 4                             ; is maxchan < 4?
    jl .loop4                                       ; jump if true

align 16
.loop8:
    ; Process 5 or more channels
    REMATRIX
    LOOP_END
    jne .loop8
    RET

align 16
.loop4:
    ; Process up to 4 channels
    movdqa       xm0, [samplesq]
    movdqa       xm1, [coeffsq ]
    pshufd       xm2, xm0, q2301
    pshufd       xm3, xm1, q2301
    pmuldq       xm0, xm1
    pmuldq       xm3, xm2
    paddq        xm0, xm3
    LOOP_END
    jne .loop4
    RET

.shift:
%if WIN64
    mov       indexd, indexm         ; load index (not needed on UNIX64)
%endif
    mov          r9d, r9m            ; load access_unit_size_pow2
%if cpuflag(bmi2)
    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
                index, dest_ch, accum, index2, mns, \
                ausp, mask, cnt, noise
    add         mnsd, 7              ; matrix_noise_shift += 7
%else ; sse4
    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
%if WIN64
    ; r0 = rcx
    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
                index2, accum, ausp, mask, cnt, noise
%else ; UNIX64
    ; r3 = rcx
    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
                index2, accum, ausp, mask, cnt, noise
%endif
    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
%endif ; cpuflag
    sub        auspd, 1              ; access_unit_size_pow2 -= 1
    cmp          r7d, 4              ; is maxchan < 4?
    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
    jl .loop4_shift                  ; jump if maxchan < 4

align 16
.loop8_shift:
    ; Process 5 or more channels
    REMATRIX
    LOOP_SHIFT_END
    jne .loop8_shift
    RET

align 16
.loop4_shift:
    ; Process up to 4 channels
    movdqa       xm0, [samplesq]
    movdqa       xm1, [coeffsq ]
    pshufd       xm2, xm0, q2301
    pshufd       xm3, xm1, q2301
    pmuldq       xm0, xm1
    pmuldq       xm3, xm2
    paddq        xm0, xm3
    LOOP_SHIFT_END
    jne .loop4_shift
    RET
%endmacro

INIT_XMM sse4
MLP_REMATRIX_CHANNEL
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2, bmi2
MLP_REMATRIX_CHANNEL
%endif

%endif ; ARCH_X86_64