diff options
author | Lynne <dev@lynne.ee> | 2022-09-19 23:48:53 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-21 07:12:39 +0200 |
commit | 3ade6a8644ab519fcd7caa7ef457dd406162bc14 (patch) | |
tree | 80135c60b8bdc5d4975f986f68ae44ef80449335 /libavcodec | |
parent | cc367a9b8aa1e473bf60c5dc5e03431a7bbcd125 (diff) | |
download | ffmpeg-3ade6a8644ab519fcd7caa7ef457dd406162bc14.tar.gz |
x86/lpc: implement a new Welch windowing function
Old one was written with the assumption only even inputs would be given.
This very messy replacement supports even and odd inputs, and supports
AVX2 for extra speed. The buffers given are usually quite big (4k samples),
so the speedup is worth it.
The new SSE version is still faster than the old inline asm version by 33%.
Also checkasm is provided to make sure this monstrosity works.
This fixes some FATE tests.
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/Makefile | 3 | ||||
-rw-r--r-- | libavcodec/x86/lpc.asm | 241 | ||||
-rw-r--r-- | libavcodec/x86/lpc_init.c (renamed from libavcodec/x86/lpc.c) | 72 |
3 files changed, 258 insertions, 58 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 4e448623af..e1120b7e15 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -23,7 +23,7 @@ OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o -OBJS-$(CONFIG_LPC) += x86/lpc.o +OBJS-$(CONFIG_LPC) += x86/lpc_init.o OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o @@ -127,6 +127,7 @@ X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o +X86ASM-OBJS-$(CONFIG_LPC) += x86/lpc.o X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm new file mode 100644 index 0000000000..26101b4e25 --- /dev/null +++ b/libavcodec/x86/lpc.asm @@ -0,0 +1,241 @@ +;****************************************************************************** +;* Copyright (c) Lynne +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +one_tab: times 4 dq 1.0 +seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0 +sub_tab: dq -1.0, -2.0, -3.0, -4.0 +add_tab_avx2: times 4 dq 4.0 +dec_tab_avx2: times 4 dq -4.0 +add_tab_sse2: times 2 dq 2.0 +dec_tab_sse2: times 2 dq -2.0 +dec_tab_scalar: times 2 dq -1.0 +seq_tab_sse2: dq 1.0, 0.0 + +SECTION .text + +%macro APPLY_WELCH_FN 0 +cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2 + cmp lenq, 0 + je .end + cmp lenq, 1 + je .one + + movapd m6, [one_tab] + + movd xm1, lend + cvtdq2pd xm1, xm1 ; len +%if cpuflag(avx2) + vbroadcastsd m1, xm1 +%else + shufpd m1, m1, 00b +%endif + + addpd m0, m6, m6 ; 2.0 + subpd m1, m6 ; len - 1 + divpd m0, m1 ; 2.0 / (len - 1) + + mov off1q, lenq + and off1q, 1 + je .even + + movapd m5, m0 + addpd m0, [sub_tab] + + lea off2q, [lenq*4 - mmsize/2] + sub lenq, mmsize/4 ; avoid overwriting + xor off1q, off1q + + cmp lenq, mmsize/4 + jl .scalar_o + +%if cpuflag(avx2) + movapd m7, [dec_tab_avx2] +%else + movapd m7, [dec_tab_sse2] +%endif + +.loop_o: + movapd m1, m6 + mulpd m2, m0, m0 + subpd m1, m2 +%if cpuflag(avx2) + vpermpd m2, m1, q0123 +%else + shufpd m2, m1, m1, 01b +%endif + + cvtdq2pd m3, [dataq + off1q] + cvtdq2pd m4, [dataq + off2q] + + mulpd m1, m3 + mulpd m2, m4 + + movupd [outq + off1q*2], m1 + movupd [outq + off2q*2], m2 + + addpd m0, m7 + add off1q, mmsize/2 + sub off2q, mmsize/2 + sub lenq, mmsize/4 + jg .loop_o + + add lend, (mmsize/4 - 1) + cmp lend, 0 + je .end + sub lenq, (mmsize/4 - 1) + +.scalar_o: + movapd xm7, [dec_tab_scalar] + subpd xm0, xm7 + + ; Set offsets + add off2q, (mmsize/4) + 4*cpuflag(avx2) + add lenq, mmsize/4 - 2 + +.loop_o_scalar: + movapd xm1, xm6 + mulpd xm2, xm0, xm0 + subpd xm1, xm2 + + cvtdq2pd xm3, [dataq + off1q - (mmsize/4) + 4*cpuflag(avx2)] + cvtdq2pd xm4, [dataq + off2q - (mmsize/4) + 4*cpuflag(avx2)] + + mulpd xm3, xm1 + mulpd xm4, xm1 + + movhpd [outq + off1q*2], xm3 + movhpd [outq + off2q*2], xm4 + + addpd xm0, xm7 + + add off1q, 4 + sub off2q, 4 + + sub lenq, 2 + jg .loop_o_scalar + RET + +.even: +%if cpuflag(avx2) + addpd m0, [seq_tab_avx2] +%else + addpd m0, [seq_tab_sse2] +%endif + + mov off1d, lend + shr off1d, 1 + movd xm1, off1d + cvtdq2pd xm1, xm1 ; len/2 +%if cpuflag(avx2) + vbroadcastsd m1, xm1 +%else + shufpd m1, m1, 00b +%endif + subpd m0, m1 + +%if cpuflag(avx2) + movapd m7, [add_tab_avx2] +%else + movapd m7, [add_tab_sse2] +%endif + + lea off2q, [lenq*2] + lea off1q, [lenq*2 - mmsize/2] + sub lenq, mmsize/4 + + cmp lenq, mmsize/4 + jl .scalar_e + +.loop_e: + movapd m1, m6 + mulpd m2, m0, m0 + subpd m1, m2 +%if cpuflag(avx2) + vpermpd m2, m1, q0123 +%else + shufpd m2, m1, m1, 01b +%endif + + cvtdq2pd m3, [dataq + off1q] + cvtdq2pd m4, [dataq + off2q] + + mulpd m1, m3 + mulpd m2, m4 + + movupd [outq + off1q*2], m1 + movupd [outq + off2q*2], m2 + + addpd m0, m7 + add off2q, mmsize/2 + sub off1q, mmsize/2 + sub lenq, mmsize/4 + jge .loop_e + +.scalar_e: + subpd m0, m7 + movapd m7, [dec_tab_scalar] + subpd m0, m7 + subpd m0, m7 + subpd m0, m7 + + add off1q, (mmsize/2) + sub off2q, (mmsize/2) - 4 - 8*cpuflag(avx2) + + addpd xm0, [sub_tab] + +.loop_e_scalar: + movapd xm1, xm6 + mulpd xm2, xm0, xm0 + subpd xm1, xm2 + + cvtdq2pd m3, [dataq + off1q - 4] + cvtdq2pd m4, [dataq + off2q - 4] + + mulpd m3, m1 + mulpd m4, m1 + + movhpd [outq + off1q*2], xm3 + movhpd [outq + off2q*2], xm4 + + subpd xm0, xm7 + + add off2q, 4 + sub off1q, 4 + jge .loop_e_scalar + RET + +.one: + xorpd xm0, xm0 + movhpd [outq], xm0 +.end: + RET +%endmacro + +INIT_XMM sse2 +APPLY_WELCH_FN + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +APPLY_WELCH_FN +%endif diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc_init.c index 40fc29fc0f..df77c966c6 100644 --- a/libavcodec/x86/lpc.c +++ b/libavcodec/x86/lpc_init.c @@ -20,65 +20,19 @@ */ #include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem_internal.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/lpc.h" +void ff_lpc_apply_welch_window_sse2(const int32_t *data, int len, + double *w_data); +void ff_lpc_apply_welch_window_avx2(const int32_t *data, int len, + double *w_data); + DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; -DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 }; #if HAVE_SSE2_INLINE -static void lpc_apply_welch_window_sse2(const int32_t *data, int len, - double *w_data) -{ - double c = 2.0 / (len-1.0); - int n2 = len>>1; - x86_reg i = -n2*sizeof(int32_t); - x86_reg j = n2*sizeof(int32_t); - __asm__ volatile( - "movsd %4, %%xmm7 \n\t" - "movapd "MANGLE(pd_1)", %%xmm6 \n\t" - "movapd "MANGLE(pd_2)", %%xmm5 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "subpd %%xmm5, %%xmm7 \n\t" - "addsd %%xmm6, %%xmm7 \n\t" - "test $1, %5 \n\t" - "jz 2f \n\t" -#define WELCH(MOVPD, offset)\ - "1: \n\t"\ - "movapd %%xmm7, %%xmm1 \n\t"\ - "mulpd %%xmm1, %%xmm1 \n\t"\ - "movapd %%xmm6, %%xmm0 \n\t"\ - "subpd %%xmm1, %%xmm0 \n\t"\ - "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ - "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ - "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ - "mulpd %%xmm0, %%xmm2 \n\t"\ - "mulpd %%xmm1, %%xmm3 \n\t"\ - "movapd %%xmm2, (%2,%0,2) \n\t"\ - MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ - "subpd %%xmm5, %%xmm7 \n\t"\ - "sub $8, %1 \n\t"\ - "add $8, %0 \n\t"\ - "jl 1b \n\t"\ - - WELCH("movupd", -1) - "jmp 3f \n\t" - "2: \n\t" - WELCH("movapd", -2) - "3: \n\t" - :"+&r"(i), "+&r"(j) - :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm5", "%xmm6", "%xmm7") - ); -#undef WELCH -} - static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, double *autoc) { @@ -151,12 +105,16 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, av_cold void ff_lpc_init_x86(LPCContext *c) { -#if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_SSE2_SLOW(cpu_flags)) { - c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; - } -#endif /* HAVE_SSE2_INLINE */ +#if HAVE_SSE2_INLINE + if (INLINE_SSE2_SLOW(cpu_flags)) + c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; +#endif + + if (EXTERNAL_SSE2(cpu_flags)) + c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2; + + if (EXTERNAL_AVX2(cpu_flags)) + c->lpc_apply_welch_window = ff_lpc_apply_welch_window_avx2; } |