diff options
author | Muhammad Faiz <mfcc64@gmail.com> | 2016-06-04 14:33:05 +0700 |
---|---|---|
committer | Muhammad Faiz <mfcc64@gmail.com> | 2016-06-08 16:09:43 +0700 |
commit | 1e69ac9246be8c9a1bf595e7fe949df5bc541c55 (patch) | |
tree | 1a27b5398516a8ccbaeb71048a17200f621f6b30 /libavfilter/x86/avf_showcqt.asm | |
parent | 49b024663501320310ec729f3bd15d0420bd9e59 (diff) | |
download | ffmpeg-1e69ac9246be8c9a1bf595e7fe949df5bc541c55.tar.gz |
avfilter/avf_showcqt: cqt_calc optimization on x86
on x86_64:
time PSNR
plain 3.303 inf
SSE 1.649 107.087535
SSE3 1.632 107.087535
AVX 1.409 106.986771
FMA3 1.265 107.108437
on x86_32 (PSNR compared to x86_64 plain):
time PSNR
plain 7.225 103.951979
SSE 1.827 105.859282
SSE3 1.819 105.859282
AVX 1.533 105.997661
FMA3 1.384 105.885377
FMA4 test is not available
Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
Diffstat (limited to 'libavfilter/x86/avf_showcqt.asm')
-rw-r--r-- | libavfilter/x86/avf_showcqt.asm | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm new file mode 100644 index 0000000000..6dac0a7959 --- /dev/null +++ b/libavfilter/x86/avf_showcqt.asm @@ -0,0 +1,206 @@ +;***************************************************************************** +;* x86-optimized functions for showcqt filter +;* +;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc Coeffs + .val: pointer 1 + .start: resd 1 + .len: resd 1 + .sizeof: +endstruc + +%macro EMULATE_HADDPS 3 ; dst, src, tmp +%if cpuflag(sse3) + haddps %1, %2 +%else + movaps %3, %1 + shufps %1, %2, q2020 + shufps %3, %2, q3131 + addps %1, %3 +%endif +%endmacro ; EMULATE_HADDPS + +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp +%if cpuflag(fma3) || cpuflag(fma4) + fmaddps %1, %2, %3, %4 +%else + mulps %5, %2, %3 + addps %1, %4, %5 +%endif +%endmacro ; EMULATE_FMADDPS + +%macro CQT_CALC 9 +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset + mov id, xd + add id, [coeffsq + Coeffs.start + %9] + movaps m%5, [srcq + 8 * iq] + movaps m%7, [srcq + 8 * iq + mmsize] + shufps m%6, m%5, m%7, q3131 + shufps m%5, m%5, m%7, q2020 + sub id, fft_lend + EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 + neg id + EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 + movups m%5, [srcq + 8 * iq - mmsize + 8] + movups m%7, [srcq + 8 * iq - 2*mmsize + 8] + %if mmsize == 32 + vperm2f128 m%5, m%5, m%5, 1 + vperm2f128 m%7, m%7, m%7, 1 + %endif + shufps m%6, m%5, m%7, q1313 + shufps m%5, m%5, m%7, q0202 + EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 + EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 +%endmacro ; CQT_CALC + +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 + addps m%5, m%4, m%2 + subps m%6, m%3, m%1 + addps m%1, m%3 + subps m%2, m%4 + EMULATE_HADDPS m%5, m%6, m%3 + EMULATE_HADDPS m%1, m%2, m%3 + EMULATE_HADDPS m%1, m%5, m%2 + %if mmsize == 32 + vextractf128 xmm%2, m%1, 1 + addps xmm%1, xmm%2 + %endif +%endmacro ; CQT_SEPARATE + +%macro DECLARE_CQT_CALC 0 +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) +%if ARCH_X86_64 +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len + align 16 + .loop_k: + mov xd, [coeffsq + Coeffs.len] + xorps m0, m0 + movaps m1, m0 + movaps m2, m0 + mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] + movaps m3, m0 + movaps m8, m0 + cmp coeffs_lend, xd + movaps m9, m0 + movaps m10, m0 + movaps m11, m0 + cmova coeffs_lend, xd + xor xd, xd + test coeffs_lend, coeffs_lend + jz .check_loop_b + mov coeffs_valq, [coeffsq + Coeffs.val] + mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] + align 16 + .loop_ab: + movaps m7, [coeffs_valq + 4 * xq] + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 + movaps m7, [coeffs_val2q + 4 * xq] + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof + add xd, mmsize/4 + cmp xd, coeffs_lend + jb .loop_ab + .check_loop_b: + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] + jae .check_loop_a + align 16 + .loop_b: + movaps m7, [coeffs_val2q + 4 * xq] + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof + add xd, mmsize/4 + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] + jb .loop_b + .loop_end: + CQT_SEPARATE 0, 1, 2, 3, 4, 5 + CQT_SEPARATE 8, 9, 10, 11, 4, 5 + mulps xmm0, xmm0 + mulps xmm8, xmm8 + EMULATE_HADDPS xmm0, xmm8, xmm1 + movaps [dstq], xmm0 + sub lend, 2 + lea dstq, [dstq + 16] + lea coeffsq, [coeffsq + 2*Coeffs.sizeof] + jnz .loop_k + REP_RET + align 16 + .check_loop_a: + cmp xd, [coeffsq + Coeffs.len] + jae .loop_end + align 16 + .loop_a: + movaps m7, [coeffs_valq + 4 * xq] + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 + add xd, mmsize/4 + cmp xd, [coeffsq + Coeffs.len] + jb .loop_a + jmp .loop_end +%else +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i +%define fft_lend r4m + align 16 + .loop_k: + mov xd, [coeffsq + Coeffs.len] + xorps m0, m0 + movaps m1, m0 + movaps m2, m0 + movaps m3, m0 + test xd, xd + jz .store + mov coeffs_valq, [coeffsq + Coeffs.val] + xor xd, xd + align 16 + .loop_x: + movaps m7, [coeffs_valq + 4 * xq] + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 + add xd, mmsize/4 + cmp xd, [coeffsq + Coeffs.len] + jb .loop_x + CQT_SEPARATE 0, 1, 2, 3, 4, 5 + mulps xmm0, xmm0 + EMULATE_HADDPS xmm0, xmm0, xmm1 + .store: + movlps [dstq], xmm0 + sub lend, 1 + lea dstq, [dstq + 8] + lea coeffsq, [coeffsq + Coeffs.sizeof] + jnz .loop_k + REP_RET +%endif ; ARCH_X86_64 +%endmacro ; DECLARE_CQT_CALC + +INIT_XMM sse +DECLARE_CQT_CALC +INIT_XMM sse3 +DECLARE_CQT_CALC +INIT_YMM avx +DECLARE_CQT_CALC +INIT_YMM fma3 +DECLARE_CQT_CALC +INIT_XMM fma4 +DECLARE_CQT_CALC |