diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-02-23 19:48:58 +0100 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-02-23 15:50:06 -0800 |
commit | 34454c761f01275d4adaf40df6d70a59011c4a6c (patch) | |
tree | a25a23c028ddee97c1195567f855ce064bdbe916 | |
parent | 2e74a5abc2fda6cfbc86589852d6194d502332cb (diff) | |
download | ffmpeg-34454c761f01275d4adaf40df6d70a59011c4a6c.tar.gz |
SBR DSP x86: implement SSE sbr_sum_square_sse
The 32bits targets have been compiled with -mfpmath=sse for proper reference.
sbr_sum_square C /32bits: 82c (unrolled)/102c
C /64bits: 69c (unrolled)/82c
SSE/32bits: 42c
SSE/64bits: 31c
Use of SSE4.1 dpps to perform the final sum is slower.
Not unrolling to perform 8 operations in a loop yields 10 more cycles.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r-- | libavcodec/sbrdsp.c | 2 | ||||
-rw-r--r-- | libavcodec/sbrdsp.h | 1 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/x86/sbrdsp.asm | 74 | ||||
-rw-r--r-- | libavcodec/x86/sbrdsp_init.c | 37 |
5 files changed, 116 insertions, 0 deletions
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c index aef894a51d..f942759aa7 100644 --- a/libavcodec/sbrdsp.c +++ b/libavcodec/sbrdsp.c @@ -238,4 +238,6 @@ av_cold void ff_sbrdsp_init(SBRDSPContext *s) if (ARCH_ARM) ff_sbrdsp_init_arm(s); + if (HAVE_MMX) + ff_sbrdsp_init_x86(s); } diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h index ee5d5a0972..fe91957ce4 100644 --- a/libavcodec/sbrdsp.h +++ b/libavcodec/sbrdsp.h @@ -46,5 +46,6 @@ extern const float ff_sbr_noise_table[][2]; void ff_sbrdsp_init(SBRDSPContext *s); void ff_sbrdsp_init_arm(SBRDSPContext *s); +void ff_sbrdsp_init_x86(SBRDSPContext *s); #endif diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index fc88433783..e64697aa2b 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -47,6 +47,8 @@ YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o +MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o +YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm new file mode 100644 index 0000000000..71471bd5ab --- /dev/null +++ b/libavcodec/x86/sbrdsp.asm @@ -0,0 +1,74 @@ +;****************************************************************************** +;* AAC Spectral Band Replication decoding functions +;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +;SECTION_RODATA +SECTION .text + +INIT_XMM sse +cglobal sbr_sum_square, 2, 3, 6 + mov r2, r1 + xorps m0, m0 + xorps m1, m1 + sar r2, 3 + jz .prepare +.loop: + movu m2, [r0 + 0] + movu m3, [r0 + 16] + movu m4, [r0 + 32] + movu m5, [r0 + 48] + mulps m2, m2 + mulps m3, m3 + mulps m4, m4 + mulps m5, m5 + addps m0, m2 + addps m1, m3 + addps m0, m4 + addps m1, m5 + add r0, 64 + dec r2 + jnz .loop +.prepare: + and r1, 7 + sar r1, 1 + jz .end +; len is a multiple of 2, thus there are at least 4 elements to process +.endloop: + movu m2, [r0] + add r0, 16 + mulps m2, m2 + dec r1 + addps m0, m2 + jnz .endloop +.end: + addps m0, m1 + movhlps m2, m0 + addps m0, m2 + movss m1, m0 + shufps m0, m0, 1 + addss m0, m1 +%if ARCH_X86_64 == 0 + movd r0m, m0 + fld dword r0m +%endif + RET diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c new file mode 100644 index 0000000000..313f492054 --- /dev/null +++ b/libavcodec/x86/sbrdsp_init.c @@ -0,0 +1,37 @@ +/* + * AAC Spectral Band Replication decoding functions + * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavcodec/sbrdsp.h" + +float ff_sbr_sum_square_sse(float (*x)[2], int n); + +void ff_sbrdsp_init_x86(SBRDSPContext *s) +{ + if (HAVE_YASM) { + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_SSE) { + s->sum_square = ff_sbr_sum_square_sse; + } + } +} |