diff options
author | Diego Biurrun <diego@biurrun.de> | 2013-12-29 02:32:16 +0100 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2014-05-29 06:41:15 -0700 |
commit | 054013a0fc6f2b52c60cee3e051be8cc7f82cef3 (patch) | |
tree | 87098f4b0443359b7109066486c15fdaad09dddb /libavcodec/x86 | |
parent | 256da0770e495176d1b2699ec6e9c7993c2a6d7b (diff) | |
download | ffmpeg-054013a0fc6f2b52c60cee3e051be8cc7f82cef3.tar.gz |
dsputil: Move APE-specific bits into apedsp
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/x86/apedsp.asm | 167 | ||||
-rw-r--r-- | libavcodec/x86/apedsp_init.c | 47 | ||||
-rw-r--r-- | libavcodec/x86/dsputil.asm | 137 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_init.c | 13 |
5 files changed, 216 insertions, 150 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 8830a22a8f..10242269c2 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o +OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o @@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o +YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm new file mode 100644 index 0000000000..d721ebda6b --- /dev/null +++ b/libavcodec/x86/apedsp.asm @@ -0,0 +1,167 @@ +;****************************************************************************** +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%macro SCALARPRODUCT 0 +; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, +; int order, int mul) +cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm +%if mmsize == 16 + pshuflw m7, m7, 0 + punpcklqdq m7, m7 +%else + pshufw m7, m7, 0 +%endif + pxor m6, m6 + add v1q, orderq + add v2q, orderq + add v3q, orderq + neg orderq +.loop: + movu m0, [v2q + orderq] + movu m1, [v2q + orderq + mmsize] + mova m4, [v1q + orderq] + mova m5, [v1q + orderq + mmsize] + movu m2, [v3q + orderq] + movu m3, [v3q + orderq + mmsize] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmullw m2, m7 + pmullw m3, m7 + paddd m6, m0 + paddd m6, m1 + paddw m2, m4 + paddw m3, m5 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + add orderq, mmsize*2 + jl .loop +%if mmsize == 16 + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e +%else + pshufw m0, m6, 0x4e +%endif + paddd m6, m0 + movd eax, m6 + RET +%endmacro + +INIT_MMX mmxext +SCALARPRODUCT +INIT_XMM sse2 +SCALARPRODUCT + +%macro SCALARPRODUCT_LOOP 1 +align 16 +.loop%1: + sub orderq, mmsize*2 +%if %1 + mova m1, m4 + mova m4, [v2q + orderq] + mova m0, [v2q + orderq + mmsize] + palignr m1, m0, %1 + palignr m0, m4, %1 + mova m3, m5 + mova m5, [v3q + orderq] + mova m2, [v3q + orderq + mmsize] + palignr m3, m2, %1 + palignr m2, m5, %1 +%else + mova m0, [v2q + orderq] + mova m1, [v2q + orderq + mmsize] + mova m2, [v3q + orderq] + mova m3, [v3q + orderq + mmsize] +%endif + %define t0 [v1q + orderq] + %define t1 [v1q + orderq + mmsize] +%if ARCH_X86_64 + mova m8, t0 + mova m9, t1 + %define t0 m8 + %define t1 m9 +%endif + pmaddwd m0, t0 + pmaddwd m1, t1 + pmullw m2, m7 + pmullw m3, m7 + paddw m2, t0 + paddw m3, t1 + paddd m6, m0 + paddd m6, m1 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + jg .loop%1 +%if %1 + jmp .end +%endif +%endmacro + +; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, +; int order, int mul) +INIT_XMM ssse3 +cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm + pshuflw m7, m7, 0 + punpcklqdq m7, m7 + pxor m6, m6 + mov r4d, v2d + and r4d, 15 + and v2q, ~15 + and v3q, ~15 + mova m4, [v2q + orderq] + mova m5, [v3q + orderq] + ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) + cmp r4d, 0 + je .loop0 + cmp r4d, 2 + je .loop2 + cmp r4d, 4 + je .loop4 + cmp r4d, 6 + je .loop6 + cmp r4d, 8 + je .loop8 + cmp r4d, 10 + je .loop10 + cmp r4d, 12 + je .loop12 +SCALARPRODUCT_LOOP 14 +SCALARPRODUCT_LOOP 12 +SCALARPRODUCT_LOOP 10 +SCALARPRODUCT_LOOP 8 +SCALARPRODUCT_LOOP 6 +SCALARPRODUCT_LOOP 4 +SCALARPRODUCT_LOOP 2 +SCALARPRODUCT_LOOP 0 +.end: + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e + paddd m6, m0 + movd eax, m6 + RET diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c new file mode 100644 index 0000000000..f692c2b9b6 --- /dev/null +++ b/libavcodec/x86/apedsp_init.c @@ -0,0 +1,47 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/apedsp.h" + +int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); + +av_cold void ff_apedsp_init_x86(APEDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMXEXT(cpu_flags)) + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; + + if (EXTERNAL_SSE2(cpu_flags)) + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; + + if (EXTERNAL_SSSE3(cpu_flags) && + !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; +} diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 684f09b7fc..b5d6d3cc65 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order paddd m2, m0 movd eax, m2 RET - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm -%if mmsize == 16 - pshuflw m7, m7, 0 - punpcklqdq m7, m7 -%else - pshufw m7, m7, 0 -%endif - pxor m6, m6 - add v1q, orderq - add v2q, orderq - add v3q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - mova m4, [v1q + orderq] - mova m5, [v1q + orderq + mmsize] - movu m2, [v3q + orderq] - movu m3, [v3q + orderq + mmsize] - pmaddwd m0, m4 - pmaddwd m1, m5 - pmullw m2, m7 - pmullw m3, m7 - paddd m6, m0 - paddd m6, m1 - paddw m2, m4 - paddw m3, m5 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e -%else - pshufw m0, m6, 0x4e -%endif - paddd m6, m0 - movd eax, m6 - RET %endmacro INIT_MMX mmxext @@ -106,97 +60,6 @@ SCALARPRODUCT INIT_XMM sse2 SCALARPRODUCT -%macro SCALARPRODUCT_LOOP 1 -align 16 -.loop%1: - sub orderq, mmsize*2 -%if %1 - mova m1, m4 - mova m4, [v2q + orderq] - mova m0, [v2q + orderq + mmsize] - palignr m1, m0, %1 - palignr m0, m4, %1 - mova m3, m5 - mova m5, [v3q + orderq] - mova m2, [v3q + orderq + mmsize] - palignr m3, m2, %1 - palignr m2, m5, %1 -%else - mova m0, [v2q + orderq] - mova m1, [v2q + orderq + mmsize] - mova m2, [v3q + orderq] - mova m3, [v3q + orderq + mmsize] -%endif - %define t0 [v1q + orderq] - %define t1 [v1q + orderq + mmsize] -%if ARCH_X86_64 - mova m8, t0 - mova m9, t1 - %define t0 m8 - %define t1 m9 -%endif - pmaddwd m0, t0 - pmaddwd m1, t1 - pmullw m2, m7 - pmullw m3, m7 - paddw m2, t0 - paddw m3, t1 - paddd m6, m0 - paddd m6, m1 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - jg .loop%1 -%if %1 - jmp .end -%endif -%endmacro - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -INIT_XMM ssse3 -cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm - pshuflw m7, m7, 0 - punpcklqdq m7, m7 - pxor m6, m6 - mov r4d, v2d - and r4d, 15 - and v2q, ~15 - and v3q, ~15 - mova m4, [v2q + orderq] - mova m5, [v3q + orderq] - ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) - cmp r4d, 0 - je .loop0 - cmp r4d, 2 - je .loop2 - cmp r4d, 4 - je .loop4 - cmp r4d, 6 - je .loop6 - cmp r4d, 8 - je .loop8 - cmp r4d, 10 - je .loop10 - cmp r4d, 12 - je .loop12 -SCALARPRODUCT_LOOP 14 -SCALARPRODUCT_LOOP 12 -SCALARPRODUCT_LOOP 10 -SCALARPRODUCT_LOOP 8 -SCALARPRODUCT_LOOP 6 -SCALARPRODUCT_LOOP 4 -SCALARPRODUCT_LOOP 2 -SCALARPRODUCT_LOOP 0 -.end: - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e - paddd m6, m0 - movd eax, m6 - RET - ;----------------------------------------------------------------------------- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 10fa166db4..9b0788ff73 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, int order); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order); -int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); @@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; #endif /* HAVE_MMXEXT_EXTERNAL */ } @@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, #if HAVE_SSE2_EXTERNAL c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; if (cpu_flags & AV_CPU_FLAG_ATOM) { c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; } else { @@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, int cpu_flags, unsigned high_bit_depth) { #if HAVE_SSSE3_EXTERNAL - if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; c->bswap_buf = ff_bswap32_buf_ssse3; #endif /* HAVE_SSSE3_EXTERNAL */ } |