diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2014-05-30 00:59:15 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-30 00:59:15 +0200 |
commit | 40f3a87c10d5773eb66e09f4ed3d8197b1840863 (patch) | |
tree | bee50efb5beaff032de1fc4dcd797e3000b30c4c /libavcodec/x86 | |
parent | 5c6e94c42bcdc026071855a6b1749406b2456c8b (diff) | |
parent | 054013a0fc6f2b52c60cee3e051be8cc7f82cef3 (diff) | |
download | ffmpeg-40f3a87c10d5773eb66e09f4ed3d8197b1840863.tar.gz |
Merge commit '054013a0fc6f2b52c60cee3e051be8cc7f82cef3'
* commit '054013a0fc6f2b52c60cee3e051be8cc7f82cef3':
dsputil: Move APE-specific bits into apedsp
Conflicts:
libavcodec/arm/int_neon.S
libavcodec/x86/dsputil.asm
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/x86/apedsp.asm | 157 | ||||
-rw-r--r-- | libavcodec/x86/apedsp_init.c | 47 | ||||
-rw-r--r-- | libavcodec/x86/dsputil.asm | 127 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_init.c | 13 |
5 files changed, 206 insertions, 140 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 3da0ae91be..6e40230120 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -29,6 +29,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o +OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o @@ -103,6 +104,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o +YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm new file mode 100644 index 0000000000..64b769f7d4 --- /dev/null +++ b/libavcodec/x86/apedsp.asm @@ -0,0 +1,157 @@ +;****************************************************************************** +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%macro SCALARPRODUCT 0 +; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, +; int order, int mul) +cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm +%if mmsize == 16 + pshuflw m7, m7, 0 + punpcklqdq m7, m7 +%else + pshufw m7, m7, 0 +%endif + pxor m6, m6 + add v1q, orderq + add v2q, orderq + add v3q, orderq + neg orderq +.loop: + movu m0, [v2q + orderq] + movu m1, [v2q + orderq + mmsize] + mova m4, [v1q + orderq] + mova m5, [v1q + orderq + mmsize] + movu m2, [v3q + orderq] + movu m3, [v3q + orderq + mmsize] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmullw m2, m7 + pmullw m3, m7 + paddd m6, m0 + paddd m6, m1 + paddw m2, m4 + paddw m3, m5 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + add orderq, mmsize*2 + jl .loop + HADDD m6, m0 + movd eax, m6 + RET +%endmacro + +INIT_MMX mmxext +SCALARPRODUCT +INIT_XMM sse2 +SCALARPRODUCT + +%macro SCALARPRODUCT_LOOP 1 +align 16 +.loop%1: + sub orderq, mmsize*2 +%if %1 + mova m1, m4 + mova m4, [v2q + orderq] + mova m0, [v2q + orderq + mmsize] + palignr m1, m0, %1 + palignr m0, m4, %1 + mova m3, m5 + mova m5, [v3q + orderq] + mova m2, [v3q + orderq + mmsize] + palignr m3, m2, %1 + palignr m2, m5, %1 +%else + mova m0, [v2q + orderq] + mova m1, [v2q + orderq + mmsize] + mova m2, [v3q + orderq] + mova m3, [v3q + orderq + mmsize] +%endif + %define t0 [v1q + orderq] + %define t1 [v1q + orderq + mmsize] +%if ARCH_X86_64 + mova m8, t0 + mova m9, t1 + %define t0 m8 + %define t1 m9 +%endif + pmaddwd m0, t0 + pmaddwd m1, t1 + pmullw m2, m7 + pmullw m3, m7 + paddw m2, t0 + paddw m3, t1 + paddd m6, m0 + paddd m6, m1 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + jg .loop%1 +%if %1 + jmp .end +%endif +%endmacro + +; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, +; int order, int mul) +INIT_XMM ssse3 +cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm + pshuflw m7, m7, 0 + punpcklqdq m7, m7 + pxor m6, m6 + mov r4d, v2d + and r4d, 15 + and v2q, ~15 + and v3q, ~15 + mova m4, [v2q + orderq] + mova m5, [v3q + orderq] + ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) + cmp r4d, 0 + je .loop0 + cmp r4d, 2 + je .loop2 + cmp r4d, 4 + je .loop4 + cmp r4d, 6 + je .loop6 + cmp r4d, 8 + je .loop8 + cmp r4d, 10 + je .loop10 + cmp r4d, 12 + je .loop12 +SCALARPRODUCT_LOOP 14 +SCALARPRODUCT_LOOP 12 +SCALARPRODUCT_LOOP 10 +SCALARPRODUCT_LOOP 8 +SCALARPRODUCT_LOOP 6 +SCALARPRODUCT_LOOP 4 +SCALARPRODUCT_LOOP 2 +SCALARPRODUCT_LOOP 0 +.end: + HADDD m6, m0 + movd eax, m6 + RET diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c new file mode 100644 index 0000000000..2e90b2d04c --- /dev/null +++ b/libavcodec/x86/apedsp_init.c @@ -0,0 +1,47 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/apedsp.h" + +int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); + +av_cold void ff_apedsp_init_x86(APEDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMXEXT(cpu_flags)) + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; + + if (EXTERNAL_SSE2(cpu_flags)) + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; + + if (EXTERNAL_SSSE3(cpu_flags) && + !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; +} diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index eae71ab28c..2209c52541 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -53,45 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order emms %endif RET - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm -%if mmsize == 16 - pshuflw m7, m7, 0 - punpcklqdq m7, m7 -%else - pshufw m7, m7, 0 -%endif - pxor m6, m6 - add v1q, orderq - add v2q, orderq - add v3q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - mova m4, [v1q + orderq] - mova m5, [v1q + orderq + mmsize] - movu m2, [v3q + orderq] - movu m3, [v3q + orderq + mmsize] - pmaddwd m0, m4 - pmaddwd m1, m5 - pmullw m2, m7 - pmullw m3, m7 - paddd m6, m0 - paddd m6, m1 - paddw m2, m4 - paddw m3, m5 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - add orderq, mmsize*2 - jl .loop - HADDD m6, m0 - movd eax, m6 - RET %endmacro INIT_MMX mmxext @@ -99,94 +60,6 @@ SCALARPRODUCT INIT_XMM sse2 SCALARPRODUCT -%macro SCALARPRODUCT_LOOP 1 -align 16 -.loop%1: - sub orderq, mmsize*2 -%if %1 - mova m1, m4 - mova m4, [v2q + orderq] - mova m0, [v2q + orderq + mmsize] - palignr m1, m0, %1 - palignr m0, m4, %1 - mova m3, m5 - mova m5, [v3q + orderq] - mova m2, [v3q + orderq + mmsize] - palignr m3, m2, %1 - palignr m2, m5, %1 -%else - mova m0, [v2q + orderq] - mova m1, [v2q + orderq + mmsize] - mova m2, [v3q + orderq] - mova m3, [v3q + orderq + mmsize] -%endif - %define t0 [v1q + orderq] - %define t1 [v1q + orderq + mmsize] -%if ARCH_X86_64 - mova m8, t0 - mova m9, t1 - %define t0 m8 - %define t1 m9 -%endif - pmaddwd m0, t0 - pmaddwd m1, t1 - pmullw m2, m7 - pmullw m3, m7 - paddw m2, t0 - paddw m3, t1 - paddd m6, m0 - paddd m6, m1 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - jg .loop%1 -%if %1 - jmp .end -%endif -%endmacro - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -INIT_XMM ssse3 -cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm - pshuflw m7, m7, 0 - punpcklqdq m7, m7 - pxor m6, m6 - mov r4d, v2d - and r4d, 15 - and v2q, ~15 - and v3q, ~15 - mova m4, [v2q + orderq] - mova m5, [v3q + orderq] - ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) - cmp r4d, 0 - je .loop0 - cmp r4d, 2 - je .loop2 - cmp r4d, 4 - je .loop4 - cmp r4d, 6 - je .loop6 - cmp r4d, 8 - je .loop8 - cmp r4d, 10 - je .loop10 - cmp r4d, 12 - je .loop12 -SCALARPRODUCT_LOOP 14 -SCALARPRODUCT_LOOP 12 -SCALARPRODUCT_LOOP 10 -SCALARPRODUCT_LOOP 8 -SCALARPRODUCT_LOOP 6 -SCALARPRODUCT_LOOP 4 -SCALARPRODUCT_LOOP 2 -SCALARPRODUCT_LOOP 0 -.end: - HADDD m6, m0 - movd eax, m6 - RET - ;----------------------------------------------------------------------------- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 9fc92fe799..ebbf97f637 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -79,15 +79,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, int order); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order); -int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); @@ -560,7 +551,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; #endif /* HAVE_MMXEXT_EXTERNAL */ } @@ -600,7 +590,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, #if HAVE_SSE2_EXTERNAL c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; if (cpu_flags & AV_CPU_FLAG_ATOM) { c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; } else { @@ -615,8 +604,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, int cpu_flags, unsigned high_bit_depth) { #if HAVE_SSSE3_EXTERNAL - if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; c->bswap_buf = ff_bswap32_buf_ssse3; #endif /* HAVE_SSSE3_EXTERNAL */ } |