diff options
author | Diego Biurrun <diego@biurrun.de> | 2013-12-29 02:32:16 +0100 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2014-05-29 06:41:15 -0700 |
commit | 054013a0fc6f2b52c60cee3e051be8cc7f82cef3 (patch) | |
tree | 87098f4b0443359b7109066486c15fdaad09dddb /libavcodec/x86/dsputil.asm | |
parent | 256da0770e495176d1b2699ec6e9c7993c2a6d7b (diff) | |
download | ffmpeg-054013a0fc6f2b52c60cee3e051be8cc7f82cef3.tar.gz |
dsputil: Move APE-specific bits into apedsp
Diffstat (limited to 'libavcodec/x86/dsputil.asm')
-rw-r--r-- | libavcodec/x86/dsputil.asm | 137 |
1 files changed, 0 insertions, 137 deletions
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 684f09b7fc..b5d6d3cc65 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order paddd m2, m0 movd eax, m2 RET - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm -%if mmsize == 16 - pshuflw m7, m7, 0 - punpcklqdq m7, m7 -%else - pshufw m7, m7, 0 -%endif - pxor m6, m6 - add v1q, orderq - add v2q, orderq - add v3q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - mova m4, [v1q + orderq] - mova m5, [v1q + orderq + mmsize] - movu m2, [v3q + orderq] - movu m3, [v3q + orderq + mmsize] - pmaddwd m0, m4 - pmaddwd m1, m5 - pmullw m2, m7 - pmullw m3, m7 - paddd m6, m0 - paddd m6, m1 - paddw m2, m4 - paddw m3, m5 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e -%else - pshufw m0, m6, 0x4e -%endif - paddd m6, m0 - movd eax, m6 - RET %endmacro INIT_MMX mmxext @@ -106,97 +60,6 @@ SCALARPRODUCT INIT_XMM sse2 SCALARPRODUCT -%macro SCALARPRODUCT_LOOP 1 -align 16 -.loop%1: - sub orderq, mmsize*2 -%if %1 - mova m1, m4 - mova m4, [v2q + orderq] - mova m0, [v2q + orderq + mmsize] - palignr m1, m0, %1 - palignr m0, m4, %1 - mova m3, m5 - mova m5, [v3q + orderq] - mova m2, [v3q + orderq + mmsize] - palignr m3, m2, %1 - palignr m2, m5, %1 -%else - mova m0, [v2q + orderq] - mova m1, [v2q + orderq + mmsize] - mova m2, [v3q + orderq] - mova m3, [v3q + orderq + mmsize] -%endif - %define t0 [v1q + orderq] - %define t1 [v1q + orderq + mmsize] -%if ARCH_X86_64 - mova m8, t0 - mova m9, t1 - %define t0 m8 - %define t1 m9 -%endif - pmaddwd m0, t0 - pmaddwd m1, t1 - pmullw m2, m7 - pmullw m3, m7 - paddw m2, t0 - paddw m3, t1 - paddd m6, m0 - paddd m6, m1 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - jg .loop%1 -%if %1 - jmp .end -%endif -%endmacro - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -INIT_XMM ssse3 -cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm - pshuflw m7, m7, 0 - punpcklqdq m7, m7 - pxor m6, m6 - mov r4d, v2d - and r4d, 15 - and v2q, ~15 - and v3q, ~15 - mova m4, [v2q + orderq] - mova m5, [v3q + orderq] - ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) - cmp r4d, 0 - je .loop0 - cmp r4d, 2 - je .loop2 - cmp r4d, 4 - je .loop4 - cmp r4d, 6 - je .loop6 - cmp r4d, 8 - je .loop8 - cmp r4d, 10 - je .loop10 - cmp r4d, 12 - je .loop12 -SCALARPRODUCT_LOOP 14 -SCALARPRODUCT_LOOP 12 -SCALARPRODUCT_LOOP 10 -SCALARPRODUCT_LOOP 8 -SCALARPRODUCT_LOOP 6 -SCALARPRODUCT_LOOP 4 -SCALARPRODUCT_LOOP 2 -SCALARPRODUCT_LOOP 0 -.end: - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e - paddd m6, m0 - movd eax, m6 - RET - ;----------------------------------------------------------------------------- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, |