aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dsputil_yasm.asm
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2009-12-05 15:09:10 +0000
committerLoren Merritt <lorenm@u.washington.edu>2009-12-05 15:09:10 +0000
commitb1159ad92818cd8f0885d252b0800f5960fe7241 (patch)
treea9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/x86/dsputil_yasm.asm
parente470691aa8798004bf5589871865a765cb791014 (diff)
downloadffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/dsputil_yasm.asm')
-rw-r--r--libavcodec/x86/dsputil_yasm.asm164
1 files changed, 127 insertions, 37 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index c8a4230374..96080be349 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -100,43 +100,7 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
%macro SCALARPRODUCT 1
-; void add_int16(int16_t * v1, int16_t * v2, int order)
-cglobal add_int16_%1, 3,3,2, v1, v2, order
- shl orderq, 1
- add v1q, orderq
- add v2q, orderq
- neg orderq
-.loop:
- movu m0, [v2q + orderq]
- movu m1, [v2q + orderq + mmsize]
- paddw m0, [v1q + orderq]
- paddw m1, [v1q + orderq + mmsize]
- mova [v1q + orderq], m0
- mova [v1q + orderq + mmsize], m1
- add orderq, mmsize*2
- jl .loop
- REP_RET
-
-; void sub_int16(int16_t * v1, int16_t * v2, int order)
-cglobal sub_int16_%1, 3,3,4, v1, v2, order
- shl orderq, 1
- add v1q, orderq
- add v2q, orderq
- neg orderq
-.loop:
- movu m2, [v2q + orderq]
- movu m3, [v2q + orderq + mmsize]
- mova m0, [v1q + orderq]
- mova m1, [v1q + orderq + mmsize]
- psubw m0, m2
- psubw m1, m3
- mova [v1q + orderq], m0
- mova [v1q + orderq + mmsize], m1
- add orderq, mmsize*2
- jl .loop
- REP_RET
-
-; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
shl orderq, 1
add v1q, orderq
@@ -165,6 +129,51 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
paddd m2, m0
movd eax, m2
RET
+
+; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+%if mmsize == 16
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+%else
+ pshufw m7, m7, 0
+%endif
+ pxor m6, m6
+ add v1q, orderq
+ add v2q, orderq
+ add v3q, orderq
+ neg orderq
+.loop:
+ movu m0, [v2q + orderq]
+ movu m1, [v2q + orderq + mmsize]
+ mova m4, [v1q + orderq]
+ mova m5, [v1q + orderq + mmsize]
+ movu m2, [v3q + orderq]
+ movu m3, [v3q + orderq + mmsize]
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmullw m2, m7
+ pmullw m3, m7
+ paddd m6, m0
+ paddd m6, m1
+ paddw m2, m4
+ paddw m3, m5
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ add orderq, mmsize*2
+ jl .loop
+%if mmsize == 16
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+%else
+ pshufw m0, m6, 0x4e
+%endif
+ paddd m6, m0
+ movd eax, m6
+ RET
%endmacro
INIT_MMX
@@ -172,6 +181,87 @@ SCALARPRODUCT mmx2
INIT_XMM
SCALARPRODUCT sse2
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+ sub orderq, mmsize*2
+%if %1
+ mova m1, m4
+ mova m4, [v2q + orderq]
+ mova m0, [v2q + orderq + mmsize]
+ palignr m1, m0, %1
+ palignr m0, m4, %1
+ mova m3, m5
+ mova m5, [v3q + orderq]
+ mova m2, [v3q + orderq + mmsize]
+ palignr m3, m2, %1
+ palignr m2, m5, %1
+%else
+ mova m0, [v2q + orderq]
+ mova m1, [v2q + orderq + mmsize]
+ mova m2, [v3q + orderq]
+ mova m3, [v3q + orderq + mmsize]
+%endif
+ pmaddwd m0, [v1q + orderq]
+ pmaddwd m1, [v1q + orderq + mmsize]
+ pmullw m2, m7
+ pmullw m3, m7
+ paddw m2, [v1q + orderq]
+ paddw m3, [v1q + orderq + mmsize]
+ paddd m6, m0
+ paddd m6, m1
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ jg .loop%1
+%if %1
+ jmp .end
+%endif
+%endmacro
+
+; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+ pxor m6, m6
+ mov r4d, v2d
+ and r4d, 15
+ and v2q, ~15
+ and v3q, ~15
+ mova m4, [v2q + orderq]
+ mova m5, [v3q + orderq]
+ ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+ cmp r4d, 0
+ je .loop0
+ cmp r4d, 2
+ je .loop2
+ cmp r4d, 4
+ je .loop4
+ cmp r4d, 6
+ je .loop6
+ cmp r4d, 8
+ je .loop8
+ cmp r4d, 10
+ je .loop10
+ cmp r4d, 12
+ je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+ paddd m6, m0
+ movd eax, m6
+ RET
+
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)