aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/vp8dsp.asm
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2010-06-29 14:43:11 +0000
committerRonald S. Bultje <rsbultje@gmail.com>2010-06-29 14:43:11 +0000
commit2dd2f71692996e66efd1f8055ac47d19799e589a (patch)
tree863455573fb776a860d8feeda7ec400fa3c4e518 /libavcodec/x86/vp8dsp.asm
parent29e719377f0643a38d6f66b029ddfbc2441dffda (diff)
downloadffmpeg-2dd2f71692996e66efd1f8055ac47d19799e589a.tar.gz
MMX idct_add for VP8.
Originally committed as revision 23886 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/vp8dsp.asm')
-rw-r--r--libavcodec/x86/vp8dsp.asm89
1 files changed, 89 insertions, 0 deletions
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 118d07196e..22947763f9 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -142,6 +142,9 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+pw_20091: times 4 dw 20091
+pw_17734: times 4 dw 17734
+
cextern pw_3
cextern pw_4
cextern pw_64
@@ -924,6 +927,92 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
RET
;-----------------------------------------------------------------------------
+; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
+;-----------------------------------------------------------------------------
+
+; calculate %1=%2+%1; %2=%2-%1, with %3=temp register
+%macro SUMSUB 3
+ mova %3, %1
+ paddw %1, %2
+ psubw %2, %3
+%endmacro
+
+; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
+; this macro assumes that m6/m7 have words for 20091/17734 loaded
+%macro VP8_MULTIPLY_SUMSUB 4
+ mova %3, %1
+ mova %4, %2
+ pmulhw %3, m6 ;20091(1)
+ pmulhw %4, m6 ;20091(2)
+ paddw %3, %1
+ paddw %4, %2
+ psllw %1, 1
+ psllw %2, 1
+ pmulhw %1, m7 ;35468(1)
+ pmulhw %2, m7 ;35468(2)
+ psubw %1, %4
+ paddw %2, %3
+%endmacro
+
+; calculate x0=%1+%3; x1=%1-%3
+; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
+; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
+; %5/%6 are temporary registers
+; we assume m6/m7 have constant words 20091/17734 loaded in them
+%macro VP8_IDCT_TRANSFORM4x4_1D 6
+ SUMSUB_BA m%3, m%1, m%5 ;t0, t1
+ VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
+ SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
+ SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
+ SWAP %4, %1
+ SWAP %4, %3
+%endmacro
+
+; transpose a 4x4 table
+%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3
+ mova m%5, m%1
+ punpcklwd m%1, m%2
+ punpckhwd m%5, m%2
+ mova m%2, m%3
+ punpcklwd m%3, m%4
+ punpckhwd m%2, m%4
+ mova m%4, m%1
+ punpckldq m%1, m%3 ;col0
+ punpckhdq m%4, m%3 ;col1
+ mova m%3, m%5
+ punpckldq m%5, m%2 ;col2
+ punpckhdq m%3, m%2 ;col3
+ SWAP %4, %2
+ SWAP %4, %5
+ SWAP %4, %3
+%endmacro
+
+INIT_MMX
+cglobal vp8_idct_add_mmx, 3, 3
+ ; load block data
+ movq m0, [r1]
+ movq m1, [r1+8]
+ movq m2, [r1+16]
+ movq m3, [r1+24]
+ movq m6, [pw_20091]
+ movq m7, [pw_17734]
+
+ ; actual IDCT
+ VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ paddw m0, [pw_4]
+ VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+
+ ; store
+ pxor m4, m4
+ lea r1, [r0+2*r2]
+ STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
+ STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
+
+ RET
+
+;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
;-----------------------------------------------------------------------------