vp3: Use full transpose for all IDCTs

This way, the special IDCT permutations are no longer needed. This is similar to how H264 does it, and removes the dsputil dependency imposed by the scantable code. Also remove the unused type == 0 cases from the plain C version of the idct. Signed-off-by: Martin Storsjö <martin@martin.st>
author: Ronald S. Bultje <rsbultje@gmail.com> 2013-03-12 07:28:12 -0700
committer: Martin Storsjö <martin@martin.st> 2013-04-15 12:32:05 +0300
commit: 015821229f96bf7e677f2a711a58dbea3009f574 (patch)
tree: 2247f2d16c077a1f887656b8859b164eca6b84df /libavcodec/x86/vp3dsp.asm
parent: 5941978e71d2c3a8e2a7e87951e081e0b2e77da9 (diff)
download: ffmpeg-015821229f96bf7e677f2a711a58dbea3009f574.tar.gz
1 files changed, 82 insertions, 41 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 078e9db99a..fc8a047224 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -501,22 +501,22 @@ cglobal vp3_h_loop_filter, 3, 4
 
     ; at this point, function has completed dequantization + dezigzag +
     ; partial transposition; now do the idct itself
-%define I(x) [%1+16* x     ]
-%define J(x) [%1+16*(x-4)+8]
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
     RowIDCT
     Transpose
 
-%define I(x) [%1+16* x   +64]
-%define J(x) [%1+16*(x-4)+72]
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
     RowIDCT
     Transpose
 
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
     ColumnIDCT
 
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
+%define I(x) [%1+16* x   +64]
+%define J(x) [%1+16*(x-4)+72]
     ColumnIDCT
 %endif ; mmsize == 16/8
 %endmacro
@@ -534,10 +534,17 @@ cglobal vp3_idct_put, 3, 4, 9
     mova          m1, [r2+mmsize*2+%%i]
     mova          m2, [r2+mmsize*4+%%i]
     mova          m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+    packsswb      m0, [r2+mmsize*8+%%i]
+    packsswb      m1, [r2+mmsize*10+%%i]
+    packsswb      m2, [r2+mmsize*12+%%i]
+    packsswb      m3, [r2+mmsize*14+%%i]
+%else
     packsswb      m0, [r2+mmsize*1+%%i]
     packsswb      m1, [r2+mmsize*3+%%i]
     packsswb      m2, [r2+mmsize*5+%%i]
     packsswb      m3, [r2+mmsize*7+%%i]
+%endif
     paddb         m0, m4
     paddb         m1, m4
     paddb         m2, m4
@@ -561,7 +568,7 @@ cglobal vp3_idct_put, 3, 4, 9
     movq   [r0+r1*2], m3
     movhps [r0+r3  ], m3
 %endif
-%assign %%i %%i+64
+%assign %%i %%i+8
 %endrep
 
     pxor          m0, m0
@@ -575,47 +582,81 @@ cglobal vp3_idct_put, 3, 4, 9
 cglobal vp3_idct_add, 3, 4, 9
     VP3_IDCT      r2
 
-    mov           r3, 4
-    pxor          m4, m4
     movsxdifnidn  r1, r1d
-.loop:
+    lea           r3, [r1*3]
+    pxor          m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
     movq          m0, [r0]
     movq          m1, [r0+r1]
-%if mmsize == 8
-    mova          m2, m0
-    mova          m3, m1
-%endif
+    movq          m2, [r0+r1*2]
+    movq          m3, [r0+r3]
     punpcklbw     m0, m4
     punpcklbw     m1, m4
-%if mmsize == 8
-    punpckhbw     m2, m4
-    punpckhbw     m3, m4
-%endif
-    paddsw        m0, [r2+ 0]
-    paddsw        m1, [r2+16]
-%if mmsize == 8
-    paddsw        m2, [r2+ 8]
-    paddsw        m3, [r2+24]
-    packuswb      m0, m2
-    packuswb      m1, m3
-%else ; mmsize == 16
+    punpcklbw     m2, m4
+    punpcklbw     m3, m4
+    paddsw        m0, [r2+ 0+%%i]
+    paddsw        m1, [r2+16+%%i]
+    paddsw        m2, [r2+32+%%i]
+    paddsw        m3, [r2+48+%%i]
     packuswb      m0, m1
+    packuswb      m2, m3
+    movq   [r0     ], m0
+    movhps [r0+r1  ], m0
+    movq   [r0+r1*2], m2
+    movhps [r0+r3  ], m2
+%if %%i == 0
+    lea           r0, [r0+r1*4]
 %endif
-    movq     [r0   ], m0
-%if mmsize == 8
-    movq     [r0+r1], m1
-%else ; mmsize == 16
-    movhps   [r0+r1], m0
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+    movq          m0, [r0]
+    movq          m1, [r0+r1]
+    movq          m2, [r0+r1*2]
+    movq          m3, [r0+r3]
+    movq          m5, m0
+    movq          m6, m1
+    movq          m7, m2
+    punpcklbw     m0, m4
+    punpcklbw     m1, m4
+    punpcklbw     m2, m4
+    punpckhbw     m5, m4
+    punpckhbw     m6, m4
+    punpckhbw     m7, m4
+    paddsw        m0, [r2+ 0+%%i]
+    paddsw        m1, [r2+16+%%i]
+    paddsw        m2, [r2+32+%%i]
+    paddsw        m5, [r2+64+%%i]
+    paddsw        m6, [r2+80+%%i]
+    paddsw        m7, [r2+96+%%i]
+    packuswb      m0, m5
+    movq          m5, m3
+    punpcklbw     m3, m4
+    punpckhbw     m5, m4
+    packuswb      m1, m6
+    paddsw        m3, [r2+48+%%i]
+    paddsw        m5, [r2+112+%%i]
+    packuswb      m2, m7
+    packuswb      m3, m5
+    movq   [r0     ], m0
+    movq   [r0+r1  ], m1
+    movq   [r0+r1*2], m2
+    movq   [r0+r3  ], m3
+%if %%i == 0
+    lea           r0, [r0+r1*4]
 %endif
-    lea           r0, [r0+r1*2]
-%assign %%offset 0
-%rep 32/mmsize
-    mova [r2+%%offset], m4
-%assign %%offset %%offset+mmsize
+%assign %%i %%i+8
+%endrep
+%endif
+%assign %%i 0
+%rep 128/mmsize
+    mova    [r2+%%i], m4
+%assign %%i %%i+mmsize
 %endrep
-    add           r2, 32
-    dec           r3
-    jg .loop
     RET
 %endmacro
author	Ronald S. Bultje <rsbultje@gmail.com>	2013-03-12 07:28:12 -0700
committer	Martin Storsjö <martin@martin.st>	2013-04-15 12:32:05 +0300
commit	015821229f96bf7e677f2a711a58dbea3009f574 (patch)
tree	2247f2d16c077a1f887656b8859b164eca6b84df /libavcodec/x86/vp3dsp.asm
parent	5941978e71d2c3a8e2a7e87951e081e0b2e77da9 (diff)
download	ffmpeg-015821229f96bf7e677f2a711a58dbea3009f574.tar.gz