Revert "x86/videodsp: add emulated_edge_mc_mmxext"

The commit causes minor out of array reads and was mainly intended for future optimizations which turned out not to be meassurably faster. Itself it was just 1 cpu cycle faster Approved-by: jamrial This reverts commit 057d2704e78b52fec357b4fc646c9de37a085413.
author: Michael Niedermayer <michaelni@gmx.at> 2014-06-28 05:33:52 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2014-06-28 05:39:07 +0200
commit: 5bca5f87d1a32669e0357790e0d0ad8a5c9c998b (patch)
tree: 9ac36e13704260d32a0646ec86dd06ad28d96dd6 /libavcodec/x86/videodsp.asm
parent: 4d1fa38d984cba322284cfce416c4c969da58f51 (diff)
download: ffmpeg-5bca5f87d1a32669e0357790e0d0ad8a5c9c998b.tar.gz
1 files changed, 24 insertions, 35 deletions
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index ad15af9a27..1ac02574d6 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -92,13 +92,21 @@ INIT_XMM sse
 vvar_fn
 
 %macro hvar_fn 0
-cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w
+cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     lea            dstq, [dstq+n_wordsq*2]
     neg        n_wordsq
     lea        start_xq, [start_xq+n_wordsq*2]
 .y_loop:                                        ; do {
+    ; FIXME also write a ssse3 version using pshufb
+    movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
+    imul             wd, 0x01010101             ;   w *= 0x01010101
+    movd             m0, wd
     mov              wq, n_wordsq               ;   initialize w
-    SPLATB_LOAD      m0, dstq+start_xq, m1      ;   read(1); splat
+%if cpuflag(sse2)
+    pshufd           m0, m0, q0000              ;   splat
+%else ; mmx
+    punpckldq        m0, m0                     ;   splat
+%endif ; mmx/sse
 .x_loop:                                        ;   do {
     movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
     add              wq, mmsize/2               ;     w -= $mmsize/2
@@ -114,8 +122,6 @@ cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w
 %if ARCH_X86_32
 INIT_MMX mmx
 hvar_fn
-INIT_MMX mmxext
-hvar_fn
 %endif
 
 INIT_XMM sse2
@@ -338,12 +344,16 @@ VERTICAL_EXTEND 16, 22
 ; obviously not the same on both sides.
 
 %macro READ_V_PIXEL 2
-%if notcpuflag(mmxext) && %1 < 8
-    movzx          vald, byte [%2]
+    movzx          vald, byte %2
     imul           vald, 0x01010101
+%if %1 >= 8
+    movd             m0, vald
+%if mmsize == 16
+    pshufd           m0, m0, q0000
 %else
-    SPLATB_LOAD      m0, %2, m1
-%endif ; %1 < 8
+    punpckldq        m0, m0
+%endif ; mmsize == 16
+%endif ; %1 > 16
 %endmacro ; READ_V_PIXEL
 
 %macro WRITE_V_PIXEL 2
@@ -378,42 +388,26 @@ VERTICAL_EXTEND 16, 22
 %endif
 %endif ; %1-%%off >= 4
 
-%if %1-%%off == 2
-    movd     [%2+%%off-2], m0
-%endif ; (%1-%%off)/2
-
 %else ; %1 < 8
 
-%if cpuflag(mmxext)
-    movd     [%2+%%off], m0
-%if %1 == 6
-    movd     [%2+%%off+2], m0
-%endif ; (%1-%%off)/2
-
-%else ; notcpuflag(mmxext)
-
 %rep %1/4
     mov      [%2+%%off], vald
 %assign %%off %%off+4
 %endrep ; %1/4
 
+%endif ; %1 >=/< 8
+
 %if %1-%%off == 2
     mov      [%2+%%off], valw
 %endif ; (%1-%%off)/2
-%endif ; cpuflag
-%endif ; %1 >=/< 8
 %endmacro ; WRITE_V_PIXEL
 
 %macro H_EXTEND 2
 %assign %%n %1
 %rep 1+(%2-%1)/2
-%if %%n < 8 && notcpuflag(mmxext)
-cglobal emu_edge_hfix %+ %%n, 4, 5, 2, dst, dst_stride, start_x, bh, val
-%else
-cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh
-%endif
+cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
 .loop_y:                                        ; do {
-    READ_V_PIXEL    %%n, dstq+start_xq          ;   $variable_regs = read($n)
+    READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
     WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
     add            dstq, dst_strideq            ;   dst += dst_stride
     dec             bhq                         ; } while (--bh)
@@ -424,16 +418,11 @@ cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh
 %endmacro ; H_EXTEND
 
 INIT_MMX mmx
-H_EXTEND 2, 2
-%if ARCH_X86_32
-H_EXTEND 4, 22
-%endif
-
-INIT_MMX mmxext
-H_EXTEND 4, 14
+H_EXTEND 2, 14
 %if ARCH_X86_32
 H_EXTEND 16, 22
 %endif
+
 INIT_XMM sse2
 H_EXTEND 16, 22
author	Michael Niedermayer <michaelni@gmx.at>	2014-06-28 05:33:52 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2014-06-28 05:39:07 +0200
commit	5bca5f87d1a32669e0357790e0d0ad8a5c9c998b (patch)
tree	9ac36e13704260d32a0646ec86dd06ad28d96dd6 /libavcodec/x86/videodsp.asm
parent	4d1fa38d984cba322284cfce416c4c969da58f51 (diff)
download	ffmpeg-5bca5f87d1a32669e0357790e0d0ad8a5c9c998b.tar.gz