vc1dsp: Port ff_vc1_put_ver_16b_shift2_mmx to yasm

This function is only used within other inline asm functions, hence the HAVE_MMX_INLINE guard. Per recent discussions, we should not worry about the performance of inline asm-only builds.
author: Timothy Gu <[email protected]> 2015-10-17 16:39:50 -0700
committer: Timothy Gu <[email protected]> 2015-10-21 20:01:52 -0700
commit: ab5f43e6342c4c07faf5c9ae87628d7d7c83abb6 (patch)
tree: e1d8d77e0c7441b5289aae695062ce56d1028817
parent: 12628e3369de1dcd6a2de21e089ee007e2cf951a (diff)
2 files changed, 95 insertions, 53 deletions
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index 546688cf9d..cbf1336f3a 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -1,5 +1,6 @@
 ;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 DSP optimizations
+;* Copyright (c) 2007 Christophe GISQUET <[email protected]>
 ;* Copyright (c) 2009 David Conrad
 ;*
 ;* This file is part of FFmpeg.
@@ -23,6 +24,7 @@
 
 cextern pw_4
 cextern pw_5
+cextern pw_9
 
 section .text
 
@@ -315,3 +317,88 @@ cglobal vc1_h_loop_filter8, 3,5,8
     START_H_FILTER 8
     VC1_H_LOOP_FILTER 8
     RET
+
+%if HAVE_MMX_INLINE
+%macro NORMALIZE_MMX 1 ; shift
+    paddw m3, m7 ; +bias-r
+    paddw m4, m7 ; +bias-r
+    psraw m3, %1
+    psraw m4, %1
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+    movd      m7, %1
+    punpcklwd m7, m7
+    punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+    paddw          m%3, m%4
+    movh           m%2, [srcq + stride_neg2]
+    pmullw         m%3, m6
+    punpcklbw      m%2, m0
+    movh           m%5, [srcq + strideq]
+    psubw          m%3, m%2
+    punpcklbw      m%5, m0
+    paddw          m%3, m7
+    psubw          m%3, m%5
+    psraw          m%3, shift
+    movu   [dstq + %1], m%3
+    add           srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+;                                    x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+    DECLARE_REG_TMP     3, 4, 5
+    %define rnd r3mp
+    %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+    DECLARE_REG_TMP     4, 5, 6
+    %define   rnd r3d
+    ; We need shift either in memory or in a mm reg as it's used in psraw
+    ; On WIN64, the arg is already on the stack
+    ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+    %define shift r4mp
+%else ; UNIX64
+    %define shift m5
+    mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+    mov       stride_neg2, strideq
+    neg       stride_neg2
+    add       stride_neg2, stride_neg2
+    lea    stride_9minus4, [strideq * 9 - 4]
+    mov                 i, 3
+    LOAD_ROUNDER_MMX  rnd
+    mova               m6, [pw_9]
+    pxor               m0, m0
+.loop:
+    movh               m2, [srcq]
+    add              srcq, strideq
+    movh               m3, [srcq]
+    punpcklbw          m2, m0
+    punpcklbw          m3, m0
+    SHIFT2_LINE         0, 1, 2, 3, 4
+    SHIFT2_LINE        24, 2, 3, 4, 1
+    SHIFT2_LINE        48, 3, 4, 1, 2
+    SHIFT2_LINE        72, 4, 1, 2, 3
+    SHIFT2_LINE        96, 1, 2, 3, 4
+    SHIFT2_LINE       120, 2, 3, 4, 1
+    SHIFT2_LINE       144, 3, 4, 1, 2
+    SHIFT2_LINE       168, 4, 1, 2, 3
+    sub              srcq, stride_9minus4
+    add              dstq, 8
+    dec                 i
+        jnz         .loop
+    REP_RET
+%endif ; HAVE_MMX_INLINE
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index e42099b46c..c268cc6a8e 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -33,7 +33,11 @@
 #include "fpel.h"
 #include "vc1dsp.h"
 
-#if HAVE_6REGS && HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+                                   const uint8_t *src, x86_reg stride,
+                                   int rnd, int64_t shift);
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -66,55 +70,6 @@
      "punpcklwd %%mm7, %%mm7           \n\t"    \
      "punpckldq %%mm7, %%mm7           \n\t"
 
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3)           \
-    "paddw     %%mm"#R2", %%mm"#R1"    \n\t"    \
-    "movd      (%0,%3), %%mm"#R0"      \n\t"    \
-    "pmullw    %%mm6, %%mm"#R1"        \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R0"        \n\t"    \
-    "movd      (%0,%2), %%mm"#R3"      \n\t"    \
-    "psubw     %%mm"#R0", %%mm"#R1"    \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R3"        \n\t"    \
-    "paddw     %%mm7, %%mm"#R1"        \n\t"    \
-    "psubw     %%mm"#R3", %%mm"#R1"    \n\t"    \
-    "psraw     %4, %%mm"#R1"           \n\t"    \
-    "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \
-    "add       %2, %0                  \n\t"
-
-/** Sacrificing mm6 makes it possible to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
-                                       const uint8_t *src, x86_reg stride,
-                                       int rnd, int64_t shift)
-{
-    __asm__ volatile(
-        "mov       $3, %%"REG_c"           \n\t"
-        LOAD_ROUNDER_MMX("%5")
-        "movq      "MANGLE(ff_pw_9)", %%mm6 \n\t"
-        "1:                                \n\t"
-        "movd      (%0), %%mm2             \n\t"
-        "add       %2, %0                  \n\t"
-        "movd      (%0), %%mm3             \n\t"
-        "punpcklbw %%mm0, %%mm2            \n\t"
-        "punpcklbw %%mm0, %%mm3            \n\t"
-        SHIFT2_LINE(  0, 1, 2, 3, 4)
-        SHIFT2_LINE( 24, 2, 3, 4, 1)
-        SHIFT2_LINE( 48, 3, 4, 1, 2)
-        SHIFT2_LINE( 72, 4, 1, 2, 3)
-        SHIFT2_LINE( 96, 1, 2, 3, 4)
-        SHIFT2_LINE(120, 2, 3, 4, 1)
-        SHIFT2_LINE(144, 3, 4, 1, 2)
-        SHIFT2_LINE(168, 4, 1, 2, 3)
-        "sub       %6, %0                  \n\t"
-        "add       $8, %1                  \n\t"
-        "dec       %%"REG_c"               \n\t"
-        "jnz 1b                            \n\t"
-        : "+r"(src), "+r"(dst)
-        : "r"(stride), "r"(-2*stride),
-          "m"(shift), "m"(rnd), "r"(9*stride-4)
-          NAMED_CONSTRAINTS_ADD(ff_pw_9)
-        : "%"REG_c, "memory"
-    );
-}
-
 /**
  * Data is already unpacked, so some operations can directly be made from
  * memory.
@@ -430,7 +385,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
                                int hmode, int vmode, int rnd)\
 {\
     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
-         { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
          { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
@@ -780,4 +735,4 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
     dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
 }
-#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
author	Timothy Gu <[email protected]>	2015-10-17 16:39:50 -0700
committer	Timothy Gu <[email protected]>	2015-10-21 20:01:52 -0700
commit	ab5f43e6342c4c07faf5c9ae87628d7d7c83abb6 (patch)
tree	e1d8d77e0c7441b5289aae695062ce56d1028817
parent	12628e3369de1dcd6a2de21e089ee007e2cf951a (diff)