aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimothy Gu <timothygu99@gmail.com>2015-10-17 16:39:50 -0700
committerTimothy Gu <timothygu99@gmail.com>2015-10-21 20:01:52 -0700
commitab5f43e6342c4c07faf5c9ae87628d7d7c83abb6 (patch)
treee1d8d77e0c7441b5289aae695062ce56d1028817
parent12628e3369de1dcd6a2de21e089ee007e2cf951a (diff)
downloadffmpeg-ab5f43e6342c4c07faf5c9ae87628d7d7c83abb6.tar.gz
vc1dsp: Port ff_vc1_put_ver_16b_shift2_mmx to yasm
This function is only used within other inline asm functions, hence the HAVE_MMX_INLINE guard. Per recent discussions, we should not worry about the performance of inline asm-only builds.
-rw-r--r--libavcodec/x86/vc1dsp.asm89
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c59
2 files changed, 95 insertions, 53 deletions
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index 546688cf9d..cbf1336f3a 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -1,5 +1,6 @@
;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 DSP optimizations
+;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
;* Copyright (c) 2009 David Conrad
;*
;* This file is part of FFmpeg.
@@ -23,6 +24,7 @@
cextern pw_4
cextern pw_5
+cextern pw_9
section .text
@@ -315,3 +317,88 @@ cglobal vc1_h_loop_filter8, 3,5,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8
RET
+
+%if HAVE_MMX_INLINE
+%macro NORMALIZE_MMX 1 ; shift
+ paddw m3, m7 ; +bias-r
+ paddw m4, m7 ; +bias-r
+ psraw m3, %1
+ psraw m4, %1
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+ movd m7, %1
+ punpcklwd m7, m7
+ punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+ paddw m%3, m%4
+ movh m%2, [srcq + stride_neg2]
+ pmullw m%3, m6
+ punpcklbw m%2, m0
+ movh m%5, [srcq + strideq]
+ psubw m%3, m%2
+ punpcklbw m%5, m0
+ paddw m%3, m7
+ psubw m%3, m%5
+ psraw m%3, shift
+ movu [dstq + %1], m%3
+ add srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+; x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+ DECLARE_REG_TMP 3, 4, 5
+ %define rnd r3mp
+ %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+ DECLARE_REG_TMP 4, 5, 6
+ %define rnd r3d
+ ; We need shift either in memory or in a mm reg as it's used in psraw
+ ; On WIN64, the arg is already on the stack
+ ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+ %define shift r4mp
+%else ; UNIX64
+ %define shift m5
+ mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+ mov stride_neg2, strideq
+ neg stride_neg2
+ add stride_neg2, stride_neg2
+ lea stride_9minus4, [strideq * 9 - 4]
+ mov i, 3
+ LOAD_ROUNDER_MMX rnd
+ mova m6, [pw_9]
+ pxor m0, m0
+.loop:
+ movh m2, [srcq]
+ add srcq, strideq
+ movh m3, [srcq]
+ punpcklbw m2, m0
+ punpcklbw m3, m0
+ SHIFT2_LINE 0, 1, 2, 3, 4
+ SHIFT2_LINE 24, 2, 3, 4, 1
+ SHIFT2_LINE 48, 3, 4, 1, 2
+ SHIFT2_LINE 72, 4, 1, 2, 3
+ SHIFT2_LINE 96, 1, 2, 3, 4
+ SHIFT2_LINE 120, 2, 3, 4, 1
+ SHIFT2_LINE 144, 3, 4, 1, 2
+ SHIFT2_LINE 168, 4, 1, 2, 3
+ sub srcq, stride_9minus4
+ add dstq, 8
+ dec i
+ jnz .loop
+ REP_RET
+%endif ; HAVE_MMX_INLINE
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index e42099b46c..c268cc6a8e 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -33,7 +33,11 @@
#include "fpel.h"
#include "vc1dsp.h"
-#if HAVE_6REGS && HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+ const uint8_t *src, x86_reg stride,
+ int rnd, int64_t shift);
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -66,55 +70,6 @@
"punpcklwd %%mm7, %%mm7 \n\t" \
"punpckldq %%mm7, %%mm7 \n\t"
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
- "paddw %%mm"#R2", %%mm"#R1" \n\t" \
- "movd (%0,%3), %%mm"#R0" \n\t" \
- "pmullw %%mm6, %%mm"#R1" \n\t" \
- "punpcklbw %%mm0, %%mm"#R0" \n\t" \
- "movd (%0,%2), %%mm"#R3" \n\t" \
- "psubw %%mm"#R0", %%mm"#R1" \n\t" \
- "punpcklbw %%mm0, %%mm"#R3" \n\t" \
- "paddw %%mm7, %%mm"#R1" \n\t" \
- "psubw %%mm"#R3", %%mm"#R1" \n\t" \
- "psraw %4, %%mm"#R1" \n\t" \
- "movq %%mm"#R1", "#OFF"(%1) \n\t" \
- "add %2, %0 \n\t"
-
-/** Sacrificing mm6 makes it possible to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
- const uint8_t *src, x86_reg stride,
- int rnd, int64_t shift)
-{
- __asm__ volatile(
- "mov $3, %%"REG_c" \n\t"
- LOAD_ROUNDER_MMX("%5")
- "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
- "1: \n\t"
- "movd (%0), %%mm2 \n\t"
- "add %2, %0 \n\t"
- "movd (%0), %%mm3 \n\t"
- "punpcklbw %%mm0, %%mm2 \n\t"
- "punpcklbw %%mm0, %%mm3 \n\t"
- SHIFT2_LINE( 0, 1, 2, 3, 4)
- SHIFT2_LINE( 24, 2, 3, 4, 1)
- SHIFT2_LINE( 48, 3, 4, 1, 2)
- SHIFT2_LINE( 72, 4, 1, 2, 3)
- SHIFT2_LINE( 96, 1, 2, 3, 4)
- SHIFT2_LINE(120, 2, 3, 4, 1)
- SHIFT2_LINE(144, 3, 4, 1, 2)
- SHIFT2_LINE(168, 4, 1, 2, 3)
- "sub %6, %0 \n\t"
- "add $8, %1 \n\t"
- "dec %%"REG_c" \n\t"
- "jnz 1b \n\t"
- : "+r"(src), "+r"(dst)
- : "r"(stride), "r"(-2*stride),
- "m"(shift), "m"(rnd), "r"(9*stride-4)
- NAMED_CONSTRAINTS_ADD(ff_pw_9)
- : "%"REG_c, "memory"
- );
-}
-
/**
* Data is already unpacked, so some operations can directly be made from
* memory.
@@ -430,7 +385,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
int hmode, int vmode, int rnd)\
{\
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
- { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+ { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
@@ -780,4 +735,4 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
}
-#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */