diff options
author | Timothy Gu <timothygu99@gmail.com> | 2015-10-17 16:39:50 -0700 |
---|---|---|
committer | Timothy Gu <timothygu99@gmail.com> | 2015-10-21 20:01:52 -0700 |
commit | ab5f43e6342c4c07faf5c9ae87628d7d7c83abb6 (patch) | |
tree | e1d8d77e0c7441b5289aae695062ce56d1028817 | |
parent | 12628e3369de1dcd6a2de21e089ee007e2cf951a (diff) | |
download | ffmpeg-ab5f43e6342c4c07faf5c9ae87628d7d7c83abb6.tar.gz |
vc1dsp: Port ff_vc1_put_ver_16b_shift2_mmx to yasm
This function is only used within other inline asm functions, hence the
HAVE_MMX_INLINE guard. Per recent discussions, we should not worry about
the performance of inline asm-only builds.
-rw-r--r-- | libavcodec/x86/vc1dsp.asm | 89 | ||||
-rw-r--r-- | libavcodec/x86/vc1dsp_mmx.c | 59 |
2 files changed, 95 insertions, 53 deletions
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm index 546688cf9d..cbf1336f3a 100644 --- a/libavcodec/x86/vc1dsp.asm +++ b/libavcodec/x86/vc1dsp.asm @@ -1,5 +1,6 @@ ;****************************************************************************** -;* VC1 deblocking optimizations +;* VC1 DSP optimizations +;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> ;* Copyright (c) 2009 David Conrad ;* ;* This file is part of FFmpeg. @@ -23,6 +24,7 @@ cextern pw_4 cextern pw_5 +cextern pw_9 section .text @@ -315,3 +317,88 @@ cglobal vc1_h_loop_filter8, 3,5,8 START_H_FILTER 8 VC1_H_LOOP_FILTER 8 RET + +%if HAVE_MMX_INLINE +%macro NORMALIZE_MMX 1 ; shift + paddw m3, m7 ; +bias-r + paddw m4, m7 ; +bias-r + psraw m3, %1 + psraw m4, %1 +%endmacro + +; Compute the rounder 32-r or 8-r and unpacks it to m7 +%macro LOAD_ROUNDER_MMX 1 ; round + movd m7, %1 + punpcklwd m7, m7 + punpckldq m7, m7 +%endmacro + +%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3 + paddw m%3, m%4 + movh m%2, [srcq + stride_neg2] + pmullw m%3, m6 + punpcklbw m%2, m0 + movh m%5, [srcq + strideq] + psubw m%3, m%2 + punpcklbw m%5, m0 + paddw m%3, m7 + psubw m%3, m%5 + psraw m%3, shift + movu [dstq + %1], m%3 + add srcq, strideq +%endmacro + +INIT_MMX mmx +; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src, +; x86_reg stride, int rnd, int64_t shift) +; Sacrificing m6 makes it possible to pipeline loads from src +%if ARCH_X86_32 +cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride + DECLARE_REG_TMP 3, 4, 5 + %define rnd r3mp + %define shift qword r4m +%else ; X86_64 +cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride + DECLARE_REG_TMP 4, 5, 6 + %define rnd r3d + ; We need shift either in memory or in a mm reg as it's used in psraw + ; On WIN64, the arg is already on the stack + ; On UNIX64, m5 doesn't seem to be used +%if WIN64 + %define shift r4mp +%else ; UNIX64 + %define shift m5 + mova shift, r4q +%endif ; WIN64 +%endif ; X86_32 +%define stride_neg2 t0q +%define stride_9minus4 t1q +%define i t2q + mov stride_neg2, strideq + neg stride_neg2 + add stride_neg2, stride_neg2 + lea stride_9minus4, [strideq * 9 - 4] + mov i, 3 + LOAD_ROUNDER_MMX rnd + mova m6, [pw_9] + pxor m0, m0 +.loop: + movh m2, [srcq] + add srcq, strideq + movh m3, [srcq] + punpcklbw m2, m0 + punpcklbw m3, m0 + SHIFT2_LINE 0, 1, 2, 3, 4 + SHIFT2_LINE 24, 2, 3, 4, 1 + SHIFT2_LINE 48, 3, 4, 1, 2 + SHIFT2_LINE 72, 4, 1, 2, 3 + SHIFT2_LINE 96, 1, 2, 3, 4 + SHIFT2_LINE 120, 2, 3, 4, 1 + SHIFT2_LINE 144, 3, 4, 1, 2 + SHIFT2_LINE 168, 4, 1, 2, 3 + sub srcq, stride_9minus4 + add dstq, 8 + dec i + jnz .loop + REP_RET +%endif ; HAVE_MMX_INLINE diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index e42099b46c..c268cc6a8e 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -33,7 +33,11 @@ #include "fpel.h" #include "vc1dsp.h" -#if HAVE_6REGS && HAVE_INLINE_ASM +#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL + +void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, + const uint8_t *src, x86_reg stride, + int rnd, int64_t shift); #define OP_PUT(S,D) #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" @@ -66,55 +70,6 @@ "punpcklwd %%mm7, %%mm7 \n\t" \ "punpckldq %%mm7, %%mm7 \n\t" -#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ - "paddw %%mm"#R2", %%mm"#R1" \n\t" \ - "movd (%0,%3), %%mm"#R0" \n\t" \ - "pmullw %%mm6, %%mm"#R1" \n\t" \ - "punpcklbw %%mm0, %%mm"#R0" \n\t" \ - "movd (%0,%2), %%mm"#R3" \n\t" \ - "psubw %%mm"#R0", %%mm"#R1" \n\t" \ - "punpcklbw %%mm0, %%mm"#R3" \n\t" \ - "paddw %%mm7, %%mm"#R1" \n\t" \ - "psubw %%mm"#R3", %%mm"#R1" \n\t" \ - "psraw %4, %%mm"#R1" \n\t" \ - "movq %%mm"#R1", "#OFF"(%1) \n\t" \ - "add %2, %0 \n\t" - -/** Sacrificing mm6 makes it possible to pipeline loads from src */ -static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, - const uint8_t *src, x86_reg stride, - int rnd, int64_t shift) -{ - __asm__ volatile( - "mov $3, %%"REG_c" \n\t" - LOAD_ROUNDER_MMX("%5") - "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" - "1: \n\t" - "movd (%0), %%mm2 \n\t" - "add %2, %0 \n\t" - "movd (%0), %%mm3 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "punpcklbw %%mm0, %%mm3 \n\t" - SHIFT2_LINE( 0, 1, 2, 3, 4) - SHIFT2_LINE( 24, 2, 3, 4, 1) - SHIFT2_LINE( 48, 3, 4, 1, 2) - SHIFT2_LINE( 72, 4, 1, 2, 3) - SHIFT2_LINE( 96, 1, 2, 3, 4) - SHIFT2_LINE(120, 2, 3, 4, 1) - SHIFT2_LINE(144, 3, 4, 1, 2) - SHIFT2_LINE(168, 4, 1, 2, 3) - "sub %6, %0 \n\t" - "add $8, %1 \n\t" - "dec %%"REG_c" \n\t" - "jnz 1b \n\t" - : "+r"(src), "+r"(dst) - : "r"(stride), "r"(-2*stride), - "m"(shift), "m"(rnd), "r"(9*stride-4) - NAMED_CONSTRAINTS_ADD(ff_pw_9) - : "%"REG_c, "memory" - ); -} - /** * Data is already unpacked, so some operations can directly be made from * memory. @@ -430,7 +385,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ int hmode, int vmode, int rnd)\ {\ static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ - { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ + { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ @@ -780,4 +735,4 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; } -#endif /* HAVE_6REGS && HAVE_INLINE_ASM */ +#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */ |