diff options
author | Timothy Gu <timothygu99@gmail.com> | 2015-10-19 02:25:11 +0100 |
---|---|---|
committer | Timothy Gu <timothygu99@gmail.com> | 2015-10-21 12:25:32 -0700 |
commit | 5e586e1befaab7b77883d73c0ebcb19f1d786ed8 (patch) | |
tree | 4cb281e532f1c55180e021daa03dcf31c811f79d /libavcodec/x86 | |
parent | da43e9e157a3c64337348b44193ab390bd481911 (diff) | |
download | ffmpeg-5e586e1befaab7b77883d73c0ebcb19f1d786ed8.tar.gz |
huffyuvencdsp: Add ff_diff_bytes_{sse2,avx2}
SSE2 version 4%-35% faster than MMX depending on the width.
AVX2 version 1%-13% faster than SSE2 depending on the width.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/huffyuvencdsp.asm | 110 | ||||
-rw-r--r-- | libavcodec/x86/huffyuvencdsp_mmx.c | 14 |
2 files changed, 105 insertions, 19 deletions
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index e001906742..699fd38495 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -27,9 +27,9 @@ section .text -INIT_MMX mmx ; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; intptr_t w); +%macro DIFF_BYTES_PROLOGUE 0 %if ARCH_X86_32 cglobal diff_bytes, 3,5,2, dst, src1, src2 %define wq r4q @@ -40,34 +40,108 @@ cglobal diff_bytes, 4,5,2, dst, src1, src2, w DECLARE_REG_TMP 4 %endif ; ARCH_X86_32 %define i t0q +%endmacro + +; label to jump to if w < regsize +%macro DIFF_BYTES_LOOP_PREP 1 mov i, wq - and i, -2 * mmsize - jz .setup_loop2 + and i, -2 * regsize + jz %1 add dstq, i add src1q, i add src2q, i neg i -.loop: - mova m0, [src1q + i] - mova m1, [src1q + i + mmsize] - psubb m0, [src2q + i] - psubb m1, [src2q + i + mmsize] - mova [dstq + i], m0 - mova [mmsize + dstq + i], m1 - add i, 2 * mmsize - jl .loop -.setup_loop2: - and wq, 2 * mmsize - 1 - jz .end +%endmacro + +; mov type used for src1q, dstq, first reg, second reg +%macro DIFF_BYTES_LOOP_CORE 4 +%if regsize != 16 + mov%1 %3, [src1q + i] + mov%1 %4, [src1q + i + regsize] + psubb %3, [src2q + i] + psubb %4, [src2q + i + regsize] + mov%2 [dstq + i], %3 + mov%2 [regsize + dstq + i], %4 +%else + ; SSE enforces alignment of psubb operand + mov%1 %3, [src1q + i] + movu %4, [src2q + i] + psubb %3, %4 + mov%2 [dstq + i], %3 + mov%1 %3, [src1q + i + regsize] + movu %4, [src2q + i + regsize] + psubb %3, %4 + mov%2 [regsize + dstq + i], %3 +%endif +%endmacro + +%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq + %define regsize mmsize +.loop_%1%2: + DIFF_BYTES_LOOP_CORE %1, %2, m0, m1 + add i, 2 * regsize + jl .loop_%1%2 +.skip_main_%1%2: + and wq, 2 * regsize - 1 + jz .end_%1%2 +%if mmsize > 16 + ; fall back to narrower xmm + %define regsize mmsize / 2 + DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa +.loop2_%1%2: + DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1 + add i, 2 * regsize + jl .loop2_%1%2 +.setup_loop_gpr_%1%2: + and wq, 2 * regsize - 1 + jz .end_%1%2 +%endif add dstq, wq add src1q, wq add src2q, wq neg wq -.loop2: +.loop_gpr_%1%2: mov t0b, [src1q + wq] sub t0b, [src2q + wq] mov [dstq + wq], t0b inc wq - jl .loop2 -.end: + jl .loop_gpr_%1%2 +.end_%1%2: REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +DIFF_BYTES_PROLOGUE + %define regsize mmsize + DIFF_BYTES_LOOP_PREP .skip_main_aa + DIFF_BYTES_BODY a, a +%endif + +INIT_XMM sse2 +DIFF_BYTES_PROLOGUE + %define regsize mmsize + DIFF_BYTES_LOOP_PREP .skip_main_aa + test dstq, regsize - 1 + jnz .loop_uu + test src1q, regsize - 1 + jnz .loop_ua + DIFF_BYTES_BODY a, a + DIFF_BYTES_BODY u, a + DIFF_BYTES_BODY u, u + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +DIFF_BYTES_PROLOGUE + %define regsize mmsize + ; Directly using unaligned SSE2 version is marginally faster than + ; branching based on arguments. + DIFF_BYTES_LOOP_PREP .skip_main_uu + test dstq, regsize - 1 + jnz .loop_uu + test src1q, regsize - 1 + jnz .loop_ua + DIFF_BYTES_BODY a, a + DIFF_BYTES_BODY u, a + DIFF_BYTES_BODY u, u +%endif diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c index f28df05dba..0ba4358165 100644 --- a/libavcodec/x86/huffyuvencdsp_mmx.c +++ b/libavcodec/x86/huffyuvencdsp_mmx.c @@ -31,6 +31,10 @@ void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); +void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + intptr_t w); +void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + intptr_t w); #if HAVE_INLINE_ASM @@ -80,7 +84,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { + if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { c->diff_bytes = ff_diff_bytes_mmx; } @@ -89,4 +93,12 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext; } #endif /* HAVE_INLINE_ASM */ + + if (EXTERNAL_SSE2(cpu_flags)) { + c->diff_bytes = ff_diff_bytes_sse2; + } + + if (EXTERNAL_AVX2(cpu_flags)) { + c->diff_bytes = ff_diff_bytes_avx2; + } } |