aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/huffyuvencdsp.asm
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2017-01-07 23:13:48 -0300
committerJames Almer <jamrial@gmail.com>2017-01-12 22:53:04 -0300
commitcf9ef839606dd50f779c395d8a277de143f7e5b2 (patch)
tree615bcdf1fc268c6ef0b3cc75273ca08aff8254bd /libavcodec/x86/huffyuvencdsp.asm
parent30c1f27299d3fc2b0c0858c003066cc5e36a28af (diff)
downloadffmpeg-cf9ef839606dd50f779c395d8a277de143f7e5b2.tar.gz
huffyuvencdsp: move shared functions to a new lossless_videoencdsp context
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/huffyuvencdsp.asm')
-rw-r--r--libavcodec/x86/huffyuvencdsp.asm124
1 files changed, 2 insertions, 122 deletions
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 78ad202249..1228aa8355 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -27,128 +27,8 @@
section .text
-; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-; intptr_t w);
-%macro DIFF_BYTES_PROLOGUE 0
-%if ARCH_X86_32
-cglobal diff_bytes, 3,5,2, dst, src1, src2
-%define wq r4q
- DECLARE_REG_TMP 3
- mov wq, r3mp
-%else
-cglobal diff_bytes, 4,5,2, dst, src1, src2, w
- DECLARE_REG_TMP 4
-%endif ; ARCH_X86_32
-%define i t0q
-%endmacro
-
-; label to jump to if w < regsize
-%macro DIFF_BYTES_LOOP_PREP 1
- mov i, wq
- and i, -2 * regsize
- jz %1
- add dstq, i
- add src1q, i
- add src2q, i
- neg i
-%endmacro
-
-; mov type used for src1q, dstq, first reg, second reg
-%macro DIFF_BYTES_LOOP_CORE 4
-%if mmsize != 16
- mov%1 %3, [src1q + i]
- mov%1 %4, [src1q + i + regsize]
- psubb %3, [src2q + i]
- psubb %4, [src2q + i + regsize]
- mov%2 [dstq + i], %3
- mov%2 [regsize + dstq + i], %4
-%else
- ; SSE enforces alignment of psubb operand
- mov%1 %3, [src1q + i]
- movu %4, [src2q + i]
- psubb %3, %4
- mov%2 [dstq + i], %3
- mov%1 %3, [src1q + i + regsize]
- movu %4, [src2q + i + regsize]
- psubb %3, %4
- mov%2 [regsize + dstq + i], %3
-%endif
-%endmacro
-
-%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
- %define regsize mmsize
-.loop_%1%2:
- DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
- add i, 2 * regsize
- jl .loop_%1%2
-.skip_main_%1%2:
- and wq, 2 * regsize - 1
- jz .end_%1%2
-%if mmsize > 16
- ; fall back to narrower xmm
- %define regsize mmsize / 2
- DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
-.loop2_%1%2:
- DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
- add i, 2 * regsize
- jl .loop2_%1%2
-.setup_loop_gpr_%1%2:
- and wq, 2 * regsize - 1
- jz .end_%1%2
-%endif
- add dstq, wq
- add src1q, wq
- add src2q, wq
- neg wq
-.loop_gpr_%1%2:
- mov t0b, [src1q + wq]
- sub t0b, [src2q + wq]
- mov [dstq + wq], t0b
- inc wq
- jl .loop_gpr_%1%2
-.end_%1%2:
- REP_RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-DIFF_BYTES_PROLOGUE
- %define regsize mmsize
- DIFF_BYTES_LOOP_PREP .skip_main_aa
- DIFF_BYTES_BODY a, a
-%undef i
-%endif
-
-INIT_XMM sse2
-DIFF_BYTES_PROLOGUE
- %define regsize mmsize
- DIFF_BYTES_LOOP_PREP .skip_main_aa
- test dstq, regsize - 1
- jnz .loop_uu
- test src1q, regsize - 1
- jnz .loop_ua
- DIFF_BYTES_BODY a, a
- DIFF_BYTES_BODY u, a
- DIFF_BYTES_BODY u, u
-%undef i
-
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-DIFF_BYTES_PROLOGUE
- %define regsize mmsize
- ; Directly using unaligned SSE2 version is marginally faster than
- ; branching based on arguments.
- DIFF_BYTES_LOOP_PREP .skip_main_uu
- test dstq, regsize - 1
- jnz .loop_uu
- test src1q, regsize - 1
- jnz .loop_ua
- DIFF_BYTES_BODY a, a
- DIFF_BYTES_BODY u, a
- DIFF_BYTES_BODY u, u
-%undef i
-%endif
-
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; unsigned mask, int w);
%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
movd m4, maskd
SPLATW m4, m4