aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2014-01-22 19:41:21 +0100
committerMichael Niedermayer <michaelni@gmx.at>2014-01-22 19:41:21 +0100
commit631939bde6e29e29131a0ca389e5e8dea4c3d038 (patch)
tree5a6390ac41514a2d4a8f8c6906cd1ab89fa1211b
parent7b89e24151eaabc1b526cfd199dd2a67cdec6154 (diff)
downloadffmpeg-631939bde6e29e29131a0ca389e5e8dea4c3d038.tar.gz
avcodec/x86/lossless_videodsp: add diff_int16_mmx/sse2
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/x86/lossless_videodsp.asm66
-rw-r--r--libavcodec/x86/lossless_videodsp_init.c4
2 files changed, 70 insertions, 0 deletions
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index e496c804c8..37663d70df 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -93,6 +93,72 @@ cglobal add_int16, 4,4,5, dst, src, mask, w
.unaligned:
ADD_INT16_LOOP 0
+%macro DIFF_INT16_LOOP 1 ; %1 = is_aligned
+ movd m4, maskd
+ SPLATW m4, m4
+ add wq, wq
+ test wq, 2*mmsize - 1
+ jz %%.tomainloop
+%%.wordloop:
+ sub wq, 2
+ mov ax, [src1q+wq]
+ sub ax, [src2q+wq]
+ and ax, maskw
+ mov [dstq+wq], ax
+ test wq, 2*mmsize - 1
+ jnz %%.wordloop
+%%.tomainloop:
+ add src1q, wq
+ add src2q, wq
+ add dstq, wq
+ neg wq
+ jz %%.end
+%%.loop:
+%if %1
+ mova m0, [src1q+wq]
+ mova m1, [src2q+wq]
+ mova m2, [src1q+wq+mmsize]
+ mova m3, [src2q+wq+mmsize]
+%else
+ movu m0, [src1q+wq]
+ movu m1, [src2q+wq]
+ movu m2, [src1q+wq+mmsize]
+ movu m3, [src2q+wq+mmsize]
+%endif
+ psubw m0, m1
+ psubw m2, m3
+ pand m0, m4
+ pand m2, m4
+%if %1
+ mova [dstq+wq] , m0
+ mova [dstq+wq+mmsize], m2
+%else
+ movu [dstq+wq] , m0
+ movu [dstq+wq+mmsize], m2
+%endif
+ add wq, 2*mmsize
+ jl %%.loop
+%%.end:
+ RET
+%endmacro
+
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w
+ DIFF_INT16_LOOP 1
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w
+ test src1q, mmsize-1
+ jnz .unaligned
+ test src2q, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ DIFF_INT16_LOOP 1
+.unaligned:
+ DIFF_INT16_LOOP 0
+
+
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add wq, wq
add srcq, wq
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index 88424ba1f9..9927ca38f3 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -23,6 +23,8 @@
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
int ff_add_hfyu_left_prediction_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc);
int ff_add_hfyu_left_prediction_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc);
@@ -32,10 +34,12 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
if (EXTERNAL_MMX(cpu_flags)) {
c->add_int16 = ff_add_int16_mmx;
+ c->diff_int16 = ff_diff_int16_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_int16 = ff_add_int16_sse2;
+ c->diff_int16 = ff_diff_int16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {