diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2012-08-03 16:58:26 +0200 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-08-03 11:09:14 -0700 |
commit | da6505ad2fc8ef045401a3d9f980586ac5cf808c (patch) | |
tree | ecb499ab5873b038842b8ec382644edb57b0564e | |
parent | 9cc74c9f6e8b645e67d45b2070db004caca09af7 (diff) | |
download | ffmpeg-da6505ad2fc8ef045401a3d9f980586ac5cf808c.tar.gz |
dsputil: make add_hfyu_left_prediction_sse4() support unaligned src.
This makes add_hfyu_left_prediction_sse4() handle sources that are not
16-byte aligned in its own function rather than by proxying the call to
add_hfyu_left_prediction_ssse3(). This fixes a crash on Win64, since the
sse4 version clobberes xmm6, but the ssse3 version (which uses MMX regs)
does not restore it, thus leading to XMM clobbering and RSP being off.
Fixes bug 342.
-rw-r--r-- | libavcodec/x86/dsputil_yasm.asm | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 70a0aa12e7..af2de15a25 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -388,12 +388,16 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to RET -%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned +%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned add srcq, wq add dstq, wq neg wq %%.loop: +%if %2 mova m1, [srcq+wq] +%else + movu m1, [srcq+wq] +%endif mova m2, m1 psllw m1, 8 paddb m1, m2 @@ -435,7 +439,7 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left mova m3, [pb_zz11zz55zz99zzdd] movd m0, leftm psllq m0, 56 - ADD_HFYU_LEFT_LOOP 1 + ADD_HFYU_LEFT_LOOP 1, 1 INIT_XMM cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left @@ -446,12 +450,14 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left movd m0, leftm pslldq m0, 15 test srcq, 15 - jnz add_hfyu_left_prediction_ssse3.skip_prologue + jnz .src_unaligned test dstq, 15 - jnz .unaligned - ADD_HFYU_LEFT_LOOP 1 -.unaligned: - ADD_HFYU_LEFT_LOOP 0 + jnz .dst_unaligned + ADD_HFYU_LEFT_LOOP 1, 1 +.dst_unaligned: + ADD_HFYU_LEFT_LOOP 0, 1 +.src_unaligned: + ADD_HFYU_LEFT_LOOP 0, 0 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |