diff options
author | James Almer <jamrial@gmail.com> | 2014-05-23 04:05:43 -0300 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-23 22:59:55 +0200 |
commit | 518cbf9b4a0a5d17e61fa9ee71330c02bd339f32 (patch) | |
tree | 77a2eee5c0819c65bc4434a5ced15c885d6d5b3e | |
parent | 6b88f22e895314164eef00ccbdbf11ad97a7c56f (diff) | |
download | ffmpeg-518cbf9b4a0a5d17e61fa9ee71330c02bd339f32.tar.gz |
x86/dsputil: fix VECTOR_CLIP_INT32 macro
The inline loop was incrementing and using the value of %%i
the wrong way.
Disassembly of ff_vector_clip_int32_sse2 before and after
this patch:
movdqa (%rdx),%xmm0 | movdqa (%rdx),%xmm0
movdqa 0x10(%rdx),%xmm1 | movdqa 0x10(%rdx),%xmm1
movdqa 0x20(%rdx),%xmm2 | movdqa 0x20(%rdx),%xmm2
movdqa 0x30(%rdx),%xmm3 | movdqa 0x30(%rdx),%xmm3
[...] |
movdqa %xmm0,(%rcx) | movdqa %xmm0,(%rcx)
movdqa %xmm1,0x10(%rcx) | movdqa %xmm1,0x10(%rcx)
movdqa %xmm2,0x20(%rcx) | movdqa %xmm2,0x20(%rcx)
movdqa %xmm3,0x30(%rcx) | movdqa %xmm3,0x30(%rcx)
movdqa (%rdx),%xmm0 | movdqa 0x40(%rdx),%xmm0
movdqa 0x20(%rdx),%xmm1 | movdqa 0x50(%rdx),%xmm1
movdqa 0x40(%rdx),%xmm2 | movdqa 0x60(%rdx),%xmm2
movdqa 0x60(%rdx),%xmm3 | movdqa 0x70(%rdx),%xmm3
[...] |
movdqa %xmm0,(%rcx) | movdqa %xmm0,0x40(%rcx)
movdqa %xmm1,0x20(%rcx) | movdqa %xmm1,0x50(%rcx)
movdqa %xmm2,0x40(%rcx) | movdqa %xmm2,0x60(%rcx)
movdqa %xmm3,0x60(%rcx) | movdqa %xmm3,0x70(%rcx)
add $0x80,%rdx | add $0x80,%rdx
add $0x80,%rcx | add $0x80,%rcx
Other versions were unaffected.
Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/x86/dsputil.asm | 36 |
1 files changed, 18 insertions, 18 deletions
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 8ebc9a06d2..bba60e5364 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -351,17 +351,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len SPLATD m4 SPLATD m5 .loop: -%assign %%i 1 +%assign %%i 0 %rep %2 - mova m0, [srcq+mmsize*0*%%i] - mova m1, [srcq+mmsize*1*%%i] - mova m2, [srcq+mmsize*2*%%i] - mova m3, [srcq+mmsize*3*%%i] + mova m0, [srcq+mmsize*(0+%%i)] + mova m1, [srcq+mmsize*(1+%%i)] + mova m2, [srcq+mmsize*(2+%%i)] + mova m3, [srcq+mmsize*(3+%%i)] %if %3 - mova m7, [srcq+mmsize*4*%%i] - mova m8, [srcq+mmsize*5*%%i] - mova m9, [srcq+mmsize*6*%%i] - mova m10, [srcq+mmsize*7*%%i] + mova m7, [srcq+mmsize*(4+%%i)] + mova m8, [srcq+mmsize*(5+%%i)] + mova m9, [srcq+mmsize*(6+%%i)] + mova m10, [srcq+mmsize*(7+%%i)] %endif CLIPD m0, m4, m5, m6 CLIPD m1, m4, m5, m6 @@ -373,17 +373,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len CLIPD m9, m4, m5, m6 CLIPD m10, m4, m5, m6 %endif - mova [dstq+mmsize*0*%%i], m0 - mova [dstq+mmsize*1*%%i], m1 - mova [dstq+mmsize*2*%%i], m2 - mova [dstq+mmsize*3*%%i], m3 + mova [dstq+mmsize*(0+%%i)], m0 + mova [dstq+mmsize*(1+%%i)], m1 + mova [dstq+mmsize*(2+%%i)], m2 + mova [dstq+mmsize*(3+%%i)], m3 %if %3 - mova [dstq+mmsize*4*%%i], m7 - mova [dstq+mmsize*5*%%i], m8 - mova [dstq+mmsize*6*%%i], m9 - mova [dstq+mmsize*7*%%i], m10 + mova [dstq+mmsize*(4+%%i)], m7 + mova [dstq+mmsize*(5+%%i)], m8 + mova [dstq+mmsize*(6+%%i)], m9 + mova [dstq+mmsize*(7+%%i)], m10 %endif -%assign %%i %%i+1 +%assign %%i %%i+4*(%3+1) %endrep add srcq, mmsize*4*(%2+%3) add dstq, mmsize*4*(%2+%3) |