diff options
author | Michael Niedermayer <michael@niedermayer.cc> | 2016-02-17 00:14:56 +0100 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2016-04-29 14:23:40 +0200 |
commit | 7bf84d1c2b4e5f334e0704bb33d152891f0463b6 (patch) | |
tree | 5697facf662435948c15c40cfde1a63e7241d3e3 | |
parent | bd641685f4ce73369c158d4abcb3eb138a38a891 (diff) | |
download | ffmpeg-7bf84d1c2b4e5f334e0704bb33d152891f0463b6.tar.gz |
swscale/x86/output: Fix yuv2planeX_16* with unaligned destination
Reviewed-by: BBB
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
(cherry picked from commit f6492a2ea8df80be0ed9591aee4019cef0e36e99)
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r-- | libswscale/x86/output.asm | 23 |
1 files changed, 16 insertions, 7 deletions
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index 9570969cea..133817cb71 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -54,8 +54,8 @@ SECTION .text ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple ; of 2. $offset is either 0 or 3. $dither holds 8 values. ;----------------------------------------------------------------------------- -%macro yuv2planeX_mainloop 1 -.pixelloop: +%macro yuv2planeX_mainloop 2 +.pixelloop_%2: %assign %%i 0 ; the rep here is for the 8bit output mmx case, where dither covers ; 8 pixels but we can only handle 2 pixels per register, and thus 4 @@ -82,7 +82,7 @@ SECTION .text mova m2, m1 %endif ; %1 == 8/9/10/16 movsx cntr_reg, fltsizem -.filterloop_ %+ %%i: +.filterloop_%2_ %+ %%i: ; input pixels mov r6, [srcq+gprsize*cntr_reg-2*gprsize] %if %1 == 16 @@ -129,7 +129,7 @@ SECTION .text %endif ; %1 == 8/9/10/16 sub cntr_reg, 2 - jg .filterloop_ %+ %%i + jg .filterloop_%2_ %+ %%i %if %1 == 16 psrad m2, 31 - %1 @@ -156,7 +156,7 @@ SECTION .text %endif ; mmxext/sse2/sse4/avx pminsw m2, [yuv2yuvX_%1_upper] %endif ; %1 == 9/10/16 - mova [dstq+r5*2], m2 + mov%2 [dstq+r5*2], m2 %endif ; %1 == 8/9/10/16 add r5, mmsize/2 @@ -164,7 +164,7 @@ SECTION .text %assign %%i %%i+2 %endrep - jg .pixelloop + jg .pixelloop_%2 %endmacro %macro yuv2planeX_fn 3 @@ -235,7 +235,16 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset xor r5, r5 -yuv2planeX_mainloop %1 +%if mmsize == 8 || %1 == 8 + yuv2planeX_mainloop %1, a +%else ; mmsize == 16 + test dstq, 15 + jnz .unaligned + yuv2planeX_mainloop %1, a + REP_RET +.unaligned: + yuv2planeX_mainloop %1, u +%endif ; mmsize == 8/16 %if %1 == 8 %if ARCH_X86_32 |