diff options
author | Alan Kelly <alankelly-at-google.com@ffmpeg.org> | 2022-04-26 10:00:02 +0200 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2022-08-18 16:24:48 +0200 |
commit | a6724285fd45111436dd5242eab2c489182aa5c2 (patch) | |
tree | fbe3a7a9261b5e857f8147fe717112016e0a0250 | |
parent | 51a34e8525fea2bbc29b42831d7a17f34e8518d3 (diff) | |
download | ffmpeg-a6724285fd45111436dd5242eab2c489182aa5c2.tar.gz |
sws: allow avx2 hscale to process inputs of any size.
The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.
Signed-off-by: Anton Khirnov <anton@khirnov.net>
-rw-r--r-- | libswscale/x86/scale_avx2.asm | 44 |
1 files changed, 43 insertions, 1 deletions
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 20acdbd633..37095e596a 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, mova m14, [four] shr fltsized, 2 %endif + cmp wq, 0x10 + jl .tail_loop + sub wq, 0x10 .loop: movu m1, [fltposq] movu m2, [fltposq+32] @@ -101,7 +104,46 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, add fltposq, 0x40 add countq, 0x10 cmp countq, wq - jl .loop + jle .loop + + add wq, 0x10 + cmp countq, wq + jge .end + +.tail_loop: + movu xm1, [fltposq] +%ifidn %1, X4 + pxor xm9, xm9 + pxor xm10, xm10 + xor innerq, innerq +.tail_innerloop: +%endif + vpcmpeqd xm13, xm13 + vpgatherdd xm3,[srcmemq + xm1], xm13 + vpunpcklbw xm5, xm3, xm0 + vpunpckhbw xm6, xm3, xm0 + vpmaddwd xm5, xm5, [filterq] + vpmaddwd xm6, xm6, [filterq + 0x10] + add filterq, 0x20 +%ifidn %1, X4 + paddd xm9, xm5 + paddd xm10, xm6 + paddd xm1, xm14 + add innerq, 1 + cmp innerq, fltsizeq + jl .tail_innerloop + vphaddd xm5, xm9, xm10 +%else + vphaddd xm5, xm5, xm6 +%endif + vpsrad xm5, 7 + vpackssdw xm5, xm5, xm5 + vmovq [dstq + countq * 2], xm5 + add fltposq, 0x10 + add countq, 0x4 + cmp countq, wq + jl .tail_loop +.end: REP_RET %endmacro |