diff options
author | Timothy Gu <timothygu99@gmail.com> | 2016-02-08 19:04:00 +0000 |
---|---|---|
committer | Timothy Gu <timothygu99@gmail.com> | 2016-02-08 13:35:24 -0800 |
commit | 253209ac444947d4735be84469c582df2718a59e (patch) | |
tree | 5c9f80bc40f7b82f990b172b9ccc264c3c0c42c3 /libavfilter/x86/vf_blend.asm | |
parent | a25c5dbb5ee0f54c474d9caf43359cd0f61ae1bf (diff) | |
download | ffmpeg-253209ac444947d4735be84469c582df2718a59e.tar.gz |
vf_blend: Add SSE2 optimization for multiply
5 times faster than C, 3 times overall.
Diffstat (limited to 'libavfilter/x86/vf_blend.asm')
-rw-r--r-- | libavfilter/x86/vf_blend.asm | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 730be77d00..9388a74250 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -24,6 +24,7 @@ SECTION_RODATA +pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 pb_127: times 16 db 127 @@ -101,6 +102,34 @@ BLEND_INIT difference128, 4 jl .loop BLEND_END +BLEND_INIT multiply, 4 + pxor m2, m2 + mova m3, [pw_1] +.nextrow: + mov xq, widthq + + .loop: + ; word + ; |--| + movh m0, [topq + xq] ; 0000xxxx + movh m1, [bottomq + xq] + punpcklbw m0, m2 ; 00xx00xx + punpcklbw m1, m2 + + pmullw m0, m1 ; xxxxxxxx a * b + paddw m0, m3 + mova m1, m0 + psrlw m1, 8 + paddw m0, m1 + psrlw m0, 8 ; 00xx00xx a * b / 255 + + packuswb m0, m0 ; 0000xxxx + movh [dstq + xq], m0 + add xq, mmsize / 2 + + jl .loop +BLEND_END + BLEND_INIT average, 3 pxor m2, m2 .nextrow: |