diff options
author | Timothy Gu <timothygu99@gmail.com> | 2016-02-10 09:04:51 +0000 |
---|---|---|
committer | Timothy Gu <timothygu99@gmail.com> | 2016-02-28 08:19:09 -0800 |
commit | 222e6da605eadd9afa386f0a6c3142b16e16cf74 (patch) | |
tree | e701f0c4485279d065245ae87571a7109a224177 /libavfilter/x86 | |
parent | 1c9215e580b6436d1aff3c0118ef01269712ebd9 (diff) | |
download | ffmpeg-222e6da605eadd9afa386f0a6c3142b16e16cf74.tar.gz |
x86/vf_blend: Add SSE2 optimization for divide
4.5x faster than C float version with autovectorization
10 x faster than C int version
25 x faster than C float version without autovectorization
Diffstat (limited to 'libavfilter/x86')
-rw-r--r-- | libavfilter/x86/vf_blend.asm | 30 | ||||
-rw-r--r-- | libavfilter/x86/vf_blend_init.c | 2 |
2 files changed, 32 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 47471aaaaf..33b1ad1496 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -24,6 +24,7 @@ SECTION_RODATA +ps_255: times 4 dd 255.0 pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 @@ -218,6 +219,35 @@ BLEND_INIT hardmix, 5 jl .loop BLEND_END +BLEND_INIT divide, 4 + pxor m2, m2 + mova m3, [ps_255] +.nextrow: + mov xq, widthq + + .loop: + movd m0, [topq + xq] ; 000000xx + movd m1, [bottomq + xq] + punpcklbw m0, m2 ; 00000x0x + punpcklbw m1, m2 + punpcklwd m0, m2 ; 000x000x + punpcklwd m1, m2 + + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + divps m0, m1 ; a / b + mulps m0, m3 ; a / b * 255 + minps m0, m3 + cvttps2dq m0, m0 + + packssdw m0, m0 ; 00000x0x + packuswb m0, m0 ; 000000xx + movd [dstq + xq], m0 + add xq, mmsize / 4 + + jl .loop +BLEND_END + BLEND_INIT phoenix, 4 mova m3, [pb_255] .nextrow: diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index 555e1e54dc..ed053ba272 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -39,6 +39,7 @@ BLEND_FUNC(difference128, sse2) BLEND_FUNC(multiply, sse2) BLEND_FUNC(screen, sse2) BLEND_FUNC(hardmix, sse2) +BLEND_FUNC(divide, sse2) BLEND_FUNC(lighten, sse2) BLEND_FUNC(or, sse2) BLEND_FUNC(phoenix, sse2) @@ -61,6 +62,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_AVERAGE: param->blend = ff_blend_average_sse2; break; case BLEND_DARKEN: param->blend = ff_blend_darken_sse2; break; case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; break; + case BLEND_DIVIDE: param->blend = ff_blend_divide_sse2; break; case BLEND_HARDMIX: param->blend = ff_blend_hardmix_sse2; break; case BLEND_LIGHTEN: param->blend = ff_blend_lighten_sse2; break; case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break; |