aboutsummaryrefslogtreecommitdiffstats
path: root/libavfilter/x86
diff options
context:
space:
mode:
authorTimothy Gu <timothygu99@gmail.com>2016-02-10 09:04:51 +0000
committerTimothy Gu <timothygu99@gmail.com>2016-02-28 08:19:09 -0800
commit222e6da605eadd9afa386f0a6c3142b16e16cf74 (patch)
treee701f0c4485279d065245ae87571a7109a224177 /libavfilter/x86
parent1c9215e580b6436d1aff3c0118ef01269712ebd9 (diff)
downloadffmpeg-222e6da605eadd9afa386f0a6c3142b16e16cf74.tar.gz
x86/vf_blend: Add SSE2 optimization for divide
4.5x faster than C float version with autovectorization 10 x faster than C int version 25 x faster than C float version without autovectorization
Diffstat (limited to 'libavfilter/x86')
-rw-r--r--libavfilter/x86/vf_blend.asm30
-rw-r--r--libavfilter/x86/vf_blend_init.c2
2 files changed, 32 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 47471aaaaf..33b1ad1496 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -24,6 +24,7 @@
SECTION_RODATA
+ps_255: times 4 dd 255.0
pw_1: times 8 dw 1
pw_128: times 8 dw 128
pw_255: times 8 dw 255
@@ -218,6 +219,35 @@ BLEND_INIT hardmix, 5
jl .loop
BLEND_END
+BLEND_INIT divide, 4
+ pxor m2, m2
+ mova m3, [ps_255]
+.nextrow:
+ mov xq, widthq
+
+ .loop:
+ movd m0, [topq + xq] ; 000000xx
+ movd m1, [bottomq + xq]
+ punpcklbw m0, m2 ; 00000x0x
+ punpcklbw m1, m2
+ punpcklwd m0, m2 ; 000x000x
+ punpcklwd m1, m2
+
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ divps m0, m1 ; a / b
+ mulps m0, m3 ; a / b * 255
+ minps m0, m3
+ cvttps2dq m0, m0
+
+ packssdw m0, m0 ; 00000x0x
+ packuswb m0, m0 ; 000000xx
+ movd [dstq + xq], m0
+ add xq, mmsize / 4
+
+ jl .loop
+BLEND_END
+
BLEND_INIT phoenix, 4
mova m3, [pb_255]
.nextrow:
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index 555e1e54dc..ed053ba272 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -39,6 +39,7 @@ BLEND_FUNC(difference128, sse2)
BLEND_FUNC(multiply, sse2)
BLEND_FUNC(screen, sse2)
BLEND_FUNC(hardmix, sse2)
+BLEND_FUNC(divide, sse2)
BLEND_FUNC(lighten, sse2)
BLEND_FUNC(or, sse2)
BLEND_FUNC(phoenix, sse2)
@@ -61,6 +62,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
case BLEND_AVERAGE: param->blend = ff_blend_average_sse2; break;
case BLEND_DARKEN: param->blend = ff_blend_darken_sse2; break;
case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; break;
+ case BLEND_DIVIDE: param->blend = ff_blend_divide_sse2; break;
case BLEND_HARDMIX: param->blend = ff_blend_hardmix_sse2; break;
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_sse2; break;
case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;