diff options
author | Martin Vignali <martin.vignali@gmail.com> | 2018-03-17 19:37:06 +0100 |
---|---|---|
committer | Martin Vignali <martin.vignali@gmail.com> | 2018-04-05 21:46:16 +0200 |
commit | f3df42e81d367547756e7955e36c8af7c9c18db2 (patch) | |
tree | f0c23de031d57c28ad7fc87516a7d763ba020d03 | |
parent | 8eb0bb11083320cc12bcc23104a384984c4a9d64 (diff) | |
download | ffmpeg-f3df42e81d367547756e7955e36c8af7c9c18db2.tar.gz |
avfilter/x86/vf_blend : add SIMD for 16 bit version of
grainextract
grainmerge
average
extremity
negation
-rw-r--r-- | libavfilter/x86/vf_blend.asm | 168 | ||||
-rw-r--r-- | libavfilter/x86/vf_blend_init.c | 20 |
2 files changed, 128 insertions, 60 deletions
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 9cd5ee7acb..251bbb5a12 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -27,6 +27,8 @@ SECTION_RODATA ps_255: times 4 dd 255.0 +pd_32768 : times 4 dd 32768 +pd_65535 : times 4 dd 65535 pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 @@ -79,26 +81,33 @@ BLEND_INIT %1, 2, %3 BLEND_END %endmacro -%macro GRAINEXTRACT 0 -BLEND_INIT grainextract, 6 +; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) +%macro GRAINEXTRACT 3-4 +BLEND_INIT %1, 6, %4 pxor m4, m4 +%if %0 == 4 ; 16 bit + VBROADCASTI128 m5, [pd_32768] +%else VBROADCASTI128 m5, [pw_128] +%endif .nextrow: mov xq, widthq .loop: movu m1, [topq + xq] movu m3, [bottomq + xq] - punpcklbw m0, m1, m4 - punpckhbw m1, m4 - punpcklbw m2, m3, m4 - punpckhbw m3, m4 - paddw m0, m5 - paddw m1, m5 - psubw m0, m2 - psubw m1, m3 + punpckl%2%3 m0, m1, m4 + punpckh%2%3 m1, m4 + punpckl%2%3 m2, m3, m4 + punpckh%2%3 m3, m4 + + padd%3 m0, m5 + padd%3 m1, m5 + psub%3 m0, m2 + psub%3 m1, m3 + + packus%3%2 m0, m1 - packuswb m0, m1 mova [dstq + xq], m0 add xq, mmsize jl .loop @@ -172,8 +181,9 @@ BLEND_INIT screen, 7 BLEND_END %endmacro -%macro AVERAGE 0 -BLEND_INIT average, 3 +;%1 name, %2 (b or w), %3 (set if 16 bit) +%macro AVERAGE 2-3 +BLEND_INIT %1, 3, %3 pcmpeqb m2, m2 .nextrow: @@ -184,7 +194,7 @@ BLEND_INIT average, 3 movu m1, [bottomq + xq] pxor m0, m2 pxor m1, m2 - pavgb m0, m1 + pavg%2 m0, m1 pxor m0, m2 mova [dstq + xq], m0 add xq, mmsize @@ -192,29 +202,34 @@ BLEND_INIT average, 3 BLEND_END %endmacro - -%macro GRAINMERGE 0 -BLEND_INIT grainmerge, 6 +; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) +%macro GRAINMERGE 3-4 +BLEND_INIT %1, 6, %4 pxor m4, m4 - +%if %0 == 4 ; 16 bit + VBROADCASTI128 m5, [pd_32768] +%else VBROADCASTI128 m5, [pw_128] +%endif .nextrow: mov xq, widthq .loop: movu m1, [topq + xq] movu m3, [bottomq + xq] - punpcklbw m0, m1, m4 - punpckhbw m1, m4 - punpcklbw m2, m3, m4 - punpckhbw m3, m4 - paddw m0, m2 - paddw m1, m3 - psubw m0, m5 - psubw m1, m5 + punpckl%2%3 m0, m1, m4 + punpckh%2%3 m1, m4 + punpckl%2%3 m2, m3, m4 + punpckh%2%3 m3, m4 + + padd%3 m0, m2 + padd%3 m1, m3 + psub%3 m0, m5 + psub%3 m1, m5 + + packus%3%2 m0, m1 - packuswb m0, m1 mova [dstq + xq], m0 add xq, mmsize jl .loop @@ -324,52 +339,73 @@ BLEND_INIT %1, 5, %4 BLEND_END %endmacro -%macro BLEND_ABS 0 -BLEND_INIT extremity, 8 +; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) +%macro EXTREMITY 3-4 +BLEND_INIT %1, 8, %4 pxor m2, m2 +%if %0 == 4; 16 bit + VBROADCASTI128 m4, [pd_65535] +%else VBROADCASTI128 m4, [pw_255] +%endif .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] - punpckhbw m5, m0, m2 - punpcklbw m0, m2 - punpckhbw m6, m1, m2 - punpcklbw m1, m2 - psubw m3, m4, m0 - psubw m7, m4, m5 - psubw m3, m1 - psubw m7, m6 + punpckh%2%3 m5, m0, m2 + punpckl%2%3 m0, m2 + punpckh%2%3 m6, m1, m2 + punpckl%2%3 m1, m2 + psub%3 m3, m4, m0 + psub%3 m7, m4, m5 + psub%3 m3, m1 + psub%3 m7, m6 +%if %0 == 4; 16 bit + pabsd m3, m3 + pabsd m7, m7 +%else ABS2 m3, m7, m1, m6 - packuswb m3, m7 +%endif + packus%3%2 m3, m7 mova [dstq + xq], m3 add xq, mmsize jl .loop BLEND_END +%endmacro -BLEND_INIT negation, 8 +%macro NEGATION 3-4 +BLEND_INIT %1, 8, %4 pxor m2, m2 +%if %0 == 4; 16 bit + VBROADCASTI128 m4, [pd_65535] +%else VBROADCASTI128 m4, [pw_255] +%endif .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] - punpckhbw m5, m0, m2 - punpcklbw m0, m2 - punpckhbw m6, m1, m2 - punpcklbw m1, m2 - psubw m3, m4, m0 - psubw m7, m4, m5 - psubw m3, m1 - psubw m7, m6 + punpckh%2%3 m5, m0, m2 + punpckl%2%3 m0, m2 + punpckh%2%3 m6, m1, m2 + punpckl%2%3 m1, m2 + psub%3 m3, m4, m0 + psub%3 m7, m4, m5 + psub%3 m3, m1 + psub%3 m7, m6 +%if %0 == 4; 16 bit + pabsd m3, m3 + pabsd m7, m7 +%else ABS2 m3, m7, m1, m6 - psubw m0, m4, m3 - psubw m1, m4, m7 - packuswb m0, m1 +%endif + psub%3 m0, m4, m3 + psub%3 m1, m4, m7 + packus%3%2 m0, m1 mova [dstq + xq], m0 add xq, mmsize jl .loop @@ -384,17 +420,17 @@ BLEND_SIMPLE addition, addusb BLEND_SIMPLE subtract, subusb BLEND_SIMPLE darken, minub BLEND_SIMPLE lighten, maxub -GRAINEXTRACT +GRAINEXTRACT grainextract, b, w BLEND_MULTIPLY BLEND_SCREEN -AVERAGE -GRAINMERGE +AVERAGE average, b +GRAINMERGE grainmerge, b, w HARDMIX PHOENIX phoenix, b DIFFERENCE difference, b, w DIVIDE - -BLEND_ABS +EXTREMITY extremity, b, w +NEGATION negation, b, w %if ARCH_X86_64 BLEND_SIMPLE addition_16, addusw, 1 @@ -402,18 +438,24 @@ BLEND_SIMPLE and_16, and, 1 BLEND_SIMPLE or_16, or, 1 BLEND_SIMPLE subtract_16, subusw, 1 BLEND_SIMPLE xor_16, xor, 1 +AVERAGE average_16, w, 1 %endif INIT_XMM ssse3 DIFFERENCE difference, b, w -BLEND_ABS +EXTREMITY extremity, b, w +NEGATION negation, b, w INIT_XMM sse4 %if ARCH_X86_64 BLEND_SIMPLE darken_16, minuw, 1 BLEND_SIMPLE lighten_16, maxuw, 1 +GRAINEXTRACT grainextract_16, w, d, 1 +GRAINMERGE grainmerge_16, w, d, 1 PHOENIX phoenix_16, w, 1 DIFFERENCE difference_16, w, d, 1 +EXTREMITY extremity_16, w, d, 1 +NEGATION negation_16, w, d, 1 %endif %if HAVE_AVX2_EXTERNAL @@ -425,16 +467,17 @@ BLEND_SIMPLE addition, addusb BLEND_SIMPLE subtract, subusb BLEND_SIMPLE darken, minub BLEND_SIMPLE lighten, maxub -GRAINEXTRACT +GRAINEXTRACT grainextract, b, w BLEND_MULTIPLY BLEND_SCREEN -AVERAGE -GRAINMERGE +AVERAGE average, b +GRAINMERGE grainmerge, b, w HARDMIX PHOENIX phoenix, b DIFFERENCE difference, b, w -BLEND_ABS +EXTREMITY extremity, b, w +NEGATION negation, b, w %if ARCH_X86_64 BLEND_SIMPLE addition_16, addusw, 1 @@ -444,7 +487,12 @@ BLEND_SIMPLE lighten_16, maxuw, 1 BLEND_SIMPLE or_16, or, 1 BLEND_SIMPLE subtract_16, subusw, 1 BLEND_SIMPLE xor_16, xor, 1 +GRAINEXTRACT grainextract_16, w, d, 1 +AVERAGE average_16, w, 1 +GRAINMERGE grainmerge_16, w, d, 1 PHOENIX phoenix_16, w, 1 DIFFERENCE difference_16, w, d, 1 +EXTREMITY extremity_16, w, d, 1 +NEGATION negation_16, w, d, 1 %endif %endif diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index 0962f6d7fd..acf28559ff 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -72,12 +72,22 @@ BLEND_FUNC(negation, avx2) #if ARCH_X86_64 BLEND_FUNC(addition_16, sse2) BLEND_FUNC(addition_16, avx2) +BLEND_FUNC(grainmerge_16, sse4) +BLEND_FUNC(grainmerge_16, avx2) +BLEND_FUNC(average_16, sse2) +BLEND_FUNC(average_16, avx2) BLEND_FUNC(and_16, sse2) BLEND_FUNC(and_16, avx2) BLEND_FUNC(darken_16, sse4) BLEND_FUNC(darken_16, avx2) +BLEND_FUNC(grainextract_16, sse4) +BLEND_FUNC(grainextract_16, avx2) BLEND_FUNC(difference_16, sse4) BLEND_FUNC(difference_16, avx2) +BLEND_FUNC(extremity_16, sse4) +BLEND_FUNC(extremity_16, avx2) +BLEND_FUNC(negation_16, sse4) +BLEND_FUNC(negation_16, avx2) BLEND_FUNC(lighten_16, sse4) BLEND_FUNC(lighten_16, avx2) BLEND_FUNC(or_16, sse2) @@ -152,6 +162,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) switch (param->mode) { case BLEND_ADDITION: param->blend = ff_blend_addition_16_sse2; break; case BLEND_AND: param->blend = ff_blend_and_16_sse2; break; + case BLEND_AVERAGE: param->blend = ff_blend_average_16_sse2; break; case BLEND_OR: param->blend = ff_blend_or_16_sse2; break; case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_sse2; break; case BLEND_XOR: param->blend = ff_blend_xor_16_sse2; break; @@ -159,8 +170,12 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) } if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1) { switch (param->mode) { + case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_16_sse4; break; case BLEND_DARKEN: param->blend = ff_blend_darken_16_sse4; break; + case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_16_sse4; break; case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_sse4; break; + case BLEND_EXTREMITY: param->blend = ff_blend_extremity_16_sse4; break; + case BLEND_NEGATION: param->blend = ff_blend_negation_16_sse4; break; case BLEND_LIGHTEN: param->blend = ff_blend_lighten_16_sse4; break; case BLEND_PHOENIX: param->blend = ff_blend_phoenix_16_sse4; break; } @@ -168,9 +183,14 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1) { switch (param->mode) { case BLEND_ADDITION: param->blend = ff_blend_addition_16_avx2; break; + case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_16_avx2; break; case BLEND_AND: param->blend = ff_blend_and_16_avx2; break; + case BLEND_AVERAGE: param->blend = ff_blend_average_16_avx2; break; case BLEND_DARKEN: param->blend = ff_blend_darken_16_avx2; break; + case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_16_avx2; break; case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_avx2; break; + case BLEND_EXTREMITY: param->blend = ff_blend_extremity_16_avx2; break; + case BLEND_NEGATION: param->blend = ff_blend_negation_16_avx2; break; case BLEND_LIGHTEN: param->blend = ff_blend_lighten_16_avx2; break; case BLEND_OR: param->blend = ff_blend_or_16_avx2; break; case BLEND_PHOENIX: param->blend = ff_blend_phoenix_16_avx2; break; |