diff options
author | Martin Vignali <martin.vignali@gmail.com> | 2018-01-17 20:59:58 +0100 |
---|---|---|
committer | Martin Vignali <martin.vignali@gmail.com> | 2018-01-28 20:21:32 +0100 |
commit | 3a230ce5fa10b21312236b362df9eeddd99e7ac2 (patch) | |
tree | c1ed49dc5130afd5d7748518cc862fc4ba292157 | |
parent | 4d95c6d5d7d8d79b5acafcf526a1b7c1797a1060 (diff) | |
download | ffmpeg-3a230ce5fa10b21312236b362df9eeddd99e7ac2.tar.gz |
avfilter/x86/vf_blend : avfilter/x86/vf_blend : add AVX2 version for each func except divide
and optimize average, grainextract, multiply, screen, grain merge
-rw-r--r-- | libavfilter/x86/vf_blend.asm | 229 | ||||
-rw-r--r-- | libavfilter/x86/vf_blend_init.c | 39 |
2 files changed, 184 insertions, 84 deletions
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 4916aaf251..680e266348 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -2,6 +2,8 @@ ;* x86-optimized functions for blend filter ;* ;* Copyright (C) 2015 Paul B Mahol +;* Copyright (C) 2018 Henrik Gramner +;* Copyright (C) 2018 Jokyo Images ;* ;* This file is part of FFmpeg. ;* @@ -74,39 +76,36 @@ BLEND_INIT %1, 2 BLEND_END %endmacro -INIT_XMM sse2 -BLEND_SIMPLE xor, xor -BLEND_SIMPLE or, or -BLEND_SIMPLE and, and -BLEND_SIMPLE addition, addusb -BLEND_SIMPLE subtract, subusb -BLEND_SIMPLE darken, minub -BLEND_SIMPLE lighten, maxub - -BLEND_INIT grainextract, 4 - pxor m2, m2 - mova m3, [pw_128] +%macro GRAINEXTRACT 0 +BLEND_INIT grainextract, 6 + pxor m4, m4 + VBROADCASTI128 m5, [pw_128] .nextrow: mov xq, widthq - .loop: - movh m0, [topq + xq] - movh m1, [bottomq + xq] - punpcklbw m0, m2 - punpcklbw m1, m2 - paddw m0, m3 - psubw m0, m1 - packuswb m0, m0 - movh [dstq + xq], m0 - add xq, mmsize / 2 + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + paddw m0, m5 + paddw m1, m5 + psubw m0, m2 + psubw m1, m3 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro %macro MULTIPLY 3 ; a, b, pw_1 pmullw %1, %2 ; xxxxxxxx a * b paddw %1, %3 - mova %2, %1 - psrlw %2, 8 + psrlw %2, %1, 8 paddw %1, %2 psrlw %1, 8 ; 00xx00xx a * b / 255 %endmacro @@ -118,92 +117,112 @@ BLEND_END pxor %1, %4 ; 00xx00xx 255 - x / 255 %endmacro -BLEND_INIT multiply, 4 - pxor m2, m2 - mova m3, [pw_1] +%macro BLEND_MULTIPLY 0 +BLEND_INIT multiply, 6 + pxor m4, m4 + VBROADCASTI128 m5, [pw_1] .nextrow: mov xq, widthq .loop: - ; word - ; |--| - movh m0, [topq + xq] ; 0000xxxx - movh m1, [bottomq + xq] - punpcklbw m0, m2 ; 00xx00xx - punpcklbw m1, m2 - - MULTIPLY m0, m1, m3 - - packuswb m0, m0 ; 0000xxxx - movh [dstq + xq], m0 - add xq, mmsize / 2 - + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + MULTIPLY m0, m2, m5 + MULTIPLY m1, m3, m5 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro -BLEND_INIT screen, 5 - pxor m2, m2 - mova m3, [pw_1] - mova m4, [pw_255] +%macro BLEND_SCREEN 0 +BLEND_INIT screen, 7 + pxor m4, m4 + + VBROADCASTI128 m5, [pw_1] + VBROADCASTI128 m6, [pw_255] .nextrow: mov xq, widthq .loop: - movh m0, [topq + xq] ; 0000xxxx - movh m1, [bottomq + xq] - punpcklbw m0, m2 ; 00xx00xx - punpcklbw m1, m2 - - SCREEN m0, m1, m3, m4 - - packuswb m0, m0 ; 0000xxxx - movh [dstq + xq], m0 - add xq, mmsize / 2 - + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + SCREEN m0, m2, m5, m6 + SCREEN m1, m3, m5, m6 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro +%macro AVERAGE 0 BLEND_INIT average, 3 - pxor m2, m2 + pcmpeqb m2, m2 + .nextrow: mov xq, widthq - .loop: - movh m0, [topq + xq] - movh m1, [bottomq + xq] - punpcklbw m0, m2 - punpcklbw m1, m2 - paddw m0, m1 - psrlw m0, 1 - packuswb m0, m0 - movh [dstq + xq], m0 - add xq, mmsize / 2 +.loop: + movu m0, [topq + xq] + movu m1, [bottomq + xq] + pxor m0, m2 + pxor m1, m2 + pavgb m0, m1 + pxor m0, m2 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro -BLEND_INIT grainmerge, 4 - pxor m2, m2 - mova m3, [pw_128] + +%macro GRAINMERGE 0 +BLEND_INIT grainmerge, 6 + pxor m4, m4 + + VBROADCASTI128 m5, [pw_128] .nextrow: mov xq, widthq .loop: - movh m0, [topq + xq] - movh m1, [bottomq + xq] - punpcklbw m0, m2 - punpcklbw m1, m2 - paddw m0, m1 - psubw m0, m3 - packuswb m0, m0 - movh [dstq + xq], m0 - add xq, mmsize / 2 + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + paddw m0, m2 + paddw m1, m3 + psubw m0, m5 + psubw m1, m5 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro +%macro HARDMIX 0 BLEND_INIT hardmix, 5 - mova m2, [pb_255] - mova m3, [pb_128] - mova m4, [pb_127] + VBROADCASTI128 m2, [pb_255] + VBROADCASTI128 m3, [pb_128] + VBROADCASTI128 m4, [pb_127] .nextrow: mov xq, widthq @@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5 add xq, mmsize jl .loop BLEND_END +%endmacro +%macro DIVIDE 0 BLEND_INIT divide, 4 pxor m2, m2 mova m3, [ps_255] @@ -247,9 +268,11 @@ BLEND_INIT divide, 4 jl .loop BLEND_END +%endmacro +%macro PHOENIX 0 BLEND_INIT phoenix, 4 - mova m3, [pb_255] + VBROADCASTI128 m3, [pb_255] .nextrow: mov xq, widthq @@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4 add xq, mmsize jl .loop BLEND_END +%endmacro %macro BLEND_ABS 0 BLEND_INIT difference, 5 @@ -291,7 +315,7 @@ BLEND_END BLEND_INIT extremity, 8 pxor m2, m2 - mova m4, [pw_255] + VBROADCASTI128 m4, [pw_255] .nextrow: mov xq, widthq @@ -315,7 +339,7 @@ BLEND_END BLEND_INIT negation, 8 pxor m2, m2 - mova m4, [pw_255] + VBROADCASTI128 m4, [pw_255] .nextrow: mov xq, widthq @@ -341,6 +365,43 @@ BLEND_END %endmacro INIT_XMM sse2 +BLEND_SIMPLE xor, xor +BLEND_SIMPLE or, or +BLEND_SIMPLE and, and +BLEND_SIMPLE addition, addusb +BLEND_SIMPLE subtract, subusb +BLEND_SIMPLE darken, minub +BLEND_SIMPLE lighten, maxub +GRAINEXTRACT +BLEND_MULTIPLY +BLEND_SCREEN +AVERAGE +GRAINMERGE +HARDMIX +PHOENIX +DIVIDE + BLEND_ABS + INIT_XMM ssse3 BLEND_ABS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +BLEND_SIMPLE xor, xor +BLEND_SIMPLE or, or +BLEND_SIMPLE and, and +BLEND_SIMPLE addition, addusb +BLEND_SIMPLE subtract, subusb +BLEND_SIMPLE darken, minub +BLEND_SIMPLE lighten, maxub +GRAINEXTRACT +BLEND_MULTIPLY +BLEND_SCREEN +AVERAGE +GRAINMERGE +HARDMIX +PHOENIX + +BLEND_ABS +%endif diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index a4fc9af246..6e782e4edb 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -31,26 +31,43 @@ void ff_blend_##name##_##opt(const uint8_t *top, ptrdiff_t top_linesize, \ struct FilterParams *param, double *values, int starty); BLEND_FUNC(addition, sse2) +BLEND_FUNC(addition, avx2) BLEND_FUNC(grainmerge, sse2) +BLEND_FUNC(grainmerge, avx2) BLEND_FUNC(average, sse2) +BLEND_FUNC(average, avx2) BLEND_FUNC(and, sse2) +BLEND_FUNC(and, avx2) BLEND_FUNC(darken, sse2) +BLEND_FUNC(darken, avx2) BLEND_FUNC(grainextract, sse2) +BLEND_FUNC(grainextract, avx2) BLEND_FUNC(multiply, sse2) +BLEND_FUNC(multiply, avx2) BLEND_FUNC(screen, sse2) +BLEND_FUNC(screen, avx2) BLEND_FUNC(hardmix, sse2) +BLEND_FUNC(hardmix, avx2) BLEND_FUNC(divide, sse2) BLEND_FUNC(lighten, sse2) +BLEND_FUNC(lighten, avx2) BLEND_FUNC(or, sse2) +BLEND_FUNC(or, avx2) BLEND_FUNC(phoenix, sse2) +BLEND_FUNC(phoenix, avx2) BLEND_FUNC(subtract, sse2) +BLEND_FUNC(subtract, avx2) BLEND_FUNC(xor, sse2) +BLEND_FUNC(xor, avx2) BLEND_FUNC(difference, sse2) BLEND_FUNC(difference, ssse3) +BLEND_FUNC(difference, avx2) BLEND_FUNC(extremity, sse2) BLEND_FUNC(extremity, ssse3) +BLEND_FUNC(extremity, avx2) BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, ssse3) +BLEND_FUNC(negation, avx2) av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) { @@ -85,4 +102,26 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break; } } + + if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1 && !is_16bit) { + switch (param->mode) { + case BLEND_ADDITION: param->blend = ff_blend_addition_avx2; break; + case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_avx2; break; + case BLEND_AND: param->blend = ff_blend_and_avx2; break; + case BLEND_AVERAGE: param->blend = ff_blend_average_avx2; break; + case BLEND_DARKEN: param->blend = ff_blend_darken_avx2; break; + case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_avx2; break; + case BLEND_HARDMIX: param->blend = ff_blend_hardmix_avx2; break; + case BLEND_LIGHTEN: param->blend = ff_blend_lighten_avx2; break; + case BLEND_MULTIPLY: param->blend = ff_blend_multiply_avx2; break; + case BLEND_OR: param->blend = ff_blend_or_avx2; break; + case BLEND_PHOENIX: param->blend = ff_blend_phoenix_avx2; break; + case BLEND_SCREEN: param->blend = ff_blend_screen_avx2; break; + case BLEND_SUBTRACT: param->blend = ff_blend_subtract_avx2; break; + case BLEND_XOR: param->blend = ff_blend_xor_avx2; break; + case BLEND_DIFFERENCE: param->blend = ff_blend_difference_avx2; break; + case BLEND_EXTREMITY: param->blend = ff_blend_extremity_avx2; break; + case BLEND_NEGATION: param->blend = ff_blend_negation_avx2; break; + } + } } |