diff options
author | Paul B Mahol <onemda@gmail.com> | 2018-04-30 12:01:07 +0200 |
---|---|---|
committer | Paul B Mahol <onemda@gmail.com> | 2018-05-02 23:58:21 +0200 |
commit | 6d7c63588c81ba61b75701702b8680bd0063f36c (patch) | |
tree | 8afa4754f11330ea27ee3fa31071bae5176baeb6 /libavfilter/x86 | |
parent | a150b2e3a099fd539ecc6664050fd20617ce223c (diff) | |
download | ffmpeg-6d7c63588c81ba61b75701702b8680bd0063f36c.tar.gz |
avfilter/vf_overlay: add x86 SIMD
Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
is straight.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
Diffstat (limited to 'libavfilter/x86')
-rw-r--r-- | libavfilter/x86/Makefile | 2 | ||||
-rw-r--r-- | libavfilter/x86/vf_overlay.asm | 144 | ||||
-rw-r--r-- | libavfilter/x86/vf_overlay_init.c | 63 |
3 files changed, 209 insertions, 0 deletions
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index f60de3b73b..b484c8bd1c 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -13,6 +13,7 @@ OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o +OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o @@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o +X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o X86ASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm new file mode 100644 index 0000000000..14ec60ca34 --- /dev/null +++ b/libavfilter/x86/vf_overlay.asm @@ -0,0 +1,144 @@ +;***************************************************************************** +;* x86-optimized functions for overlay filter +;* +;* Copyright (C) 2018 Paul B Mahol +;* Copyright (C) 2018 Henrik Gramner +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_128: times 8 dw 128 +pw_255: times 8 dw 255 +pw_257: times 8 dw 257 + +SECTION .text + +INIT_XMM sse4 +cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x + xor xq, xq + movsxdifnidn wq, wd + mov rq, wq + and rq, mmsize/2 - 1 + cmp wq, mmsize/2 + jl .end + sub wq, rq + mova m3, [pw_255] + mova m4, [pw_128] + mova m5, [pw_257] + .loop: + pmovzxbw m0, [sq+xq] + pmovzxbw m2, [aq+xq] + pmovzxbw m1, [dq+xq] + pmullw m0, m2 + pxor m2, m3 + pmullw m1, m2 + paddw m0, m4 + paddw m0, m1 + pmulhuw m0, m5 + packuswb m0, m0 + movq [dq+xq], m0 + add xq, mmsize/2 + cmp xq, wq + jl .loop + + .end: + mov eax, xd + RET + +INIT_XMM sse4 +cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x + xor xq, xq + movsxdifnidn wq, wd + sub wq, 1 + mov rq, wq + and rq, mmsize/2 - 1 + cmp wq, mmsize/2 + jl .end + sub wq, rq + mova m3, [pw_255] + mova m4, [pw_128] + mova m5, [pw_257] + .loop: + pmovzxbw m0, [sq+xq] + movu m1, [aq+2*xq] + pandn m2, m3, m1 + psllw m1, 8 + pavgw m2, m1 + pavgw m2, m1 + psrlw m2, 8 + pmovzxbw m1, [dq+xq] + pmullw m0, m2 + pxor m2, m3 + pmullw m1, m2 + paddw m0, m4 + paddw m0, m1 + pmulhuw m0, m5 + packuswb m0, m0 + movq [dq+xq], m0 + add xq, mmsize/2 + cmp xq, wq + jl .loop + + .end: + mov eax, xd + RET + +INIT_XMM sse4 +cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x + mov daq, aq + add daq, rmp + xor xq, xq + movsxdifnidn wq, wd + sub wq, 1 + mov rq, wq + and rq, mmsize/2 - 1 + cmp wq, mmsize/2 + jl .end + sub wq, rq + mova m3, [pw_255] + mova m4, [pw_128] + mova m5, [pw_257] + mova m6, [pb_1] + .loop: + pmovzxbw m0, [sq+xq] + movu m2, [aq+2*xq] + movu m1, [daq+2*xq] + pmaddubsw m2, m6 + pmaddubsw m1, m6 + paddw m2, m1 + psrlw m2, 2 + pmovzxbw m1, [dq+xq] + pmullw m0, m2 + pxor m2, m3 + pmullw m1, m2 + paddw m0, m4 + paddw m0, m1 + pmulhuw m0, m5 + packuswb m0, m0 + movq [dq+xq], m0 + add xq, mmsize/2 + cmp xq, wq + jl .loop + + .end: + mov eax, xd + RET diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c new file mode 100644 index 0000000000..fec1629829 --- /dev/null +++ b/libavfilter/x86/vf_overlay_init.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Paul B Mahol + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_overlay.h" + +int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, + int w, ptrdiff_t alinesize); + +int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, + int w, ptrdiff_t alinesize); + +int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, + int w, ptrdiff_t alinesize); + +av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE4(cpu_flags) && + (format == OVERLAY_FORMAT_YUV444 || + format == OVERLAY_FORMAT_GBRP) && + alpha_format == 0 && main_has_alpha == 0) { + s->blend_row[0] = ff_overlay_row_44_sse4; + s->blend_row[1] = ff_overlay_row_44_sse4; + s->blend_row[2] = ff_overlay_row_44_sse4; + } + + if (EXTERNAL_SSE4(cpu_flags) && + (format == OVERLAY_FORMAT_YUV420) && + alpha_format == 0 && main_has_alpha == 0) { + s->blend_row[0] = ff_overlay_row_44_sse4; + s->blend_row[1] = ff_overlay_row_20_sse4; + s->blend_row[2] = ff_overlay_row_20_sse4; + } + + if (EXTERNAL_SSE4(cpu_flags) && + (format == OVERLAY_FORMAT_YUV422) && + alpha_format == 0 && main_has_alpha == 0) { + s->blend_row[0] = ff_overlay_row_44_sse4; + s->blend_row[1] = ff_overlay_row_22_sse4; + s->blend_row[2] = ff_overlay_row_22_sse4; + } +} |