diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2016-04-06 14:09:08 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2016-04-12 16:42:48 -0400 |
commit | 5ce703a6bff77af1be9f2eb2698879c591b403c4 (patch) | |
tree | 60b8e725f53cdb999590c62e4b9d8149c4789326 /libavfilter | |
parent | 2e2e08a35b479c5a2049a0f7eaf20e00aa78e923 (diff) | |
download | ffmpeg-5ce703a6bff77af1be9f2eb2698879c591b403c4.tar.gz |
vf_colorspace: x86-64 SIMD (SSE2) optimizations.
Diffstat (limited to 'libavfilter')
-rw-r--r-- | libavfilter/colorspacedsp.c | 3 | ||||
-rw-r--r-- | libavfilter/colorspacedsp.h | 3 | ||||
-rw-r--r-- | libavfilter/x86/Makefile | 2 | ||||
-rw-r--r-- | libavfilter/x86/colorspacedsp.asm | 1097 | ||||
-rw-r--r-- | libavfilter/x86/colorspacedsp_init.c | 119 |
5 files changed, 1224 insertions, 0 deletions
diff --git a/libavfilter/colorspacedsp.c b/libavfilter/colorspacedsp.c index 51a7c1ddb7..d4c43c380f 100644 --- a/libavfilter/colorspacedsp.c +++ b/libavfilter/colorspacedsp.c @@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp) init_yuv2yuv_fns(2, 12); dsp->multiply3x3 = multiply3x3_c; + + if (ARCH_X86) + ff_colorspacedsp_x86_init(dsp); } diff --git a/libavfilter/colorspacedsp.h b/libavfilter/colorspacedsp.h index 357111752b..4e70c6c938 100644 --- a/libavfilter/colorspacedsp.h +++ b/libavfilter/colorspacedsp.h @@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext { void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp); +/* internal */ +void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp); + #endif /* AVFILTER_COLORSPACEDSP_H */ diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index ed294e0f92..4486b79c8a 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,5 +1,6 @@ OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o +OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o @@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o +YASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o diff --git a/libavfilter/x86/colorspacedsp.asm b/libavfilter/x86/colorspacedsp.asm new file mode 100644 index 0000000000..67d851abf4 --- /dev/null +++ b/libavfilter/x86/colorspacedsp.asm @@ -0,0 +1,1097 @@ +;***************************************************************************** +;* x86-optimized functions for colorspace filter +;* +;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_64: times 8 dw 64 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_512: times 8 dw 512 +pw_1023: times 8 dw 1023 +pw_1024: times 8 dw 1024 +pw_2048: times 8 dw 2048 +pw_4095: times 8 dw 4095 +pw_8192: times 8 dw 8192 +pw_16384: times 8 dw 16384 + +pd_1: times 4 dd 1 +pd_2: times 4 dd 2 +pd_128: times 4 dd 128 +pd_512: times 4 dd 512 +pd_2048: times 4 dd 2048 +pd_8192: times 4 dd 8192 +pd_32768: times 4 dd 32768 +pd_131072: times 4 dd 131072 + +SECTION .text + +; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], +; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], +; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], +; const int16_t yuv_offset[2][8]) + +%if ARCH_X86_64 +%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert) + +%assign %%sh (14 + %1 - %2) +%assign %%rnd (1 << (%%sh - 1)) +%assign %%uvinoff (128 << (%1 - 8)) +%assign %%uvoutoff (128 << (%2 - 8)) +%if %3 == 0 +%assign %%ss 444 +%elif %4 == 0 +%assign %%ss 422 +%else ; %4 == 1 +%assign %%ss 420 +%endif ; %3/%4 +%if %2 != 8 +%assign %%maxval (1 << %2) - 1 +%endif ; %2 != 8 + +%assign %%ypsh %%sh - 1 +%if %%ypsh > 14 +%assign %%yoffsh %%ypsh - 13 +%assign %%ypsh 14 +%else +%assign %%yoffsh 1 +%endif +%assign %%yprnd (1 << (%%yoffsh - 1)) +%assign %%ypmul (1 << %%ypsh) + +cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \ + yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo +%if %3 == 1 + inc wd + sar wd, 1 +%if %4 == 1 + inc hd + sar hd, 1 +%endif ; %4 == 1 +%endif ; %3 == 1 + mov [rsp+3*mmsize+0], wd + mov [rsp+3*mmsize+4], hd + + mova m10, [cq] + pxor m11, m11 + mova m12, [pd_ %+ %%uvoutoff] + pslld m12, %%sh + paddd m12, [pd_ %+ %%rnd] + mova m13, [pw_ %+ %%uvinoff] + mova m14, [yoffq+ 0] ; y_off_in + mova m15, [yoffq+16] ; y_off_out +%if %%yoffsh != 0 + psllw m15, %%yoffsh +%endif + paddw m15, [pw_ %+ %%yprnd] + punpcklwd m10, m15 + mova m15, [pw_ %+ %%ypmul] + movh m0, [cq+1*16] ; cyu + movh m1, [cq+2*16] ; cyv + movh m2, [cq+4*16] ; cuu + movh m3, [cq+5*16] ; cuv + movh m4, [cq+7*16] ; cvu + movh m5, [cq+8*16] ; cvv + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + mova [rsp+0*mmsize], m0 + mova [rsp+1*mmsize], m2 + mova [rsp+2*mmsize], m4 + + DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp + + mov uiq, [yiq+gprsize*1] + mov viq, [yiq+gprsize*2] + mov yiq, [yiq+gprsize*0] + mov uoq, [yoq+gprsize*1] + mov voq, [yoq+gprsize*2] + mov yoq, [yoq+gprsize*0] + mov uisq, [yisq+gprsize*1] + mov visq, [yisq+gprsize*2] + mov yisq, [yisq+gprsize*0] + mov uosq, [yosq+gprsize*1] + mov vosq, [yosq+gprsize*2] + mov yosq, [yosq+gprsize*0] + +.loop_v: + xor xq, xq + +.loop_h: +%if %4 == 1 + lea tmpq, [yiq+yisq] +%endif ; %4 == 1 +%if %1 == 8 + movu m0, [yiq+xq*(1<<%3)] ; y00/01 +%if %4 == 1 + movu m2, [tmpq+xq*2] ; y10/11 +%endif ; %4 == 1 +%if %3 == 1 + movh m4, [uiq+xq] ; u + movh m5, [viq+xq] ; v +%else ; %3 != 1 + movu m4, [uiq+xq] ; u + movu m5, [viq+xq] ; v +%endif ; %3 ==/!= 1 + punpckhbw m1, m0, m11 + punpcklbw m0, m11 +%if %4 == 1 + punpckhbw m3, m2, m11 + punpcklbw m2, m11 +%endif ; %4 == 1 +%if %3 == 0 + punpckhbw m2, m4, m11 + punpckhbw m3, m5, m11 +%endif ; %3 == 0 + punpcklbw m4, m11 + punpcklbw m5, m11 +%else ; %1 != 8 + movu m0, [yiq+xq*(2<<%3)] ; y00/01 + movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01 +%if %4 == 1 + movu m2, [tmpq+xq*4] ; y10/11 + movu m3, [tmpq+xq*4+mmsize] ; y10/11 +%endif ; %4 == 1 + movu m4, [uiq+xq*2] ; u + movu m5, [viq+xq*2] ; v +%if %3 == 0 + movu m2, [uiq+xq*2+mmsize] + movu m3, [viq+xq*2+mmsize] +%endif ; %3 == 0 +%endif ; %1 ==/!= 8 + psubw m0, m14 + psubw m1, m14 +%if %4 == 1 + psubw m2, m14 + psubw m3, m14 +%endif ; %4 == 1 + psubw m4, m13 + psubw m5, m13 +%if %3 == 0 + psubw m2, m13 + psubw m3, m13 +%endif ; %3 == 0 + + SBUTTERFLY wd, 4, 5, 6 + pmaddwd m6, m4, [rsp+1*mmsize] + pmaddwd m7, m5, [rsp+1*mmsize] +%if %3 == 0 + SBUTTERFLY wd, 2, 3, 8 + pmaddwd m8, m2, [rsp+1*mmsize] + pmaddwd m9, m3, [rsp+1*mmsize] +%else ; %3 != 0 + pmaddwd m8, m4, [rsp+2*mmsize] + pmaddwd m9, m5, [rsp+2*mmsize] +%endif + paddd m6, m12 + paddd m7, m12 + paddd m8, m12 + paddd m9, m12 + psrad m6, %%sh + psrad m7, %%sh + psrad m8, %%sh + psrad m9, %%sh + packssdw m6, m7 + packssdw m8, m9 +%if %2 == 8 + packuswb m6, m8 +%if %3 == 0 + movu [uoq+xq], m6 +%else ; %3 != 0 + movh [uoq+xq], m6 + movhps [voq+xq], m6 +%endif ; %3 ==/!= 0 +%else ; %2 != 8 + CLIPW m6, m11, [pw_ %+ %%maxval] + CLIPW m8, m11, [pw_ %+ %%maxval] + movu [uoq+xq*2], m6 +%if %3 == 0 + movu [uoq+xq*2+mmsize], m8 +%else ; %3 != 0 + movu [voq+xq*2], m8 +%endif ; %3 ==/!= 0 +%endif ; %2 ==/!= 8 + +%if %3 == 0 + pmaddwd m6, m4, [rsp+2*mmsize] + pmaddwd m7, m5, [rsp+2*mmsize] + pmaddwd m8, m2, [rsp+2*mmsize] + pmaddwd m9, m3, [rsp+2*mmsize] + paddd m6, m12 + paddd m7, m12 + paddd m8, m12 + paddd m9, m12 + psrad m6, %%sh + psrad m7, %%sh + psrad m8, %%sh + psrad m9, %%sh + packssdw m6, m7 + packssdw m8, m9 +%if %2 == 8 + packuswb m6, m8 + movu [voq+xq], m6 +%else ; %2 != 8 + CLIPW m6, m11, [pw_ %+ %%maxval] + CLIPW m8, m11, [pw_ %+ %%maxval] + movu [voq+xq*2], m6 + movu [voq+xq*2+mmsize], m8 +%endif ; %2 ==/!= 8 +%endif ; %3 == 0 + + pmaddwd m4, [rsp+0*mmsize] + pmaddwd m5, [rsp+0*mmsize] ; uv_val +%if %3 == 0 + pmaddwd m2, [rsp+0*mmsize] + pmaddwd m3, [rsp+0*mmsize] +%endif ; %3 == 0 + + ; unpack y pixels with m15 (shifted round + offset), then multiply + ; by m10, add uv pixels, and we're done! +%if %3 == 1 + punpckhdq m8, m4, m4 + punpckldq m4, m4 + punpckhdq m9, m5, m5 + punpckldq m5, m5 +%else ; %3 != 1 + SWAP 8, 5, 2 + SWAP 3, 9 +%endif ; %3 ==/!= 1 +%if %4 == 1 + punpckhwd m6, m2, m15 + punpcklwd m2, m15 + punpckhwd m7, m3, m15 + punpcklwd m3, m15 + pmaddwd m2, m10 + pmaddwd m6, m10 + pmaddwd m3, m10 + pmaddwd m7, m10 + paddd m2, m4 + paddd m6, m8 + paddd m3, m5 + paddd m7, m9 + psrad m2, %%sh + psrad m6, %%sh + psrad m3, %%sh + psrad m7, %%sh + packssdw m2, m6 + packssdw m3, m7 + + lea tmpq, [yoq+yosq] +%if %2 == 8 + packuswb m2, m3 + movu [tmpq+xq*2], m2 +%else ; %2 != 8 + CLIPW m2, m11, [pw_ %+ %%maxval] + CLIPW m3, m11, [pw_ %+ %%maxval] + movu [tmpq+xq*4], m2 + movu [tmpq+xq*4+mmsize], m3 +%endif ; %2 ==/!= 8 +%endif ; %4 == 1 + + punpckhwd m6, m0, m15 + punpcklwd m0, m15 + punpckhwd m7, m1, m15 + punpcklwd m1, m15 + pmaddwd m0, m10 + pmaddwd m6, m10 + pmaddwd m1, m10 + pmaddwd m7, m10 + paddd m0, m4 + paddd m6, m8 + paddd m1, m5 + paddd m7, m9 + psrad m0, %%sh + psrad m6, %%sh + psrad m1, %%sh + psrad m7, %%sh + packssdw m0, m6 + packssdw m1, m7 + +%if %2 == 8 + packuswb m0, m1 + movu [yoq+xq*(1<<%3)], m0 +%else ; %2 != 8 + CLIPW m0, m11, [pw_ %+ %%maxval] + CLIPW m1, m11, [pw_ %+ %%maxval] + movu [yoq+xq*(2<<%3)], m0 + movu [yoq+xq*(2<<%3)+mmsize], m1 +%endif ; %2 ==/!= 8 + + add xq, mmsize >> %3 + cmp xd, dword [rsp+3*mmsize+0] + jl .loop_h + +%if %4 == 1 + lea yiq, [yiq+yisq*2] + lea yoq, [yoq+yosq*2] +%else ; %4 != 1 + add yiq, yisq + add yoq, yosq +%endif ; %4 ==/!= 1 + add uiq, uisq + add viq, visq + add uoq, uosq + add voq, vosq + dec dword [rsp+3*mmsize+4] + jg .loop_v + + RET +%endmacro + +%macro YUV2YUV_FNS 2 ; ss_w, ss_h +YUV2YUV_FN 8, 8, %1, %2 +YUV2YUV_FN 10, 8, %1, %2 +YUV2YUV_FN 12, 8, %1, %2 +YUV2YUV_FN 8, 10, %1, %2 +YUV2YUV_FN 10, 10, %1, %2 +YUV2YUV_FN 12, 10, %1, %2 +YUV2YUV_FN 8, 12, %1, %2 +YUV2YUV_FN 10, 12, %1, %2 +YUV2YUV_FN 12, 12, %1, %2 +%endmacro + +INIT_XMM sse2 +YUV2YUV_FNS 0, 0 +YUV2YUV_FNS 1, 0 +YUV2YUV_FNS 1, 1 + +; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride, +; uint8_t *yuv[3], ptrdiff_t yuv_stride[3], +; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8], +; const int16_t yuv_offset[8]) +%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) +%assign %%sh (%1 - 1) +%assign %%rnd (1 << (%%sh - 1)) +%assign %%uvoff (1 << (%1 - 1)) +%if %2 == 0 +%assign %%ss 444 +%elif %3 == 0 +%assign %%ss 422 +%else ; %3 == 1 +%assign %%ss 420 +%endif ; %2/%3 + +cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \ + rgb, rgbs, yuv, yuvs, ww, h, c, yoff +%if %2 == 1 + inc wwd + sar wwd, 1 +%endif ; %2 == 1 +%if %3 == 1 + inc hd + sar hd, 1 +%endif ; %3 == 1 + pxor m11, m11 + mova m15, [yoffq] ; yoff + movh m14, [cq+ 0] ; cy + movh m10, [cq+ 32] ; crv + movh m13, [cq+112] ; cbu + movh m12, [cq+ 64] ; cgu + movh m9, [cq+ 80] ; cgv + punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd + punpcklwd m13, m11 ; cbu, 0 + punpcklwd m11, m10 ; 0, crv + punpcklwd m12, m9 ; cgu, cgv + mova [rsp+0*mmsize], m11 + mova [rsp+1*mmsize], m12 + mova [rsp+2*mmsize], m13 + mova [rsp+3*mmsize], m14 + pxor m14, m14 + + DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp + + mov gq, [rq+1*gprsize] + mov bq, [rq+2*gprsize] + mov rq, [rq+0*gprsize] + mov uq, [yq+1*gprsize] + mov vq, [yq+2*gprsize] + mov yq, [yq+0*gprsize] + mov usq, [ysq+1*gprsize] + mov vsq, [ysq+2*gprsize] + mov ysq, [ysq+0*gprsize] + +.loop_v: + xor xq, xq + +.loop_h: +%if %3 == 1 + lea tmpq, [yq+ysq] +%endif ; %3 == 1 +%if %1 == 8 + movu m0, [yq+xq*(1<<%2)] +%if %3 == 1 + movu m2, [tmpq+xq*2] +%endif ; %3 == 1 +%if %2 == 1 + movh m4, [uq+xq] + movh m5, [vq+xq] +%else ; %2 != 1 + movu m4, [uq+xq] + movu m5, [vq+xq] +%endif ; %2 ==/!= 1 + punpckhbw m1, m0, m14 + punpcklbw m0, m14 +%if %3 == 1 + punpckhbw m3, m2, m14 + punpcklbw m2, m14 +%endif ; %3 == 1 +%if %2 == 0 + punpckhbw m2, m4, m14 + punpckhbw m3, m5, m14 +%endif ; %2 == 0 + punpcklbw m4, m14 + punpcklbw m5, m14 +%else ; %1 != 8 + movu m0, [yq+xq*(2<<%2)] + movu m1, [yq+xq*(2<<%2)+mmsize] +%if %3 == 1 + movu m2, [tmpq+xq*4] + movu m3, [tmpq+xq*4+mmsize] +%endif ; %3 == 1 + movu m4, [uq+xq*2] + movu m5, [vq+xq*2] +%if %2 == 0 + movu m2, [uq+xq*2+mmsize] + movu m3, [vq+xq*2+mmsize] +%endif ; %2 == 0 +%endif ; %1 ==/!= 8 + psubw m0, m15 + psubw m1, m15 +%if %3 == 1 + psubw m2, m15 + psubw m3, m15 +%endif ; %3 == 1 + psubw m4, [pw_ %+ %%uvoff] + psubw m5, [pw_ %+ %%uvoff] + SBUTTERFLY wd, 4, 5, 6 +%if %2 == 0 + psubw m2, [pw_ %+ %%uvoff] + psubw m3, [pw_ %+ %%uvoff] + SBUTTERFLY wd, 2, 3, 6 +%endif ; %2 == 0 + + ; calculate y+rnd full-resolution [0-3,6-9] + punpckhwd m6, m0, [pw_1] ; y, 1 + punpcklwd m0, [pw_1] ; y, 1 + punpckhwd m7, m1, [pw_1] ; y, 1 + punpcklwd m1, [pw_1] ; y, 1 + pmaddwd m0, [rsp+3*mmsize] + pmaddwd m6, [rsp+3*mmsize] + pmaddwd m1, [rsp+3*mmsize] + pmaddwd m7, [rsp+3*mmsize] +%if %3 == 1 + punpckhwd m8, m2, [pw_1] ; y, 1 + punpcklwd m2, [pw_1] ; y, 1 + punpckhwd m9, m3, [pw_1] ; y, 1 + punpcklwd m3, [pw_1] ; y, 1 + pmaddwd m2, [rsp+3*mmsize] + pmaddwd m8, [rsp+3*mmsize] + pmaddwd m3, [rsp+3*mmsize] + pmaddwd m9, [rsp+3*mmsize] + mova [rsp+4*mmsize], m2 + mova [rsp+5*mmsize], m8 + mova [rsp+6*mmsize], m3 + mova [rsp+7*mmsize], m9 +%endif ; %3 == 1 + + ; calculate r offsets (un-subsampled, then duplicate) + pmaddwd m10, m4, [rsp+0*mmsize] +%if %2 == 1 + pmaddwd m12, m5, [rsp+0*mmsize] + punpckhdq m11, m10, m10 + punpckldq m10, m10 + punpckhdq m13, m12, m12 + punpckldq m12, m12 +%else ; %2 != 1 + pmaddwd m11, m5, [rsp+0*mmsize] + pmaddwd m12, m2, [rsp+0*mmsize] + pmaddwd m13, m3, [rsp+0*mmsize] +%endif ; %2 ==/!= 1 +%if %3 == 1 + paddd m2, m10, [rsp+4*mmsize] + paddd m3, m11, [rsp+5*mmsize] + paddd m8, m12, [rsp+6*mmsize] + paddd m9, m13, [rsp+7*mmsize] +%endif + paddd m10, m0 + paddd m11, m6 + paddd m12, m1 + paddd m13, m7 +%if %3 == 1 + psrad m2, %%sh + psrad m3, %%sh + psrad m8, %%sh + psrad m9, %%sh +%endif ; %3 == 1 + psrad m10, %%sh + psrad m11, %%sh + psrad m12, %%sh + psrad m13, %%sh +%if %3 == 1 + lea tmpq, [rq+rgbsq*2] + packssdw m2, m3 + packssdw m8, m9 + mova [tmpq+xq*4], m2 + mova [tmpq+xq*4+mmsize], m8 +%endif ; %3 == 1 + packssdw m10, m11 + packssdw m12, m13 + mova [rq+xq*(2 << %2)], m10 + mova [rq+xq*(2 << %2)+mmsize], m12 + + ; calculate g offsets (un-subsampled, then duplicate) + pmaddwd m10, m4, [rsp+1*mmsize] +%if %2 == 1 + pmaddwd m12, m5, [rsp+1*mmsize] + punpckhdq m11, m10, m10 + punpckldq m10, m10 + punpckhdq m13, m12, m12 + punpckldq m12, m12 +%else ; %2 != 1 + pmaddwd m11, m5, [rsp+1*mmsize] + pmaddwd m12, m2, [rsp+1*mmsize] + pmaddwd m13, m3, [rsp+1*mmsize] +%endif ; %2 ==/!= 1 +%if %3 == 1 + paddd m2, m10, [rsp+4*mmsize] + paddd m3, m11, [rsp+5*mmsize] + paddd m8, m12, [rsp+6*mmsize] + paddd m9, m13, [rsp+7*mmsize] +%endif ; %3 == 1 + paddd m10, m0 + paddd m11, m6 + paddd m12, m1 + paddd m13, m7 +%if %3 == 1 + psrad m2, %%sh + psrad m3, %%sh + psrad m8, %%sh + psrad m9, %%sh +%endif ; %3 == 1 + psrad m10, %%sh + psrad m11, %%sh + psrad m12, %%sh + psrad m13, %%sh +%if %3 == 1 + lea tmpq, [gq+rgbsq*2] + packssdw m2, m3 + packssdw m8, m9 + mova [tmpq+xq*4], m2 + mova [tmpq+xq*4+mmsize], m8 +%endif ; %3 == 1 + packssdw m10, m11 + packssdw m12, m13 + mova [gq+xq*(2 << %2)], m10 + mova [gq+xq*(2 << %2)+mmsize], m12 + + ; calculate b offsets (un-subsampled, then duplicate) + pmaddwd m4, [rsp+2*mmsize] + pmaddwd m5, [rsp+2*mmsize] +%if %2 == 1 + punpckhdq m2, m4, m4 + punpckldq m4, m4 + punpckhdq m3, m5, m5 + punpckldq m5, m5 +%else ; %2 != 1 + pmaddwd m2, [rsp+2*mmsize] + pmaddwd m3, [rsp+2*mmsize] + SWAP 2, 5 +%endif ; %2 ==/!= 1 + paddd m0, m4 + paddd m6, m2 + paddd m1, m5 + paddd m7, m3 +%if %3 == 1 + paddd m4, [rsp+4*mmsize] + paddd m2, [rsp+5*mmsize] + paddd m5, [rsp+6*mmsize] + paddd m3, [rsp+7*mmsize] +%endif ; %3 == 1 + psrad m0, %%sh + psrad m6, %%sh + psrad m1, %%sh + psrad m7, %%sh +%if %3 == 1 + psrad m4, %%sh + psrad m2, %%sh + psrad m5, %%sh + psrad m3, %%sh +%endif ; %3 == 1 + packssdw m0, m6 + packssdw m1, m7 + movu [bq+xq*(2 << %2)], m0 + movu [bq+xq*(2 << %2)+mmsize], m1 +%if %3 == 1 + lea tmpq, [bq+rgbsq*2] + packssdw m4, m2 + packssdw m5, m3 + movu [tmpq+xq*4], m4 + movu [tmpq+xq*4+mmsize], m5 +%endif ; %3 == 1 + + add xd, mmsize >> %2 + cmp xd, wwd + jl .loop_h + + lea rq, [rq+rgbsq*(2 << %3)] + lea gq, [gq+rgbsq*(2 << %3)] + lea bq, [bq+rgbsq*(2 << %3)] +%if %3 == 1 + lea yq, [yq+ysq*2] +%else ; %3 != 0 + add yq, ysq +%endif ; %3 ==/!= 1 + add uq, usq + add vq, vsq + dec hd + jg .loop_v + + RET +%endmacro + +%macro YUV2RGB_FNS 2 +YUV2RGB_FN 8, %1, %2 +YUV2RGB_FN 10, %1, %2 +YUV2RGB_FN 12, %1, %2 +%endmacro + +INIT_XMM sse2 +YUV2RGB_FNS 0, 0 +YUV2RGB_FNS 1, 0 +YUV2RGB_FNS 1, 1 + +%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) +%assign %%sh 29 - %1 +%assign %%rnd (1 << (%%sh - 15)) +%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14)) +%if %1 != 8 +%assign %%maxval ((1 << %1) - 1) +%endif ; %1 != 8 +%if %2 == 0 +%assign %%ss 444 +%elif %3 == 0 +%assign %%ss 422 +%else ; %3 == 1 +%assign %%ss 420 +%endif ; %2/%3 + +cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \ + yuv, yuvs, rgb, rgbs, ww, h, c, off +%if %2 == 1 + inc wwd + sar wwd, 1 +%endif ; %2 == 1 +%if %3 == 1 + inc hd + sar hd, 1 +%endif ; %3 == 1 + + ; prepare coeffs + movh m8, [offq] + movh m9, [pw_ %+ %%uvrnd] + psllw m8, %%sh - 14 + paddw m9, [pw_ %+ %%rnd] + paddw m8, [pw_ %+ %%rnd] + movh m0, [cq+ 0] + movh m1, [cq+ 16] + movh m2, [cq+ 32] + movh m3, [cq+ 48] + movh m4, [cq+ 64] + movh m5, [cq+ 80] + movh m6, [cq+112] + movh m7, [cq+128] + punpcklwd m0, m1 + punpcklwd m2, m8 + punpcklwd m3, m4 + punpcklwd m4, m5, m9 + punpcklwd m5, m6 + punpcklwd m7, m9 + + mova [rsp+0*mmsize], m0 ; cry, cgy + mova [rsp+1*mmsize], m2 ; cby, off + rnd + mova [rsp+2*mmsize], m3 ; cru, cgu + mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd + mova [rsp+4*mmsize], m5 ; cburv, cgv + mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd + + + DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x + mov gq, [rq+gprsize*1] + mov bq, [rq+gprsize*2] + mov rq, [rq+gprsize*0] + mov uq, [yq+gprsize*1] + mov vq, [yq+gprsize*2] + mov yq, [yq+gprsize*0] + mov usq, [ysq+gprsize*1] + mov vsq, [ysq+gprsize*2] + mov ysq, [ysq+gprsize*0] + + pxor m15, m15 +.loop_v: + xor xd, xd + +.loop_h: + ; top line y + mova m0, [rq+xq*(2<<%2)] + mova m3, [rq+xq*(2<<%2)+mmsize] + mova m1, [gq+xq*(2<<%2)] + mova m4, [gq+xq*(2<<%2)+mmsize] + mova m2, [bq+xq*(2<<%2)] + mova m5, [bq+xq*(2<<%2)+mmsize] + + punpcklwd m6, m0, m1 + punpckhwd m7, m0, m1 + punpcklwd m8, m3, m4 + punpckhwd m9, m3, m4 + punpcklwd m10, m2, [pw_16384] + punpckhwd m11, m2, [pw_16384] + punpcklwd m12, m5, [pw_16384] + punpckhwd m13, m5, [pw_16384] + + pmaddwd m6, [rsp+0*mmsize] + pmaddwd m7, [rsp+0*mmsize] + pmaddwd m8, [rsp+0*mmsize] + pmaddwd m9, [rsp+0*mmsize] + pmaddwd m10, [rsp+1*mmsize] + pmaddwd m11, [rsp+1*mmsize] + pmaddwd m12, [rsp+1*mmsize] + pmaddwd m13, [rsp+1*mmsize] + paddd m6, m10 + paddd m7, m11 + paddd m8, m12 + paddd m9, m13 + psrad m6, %%sh + psrad m7, %%sh + psrad m8, %%sh + psrad m9, %%sh + packssdw m6, m7 + packssdw m8, m9 +%if %1 == 8 + packuswb m6, m8 + movu [yq+xq*(1<<%2)], m6 +%else + CLIPW m6, m15, [pw_ %+ %%maxval] + CLIPW m8, m15, [pw_ %+ %%maxval] + movu [yq+xq*(2<<%2)], m6 + movu [yq+xq*(2<<%2)+mmsize], m8 +%endif + +%if %2 == 1 + ; subsampling cached data + pmaddwd m0, [pw_1] + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + pmaddwd m3, [pw_1] + pmaddwd m4, [pw_1] + pmaddwd m5, [pw_1] + +%if %3 == 1 + ; bottom line y, r/g portion only + lea tmpq, [rgbsq+xq*2] + mova m6, [rq+tmpq*2] + mova m9, [rq+tmpq*2+mmsize] + mova m7, [gq+tmpq*2] + mova m10, [gq+tmpq*2+mmsize] + mova m8, [bq+tmpq*2] + mova m11, [bq+tmpq*2+mmsize] + + punpcklwd m12, m6, m7 + punpckhwd m13, m6, m7 + punpcklwd m14, m9, m10 + punpckhwd m15, m9, m10 + + ; release two more registers + pmaddwd m6, [pw_1] + pmaddwd m7, [pw_1] + pmaddwd m9, [pw_1] + pmaddwd m10, [pw_1] + paddd m0, m6 + paddd m3, m9 + paddd m1, m7 + paddd m4, m10 + + ; bottom line y, b/rnd portion only + punpcklwd m6, m8, [pw_16384] + punpckhwd m7, m8, [pw_16384] + punpcklwd m9, m11, [pw_16384] + punpckhwd m10, m11, [pw_16384] + + pmaddwd m12, [rsp+0*mmsize] + pmaddwd m13, [rsp+0*mmsize] + pmaddwd m14, [rsp+0*mmsize] + pmaddwd m15, [rsp+0*mmsize] + pmaddwd m6, [rsp+1*mmsize] + pmaddwd m7, [rsp+1*mmsize] + pmaddwd m9, [rsp+1*mmsize] + pmaddwd m10, [rsp+1*mmsize] + paddd m12, m6 + paddd m13, m7 + paddd m14, m9 + paddd m15, m10 + psrad m12, %%sh + psrad m13, %%sh + psrad m14, %%sh + psrad m15, %%sh + packssdw m12, m13 + packssdw m14, m15 + lea tmpq, [yq+ysq] +%if %1 == 8 + packuswb m12, m14 + movu [tmpq+xq*2], m12 +%else + pxor m15, m15 + CLIPW m12, m15, [pw_ %+ %%maxval] + CLIPW m14, m15, [pw_ %+ %%maxval] + movu [tmpq+xq*4], m12 + movu [tmpq+xq*4+mmsize], m14 +%endif + + ; complete subsampling of r/g/b pixels for u/v + pmaddwd m8, [pw_1] + pmaddwd m11, [pw_1] + paddd m2, m8 + paddd m5, m11 + paddd m0, [pd_2] + paddd m1, [pd_2] + paddd m2, [pd_2] + paddd m3, [pd_2] + paddd m4, [pd_2] + paddd m5, [pd_2] + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + psrad m4, 2 + psrad m5, 2 +%else ; %3 != 1 + paddd m0, [pd_1] + paddd m1, [pd_1] + paddd m2, [pd_1] + paddd m3, [pd_1] + paddd m4, [pd_1] + paddd m5, [pd_1] + psrad m0, 1 + psrad m1, 1 + psrad m2, 1 + psrad m3, 1 + psrad m4, 1 + psrad m5, 1 +%endif ; %3 ==/!= 1 + packssdw m0, m3 + packssdw m1, m4 + packssdw m2, m5 +%endif ; %2 == 1 + + ; convert u/v pixels + SBUTTERFLY wd, 0, 1, 6 + punpckhwd m6, m2, [pw_16384] + punpcklwd m2, [pw_16384] + + pmaddwd m7, m0, [rsp+2*mmsize] + pmaddwd m8, m1, [rsp+2*mmsize] + pmaddwd m9, m2, [rsp+3*mmsize] + pmaddwd m10, m6, [rsp+3*mmsize] + pmaddwd m0, [rsp+4*mmsize] + pmaddwd m1, [rsp+4*mmsize] + pmaddwd m2, [rsp+5*mmsize] + pmaddwd m6, [rsp+5*mmsize] + paddd m7, m9 + paddd m8, m10 + paddd m0, m2 + paddd m1, m6 + psrad m7, %%sh + psrad m8, %%sh + psrad m0, %%sh + psrad m1, %%sh + packssdw m7, m8 + packssdw m0, m1 +%if %2 == 1 +%if %1 == 8 + packuswb m7, m0 + movh [uq+xq], m7 + movhps [vq+xq], m7 +%else + CLIPW m7, m15, [pw_ %+ %%maxval] + CLIPW m0, m15, [pw_ %+ %%maxval] + movu [uq+xq*2], m7 + movu [vq+xq*2], m0 +%endif +%else ; %2 != 1 + ; second set of u/v pixels + SBUTTERFLY wd, 3, 4, 6 + punpckhwd m6, m5, [pw_16384] + punpcklwd m5, [pw_16384] + + pmaddwd m8, m3, [rsp+2*mmsize] + pmaddwd m9, m4, [rsp+2*mmsize] + pmaddwd m10, m5, [rsp+3*mmsize] + pmaddwd m11, m6, [rsp+3*mmsize] + pmaddwd m3, [rsp+4*mmsize] + pmaddwd m4, [rsp+4*mmsize] + pmaddwd m5, [rsp+5*mmsize] + pmaddwd m6, [rsp+5*mmsize] + paddd m8, m10 + paddd m9, m11 + paddd m3, m5 + paddd m4, m6 + psrad m8, %%sh + psrad m9, %%sh + psrad m3, %%sh + psrad m4, %%sh + packssdw m8, m9 + packssdw m3, m4 + +%if %1 == 8 + packuswb m7, m8 + packuswb m0, m3 + movu [uq+xq], m7 + movu [vq+xq], m0 +%else + CLIPW m7, m15, [pw_ %+ %%maxval] + CLIPW m0, m15, [pw_ %+ %%maxval] + CLIPW m8, m15, [pw_ %+ %%maxval] + CLIPW m3, m15, [pw_ %+ %%maxval] + movu [uq+xq*2], m7 + movu [uq+xq*2+mmsize], m8 + movu [vq+xq*2], m0 + movu [vq+xq*2+mmsize], m3 +%endif +%endif ; %2 ==/!= 1 + + add xq, mmsize >> %2 + cmp xd, wwd + jl .loop_h + +%if %3 == 0 + add yq, ysq +%else ; %3 != 0 + lea yq, [yq+ysq*2] +%endif ; %3 ==/!= 0 + add uq, usq + add vq, vsq + lea rq, [rq+rgbsq*(2<<%3)] + lea gq, [gq+rgbsq*(2<<%3)] + lea bq, [bq+rgbsq*(2<<%3)] + dec hd + jg .loop_v + + RET +%endmacro + +%macro RGB2YUV_FNS 2 +RGB2YUV_FN 8, %1, %2 +RGB2YUV_FN 10, %1, %2 +RGB2YUV_FN 12, %1, %2 +%endmacro + +INIT_XMM sse2 +RGB2YUV_FNS 0, 0 +RGB2YUV_FNS 1, 0 +RGB2YUV_FNS 1, 1 + +; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, +; int w, int h, const int16_t coeff[3][3][8]) +INIT_XMM sse2 +cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c + movh m0, [cq+ 0] + movh m1, [cq+ 32] + movh m2, [cq+ 48] + movh m3, [cq+ 80] + movh m4, [cq+ 96] + movh m5, [cq+128] + punpcklwd m0, [cq+ 16] + punpcklwd m1, [pw_8192] + punpcklwd m2, [cq+ 64] + punpcklwd m3, [pw_8192] + punpcklwd m4, [cq+112] + punpcklwd m5, [pw_8192] + + DEFINE_ARGS data0, stride, ww, h, data1, data2, x + shl strideq, 1 + mov data1q, [data0q+gprsize*1] + mov data2q, [data0q+gprsize*2] + mov data0q, [data0q+gprsize*0] + +.loop_v: + xor xd, xd + +.loop_h: + mova m6, [data0q+xq*2] + mova m7, [data1q+xq*2] + mova m8, [data2q+xq*2] + SBUTTERFLY wd, 6, 7, 9 + punpckhwd m9, m8, [pw_1] + punpcklwd m8, [pw_1] + + pmaddwd m10, m6, m0 + pmaddwd m11, m7, m0 + pmaddwd m12, m8, m1 + pmaddwd m13, m9, m1 + paddd m10, m12 + paddd m11, m13 + psrad m10, 14 + psrad m11, 14 + + pmaddwd m12, m6, m2 + pmaddwd m13, m7, m2 + pmaddwd m14, m8, m3 + pmaddwd m15, m9, m3 + paddd m12, m14 + paddd m13, m15 + psrad m12, 14 + psrad m13, 14 + + pmaddwd m6, m4 + pmaddwd m7, m4 + pmaddwd m8, m5 + pmaddwd m9, m5 + paddd m6, m8 + paddd m7, m9 + psrad m6, 14 + psrad m7, 14 + + packssdw m10, m11 + packssdw m12, m13 + packssdw m6, m7 + + mova [data0q+xq*2], m10 + mova [data1q+xq*2], m12 + mova [data2q+xq*2], m6 + + add xd, mmsize / 2 + cmp xd, wwd + jl .loop_h + + add data0q, strideq + add data1q, strideq + add data2q, strideq + dec hd + jg .loop_v + + RET +%endif diff --git a/libavfilter/x86/colorspacedsp_init.c b/libavfilter/x86/colorspacedsp_init.c new file mode 100644 index 0000000000..78d34bcc14 --- /dev/null +++ b/libavfilter/x86/colorspacedsp_init.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" + +#include "libavfilter/colorspacedsp.h" + +#define decl_yuv2yuv_fn(t) \ +void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], \ + uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], \ + int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], \ + const int16_t yuv_offset[2][8]) + +#define decl_yuv2yuv_fns(ss) \ +decl_yuv2yuv_fn(ss##p8to8); \ +decl_yuv2yuv_fn(ss##p10to8); \ +decl_yuv2yuv_fn(ss##p12to8); \ +decl_yuv2yuv_fn(ss##p8to10); \ +decl_yuv2yuv_fn(ss##p10to10); \ +decl_yuv2yuv_fn(ss##p12to10); \ +decl_yuv2yuv_fn(ss##p8to12); \ +decl_yuv2yuv_fn(ss##p10to12); \ +decl_yuv2yuv_fn(ss##p12to12) + +decl_yuv2yuv_fns(420); +decl_yuv2yuv_fns(422); +decl_yuv2yuv_fns(444); + +#define decl_yuv2rgb_fn(t) \ +void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \ + uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \ + int w, int h, const int16_t coeff[3][3][8], \ + const int16_t yuv_offset[8]) + +#define decl_yuv2rgb_fns(ss) \ +decl_yuv2rgb_fn(ss##p8); \ +decl_yuv2rgb_fn(ss##p10); \ +decl_yuv2rgb_fn(ss##p12) + +decl_yuv2rgb_fns(420); +decl_yuv2rgb_fns(422); +decl_yuv2rgb_fns(444); + +#define decl_rgb2yuv_fn(t) \ +void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \ + int16_t *rgb_in[3], ptrdiff_t rgb_stride, \ + int w, int h, const int16_t coeff[3][3][8], \ + const int16_t yuv_offset[8]) + +#define decl_rgb2yuv_fns(ss) \ +decl_rgb2yuv_fn(ss##p8); \ +decl_rgb2yuv_fn(ss##p10); \ +decl_rgb2yuv_fn(ss##p12) + +decl_rgb2yuv_fns(420); +decl_rgb2yuv_fns(422); +decl_rgb2yuv_fns(444); + +void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, int w, int h, + const int16_t coeff[3][3][8]); + +void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) { +#define assign_yuv2yuv_fns(idx, ss) \ + dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \ + dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \ + dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \ + dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \ + dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \ + dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \ + dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \ + dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \ + dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2 + + assign_yuv2yuv_fns(2, 420); + assign_yuv2yuv_fns(1, 422); + assign_yuv2yuv_fns(0, 444); + +#define assign_yuv2rgb_fns(idx, ss) \ + dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \ + dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \ + dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2 + + assign_yuv2rgb_fns(2, 420); + assign_yuv2rgb_fns(1, 422); + assign_yuv2rgb_fns(0, 444); + +#define assign_rgb2yuv_fns(idx, ss) \ + dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \ + dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \ + dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2 + + assign_rgb2yuv_fns(2, 420); + assign_rgb2yuv_fns(1, 422); + assign_rgb2yuv_fns(0, 444); + + dsp->multiply3x3 = ff_multiply3x3_sse2; + } +} |