diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2015-09-25 17:24:07 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2015-10-03 14:42:39 -0400 |
commit | 26ece7a511f8905a5ddfc19c7cd4ecdca7056138 (patch) | |
tree | 1efff584577934d49bd98d2089896f7c695ffce7 /libavcodec/x86 | |
parent | db7786e8ffa2c8f5c7da062054962ca81cf09349 (diff) | |
download | ffmpeg-26ece7a511f8905a5ddfc19c7cd4ecdca7056138.tar.gz |
vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/constants.c | 4 | ||||
-rw-r--r-- | libavcodec/x86/constants.h | 2 | ||||
-rw-r--r-- | libavcodec/x86/h264_idct_10bit.asm | 5 | ||||
-rw-r--r-- | libavcodec/x86/h264_intrapred_10bit.asm | 2 | ||||
-rw-r--r-- | libavcodec/x86/vp9dsp_init.h | 23 | ||||
-rw-r--r-- | libavcodec/x86/vp9dsp_init_16bpp.c | 15 | ||||
-rw-r--r-- | libavcodec/x86/vp9dsp_init_16bpp_template.c | 7 | ||||
-rw-r--r-- | libavcodec/x86/vp9intrapred_16bpp.asm | 615 |
9 files changed, 669 insertions, 5 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 01e5f18783..5ff3a77e37 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ + x86/vp9intrapred_16bpp.o \ x86/vp9itxfm.o \ x86/vp9lpf.o \ x86/vp9lpf_16bpp.o \ diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 9f3c8b4165..19345f56e4 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x800 DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL, 0x0000000100000001ULL, 0x0000000100000001ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL, + 0x0000001000000010ULL, 0x0000001000000010ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL, + 0x0000002000000020ULL, 0x0000002000000020ULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 37a1869641..4a2451d520 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC; extern const xmm_reg ff_ps_neg; extern const ymm_reg ff_pd_1; +extern const ymm_reg ff_pd_16; +extern const ymm_reg ff_pd_32; #endif /* AVCODEC_X86_CONSTANTS_H */ diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index cc115b0ff9..f1c2c81ef8 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -24,14 +24,11 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -pd_32: times 4 dd 32 - SECTION .text cextern pw_1023 %define pw_pixel_max pw_1023 +cextern pd_32 ;----------------------------------------------------------------------------- ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 9aeb70242b..9e40cfe24b 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -34,11 +34,11 @@ cextern pw_8 cextern pw_4 cextern pw_2 cextern pw_1 +cextern pd_16 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 8 dw -3 pd_17: times 4 dd 17 -pd_16: times 4 dd 16 SECTION .text diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index d1a9514c09..47d22461ae 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \ decl_mc_func(put, sz, v, opt, type, fsz, bpp); \ decl_mc_func(avg, sz, v, opt, type, fsz, bpp) +#define decl_ipred_fn(type, sz, bpp, opt) \ +void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + const uint8_t *l, \ + const uint8_t *a) + +#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \ +decl_ipred_fn(type, 4, bpp, opt4); \ +decl_ipred_fn(type, 8, bpp, opt8_16_32); \ +decl_ipred_fn(type, 16, bpp, opt8_16_32); \ +decl_ipred_fn(type, 32, bpp, opt8_16_32) + #define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \ static av_always_inline void \ ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) init_subpel3_8to64(idx, type, bpp, opt); \ init_subpel2(4, idx, 4, type, bpp, opt) +#define cat(a, bpp, b) a##bpp##b + +#define init_ipred_func(type, enum, sz, bpp, opt) \ + dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \ + cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt) + +#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \ + init_ipred_func(type, enum, 8, bpp, opt); \ + init_ipred_func(type, enum, 16, bpp, opt); \ + init_ipred_func(type, enum, 32, bpp, opt) + void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index bd61e24288..f4a4a5d891 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2); decl_fpel_func(avg, 64, _16, avx2); decl_fpel_func(avg, 128, _16, avx2); +decl_ipred_fns(v, 16, mmx, sse); +decl_ipred_fns(h, 16, mmxext, sse2); +decl_ipred_fns(dc, 16, mmxext, sse2); +decl_ipred_fns(dc_top, 16, mmxext, sse2); +decl_ipred_fns(dc_left, 16, mmxext, sse2); #endif /* HAVE_YASM */ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) @@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) if (EXTERNAL_MMX(cpu_flags)) { init_fpel_func(4, 0, 8, put, , mmx); + init_ipred_func(v, VERT, 4, 16, mmx); } if (EXTERNAL_MMXEXT(cpu_flags)) { init_fpel_func(4, 1, 8, avg, _16, mmxext); + init_ipred_func(h, HOR, 4, 16, mmxext); + init_ipred_func(dc, DC, 4, 16, mmxext); + init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext); + init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext); } if (EXTERNAL_SSE(cpu_flags)) { @@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(2, 0, 32, put, , sse); init_fpel_func(1, 0, 64, put, , sse); init_fpel_func(0, 0, 128, put, , sse); + init_8_16_32_ipred_funcs(v, VERT, 16, sse); } if (EXTERNAL_SSE2(cpu_flags)) { @@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(2, 1, 32, avg, _16, sse2); init_fpel_func(1, 1, 64, avg, _16, sse2); init_fpel_func(0, 1, 128, avg, _16, sse2); + init_8_16_32_ipred_funcs(h, HOR, 16, sse2); + init_8_16_32_ipred_funcs(dc, DC, 16, sse2); + init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2); + init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2); } if (EXTERNAL_AVX_FAST(cpu_flags)) { diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index 56cd79e7a4..f486caf1a1 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \ lpf_mix2_wrappers_set(BPC, sse2); lpf_mix2_wrappers_set(BPC, ssse3); lpf_mix2_wrappers_set(BPC, avx); + +decl_ipred_fns(tm, BPC, mmxext, sse2); #endif /* HAVE_YASM */ av_cold void INIT_FUNC(VP9DSPContext *dsp) @@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp) init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \ init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt) + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); + } + if (EXTERNAL_SSE2(cpu_flags)) { init_subpel3(0, put, BPC, sse2); init_subpel3(1, avg, BPC, sse2); init_lpf_funcs(BPC, sse2); + init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2); } if (EXTERNAL_SSSE3(cpu_flags)) { diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm new file mode 100644 index 0000000000..018d92de58 --- /dev/null +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -0,0 +1,615 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> +;* Copyright (c) 2015 Henrik Gramner <henrik gramner com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_2: times 8 dd 2 +pd_4: times 8 dd 4 +pd_8: times 8 dd 8 + +cextern pw_1 +cextern pw_1023 +cextern pw_4095 +cextern pd_16 +cextern pd_32 + +SECTION .text + +INIT_MMX mmx +cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + mova m3, [aq+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m2 + mova [dstq+strideq*1+48], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a + mova m3, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + punpckhwd m3, m2, m2 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m3, q1111 + pshufd m1, m3, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + punpcklwd m2, m2 + pshufd m0, m2, q3333 + pshufd m1, m2, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m2, q1111 + pshufd m1, m2, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 3 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 7 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m1 + mova [dstq+strideq*1+48], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+strideq*2+32], m2 + mova [dstq+strideq*2+48], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + mova [dstq+stride3q +32], m3 + mova [dstq+stride3q +48], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [lq+mmsize] + paddw m0, [aq] + paddw m0, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq+mmsize*0] + paddw m0, [lq+mmsize*1] + paddw m0, [lq+mmsize*2] + paddw m0, [lq+mmsize*3] + paddw m0, [aq+mmsize*0] + paddw m0, [aq+mmsize*1] + paddw m0, [aq+mmsize*2] + paddw m0, [aq+mmsize*3] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_32] + paddd m0, m1 + psrad m0, 6 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +%macro DC_1D_FNS 2 +INIT_MMX mmxext +cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_2] + paddd m0, m1 + psrad m0, 2 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + paddw m0, [%2+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2+mmsize*0] + paddw m0, [%2+mmsize*1] + paddw m0, [%2+mmsize*2] + paddw m0, [%2+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +DC_1D_FNS top, aq +DC_1D_FNS left, lq + +INIT_MMX mmxext +cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a + mova m5, [pw_1023] +.body: + mova m4, [aq] + mova m3, [lq] + movd m0, [aq-4] + pshufw m0, m0, q1111 + psubw m4, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + paddw m0, m4 + paddw m1, m4 + paddw m2, m4 + paddw m3, m4 + pxor m4, m4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m3, m5 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a + mova m4, [pw_1023] +.body: + pxor m6, m6 + mova m5, [aq] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 1 +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pmaxsw m2, m6 + pmaxsw m3, m6 + pminsw m0, m4 + pminsw m1, m4 + pminsw m2, m4 + pminsw m3, m4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a + mova m4, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a + mova m7, [pw_1023] +.body: + pxor m6, m6 + mova m4, [aq] + mova m5, [aq+mmsize] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 7 +.loop: + movd m3, [lq+cntq*4] + punpcklwd m3, m3 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m2, m4 + paddw m2, m5 + paddw m1, m3, m4 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m2, m6 + pmaxsw m1, m6 + pmaxsw m3, m6 + pminsw m0, m7 + pminsw m2, m7 + pminsw m1, m7 + pminsw m3, m7 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a + mova m7, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a + mova m0, [pw_1023] +.body: + pxor m1, m1 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 +%define reg_min m9 +%define reg_max m8 +%else + mova [rsp+ 0], m0 + mova [rsp+16], m1 +%define reg_min [rsp+16] +%define reg_max [rsp+ 0] +%endif + + mova m4, [aq+mmsize*0] + mova m5, [aq+mmsize*1] + mova m6, [aq+mmsize*2] + mova m7, [aq+mmsize*3] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + psubw m6, m0 + psubw m7, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 31 +.loop: + pinsrw m3, [lq+cntq*2], 0 + punpcklwd m3, m3 + pshufd m3, m3, q0000 + paddw m0, m3, m4 + paddw m1, m3, m5 + paddw m2, m3, m6 + paddw m3, m7 + pmaxsw m0, reg_min + pmaxsw m1, reg_min + pmaxsw m2, reg_min + pmaxsw m3, reg_min + pminsw m0, reg_max + pminsw m1, reg_max + pminsw m2, reg_max + pminsw m3, reg_max + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + add dstq, strideq + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body |