diff options
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/h264_weight_10bit.asm | 321 | ||||
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 61 |
3 files changed, 383 insertions, 0 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index ea57bd1db6..022ab27766 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -15,6 +15,7 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ x86/h264_idct.o \ x86/h264_idct_10bit.o \ x86/h264_weight.o \ + x86/h264_weight_10bit.o \ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ x86/h264_intrapred_10bit.o diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm new file mode 100644 index 0000000000..1c58d72d94 --- /dev/null +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -0,0 +1,321 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +pw_pixel_max: times 8 dw ((1 << 10)-1) +sq_1: dq 1 + dq 0 + +cextern pw_1 + +SECTION .text + +;----------------------------------------------------------------------------- +; void h264_weight(uint8_t *dst, int stride, int log2_denom, +; int weight, int offset); +;----------------------------------------------------------------------------- +%ifdef ARCH_X86_32 +DECLARE_REG_TMP 2 +%else +DECLARE_REG_TMP 10 +%endif + +%macro WEIGHT_PROLOGUE 1 + mov t0, %1 +.prologue + PROLOGUE 0,5,8 + movifnidn r0, r0mp + movifnidn r1d, r1m + movifnidn r3d, r3m + movifnidn r4d, r4m +%endmacro + +%macro WEIGHT_SETUP 1 + mova m0, [pw_1] + movd m2, r2m + pslld m0, m2 ; 1<<log2_denom + SPLATW m0, m0 + shl r4, 19 ; *8, move to upper half of dword + lea r4, [r4+r3*2+0x10000] + movd m3, r4d ; weight<<1 | 1+(offset<<(3)) + pshufd m3, m3, 0 + mova m4, [pw_pixel_max] + paddw m2, [sq_1] ; log2_denom+1 +%ifnidn %1, sse4 + pxor m7, m7 +%endif +%endmacro + +%macro WEIGHT_OP 2-3 +%if %0==2 + mova m5, [r0+%2] + punpckhwd m6, m5, m0 + punpcklwd m5, m0 +%else + movq m5, [r0+%2] + movq m6, [r0+%3] + punpcklwd m5, m0 + punpcklwd m6, m0 +%endif + pmaddwd m5, m3 + pmaddwd m6, m3 + psrad m5, m2 + psrad m6, m2 +%ifidn %1, sse4 + packusdw m5, m6 + pminsw m5, m4 +%else + packssdw m5, m6 + CLIPW m5, m7, m4 +%endif +%endmacro + +%macro WEIGHT_FUNC_DBL 1 +cglobal h264_weight_16x16_10_%1 + WEIGHT_PROLOGUE 16 + WEIGHT_SETUP %1 +.nextrow + WEIGHT_OP %1, 0 + mova [r0 ], m5 + WEIGHT_OP %1, 16 + mova [r0+16], m5 + add r0, r1 + dec t0 + jnz .nextrow + REP_RET + +cglobal h264_weight_16x8_10_%1 + mov t0, 8 + jmp mangle(ff_h264_weight_16x16_10_%1.prologue) +%endmacro + +INIT_XMM +WEIGHT_FUNC_DBL sse2 +WEIGHT_FUNC_DBL sse4 + + +%macro WEIGHT_FUNC_MM 1 +cglobal h264_weight_8x16_10_%1 + WEIGHT_PROLOGUE 16 + WEIGHT_SETUP %1 +.nextrow + WEIGHT_OP %1, 0 + mova [r0], m5 + add r0, r1 + dec t0 + jnz .nextrow + REP_RET + +cglobal h264_weight_8x8_10_%1 + mov t0, 8 + jmp mangle(ff_h264_weight_8x16_10_%1.prologue) + +cglobal h264_weight_8x4_10_%1 + mov t0, 4 + jmp mangle(ff_h264_weight_8x16_10_%1.prologue) +%endmacro + +INIT_XMM +WEIGHT_FUNC_MM sse2 +WEIGHT_FUNC_MM sse4 + + +%macro WEIGHT_FUNC_HALF_MM 1 +cglobal h264_weight_4x8_10_%1 + WEIGHT_PROLOGUE 4 + WEIGHT_SETUP %1 + lea r3, [r1*2] +.nextrow + WEIGHT_OP %1, 0, r1 + movh [r0], m5 + movhps [r0+r1], m5 + add r0, r3 + dec t0 + jnz .nextrow + REP_RET + +cglobal h264_weight_4x4_10_%1 + mov t0, 2 + jmp mangle(ff_h264_weight_4x8_10_%1.prologue) + +cglobal h264_weight_4x2_10_%1 + mov t0, 1 + jmp mangle(ff_h264_weight_4x8_10_%1.prologue) +%endmacro + +INIT_XMM +WEIGHT_FUNC_HALF_MM sse2 +WEIGHT_FUNC_HALF_MM sse4 + + +;----------------------------------------------------------------------------- +; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom, +; int weightd, int weights, int offset); +;----------------------------------------------------------------------------- +%ifdef ARCH_X86_32 +DECLARE_REG_TMP 2,3 +%else +DECLARE_REG_TMP 10,2 +%endif + +%macro BIWEIGHT_PROLOGUE 1 + mov t0, %1 +.prologue + PROLOGUE 0,7,8 + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn t1d, r2m + movifnidn r4d, r4m + movifnidn r5d, r5m + movifnidn r6d, r6m +%endmacro + +%macro BIWEIGHT_SETUP 1 + lea r6, [r6*4+1] ; (offset<<2)+1 + or r6, 1 + shl r5, 16 + or r4, r5 + movd m4, r4d ; weightd | weights + movd m5, r6d ; (offset+1)|1 + movd m6, r3m ; log2_denom + pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom + paddd m6, [sq_1] + pshufd m4, m4, 0 + pshufd m5, m5, 0 + mova m3, [pw_pixel_max] +%ifnidn %1, sse4 + pxor m7, m7 +%endif +%endmacro + +%macro BIWEIGHT 2-3 +%if %0==2 + mova m0, [r0+%2] + mova m1, [r1+%2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 +%else + movq m0, [r0+%2] + movq m1, [r1+%2] + punpcklwd m0, m1 + movq m2, [r0+%3] + movq m1, [r1+%3] + punpcklwd m2, m1 +%endif + pmaddwd m0, m4 + pmaddwd m2, m4 + paddd m0, m5 + paddd m2, m5 + psrad m0, m6 + psrad m2, m6 +%ifidn %1, sse4 + packusdw m0, m2 + pminsw m0, m3 +%else + packssdw m0, m2 + CLIPW m0, m7, m3 +%endif +%endmacro + +%macro BIWEIGHT_FUNC_DBL 1 +cglobal h264_biweight_16x16_10_%1 + BIWEIGHT_PROLOGUE 16 + BIWEIGHT_SETUP %1 +.nextrow + BIWEIGHT %1, 0 + mova [r0 ], m0 + BIWEIGHT %1, 16 + mova [r0+16], m0 + add r0, t1 + add r1, t1 + dec t0 + jnz .nextrow + REP_RET + +cglobal h264_biweight_16x8_10_%1 + mov t0, 8 + jmp mangle(ff_h264_biweight_16x16_10_%1.prologue) +%endmacro + +INIT_XMM +BIWEIGHT_FUNC_DBL sse2 +BIWEIGHT_FUNC_DBL sse4 + +%macro BIWEIGHT_FUNC 1 +cglobal h264_biweight_8x16_10_%1 + BIWEIGHT_PROLOGUE 16 + BIWEIGHT_SETUP %1 +.nextrow + BIWEIGHT %1, 0 + mova [r0], m0 + add r0, t1 + add r1, t1 + dec t0 + jnz .nextrow + REP_RET + +cglobal h264_biweight_8x8_10_%1 + mov t0, 8 + jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) + +cglobal h264_biweight_8x4_10_%1 + mov t0, 4 + jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) +%endmacro + +INIT_XMM +BIWEIGHT_FUNC sse2 +BIWEIGHT_FUNC sse4 + +%macro BIWEIGHT_FUNC_HALF 1 +cglobal h264_biweight_4x8_10_%1 + BIWEIGHT_PROLOGUE 4 + BIWEIGHT_SETUP %1 + lea r4, [t1*2] +.nextrow + BIWEIGHT %1, 0, t1 + movh [r0 ], m0 + movhps [r0+t1], m0 + add r0, r4 + add r1, r4 + dec t0 + jnz .nextrow + REP_RET + +cglobal h264_biweight_4x4_10_%1 + mov t0, 2 + jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) + +cglobal h264_biweight_4x2_10_%1 + mov t0, 1 + jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) +%endmacro + +INIT_XMM +BIWEIGHT_FUNC_HALF sse2 +BIWEIGHT_FUNC_HALF sse4 diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 3fccd081d5..1042552948 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -326,6 +326,30 @@ H264_BIWEIGHT_MMX ( 4, 8) H264_BIWEIGHT_MMX ( 4, 4) H264_BIWEIGHT_MMX ( 4, 2) +#define H264_WEIGHT_10(W, H, DEPTH, OPT) \ +void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int stride, int log2_denom, int weight, int offset); + +#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \ +void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \ + int weightd, int weights, int offset); + +#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \ +H264_WEIGHT_10 (W, H, DEPTH, sse2) \ +H264_WEIGHT_10 (W, H, DEPTH, sse4) \ +H264_BIWEIGHT_10(W, H, DEPTH, sse2) \ +H264_BIWEIGHT_10(W, H, DEPTH, sse4) + +H264_BIWEIGHT_10_SSE(16, 16, 10) +H264_BIWEIGHT_10_SSE(16, 8, 10) +H264_BIWEIGHT_10_SSE( 8, 16, 10) +H264_BIWEIGHT_10_SSE( 8, 8, 10) +H264_BIWEIGHT_10_SSE( 8, 4, 10) +H264_BIWEIGHT_10_SSE( 4, 8, 10) +H264_BIWEIGHT_10_SSE( 4, 4, 10) +H264_BIWEIGHT_10_SSE( 4, 2, 10) + void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) { int mm_flags = av_get_cpu_flags(); @@ -454,6 +478,24 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; #endif + c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; + c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; + c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; + c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; + c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; + c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; + c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; + c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; + c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; + c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; + c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK @@ -463,6 +505,25 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; #endif } + if (mm_flags&AV_CPU_FLAG_SSE4) { + c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; + c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; + c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; + c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; + c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; + c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; + c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; + c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; + c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; + c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; + c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; + } #if HAVE_AVX if (mm_flags&AV_CPU_FLAG_AVX) { c->h264_idct_dc_add = |