diff options
author | Daniel Kang <daniel.d.kang@gmail.com> | 2011-07-02 22:18:39 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2011-07-03 07:43:38 -0700 |
commit | 9bfa5363da21a5ba7bed174a82c86f2a089fc917 (patch) | |
tree | 99da032075f99024c444431c0bf1d90c92f94606 | |
parent | 10dde477c77e0ac0fecda49fdb1dc71329aa7513 (diff) | |
download | ffmpeg-9bfa5363da21a5ba7bed174a82c86f2a089fc917.tar.gz |
H.264: Add x86 assembly for 10-bit H.264 qpel functions.
Mainly ported from 8-bit H.264 qpel.
Some code ported from x264. LGPL ok by author.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 136 | ||||
-rw-r--r-- | libavcodec/x86/h264_qpel_10bit.asm | 891 | ||||
-rw-r--r-- | libavcodec/x86/h264_qpel_mmx.c | 98 |
4 files changed, 1076 insertions, 50 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 022ab27766..d3cf0da72b 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -46,6 +46,7 @@ MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ x86/fmtconvert.o \ x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o \ + x86/h264_qpel_10bit.o \ $(YASM-OBJS-yes) MMX-OBJS-$(CONFIG_FFT) += x86/fft.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 03c094533f..d6eed82dd8 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2627,44 +2627,56 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } -#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU - - SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); +#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ + c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU + + SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); if (!high_bit_depth) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); } +#if HAVE_YASM + else if (bit_depth == 10) { +#if !ARCH_X86_64 + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); +#endif + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); + } +#endif - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); #if HAVE_YASM c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2; @@ -2725,26 +2737,26 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; } - SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); if (!high_bit_depth) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); } - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); #if HAVE_YASM if (!high_bit_depth) { @@ -2788,7 +2800,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 3, sse2); } #if HAVE_YASM +#define H264_QPEL_FUNCS_10(x, y, CPU)\ + c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\ + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\ + c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\ + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; if (bit_depth == 10) { + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); + H264_QPEL_FUNCS_10(1, 0, sse2_cache64) + H264_QPEL_FUNCS_10(2, 0, sse2_cache64) + H264_QPEL_FUNCS_10(3, 0, sse2_cache64) + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2; c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2; } @@ -2810,6 +2835,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); } + else if (bit_depth == 10) { + H264_QPEL_FUNCS_10(1, 0, ssse3_cache64) + H264_QPEL_FUNCS_10(2, 0, ssse3_cache64) + H264_QPEL_FUNCS_10(3, 0, ssse3_cache64) + } c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; #if HAVE_YASM if (!high_bit_depth) { @@ -2906,6 +2936,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #if HAVE_AVX && HAVE_YASM if (mm_flags & AV_CPU_FLAG_AVX) { if (bit_depth == 10) { + //AVX implies !cache64. + //TODO: Port cache(32|64) detection from x264. + H264_QPEL_FUNCS_10(1, 0, sse2) + H264_QPEL_FUNCS_10(2, 0, sse2) + H264_QPEL_FUNCS_10(3, 0, sse2) + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; } diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm new file mode 100644 index 0000000000..15dd72ca36 --- /dev/null +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -0,0 +1,891 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code +;***************************************************************************** +;* Copyright (C) 2011 x264 project +;* +;* Authors: Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +cextern pw_16 +cextern pw_1 +cextern pb_0 + +pw_pixel_max: times 8 dw ((1 << 10)-1) + +pad10: times 8 dw 10*1023 +pad20: times 8 dw 20*1023 +pad30: times 8 dw 30*1023 +depad: times 4 dd 32*20*1023 + 512 +depad2: times 8 dw 20*1023 + 16*1022 + 16 +unpad: times 8 dw 16*1022/32 ; needs to be mod 16 + +tap1: times 4 dw 1, -5 +tap2: times 4 dw 20, 20 +tap3: times 4 dw -5, 1 +pd_0f: times 4 dd 0xffff + +SECTION .text + + +%macro AVG_MOV 2 + pavgw %2, %1 + mova %1, %2 +%endmacro + +%macro ADDW 3 +%if mmsize == 8 + paddw %1, %2 +%else + movu %3, %2 + paddw %1, %3 +%endif +%endmacro + +%macro FILT_H 4 + paddw %1, %4 + psubw %1, %2 ; a-b + psraw %1, 2 ; (a-b)/4 + psubw %1, %2 ; (a-b)/4-b + paddw %1, %3 ; (a-b)/4-b+c + psraw %1, 2 ; ((a-b)/4-b+c)/4 + paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 +%endmacro + +%macro PRELOAD_V 0 + lea r3, [r2*3] + sub r1, r3 + movu m0, [r1+r2] + movu m1, [r1+r2*2] + add r1, r3 + movu m2, [r1] + movu m3, [r1+r2] + movu m4, [r1+r2*2] + add r1, r3 +%endmacro + +%macro FILT_V 8 + movu %6, [r1] + paddw %1, %6 + mova %7, %2 + paddw %7, %5 + mova %8, %3 + paddw %8, %4 + FILT_H %1, %7, %8, [pw_16] + psraw %1, 1 + CLIPW %1, [pb_0], [pw_pixel_max] +%endmacro + +%macro MC 1 +%define OP_MOV mova +INIT_MMX +%1 mmxext, put, 4 +INIT_XMM +%1 sse2 , put, 8 + +%define OP_MOV AVG_MOV +INIT_MMX +%1 mmxext, avg, 4 +INIT_XMM +%1 sse2 , avg, 8 +%endmacro + +%macro MCAxA 8 +%ifdef ARCH_X86_64 +%ifnidn %1,mmxext +MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 +%endif +%else +MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 +%endif +%endmacro + +%macro MCAxA_OP 8 +cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 +%ifdef ARCH_X86_32 + call stub_%2_h264_qpel%4_%3_10_%1 + mov r0, r0m + mov r1, r1m + add r0, %4*2 + add r1, %4*2 + call stub_%2_h264_qpel%4_%3_10_%1 + mov r0, r0m + mov r1, r1m + lea r0, [r0+r2*%4] + lea r1, [r1+r2*%4] + call stub_%2_h264_qpel%4_%3_10_%1 + mov r0, r0m + mov r1, r1m + lea r0, [r0+r2*%4+%4*2] + lea r1, [r1+r2*%4+%4*2] + call stub_%2_h264_qpel%4_%3_10_%1 + RET +%else ; ARCH_X86_64 + mov r10, r0 + mov r11, r1 + call stub_%2_h264_qpel%4_%3_10_%1 + lea r0, [r10+%4*2] + lea r1, [r11+%4*2] + call stub_%2_h264_qpel%4_%3_10_%1 + lea r0, [r10+r2*%4] + lea r1, [r11+r2*%4] + call stub_%2_h264_qpel%4_%3_10_%1 + lea r0, [r10+r2*%4+%4*2] + lea r1, [r11+r2*%4+%4*2] +%ifndef UNIX64 ; fall through to function + call stub_%2_h264_qpel%4_%3_10_%1 + RET +%endif +%endif +%endmacro + +;cpu, put/avg, mc, 4/8, ... +%macro cglobal_mc 7 +%assign i %4*2 +MCAxA %1, %2, %3, %4, i, %5,%6,%7 + +cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7 +%ifndef UNIX64 ; no prologue or epilogue for UNIX64 + call stub_%2_h264_qpel%4_%3_10_%1 + RET +%endif + +stub_%2_h264_qpel%4_%3_10_%1: +%endmacro + +;----------------------------------------------------------------------------- +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro COPY4 0 + movu m0, [r1 ] + OP_MOV [r0 ], m0 + movu m0, [r1+r2 ] + OP_MOV [r0+r2 ], m0 + movu m0, [r1+r2*2] + OP_MOV [r0+r2*2], m0 + movu m0, [r1+r3 ] + OP_MOV [r0+r3 ], m0 +%endmacro + +%macro MC00 1 +INIT_MMX +cglobal_mc mmxext, %1, mc00, 4, 3,4,0 + lea r3, [r2*3] + COPY4 + ret + +INIT_XMM +cglobal %1_h264_qpel8_mc00_10_sse2, 3,4 + lea r3, [r2*3] + COPY4 + lea r0, [r0+r2*4] + lea r1, [r1+r2*4] + COPY4 + RET + +cglobal %1_h264_qpel16_mc00_10_sse2, 3,4 + mov r3d, 8 +.loop: + movu m0, [r1 ] + movu m1, [r1 +16] + OP_MOV [r0 ], m0 + OP_MOV [r0 +16], m1 + movu m0, [r1+r2 ] + movu m1, [r1+r2+16] + OP_MOV [r0+r2 ], m0 + OP_MOV [r0+r2+16], m1 + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET +%endmacro + +%define OP_MOV mova +MC00 put + +%define OP_MOV AVG_MOV +MC00 avg + +;----------------------------------------------------------------------------- +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC_CACHE 1 +%define OP_MOV mova +%define PALIGNR PALIGNR_MMX +INIT_MMX +%1 mmxext , put, 4 +INIT_XMM +%1 sse2_cache64 , put, 8 +%define PALIGNR PALIGNR_SSSE3 +%1 ssse3_cache64, put, 8 +%1 sse2 , put, 8, 0 + +%define OP_MOV AVG_MOV +%define PALIGNR PALIGNR_MMX +INIT_MMX +%1 mmxext , avg, 4 +INIT_XMM +%1 sse2_cache64 , avg, 8 +%define PALIGNR PALIGNR_SSSE3 +%1 ssse3_cache64, avg, 8 +%1 sse2 , avg, 8, 0 +%endmacro + +%macro MC20 3-4 +cglobal_mc %1, %2, mc20, %3, 3,4,9 + mov r3d, %3 + mova m1, [pw_pixel_max] +%if num_mmregs > 8 + mova m8, [pw_16] + %define p16 m8 +%else + %define p16 [pw_16] +%endif +.nextrow +%if %0 == 4 + movu m2, [r1-4] + movu m3, [r1-2] + movu m4, [r1+0] + ADDW m2, [r1+6], m5 + ADDW m3, [r1+4], m5 + ADDW m4, [r1+2], m5 +%else ; movu is slow on these processors +%if mmsize==16 + movu m2, [r1-4] + movu m0, [r1+6] + mova m6, m0 + psrldq m0, 6 + + paddw m6, m2 + PALIGNR m3, m0, m2, 2, m5 + PALIGNR m7, m0, m2, 8, m5 + paddw m3, m7 + PALIGNR m4, m0, m2, 4, m5 + PALIGNR m7, m0, m2, 6, m5 + paddw m4, m7 + SWAP 2, 6 +%else + movu m2, [r1-4] + movu m6, [r1+4] + PALIGNR m3, m6, m2, 2, m5 + paddw m3, m6 + PALIGNR m4, m6, m2, 4, m5 + PALIGNR m7, m6, m2, 6, m5 + paddw m4, m7 + paddw m2, [r1+6] +%endif +%endif + + FILT_H m2, m3, m4, p16 + psraw m2, 1 + pxor m0, m0 + CLIPW m2, m0, m1 + OP_MOV [r0], m2 + add r0, r2 + add r1, r2 + dec r3d + jg .nextrow + rep ret +%endmacro + +MC_CACHE MC20 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC30 3-4 +cglobal_mc %1, %2, mc30, %3, 3,5,9 + lea r4, [r1+2] + jmp stub_%2_h264_qpel%3_mc10_10_%1.body +%endmacro + +MC_CACHE MC30 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC10 3-4 +cglobal_mc %1, %2, mc10, %3, 3,5,9 + mov r4, r1 +.body + mov r3d, %3 + mova m1, [pw_pixel_max] +%if num_mmregs > 8 + mova m8, [pw_16] + %define p16 m8 +%else + %define p16 [pw_16] +%endif +.nextrow +%if %0 == 4 + movu m2, [r1-4] + movu m3, [r1-2] + movu m4, [r1+0] + ADDW m2, [r1+6], m5 + ADDW m3, [r1+4], m5 + ADDW m4, [r1+2], m5 +%else ; movu is slow on these processors +%if mmsize==16 + movu m2, [r1-4] + movu m0, [r1+6] + mova m6, m0 + psrldq m0, 6 + + paddw m6, m2 + PALIGNR m3, m0, m2, 2, m5 + PALIGNR m7, m0, m2, 8, m5 + paddw m3, m7 + PALIGNR m4, m0, m2, 4, m5 + PALIGNR m7, m0, m2, 6, m5 + paddw m4, m7 + SWAP 2, 6 +%else + movu m2, [r1-4] + movu m6, [r1+4] + PALIGNR m3, m6, m2, 2, m5 + paddw m3, m6 + PALIGNR m4, m6, m2, 4, m5 + PALIGNR m7, m6, m2, 6, m5 + paddw m4, m7 + paddw m2, [r1+6] +%endif +%endif + + FILT_H m2, m3, m4, p16 + psraw m2, 1 + pxor m0, m0 + CLIPW m2, m0, m1 + movu m3, [r4] + pavgw m2, m3 + OP_MOV [r0], m2 + add r0, r2 + add r1, r2 + add r4, r2 + dec r3d + jg .nextrow + rep ret +%endmacro + +MC_CACHE MC10 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro V_FILT 11 +v_filt%9_%10_10_%11: + add r4, r2 +.no_addr4: + FILT_V m0, m1, m2, m3, m4, m5, m6, m7 + add r1, r2 + add r0, r2 + ret +%endmacro + +INIT_MMX +RESET_MM_PERMUTATION +%assign i 0 +%rep 4 +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext +SWAP 0,1,2,3,4,5 +%assign i i+1 +%endrep + +INIT_XMM +RESET_MM_PERMUTATION +%assign i 0 +%rep 6 +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2 +SWAP 0,1,2,3,4,5 +%assign i i+1 +%endrep + +%macro MC02 3 +cglobal_mc %1, %2, mc02, %3, 3,4,8 + PRELOAD_V + + sub r0, r2 +%assign j 0 +%rep %3 + %assign i (j % 6) + call v_filt%3_ %+ i %+ _10_%1.no_addr4 + OP_MOV [r0], m0 + SWAP 0,1,2,3,4,5 + %assign j j+1 +%endrep + ret +%endmacro + +MC MC02 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC01 3 +cglobal_mc %1, %2, mc01, %3, 3,5,8 + mov r4, r1 +.body + PRELOAD_V + + sub r4, r2 + sub r0, r2 +%assign j 0 +%rep %3 + %assign i (j % 6) + call v_filt%3_ %+ i %+ _10_%1 + movu m7, [r4] + pavgw m0, m7 + OP_MOV [r0], m0 + SWAP 0,1,2,3,4,5 + %assign j j+1 +%endrep + ret +%endmacro + +MC MC01 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC03 3 +cglobal_mc %1, %2, mc03, %3, 3,5,8 + lea r4, [r1+r2] + jmp stub_%2_h264_qpel%3_mc01_10_%1.body +%endmacro + +MC MC03 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro H_FILT_AVG 3-4 +h_filt%2_%3_10_%1: +;FILT_H with fewer registers and averaged with the FILT_V result +;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration +;unfortunately I need three registers, so m5 will have to be re-read from memory + movu m5, [r4-4] + ADDW m5, [r4+6], m7 + movu m6, [r4-2] + ADDW m6, [r4+4], m7 + paddw m5, [pw_16] + psubw m5, m6 ; a-b + psraw m5, 2 ; (a-b)/4 + psubw m5, m6 ; (a-b)/4-b + movu m6, [r4+0] + ADDW m6, [r4+2], m7 + paddw m5, m6 ; (a-b)/4-b+c + psraw m5, 2 ; ((a-b)/4-b+c)/4 + paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + psraw m5, 1 + CLIPW m5, [pb_0], [pw_pixel_max] +;avg FILT_V, FILT_H + pavgw m0, m5 +%if %0!=4 + movu m5, [r1+r5] +%endif + ret +%endmacro + +INIT_MMX +RESET_MM_PERMUTATION +%assign i 0 +%rep 3 +H_FILT_AVG mmxext, 4, i +SWAP 0,1,2,3,4,5 +%assign i i+1 +%endrep +H_FILT_AVG mmxext, 4, i, 0 + +INIT_XMM +RESET_MM_PERMUTATION +%assign i 0 +%rep 6 +%if i==1 +H_FILT_AVG sse2, 8, i, 0 +%else +H_FILT_AVG sse2, 8, i +%endif +SWAP 0,1,2,3,4,5 +%assign i i+1 +%endrep + +%macro MC11 3 +; this REALLY needs x86_64 +cglobal_mc %1, %2, mc11, %3, 3,6,8 + mov r4, r1 +.body + PRELOAD_V + + sub r0, r2 + sub r4, r2 + mov r5, r2 + neg r5 +%assign j 0 +%rep %3 + %assign i (j % 6) + call v_filt%3_ %+ i %+ _10_%1 + call h_filt%3_ %+ i %+ _10_%1 +%if %3==8 && i==1 + movu m5, [r1+r5] +%endif + OP_MOV [r0], m0 + SWAP 0,1,2,3,4,5 + %assign j j+1 +%endrep + ret +%endmacro + +MC MC11 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC31 3 +cglobal_mc %1, %2, mc31, %3, 3,6,8 + mov r4, r1 + add r1, 2 + jmp stub_%2_h264_qpel%3_mc11_10_%1.body +%endmacro + +MC MC31 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC13 3 +cglobal_mc %1, %2, mc13, %3, 3,7,12 + lea r4, [r1+r2] + jmp stub_%2_h264_qpel%3_mc11_10_%1.body +%endmacro + +MC MC13 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC33 3 +cglobal_mc %1, %2, mc33, %3, 3,6,8 + lea r4, [r1+r2] + add r1, 2 + jmp stub_%2_h264_qpel%3_mc11_10_%1.body +%endmacro + +MC MC33 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro FILT_H2 3 + psubw %1, %2 ; a-b + psubw %2, %3 ; b-c + psllw %2, 2 + psubw %1, %2 ; a-5*b+4*c + psllw %3, 4 + paddw %1, %3 ; a-5*b+20*c +%endmacro + +%macro FILT_VNRD 8 + movu %6, [r1] + paddw %1, %6 + mova %7, %2 + paddw %7, %5 + mova %8, %3 + paddw %8, %4 + FILT_H2 %1, %7, %8 +%endmacro + +%macro HV 2 +%ifidn %1,sse2 +%define PAD 12 +%define COUNT 2 +%else +%define PAD 0 +%define COUNT 3 +%endif +put_hv%2_10_%1: + neg r2 ; This actually saves instructions + lea r1, [r1+r2*2-mmsize+PAD] + lea r4, [rsp+PAD+gprsize] + mov r3d, COUNT +.v_loop: + movu m0, [r1] + sub r1, r2 + movu m1, [r1] + sub r1, r2 + movu m2, [r1] + sub r1, r2 + movu m3, [r1] + sub r1, r2 + movu m4, [r1] + sub r1, r2 +%assign i 0 +%rep %2-1 + FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 + psubw m0, [pad20] + movu [r4+i*mmsize*3], m0 + sub r1, r2 + SWAP 0,1,2,3,4,5 +%assign i i+1 +%endrep + FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 + psubw m0, [pad20] + movu [r4+i*mmsize*3], m0 + add r4, mmsize + lea r1, [r1+r2*8+mmsize] +%if %2==8 + lea r1, [r1+r2*4] +%endif + dec r3d + jg .v_loop + neg r2 + ret +%endmacro + +INIT_MMX +HV mmxext, 4 +INIT_XMM +HV sse2 , 8 + +%macro H_LOOP 2 +%if num_mmregs > 8 + %define s1 m8 + %define s2 m9 + %define s3 m10 + %define d1 m11 +%else + %define s1 [tap1] + %define s2 [tap2] + %define s3 [tap3] + %define d1 [depad] +%endif +h%2_loop_op_%1: + movu m1, [r1+mmsize-4] + movu m2, [r1+mmsize-2] + mova m3, [r1+mmsize+0] + movu m4, [r1+mmsize+2] + movu m5, [r1+mmsize+4] + movu m6, [r1+mmsize+6] +%if num_mmregs > 8 + pmaddwd m1, s1 + pmaddwd m2, s1 + pmaddwd m3, s2 + pmaddwd m4, s2 + pmaddwd m5, s3 + pmaddwd m6, s3 + paddd m1, d1 + paddd m2, d1 +%else + mova m0, s1 + pmaddwd m1, m0 + pmaddwd m2, m0 + mova m0, s2 + pmaddwd m3, m0 + pmaddwd m4, m0 + mova m0, s3 + pmaddwd m5, m0 + pmaddwd m6, m0 + mova m0, d1 + paddd m1, m0 + paddd m2, m0 +%endif + paddd m3, m5 + paddd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 10 + psrad m2, 10 + pslld m2, 16 + pand m1, [pd_0f] + por m1, m2 +%if num_mmregs <= 8 + pxor m0, m0 +%endif + CLIPW m1, m0, m7 + add r1, mmsize*3 + ret +%endmacro + +INIT_MMX +H_LOOP mmxext, 4 +INIT_XMM +H_LOOP sse2 , 8 + +%macro MC22 3 +cglobal_mc %1, %2, mc22, %3, 3,7,12 +%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) + mov r6, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack + sub rsp, PAD + + call put_hv%3_10_%1 + + mov r3d, %3 + mova m7, [pw_pixel_max] +%if num_mmregs > 8 + pxor m0, m0 + mova m8, [tap1] + mova m9, [tap2] + mova m10, [tap3] + mova m11, [depad] +%endif + mov r1, rsp +.h_loop: + call h%3_loop_op_%1 + + OP_MOV [r0], m1 + add r0, r2 + dec r3d + jg .h_loop + + mov rsp, r6 ; restore stack pointer + ret +%endmacro + +MC MC22 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC12 3 +cglobal_mc %1, %2, mc12, %3, 3,7,12 +%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) + mov r6, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack + sub rsp, PAD + + call put_hv%3_10_%1 + + xor r4d, r4d +.body + mov r3d, %3 + pxor m0, m0 + mova m7, [pw_pixel_max] +%if num_mmregs > 8 + mova m8, [tap1] + mova m9, [tap2] + mova m10, [tap3] + mova m11, [depad] +%endif + mov r1, rsp +.h_loop: + call h%3_loop_op_%1 + + movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc + paddw m3, [depad2] + psrlw m3, 5 + psubw m3, [unpad] + CLIPW m3, m0, m7 + pavgw m1, m3 + + OP_MOV [r0], m1 + add r0, r2 + dec r3d + jg .h_loop + + mov rsp, r6 ; restore stack pointer + ret +%endmacro + +MC MC12 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC32 3 +cglobal_mc %1, %2, mc32, %3, 3,7,12 +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) + mov r6, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack + sub rsp, PAD + + call put_hv%3_10_%1 + + mov r4d, 2 ; sizeof(pixel) + jmp stub_%2_h264_qpel%3_mc12_10_%1.body +%endmacro + +MC MC32 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro H_NRD 2 +put_h%2_10_%1: + add rsp, gprsize + mov r3d, %2 + xor r4d, r4d + mova m6, [pad20] +.nextrow + movu m2, [r5-4] + movu m3, [r5-2] + movu m4, [r5+0] + ADDW m2, [r5+6], m5 + ADDW m3, [r5+4], m5 + ADDW m4, [r5+2], m5 + + FILT_H2 m2, m3, m4 + psubw m2, m6 + mova [rsp+r4], m2 + add r4d, mmsize*3 + add r5, r2 + dec r3d + jg .nextrow + sub rsp, gprsize + ret +%endmacro + +INIT_MMX +H_NRD mmxext, 4 +INIT_XMM +H_NRD sse2 , 8 + +%macro MC21 3 +cglobal_mc %1, %2, mc21, %3, 3,7,12 + mov r5, r1 +.body +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) + mov r6, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack + + sub rsp, PAD + call put_h%3_10_%1 + + sub rsp, PAD + call put_hv%3_10_%1 + + mov r4d, PAD-mmsize ; H buffer + jmp stub_%2_h264_qpel%3_mc12_10_%1.body +%endmacro + +MC MC21 + +;----------------------------------------------------------------------------- +; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) +;----------------------------------------------------------------------------- +%macro MC23 3 +cglobal_mc %1, %2, mc23, %3, 3,7,12 + lea r5, [r1+r2] + jmp stub_%2_h264_qpel%3_mc21_10_%1.body +%endmacro + +MC MC23 diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c index 066f794fb6..c313c0e079 100644 --- a/libavcodec/x86/h264_qpel_mmx.c +++ b/libavcodec/x86/h264_qpel_mmx.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt + * Copyright (c) 2011 Daniel Kang * * This file is part of Libav. * @@ -1199,3 +1200,100 @@ H264_MC_816(H264_MC_HV, sse2) H264_MC_816(H264_MC_H, ssse3) H264_MC_816(H264_MC_HV, ssse3) #endif + + + +//10bit +#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ +void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, uint8_t *src, int stride); + +#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) + +#define LUMA_MC_816(DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) + +LUMA_MC_ALL(10, mc00, mmxext) +LUMA_MC_ALL(10, mc10, mmxext) +LUMA_MC_ALL(10, mc20, mmxext) +LUMA_MC_ALL(10, mc30, mmxext) +LUMA_MC_ALL(10, mc01, mmxext) +LUMA_MC_ALL(10, mc11, mmxext) +LUMA_MC_ALL(10, mc21, mmxext) +LUMA_MC_ALL(10, mc31, mmxext) +LUMA_MC_ALL(10, mc02, mmxext) +LUMA_MC_ALL(10, mc12, mmxext) +LUMA_MC_ALL(10, mc22, mmxext) +LUMA_MC_ALL(10, mc32, mmxext) +LUMA_MC_ALL(10, mc03, mmxext) +LUMA_MC_ALL(10, mc13, mmxext) +LUMA_MC_ALL(10, mc23, mmxext) +LUMA_MC_ALL(10, mc33, mmxext) + +LUMA_MC_816(10, mc00, sse2) +LUMA_MC_816(10, mc10, sse2) +LUMA_MC_816(10, mc10, sse2_cache64) +LUMA_MC_816(10, mc10, ssse3_cache64) +LUMA_MC_816(10, mc20, sse2) +LUMA_MC_816(10, mc20, sse2_cache64) +LUMA_MC_816(10, mc20, ssse3_cache64) +LUMA_MC_816(10, mc30, sse2) +LUMA_MC_816(10, mc30, sse2_cache64) +LUMA_MC_816(10, mc30, ssse3_cache64) +LUMA_MC_816(10, mc01, sse2) +LUMA_MC_816(10, mc11, sse2) +LUMA_MC_816(10, mc21, sse2) +LUMA_MC_816(10, mc31, sse2) +LUMA_MC_816(10, mc02, sse2) +LUMA_MC_816(10, mc12, sse2) +LUMA_MC_816(10, mc22, sse2) +LUMA_MC_816(10, mc32, sse2) +LUMA_MC_816(10, mc03, sse2) +LUMA_MC_816(10, mc13, sse2) +LUMA_MC_816(10, mc23, sse2) +LUMA_MC_816(10, mc33, sse2) + +#define QPEL16_OPMC(OP, MC, MMX)\ +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ + src += 8*stride;\ + dst += 8*stride;\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ +} + +#define QPEL16_OP(MC, MMX)\ +QPEL16_OPMC(put, MC, MMX)\ +QPEL16_OPMC(avg, MC, MMX) + +#define QPEL16(MMX)\ +QPEL16_OP(mc00, MMX)\ +QPEL16_OP(mc01, MMX)\ +QPEL16_OP(mc02, MMX)\ +QPEL16_OP(mc03, MMX)\ +QPEL16_OP(mc10, MMX)\ +QPEL16_OP(mc11, MMX)\ +QPEL16_OP(mc12, MMX)\ +QPEL16_OP(mc13, MMX)\ +QPEL16_OP(mc20, MMX)\ +QPEL16_OP(mc21, MMX)\ +QPEL16_OP(mc22, MMX)\ +QPEL16_OP(mc23, MMX)\ +QPEL16_OP(mc30, MMX)\ +QPEL16_OP(mc31, MMX)\ +QPEL16_OP(mc32, MMX)\ +QPEL16_OP(mc33, MMX) + +#if ARCH_X86_32 // ARCH_X86_64 implies sse2+ +QPEL16(mmxext) +#endif |