diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2010-09-01 20:56:16 +0000 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2010-09-01 20:56:16 +0000 |
commit | a33a2562c19aebee0aea1a4fb3c555e6d34b573f (patch) | |
tree | 57f3c8920a38ee88e6bf7bd68763fac46876fbdb /libavcodec/x86/h264_weight.asm | |
parent | 14bc1f24858a8e83a59dd61a88bdd2bc65993e2b (diff) | |
download | ffmpeg-a33a2562c19aebee0aea1a4fb3c555e6d34b573f.tar.gz |
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
biweight code to sse2/ssse3; add sse2 weight code; and use that same code to
create mmx2 functions also, so that the inline asm in h264dsp_mmx.c can be
removed. OK'ed by Jason on IRC.
Originally committed as revision 25019 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/h264_weight.asm')
-rw-r--r-- | libavcodec/x86/h264_weight.asm | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm new file mode 100644 index 0000000000..864f708a0e --- /dev/null +++ b/libavcodec/x86/h264_weight.asm @@ -0,0 +1,375 @@ +;***************************************************************************** +;* SSE2-optimized weighted prediction code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; biweight pred: +; +; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int log2_denom, int weightd, int weights, +; int offset); +; and +; void h264_weight_16x16_sse2(uint8_t *dst, int stride, +; int log2_denom, int weight, +; int offset); +;----------------------------------------------------------------------------- + +%macro WEIGHT_SETUP 0 + add r4, r4 + inc r4 + movd m3, r3 + movd m5, r4 + movd m6, r2 + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro WEIGHT_OP 2 + movh m0, [r0+%1] + movh m1, [r0+%2] + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m3 + pmullw m1, m3 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro WEIGHT_FUNC_DBL_MM 1 +cglobal h264_weight_16x%1_mmx2, 5, 5, 0 + WEIGHT_SETUP + mov r2, %1 +%if %1 == 16 +.nextrow + WEIGHT_OP 0, 4 + mova [r0 ], m0 + WEIGHT_OP 8, 12 + mova [r0+8], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_weight_16x16_mmx2.nextrow +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_DBL_MM 16 +WEIGHT_FUNC_DBL_MM 8 + +%macro WEIGHT_FUNC_MM 4 +cglobal h264_weight_%1x%2_%4, 7, 7, % + WEIGHT_SETUP + mov r2, %2 +%if %2 == 16 +.nextrow + WEIGHT_OP 0, mmsize/2 + mova [r0], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_weight_%1x16_%4.nextrow +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_MM 8, 16, 0, mmx2 +WEIGHT_FUNC_MM 8, 8, 0, mmx2 +WEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_MM 16, 16, 8, sse2 +WEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro WEIGHT_FUNC_HALF_MM 5 +cglobal h264_weight_%1x%2_%5, 5, 5, %4 + WEIGHT_SETUP + mov r2, %2/2 + lea r3, [r1*2] +%if %2 == mmsize +.nextrow + WEIGHT_OP 0, r1 + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r1], m0 +%else + psrlq m0, 32 + movh [r0+r1], m0 +%endif + add r0, r3 + dec r2 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_weight_%1x%3_%5.nextrow +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m3, r4 + movd m4, r5 + movd m5, r6 + movd m6, r3 + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m4, m4, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_STEPA 3 + movh m%1, [r0+%3] + movh m%2, [r1+%3] + punpcklbw m%1, m7 + punpcklbw m%2, m7 + pmullw m%1, m3 + pmullw m%2, m4 + paddsw m%1, m%2 +%endmacro + +%macro BIWEIGHT_STEPB 0 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro BIWEIGHT_FUNC_DBL_MM 1 +cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 + BIWEIGHT_SETUP + mov r3, %1 +%if %1 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, 4 + BIWEIGHT_STEPB + mova [r0], m0 + BIWEIGHT_STEPA 0, 1, 8 + BIWEIGHT_STEPA 1, 2, 12 + BIWEIGHT_STEPB + mova [r0+8], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_16x16_mmx2.nextrow +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_DBL_MM 16 +BIWEIGHT_FUNC_DBL_MM 8 + +%macro BIWEIGHT_FUNC_MM 4 +cglobal h264_biweight_%1x%2_%4, 7, 7, %3 + BIWEIGHT_SETUP + mov r3, %2 +%if %2 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, mmsize/2 + BIWEIGHT_STEPB + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_%1x16_%4.nextrow +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_MM 16, 16, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro BIWEIGHT_FUNC_HALF_MM 5 +cglobal h264_biweight_%1x%2_%5, 7, 7, %4 + BIWEIGHT_SETUP + mov r3, %2/2 + lea r4, [r2*2] +%if %2 == mmsize +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, r2 + BIWEIGHT_STEPB + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r2], m0 +%else + psrlq m0, 32 + movh [r0+r2], m0 +%endif + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_%1x%3_%5.nextrow +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SSSE3_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m4, r4 + movd m0, r5 + movd m5, r6 + movd m6, r3 + pslld m5, m6 + psrld m5, 1 + punpcklbw m4, m0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%endmacro + +%macro BIWEIGHT_SSSE3_OP 0 + pmaddubsw m0, m4 + pmaddubsw m2, m4 + paddsw m0, m5 + paddsw m2, m5 + psraw m0, m6 + psraw m2, m6 + packuswb m0, m2 +%endmacro + +%macro BIWEIGHT_SSSE3_16 1 +cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1 + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m2, [r0+8] + movh m3, [r1+8] + punpcklbw m0, [r1] + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_16x16_ssse3.nextrow +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_16 16 +BIWEIGHT_SSSE3_16 8 + +%macro BIWEIGHT_SSSE3_8 1 +cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1/2 + lea r4, [r2*2] + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m1, [r1] + movh m2, [r0+r2] + movh m3, [r1+r2] + punpcklbw m0, m1 + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + movh [r0], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp _ff_h264_biweight_8x16_ssse3.nextrow +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_8 16 +BIWEIGHT_SSSE3_8 8 +BIWEIGHT_SSSE3_8 4 |