diff options
author | Daniel Kang <daniel.d.kang@gmail.com> | 2012-10-13 10:04:50 -0500 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2012-11-25 20:38:35 +0100 |
commit | 610e00b3594bf0f2a75713f20e9c4edf0d03a818 (patch) | |
tree | 695d331acaafefb6832caf9b421a581573f129d4 /libavcodec/x86/h264_qpel_8bit.asm | |
parent | ad01ba6ceaea7d71c4b9887795523438689b5a96 (diff) | |
download | ffmpeg-610e00b3594bf0f2a75713f20e9c4edf0d03a818.tar.gz |
x86: h264: Convert 8-bit QPEL inline assembly to YASM
Signed-off-by: Diego Biurrun <diego@biurrun.de>
Diffstat (limited to 'libavcodec/x86/h264_qpel_8bit.asm')
-rw-r--r-- | libavcodec/x86/h264_qpel_8bit.asm | 862 |
1 files changed, 862 insertions, 0 deletions
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm new file mode 100644 index 0000000000..eef07157c9 --- /dev/null +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -0,0 +1,862 @@ +;***************************************************************************** +;* MMX/SSE2/SSSE3-optimized H.264 QPEL code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2012 Daniel Kang +;* +;* Authors: Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +cextern pw_16 +cextern pw_5 +cextern pb_0 + +SECTION .text + + +%macro op_avgh 3 + movh %3, %2 + pavgb %1, %3 + movh %2, %1 +%endmacro + +%macro op_avg 2-3 + pavgb %1, %2 + mova %2, %1 +%endmacro + +%macro op_puth 2-3 + movh %2, %1 +%endmacro + +%macro op_put 2-3 + mova %2, %1 +%endmacro + +%macro QPEL4_H_LOWPASS_OP 1 +cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + pxor m7, m7 + mova m4, [pw_5] + mova m5, [pw_16] + mov r4d, 4 +.loop: + movh m1, [r1-1] + movh m2, [r1+0] + movh m3, [r1+1] + movh m0, [r1+2] + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m0, m7 + paddw m1, m0 + paddw m2, m3 + movh m0, [r1-2] + movh m3, [r1+3] + punpcklbw m0, m7 + punpcklbw m3, m7 + paddw m0, m3 + psllw m2, 2 + psubw m2, m1 + pmullw m2, m4 + paddw m0, m5 + paddw m0, m2 + psraw m0, 5 + packuswb m0, m0 + op_%1h m0, [r0], m6 + add r0, r2 + add r1, r3 + dec r4d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL4_H_LOWPASS_OP put +QPEL4_H_LOWPASS_OP avg + +%macro QPEL8_H_LOWPASS_OP 1 +cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + mov r4d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + mova m0, [r1] + mova m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + mova m2, [r1-1] + mova m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpckhbw m3, m7 + punpcklbw m4, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m5, m3 + psubw m0, m2 + psubw m1, m5 + pmullw m0, m6 + pmullw m1, m6 + movd m2, [r1-2] + movd m5, [r1+7] + punpcklbw m2, m7 + punpcklbw m5, m7 + paddw m2, m3 + paddw m4, m5 + mova m5, [pw_16] + paddw m2, m5 + paddw m4, m5 + paddw m0, m2 + paddw m1, m4 + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + op_%1 m0, [r0], m4 + add r0, r2 + add r1, r3 + dec r4d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8_H_LOWPASS_OP put +QPEL8_H_LOWPASS_OP avg + +%macro QPEL8_H_LOWPASS_OP_XMM 1 +cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + mov r4d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + movu m1, [r1-2] + mova m0, m1 + punpckhbw m1, m7 + punpcklbw m0, m7 + mova m2, m1 + mova m3, m1 + mova m4, m1 + mova m5, m1 + palignr m4, m0, 2 + palignr m3, m0, 4 + palignr m2, m0, 6 + palignr m1, m0, 8 + palignr m5, m0, 10 + paddw m0, m5 + paddw m2, m3 + paddw m1, m4 + psllw m2, 2 + psubw m2, m1 + paddw m0, [pw_16] + pmullw m2, m6 + paddw m2, m0 + psraw m2, 5 + packuswb m2, m2 + op_%1h m2, [r0], m4 + add r1, r3 + add r0, r2 + dec r4d + jne .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8_H_LOWPASS_OP_XMM put +QPEL8_H_LOWPASS_OP_XMM avg + + +%macro QPEL4_H_LOWPASS_L2_OP 1 +cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + pxor m7, m7 + mova m4, [pw_5] + mova m5, [pw_16] + mov r5d, 4 +.loop: + movh m1, [r1-1] + movh m2, [r1+0] + movh m3, [r1+1] + movh m0, [r1+2] + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m0, m7 + paddw m1, m0 + paddw m2, m3 + movh m0, [r1-2] + movh m3, [r1+3] + punpcklbw m0, m7 + punpcklbw m3, m7 + paddw m0, m3 + psllw m2, 2 + psubw m2, m1 + pmullw m2, m4 + paddw m0, m5 + paddw m0, m2 + movh m3, [r2] + psraw m0, 5 + packuswb m0, m0 + pavgb m0, m3 + op_%1h m0, [r0], m6 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL4_H_LOWPASS_L2_OP put +QPEL4_H_LOWPASS_L2_OP avg + + +%macro QPEL8_H_LOWPASS_L2_OP 1 +cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + mova m0, [r1] + mova m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + mova m2, [r1-1] + mova m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpckhbw m3, m7 + punpcklbw m4, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m5, m3 + psubw m0, m2 + psubw m1, m5 + pmullw m0, m6 + pmullw m1, m6 + movd m2, [r1-2] + movd m5, [r1+7] + punpcklbw m2, m7 + punpcklbw m5, m7 + paddw m2, m3 + paddw m4, m5 + mova m5, [pw_16] + paddw m2, m5 + paddw m4, m5 + paddw m0, m2 + paddw m1, m4 + psraw m0, 5 + psraw m1, 5 + mova m4, [r2] + packuswb m0, m1 + pavgb m0, m4 + op_%1 m0, [r0], m4 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8_H_LOWPASS_L2_OP put +QPEL8_H_LOWPASS_L2_OP avg + + +%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 +cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,7 ; dst, src, src2, dstStride, src2Stride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + lddqu m1, [r1-2] + mova m0, m1 + punpckhbw m1, m7 + punpcklbw m0, m7 + mova m2, m1 + mova m3, m1 + mova m4, m1 + mova m5, m1 + palignr m4, m0, 2 + palignr m3, m0, 4 + palignr m2, m0, 6 + palignr m1, m0, 8 + palignr m5, m0, 10 + paddw m0, m5 + paddw m2, m3 + paddw m1, m4 + psllw m2, 2 + movh m3, [r2] + psubw m2, m1 + paddw m0, [pw_16] + pmullw m2, m6 + paddw m2, m0 + psraw m2, 5 + packuswb m2, m2 + pavgb m2, m3 + op_%1h m2, [r0], m4 + add r1, r3 + add r0, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8_H_LOWPASS_L2_OP_XMM put +QPEL8_H_LOWPASS_L2_OP_XMM avg + + +; All functions that call this are required to have function arguments of +; dst, src, dstStride, srcStride +%macro FILT_V 1 + mova m6, m2 + movh m5, [r1] + paddw m6, m3 + psllw m6, 2 + psubw m6, m1 + psubw m6, m4 + punpcklbw m5, m7 + pmullw m6, [pw_5] + paddw m0, [pw_16] + add r1, r3 + paddw m0, m5 + paddw m6, m0 + psraw m6, 5 + packuswb m6, m6 + op_%1h m6, [r0], m0 ; 1 + add r0, r2 + SWAP 0, 1, 2, 3, 4, 5 +%endmacro + +%macro QPEL4_V_LOWPASS_OP 1 +cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + sub r1, r3 + sub r1, r3 + pxor m7, m7 + movh m0, [r1] + movh m1, [r1+r3] + lea r1, [r1+2*r3] + movh m2, [r1] + movh m3, [r1+r3] + lea r1, [r1+2*r3] + movh m4, [r1] + add r1, r3 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + RET +%endmacro + +INIT_MMX mmxext +QPEL4_V_LOWPASS_OP put +QPEL4_V_LOWPASS_OP avg + + + +%macro QPEL8OR16_V_LOWPASS_OP 1 +%if cpuflag(sse2) +cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride, srcStride, h + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + sub r1, r3 + sub r1, r3 +%else +cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,7 ; dst, src, dstStride, srcStride, h + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d +%endif + pxor m7, m7 + movh m0, [r1] + movh m1, [r1+r3] + lea r1, [r1+2*r3] + movh m2, [r1] + movh m3, [r1+r3] + lea r1, [r1+2*r3] + movh m4, [r1] + add r1, r3 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + cmp r4d, 16 + jne .end + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 +.end: + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8OR16_V_LOWPASS_OP put +QPEL8OR16_V_LOWPASS_OP avg + +INIT_XMM sse2 +QPEL8OR16_V_LOWPASS_OP put +QPEL8OR16_V_LOWPASS_OP avg + + +; All functions that use this are required to have args: +; src, tmp, srcSize +%macro FILT_HV 1 ; offset + mova m6, m2 + movh m5, [r0] + paddw m6, m3 + psllw m6, 2 + paddw m0, [pw_16] + psubw m6, m1 + psubw m6, m4 + punpcklbw m5, m7 + pmullw m6, [pw_5] + paddw m0, m5 + add r0, r2 + paddw m6, m0 + mova [r1+%1], m6 + SWAP 0, 1, 2, 3, 4, 5 +%endmacro + +%macro QPEL4_HV1_LOWPASS_OP 1 +cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride + movsxdifnidn r2, r2d + pxor m7, m7 + movh m0, [r0] + movh m1, [r0+r2] + lea r0, [r0+2*r2] + movh m2, [r0] + movh m3, [r0+r2] + lea r0, [r0+2*r2] + movh m4, [r0] + add r0, r2 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_HV 0*24 + FILT_HV 1*24 + FILT_HV 2*24 + FILT_HV 3*24 + RET + +cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride + movsxdifnidn r2, r2d + mov r3d, 4 +.loop: + mova m0, [r0] + paddw m0, [r0+10] + mova m1, [r0+2] + paddw m1, [r0+8] + mova m2, [r0+4] + paddw m2, [r0+6] + psubw m0, m1 + psraw m0, 2 + psubw m0, m1 + paddsw m0, m2 + psraw m0, 2 + paddw m0, m2 + psraw m0, 6 + packuswb m0, m0 + op_%1h m0, [r1], m7 + add r0, 24 + add r1, r2 + dec r3d + jnz .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL4_HV1_LOWPASS_OP put +QPEL4_HV1_LOWPASS_OP avg + +%macro QPEL8OR16_HV1_LOWPASS_OP 1 +cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,7 ; src, tmp, srcStride, size + movsxdifnidn r2, r2d + pxor m7, m7 + movh m0, [r0] + movh m1, [r0+r2] + lea r0, [r0+2*r2] + movh m2, [r0] + movh m3, [r0+r2] + lea r0, [r0+2*r2] + movh m4, [r0] + add r0, r2 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_HV 0*48 + FILT_HV 1*48 + FILT_HV 2*48 + FILT_HV 3*48 + FILT_HV 4*48 + FILT_HV 5*48 + FILT_HV 6*48 + FILT_HV 7*48 + cmp r3d, 16 + jne .end + FILT_HV 8*48 + FILT_HV 9*48 + FILT_HV 10*48 + FILT_HV 11*48 + FILT_HV 12*48 + FILT_HV 13*48 + FILT_HV 14*48 + FILT_HV 15*48 +.end: + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8OR16_HV1_LOWPASS_OP put +QPEL8OR16_HV1_LOWPASS_OP avg + +INIT_XMM sse2 +QPEL8OR16_HV1_LOWPASS_OP put + + + +%macro QPEL8OR16_HV2_LOWPASS_OP 1 +; unused is to match ssse3 and mmxext args +cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h + movsxdifnidn r2, r2d +.loop: + mova m0, [r1] + mova m3, [r1+8] + mova m1, [r1+2] + mova m4, [r1+10] + paddw m0, m4 + paddw m1, m3 + paddw m3, [r1+18] + paddw m4, [r1+16] + mova m2, [r1+4] + mova m5, [r1+12] + paddw m2, [r1+6] + paddw m5, [r1+14] + psubw m0, m1 + psubw m3, m4 + psraw m0, 2 + psraw m3, 2 + psubw m0, m1 + psubw m3, m4 + paddsw m0, m2 + paddsw m3, m5 + psraw m0, 2 + psraw m3, 2 + paddw m0, m2 + paddw m3, m5 + psraw m0, 6 + psraw m3, 6 + packuswb m0, m3 + op_%1 m0, [r0], m7 + add r1, 48 + add r0, r2 + dec r4d + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8OR16_HV2_LOWPASS_OP put +QPEL8OR16_HV2_LOWPASS_OP avg + +%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 +cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,7 ; dst, tmp, dstStride, tmpStride, size + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + cmp r4d, 16 + je .op16 +.loop8: + mova m1, [r1+16] + mova m0, [r1] + mova m2, m1 + mova m3, m1 + mova m4, m1 + mova m5, m1 + palignr m5, m0, 10 + palignr m4, m0, 8 + palignr m3, m0, 6 + palignr m2, m0, 4 + palignr m1, m0, 2 + paddw m0, m5 + paddw m1, m4 + paddw m2, m3 + psubw m0, m1 + psraw m0, 2 + psubw m0, m1 + paddw m0, m2 + psraw m0, 2 + paddw m0, m2 + psraw m0, 6 + packuswb m0, m0 + op_%1h m0, [r0], m7 + add r1, 48 + add r0, r2 + dec r4d + jne .loop8 + jmp .done +.op16: + mova m4, [r1+32] + mova m5, [r1+16] + mova m7, [r1] + mova m3, m4 + mova m2, m4 + mova m1, m4 + mova m0, m4 + palignr m0, m5, 10 + palignr m1, m5, 8 + palignr m2, m5, 6 + palignr m3, m5, 4 + palignr m4, m5, 2 + paddw m0, m5 + paddw m1, m4 + paddw m2, m3 + mova m6, m5 + mova m4, m5 + mova m3, m5 + palignr m4, m7, 8 + palignr m6, m7, 2 + palignr m3, m7, 10 + paddw m4, m6 + mova m6, m5 + palignr m5, m7, 6 + palignr m6, m7, 4 + paddw m3, m7 + paddw m5, m6 + psubw m0, m1 + psubw m3, m4 + psraw m0, 2 + psraw m3, 2 + psubw m0, m1 + psubw m3, m4 + paddw m0, m2 + paddw m3, m5 + psraw m0, 2 + psraw m3, 2 + paddw m0, m2 + paddw m3, m5 + psraw m0, 6 + psraw m3, 6 + packuswb m3, m0 + op_%1 m3, [r0], m7 + add r1, 48 + add r0, r2 + dec r4d + jne .op16 +.done: + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8OR16_HV2_LOWPASS_OP_XMM put +QPEL8OR16_HV2_LOWPASS_OP_XMM avg + + +%macro PIXELS4_L2_SHIFT5 1 +cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mova m0, [r1] + mova m1, [r1+24] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m0 + packuswb m1, m1 + pavgb m0, [r2] + pavgb m1, [r2+r4] + op_%1h m0, [r0], m4 + op_%1h m1, [r0+r3], m5 + lea r2, [r2+r4*2] + lea r0, [r0+r3*2] + mova m0, [r1+48] + mova m1, [r1+72] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m0 + packuswb m1, m1 + pavgb m0, [r2] + pavgb m1, [r2+r4] + op_%1h m0, [r0], m4 + op_%1h m1, [r0+r3], m5 + RET +%endmacro + +INIT_MMX mmxext +PIXELS4_L2_SHIFT5 put +PIXELS4_L2_SHIFT5 avg + + +%macro PIXELS8_L2_SHIFT5 1 +cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d +.loop: + mova m0, [r1] + mova m1, [r1+8] + mova m2, [r1+48] + mova m3, [r1+48+8] + psraw m0, 5 + psraw m1, 5 + psraw m2, 5 + psraw m3, 5 + packuswb m0, m1 + packuswb m2, m3 + pavgb m0, [r2] + pavgb m2, [r2+r4] + op_%1 m0, [r0], m4 + op_%1 m2, [r0+r3], m5 + lea r2, [r2+2*r4] + add r1, 48*2 + lea r0, [r0+2*r3] + sub r5d, 2 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PIXELS8_L2_SHIFT5 put +PIXELS8_L2_SHIFT5 avg + + +%if ARCH_X86_64 +%macro QPEL16_H_LOWPASS_L2_OP 1 +cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 16 + pxor m15, m15 + mova m14, [pw_5] + mova m13, [pw_16] +.loop: + lddqu m1, [r1+6] + lddqu m7, [r1-2] + mova m0, m1 + punpckhbw m1, m15 + punpcklbw m0, m15 + punpcklbw m7, m15 + mova m2, m1 + mova m6, m0 + mova m3, m1 + mova m8, m0 + mova m4, m1 + mova m9, m0 + mova m12, m0 + mova m11, m1 + palignr m11, m0, 10 + palignr m12, m7, 10 + palignr m4, m0, 2 + palignr m9, m7, 2 + palignr m3, m0, 4 + palignr m8, m7, 4 + palignr m2, m0, 6 + palignr m6, m7, 6 + paddw m11, m0 + palignr m1, m0, 8 + palignr m0, m7, 8 + paddw m7, m12 + paddw m2, m3 + paddw m6, m8 + paddw m1, m4 + paddw m0, m9 + psllw m2, 2 + psllw m6, 2 + psubw m2, m1 + psubw m6, m0 + paddw m11, m13 + paddw m7, m13 + pmullw m2, m14 + pmullw m6, m14 + lddqu m3, [r2] + paddw m2, m11 + paddw m6, m7 + psraw m2, 5 + psraw m6, 5 + packuswb m6, m2 + pavgb m6, m3 + op_%1 m6, [r0], m11 + add r1, r3 + add r0, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL16_H_LOWPASS_L2_OP put +QPEL16_H_LOWPASS_L2_OP avg +%endif |