;******************************************************************************
;* MMX-optimized H.263 loop filter
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_FC
cextern h263_loop_filter_strength
SECTION .text
%macro H263_LOOP_FILTER 5
pxor m7, m7
mova m0, [%1]
mova m1, [%1]
mova m2, [%4]
mova m3, [%4]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova m2, [%2]
mova m3, [%2]
mova m4, [%3]
mova m5, [%3]
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
psubw m4, m2
psubw m5, m3
psllw m4, 2
psllw m5, 2
paddw m4, m0
paddw m5, m1
pxor m6, m6
pcmpgtw m6, m4
pcmpgtw m7, m5
pxor m4, m6
pxor m5, m7
psubw m4, m6
psubw m5, m7
psrlw m4, 3
psrlw m5, 3
packuswb m4, m5
packsswb m6, m7
pxor m7, m7
movd m2, %5
punpcklbw m2, m2
punpcklbw m2, m2
punpcklbw m2, m2
psubusb m2, m4
mova m3, m2
psubusb m3, m4
psubb m2, m3
mova m3, [%2]
mova m4, [%3]
pxor m3, m6
pxor m4, m6
paddusb m3, m2
psubusb m4, m2
pxor m3, m6
pxor m4, m6
paddusb m2, m2
packsswb m0, m1
pcmpgtb m7, m0
pxor m0, m7
psubb m0, m7
mova m1, m0
psubusb m0, m2
psubb m1, m0
pand m1, [pb_FC]
psrlw m1, 2
pxor m1, m7
psubb m1, m7
mova m5, [%1]
mova m6, [%4]
psubb m5, m1
paddb m6, m1
%endmacro
INIT_MMX mmx
; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
cglobal h263_v_loop_filter, 3,5
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
mov r3, r0
sub r3, r1
mov r4, r3
sub r4, r1
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
mova [r3], m3
mova [r0], m4
mova [r4], m5
mova [r0+r1], m6
RET
%macro TRANSPOSE4X4 2
movd m0, [%1]
movd m1, [%1+r1]
movd m2, [%1+r1*2]
movd m3, [%1+r3]
punpcklbw m0, m1
punpcklbw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
movd [%2+ 0], m0
punpckhdq m0, m0
movd [%2+ 8], m0
movd [%2+16], m1
punpckhdq m1, m1
movd [%2+24], m1
%endmacro
; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
INIT_MMX mmx
cglobal h263_h_loop_filter, 3,5,0,32
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
sub r0, 2
lea r3, [r1*3]
TRANSPOSE4X4 r0, rsp
lea r4, [r0+r1*4]
TRANSPOSE4X4 r4, rsp+4
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
mova m1, m5
mova m0, m4
punpcklbw m5, m3
punpcklbw m4, m6
punpckhbw m1, m3
punpckhbw m0, m6
mova m3, m5
mova m6, m1
punpcklwd m5, m4
punpcklwd m1, m0
punpckhwd m3, m4
punpckhwd m6, m0
movd [r0], m5
punpckhdq m5, m5
movd [r0+r1*1], m5
movd [r0+r1*2], m3
punpckhdq m3, m3
movd [r0+r3], m3
movd [r4], m1
punpckhdq m1, m1
movd [r4+r1*1], m1
movd [r4+r1*2], m6
punpckhdq m6, m6
movd [r4+r3], m6
RET