diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-05-12 04:51:24 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-05-12 04:51:24 +0200 |
commit | 612122b187d711257eecd517e4049cef3bb0b7f0 (patch) | |
tree | 2e0ed86f6f73bbc993a0e7787f331e21d1c7c064 /libavcodec/x86/h264_deblock_10bit.asm | |
parent | 4ea216e761e02d3f6973b316feaf3484be91a14f (diff) | |
parent | 5705b02079449c685a3dd337fcc3a8b440dca4a0 (diff) | |
download | ffmpeg-612122b187d711257eecd517e4049cef3bb0b7f0.tar.gz |
Merge remote branch 'qatar/master'
* qatar/master: (32 commits)
10-bit H.264 x86 chroma v loopfilter asm
Port SMPTE S302M audio decoder from FFmbc 0.3. [Copyright headers corrected]
Fix crash of interlaced MPEG2 decoding
h264pred: fix one more aliasing violation.
doc/APIchanges: fill in missing hashes and dates.
flacenc: use proper initializers for AVOption default values.
lavc: deprecate named constants for deprecated antialias_algo.
aac: workaround for compilation on cygwin
swscale: extend YUV422p support to 10bits depth
tiff: add support for inverted FillOrder for uncompressed data
Remove unused softfloat implementation.
h264pred: fix aliasing violations.
rotozoom: Eliminate French variable name.
rotozoom: Check return value of fread().
rotozoom: Return an error value instead of calling exit().
rotozoom: Make init_demo() return int and check for errors on invocation.
rotozoom: Drop silly UINT8 typedef.
rotozoom: Drop some unnecessary parentheses.
rotozoom: K&R coding style cosmetics
rtsp: Only do keepalive using GET_PARAMETER if the server supports it
...
Conflicts:
Changelog
cmdutils.c
doc/APIchanges
doc/general.texi
ffmpeg.c
ffplay.c
libavcodec/h264pred_template.c
libavcodec/resample.c
libavutil/pixfmt.h
libavutil/softfloat.c
libavutil/softfloat.h
tests/rotozoom.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/h264_deblock_10bit.asm')
-rw-r--r-- | libavcodec/x86/h264_deblock_10bit.asm | 910 |
1 files changed, 910 insertions, 0 deletions
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm new file mode 100644 index 0000000000..c253d02954 --- /dev/null +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -0,0 +1,910 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Oskar Arvidsson <oskar@irock.se> +;* Loren Merritt <lorenm@u.washington.edu> +;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +pw_pixel_max: times 8 dw ((1 << 10)-1) + +SECTION .text + +cextern pw_2 +cextern pw_3 +cextern pw_4 + +; out: %4 = |%1-%2|-%3 +; clobbers: %5 +%macro ABS_SUB 5 + psubusw %5, %2, %1 + psubusw %4, %1, %2 + por %4, %5 + psubw %4, %3 +%endmacro + +; out: %4 = |%1-%2|<%3 +%macro DIFF_LT 5 + psubusw %4, %2, %1 + psubusw %5, %1, %2 + por %5, %4 ; |%1-%2| + pxor %4, %4 + psubw %5, %3 ; |%1-%2|-%3 + pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 +%endmacro + +%macro LOAD_AB 4 + movd %1, %3 + movd %2, %4 + SPLATW %1, %1 + SPLATW %2, %2 +%endmacro + +; in: %2=tc reg +; out: %1=splatted tc +%macro LOAD_TC 2 + movd %1, [%2] + punpcklbw %1, %1 +%if mmsize == 8 + pshufw %1, %1, 0 +%else + pshuflw %1, %1, 01010000b + pshufd %1, %1, 01010000b +%endif + psraw %1, 6 +%endmacro + +; in: %1=p1, %2=p0, %3=q0, %4=q1 +; %5=alpha, %6=beta, %7-%9=tmp +; out: %7=mask +%macro LOAD_MASK 9 + ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha + ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta + pand %8, %9 + ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta + pxor %7, %7 + pand %8, %9 + pcmpgtw %7, %8 +%endmacro + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', m2=q0' +%macro DEBLOCK_P0_Q0 7 + psubw %3, %4 + pxor %7, %7 + paddw %3, [pw_4] + psubw %7, %5 + psubw %6, %2, %1 + psllw %6, 2 + paddw %3, %6 + psraw %3, 3 + mova %6, [pw_pixel_max] + CLIPW %3, %7, %5 + pxor %7, %7 + paddw %1, %3 + psubw %2, %3 + CLIPW %1, %7, %6 + CLIPW %2, %7, %6 +%endmacro + +; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp +%macro LUMA_Q1 6 + pavgw %6, %3, %4 ; (p0+q0+1)>>1 + paddw %1, %6 + pxor %6, %6 + psraw %1, 1 + psubw %6, %5 + psubw %1, %2 + CLIPW %1, %6, %5 + paddw %1, %2 +%endmacro + +%macro LUMA_DEBLOCK_ONE 3 + DIFF_LT m5, %1, bm, m4, m6 + pxor m6, m6 + mova %3, m4 + pcmpgtw m6, tcm + pand m4, tcm + pandn m6, m7 + pand m4, m6 + LUMA_Q1 m5, %2, m1, m2, m4, m6 +%endmacro + +%macro LUMA_H_STORE 2 +%if mmsize == 8 + movq [r0-4], m0 + movq [r0+r1-4], m1 + movq [r0+r1*2-4], m2 + movq [r0+%2-4], m3 +%else + movq [r0-4], m0 + movhps [r0+r1-4], m0 + movq [r0+r1*2-4], m1 + movhps [%1-4], m1 + movq [%1+r1-4], m2 + movhps [%1+r1*2-4], m2 + movq [%1+%2-4], m3 + movhps [%1+r1*4-4], m3 +%endif +%endmacro + +%macro DEBLOCK_LUMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) + %assign pad 5*mmsize+12-(stack_offset&15) + %define tcm [rsp] + %define ms1 [rsp+mmsize] + %define ms2 [rsp+mmsize*2] + %define am [rsp+mmsize*3] + %define bm [rsp+mmsize*4] + SUB rsp, pad + shl r2d, 2 + shl r3d, 2 + LOAD_AB m4, m5, r2, r3 + mov r3, 32/mmsize + mov r2, r0 + sub r0, r1 + mova am, m4 + sub r0, r1 + mova bm, m5 + sub r0, r1 +.loop: + mova m0, [r0+r1] + mova m1, [r0+r1*2] + mova m2, [r2] + mova m3, [r2+r1] + + LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 + LOAD_TC m6, r4 + mova tcm, m6 + + mova m5, [r0] + LUMA_DEBLOCK_ONE m1, m0, ms1 + mova [r0+r1], m5 + + mova m5, [r2+r1*2] + LUMA_DEBLOCK_ONE m2, m3, ms2 + mova [r2+r1], m5 + + pxor m5, m5 + mova m6, tcm + pcmpgtw m5, tcm + psubw m6, ms1 + pandn m5, m7 + psubw m6, ms2 + pand m5, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 + mova [r0+r1*2], m1 + mova [r2], m2 + + add r0, mmsize + add r2, mmsize + add r4, mmsize/8 + dec r3 + jg .loop + ADD rsp, pad + RET + +cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) + %assign pad 7*mmsize+12-(stack_offset&15) + %define tcm [rsp] + %define ms1 [rsp+mmsize] + %define ms2 [rsp+mmsize*2] + %define p1m [rsp+mmsize*3] + %define p2m [rsp+mmsize*4] + %define am [rsp+mmsize*5] + %define bm [rsp+mmsize*6] + SUB rsp, pad + shl r2d, 2 + shl r3d, 2 + LOAD_AB m4, m5, r2, r3 + mov r3, r1 + mova am, m4 + add r3, r1 + mov r5, 32/mmsize + mova bm, m5 + add r3, r1 +%if mmsize == 16 + mov r2, r0 + add r2, r3 +%endif +.loop: +%if mmsize == 8 + movq m2, [r0-8] ; y q2 q1 q0 + movq m7, [r0+0] + movq m5, [r0+r1-8] + movq m3, [r0+r1+0] + movq m0, [r0+r1*2-8] + movq m6, [r0+r1*2+0] + movq m1, [r0+r3-8] + TRANSPOSE4x4W 2, 5, 0, 1, 4 + SWAP 2, 7 + movq m7, [r0+r3] + TRANSPOSE4x4W 2, 3, 6, 7, 4 +%else + movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x + movu m0, [r0+r1-8] + movu m2, [r0+r1*2-8] + movu m3, [r2-8] + TRANSPOSE4x4W 5, 0, 2, 3, 6 + mova tcm, m3 + + movu m4, [r2+r1-8] + movu m1, [r2+r1*2-8] + movu m3, [r2+r3-8] + movu m7, [r2+r1*4-8] + TRANSPOSE4x4W 4, 1, 3, 7, 6 + + mova m6, tcm + punpcklqdq m6, m7 + punpckhqdq m5, m4 + SBUTTERFLY qdq, 0, 1, 7 + SBUTTERFLY qdq, 2, 3, 7 +%endif + + mova p2m, m6 + LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 + LOAD_TC m6, r4 + mova tcm, m6 + + LUMA_DEBLOCK_ONE m1, m0, ms1 + mova p1m, m5 + + mova m5, p2m + LUMA_DEBLOCK_ONE m2, m3, ms2 + mova p2m, m5 + + pxor m5, m5 + mova m6, tcm + pcmpgtw m5, tcm + psubw m6, ms1 + pandn m5, m7 + psubw m6, ms2 + pand m5, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 + mova m0, p1m + mova m3, p2m + TRANSPOSE4x4W 0, 1, 2, 3, 4 + LUMA_H_STORE r2, r3 + + add r4, mmsize/8 + lea r0, [r0+r1*(mmsize/2)] + lea r2, [r2+r1*(mmsize/2)] + dec r5 + jg .loop + ADD rsp, pad + RET +%endmacro + +INIT_XMM +%ifdef ARCH_X86_64 +; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 +; m12=alpha, m13=beta +; out: m0=p1', m3=q1', m1=p0', m2=q0' +; clobbers: m4, m5, m6, m7, m10, m11, m14 +%macro DEBLOCK_LUMA_INTER_SSE2 0 + LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 + LOAD_TC m6, r4 + DIFF_LT m8, m1, m13, m10, m4 + DIFF_LT m9, m2, m13, m11, m4 + pand m6, m7 + + mova m14, m6 + pxor m4, m4 + pcmpgtw m6, m4 + pand m6, m14 + + mova m5, m10 + pand m5, m6 + LUMA_Q1 m8, m0, m1, m2, m5, m4 + + mova m5, m11 + pand m5, m6 + LUMA_Q1 m9, m3, m1, m2, m5, m4 + + pxor m4, m4 + psubw m6, m10 + pcmpgtw m4, m14 + pandn m4, m7 + psubw m6, m11 + pand m4, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 + + SWAP 0, 8 + SWAP 3, 9 +%endmacro + +%macro DEBLOCK_LUMA_64 1 +cglobal deblock_v_luma_10_%1, 5,5,15 + %define p2 m8 + %define p1 m0 + %define p0 m1 + %define q0 m2 + %define q1 m3 + %define q2 m9 + %define mask0 m7 + %define mask1 m10 + %define mask2 m11 + shl r2d, 2 + shl r3d, 2 + LOAD_AB m12, m13, r2, r3 + mov r2, r0 + sub r0, r1 + sub r0, r1 + sub r0, r1 + mov r3, 2 +.loop: + mova p2, [r0] + mova p1, [r0+r1] + mova p0, [r0+r1*2] + mova q0, [r2] + mova q1, [r2+r1] + mova q2, [r2+r1*2] + DEBLOCK_LUMA_INTER_SSE2 + mova [r0+r1], p1 + mova [r0+r1*2], p0 + mova [r2], q0 + mova [r2+r1], q1 + add r0, mmsize + add r2, mmsize + add r4, 2 + dec r3 + jg .loop + REP_RET + +cglobal deblock_h_luma_10_%1, 5,7,15 + shl r2d, 2 + shl r3d, 2 + LOAD_AB m12, m13, r2, r3 + mov r2, r1 + add r2, r1 + add r2, r1 + mov r5, r0 + add r5, r2 + mov r6, 2 +.loop: + movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x + movu m0, [r0+r1-8] + movu m2, [r0+r1*2-8] + movu m9, [r5-8] + movu m5, [r5+r1-8] + movu m1, [r5+r1*2-8] + movu m3, [r5+r2-8] + movu m7, [r5+r1*4-8] + + TRANSPOSE4x4W 8, 0, 2, 9, 10 + TRANSPOSE4x4W 5, 1, 3, 7, 10 + + punpckhqdq m8, m5 + SBUTTERFLY qdq, 0, 1, 10 + SBUTTERFLY qdq, 2, 3, 10 + punpcklqdq m9, m7 + + DEBLOCK_LUMA_INTER_SSE2 + + TRANSPOSE4x4W 0, 1, 2, 3, 4 + LUMA_H_STORE r5, r2 + add r4, 2 + lea r0, [r0+r1*8] + lea r5, [r5+r1*8] + dec r6 + jg .loop + REP_RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA_64 sse2 +INIT_AVX +DEBLOCK_LUMA_64 avx +%endif + +%macro SWAPMOVA 2 +%ifid %1 + SWAP %1, %2 +%else + mova %1, %2 +%endif +%endmacro + +; in: t0-t2: tmp registers +; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 +; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' +%macro LUMA_INTRA_P012 12 ; p0..p3 in memory +%ifdef ARCH_X86_64 + paddw t0, %3, %2 + mova t2, %4 + paddw t2, %3 +%else + mova t0, %3 + mova t2, %4 + paddw t0, %2 + paddw t2, %3 +%endif + paddw t0, %1 + paddw t2, t2 + paddw t0, %5 + paddw t2, %9 + paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) + paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) + + psrlw t2, 3 + psrlw t1, t0, 2 + psubw t2, %3 + psubw t1, %2 + pand t2, %8 + pand t1, %8 + paddw t2, %3 + paddw t1, %2 + SWAPMOVA %11, t1 + + psubw t1, t0, %3 + paddw t0, t0 + psubw t1, %5 + psubw t0, %3 + paddw t1, %6 + paddw t1, %2 + paddw t0, %6 + psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 + psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 + + pxor t0, t1 + pxor t1, %1 + pand t0, %8 + pand t1, %7 + pxor t0, t1 + pxor t0, %1 + SWAPMOVA %10, t0 + SWAPMOVA %12, t2 +%endmacro + +%macro LUMA_INTRA_INIT 1 + %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) + %define t0 m4 + %define t1 m5 + %define t2 m6 + %define t3 m7 + %assign i 4 +%rep %1 + CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] + %assign i i+1 +%endrep + SUB rsp, pad +%endmacro + +; in: %1-%3=tmp, %4=p2, %5=q2 +%macro LUMA_INTRA_INTER 5 + LOAD_AB t0, t1, r2d, r3d + mova %1, t0 + LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 +%ifdef ARCH_X86_64 + mova %2, t0 ; mask0 + psrlw t3, %1, 2 +%else + mova t3, %1 + mova %2, t0 ; mask0 + psrlw t3, 2 +%endif + paddw t3, [pw_2] ; alpha/4+2 + DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 + pand t2, %2 + mova t3, %5 ; q2 + mova %1, t2 ; mask1 + DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta + pand t2, %1 + mova t3, %4 ; p2 + mova %3, t2 ; mask1q + DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta + pand t2, %1 + mova %1, t2 ; mask1p +%endmacro + +%macro LUMA_H_INTRA_LOAD 0 +%if mmsize == 8 + movu t0, [r0-8] + movu t1, [r0+r1-8] + movu m0, [r0+r1*2-8] + movu m1, [r0+r4-8] + TRANSPOSE4x4W 4, 5, 0, 1, 2 + mova t4, t0 ; p3 + mova t5, t1 ; p2 + + movu m2, [r0] + movu m3, [r0+r1] + movu t0, [r0+r1*2] + movu t1, [r0+r4] + TRANSPOSE4x4W 2, 3, 4, 5, 6 + mova t6, t0 ; q2 + mova t7, t1 ; q3 +%else + movu t0, [r0-8] + movu t1, [r0+r1-8] + movu m0, [r0+r1*2-8] + movu m1, [r0+r5-8] + movu m2, [r4-8] + movu m3, [r4+r1-8] + movu t2, [r4+r1*2-8] + movu t3, [r4+r5-8] + TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 + mova t4, t0 ; p3 + mova t5, t1 ; p2 + mova t6, t2 ; q2 + mova t7, t3 ; q3 +%endif +%endmacro + +; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp +%macro LUMA_H_INTRA_STORE 9 +%if mmsize == 8 + TRANSPOSE4x4W %1, %2, %3, %4, %9 + movq [r0-8], m%1 + movq [r0+r1-8], m%2 + movq [r0+r1*2-8], m%3 + movq [r0+r4-8], m%4 + movq m%1, %8 + TRANSPOSE4x4W %5, %6, %7, %1, %9 + movq [r0], m%5 + movq [r0+r1], m%6 + movq [r0+r1*2], m%7 + movq [r0+r4], m%1 +%else + TRANSPOSE2x4x4W %1, %2, %3, %4, %9 + movq [r0-8], m%1 + movq [r0+r1-8], m%2 + movq [r0+r1*2-8], m%3 + movq [r0+r5-8], m%4 + movhps [r4-8], m%1 + movhps [r4+r1-8], m%2 + movhps [r4+r1*2-8], m%3 + movhps [r4+r5-8], m%4 +%ifnum %8 + SWAP %1, %8 +%else + mova m%1, %8 +%endif + TRANSPOSE2x4x4W %5, %6, %7, %1, %9 + movq [r0], m%5 + movq [r0+r1], m%6 + movq [r0+r1*2], m%7 + movq [r0+r5], m%1 + movhps [r4], m%5 + movhps [r4+r1], m%6 + movhps [r4+r1*2], m%7 + movhps [r4+r5], m%1 +%endif +%endmacro + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +%macro DEBLOCK_LUMA_INTRA_64 1 +cglobal deblock_v_luma_intra_10_%1, 4,7,16 + %define t0 m1 + %define t1 m2 + %define t2 m4 + %define p2 m8 + %define p1 m9 + %define p0 m10 + %define q0 m11 + %define q1 m12 + %define q2 m13 + %define aa m5 + %define bb m14 + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + neg r4 + add r4, r0 ; pix-4*stride + mov r6, 2 + mova m0, [pw_2] + shl r2d, 2 + shl r3d, 2 + LOAD_AB aa, bb, r2d, r3d +.loop + mova p2, [r4+r1] + mova p1, [r4+2*r1] + mova p0, [r4+r5] + mova q0, [r0] + mova q1, [r0+r1] + mova q2, [r0+2*r1] + + LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 + mova t2, aa + psrlw t2, 2 + paddw t2, m0 ; alpha/4+2 + DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 + DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta + DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta + pand m6, m3 + pand m7, m6 + pand m6, t1 + LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] + LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] + add r0, mmsize + add r4, mmsize + dec r6 + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_luma_intra_10_%1, 4,7,16 + %define t0 m15 + %define t1 m14 + %define t2 m2 + %define q3 m5 + %define q2 m8 + %define q1 m9 + %define q0 m10 + %define p0 m11 + %define p1 m12 + %define p2 m13 + %define p3 m4 + %define spill [rsp] + %assign pad 24-(stack_offset&15) + SUB rsp, pad + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + add r4, r0 ; pix+4*stride + mov r6, 2 + mova m0, [pw_2] + shl r2d, 2 + shl r3d, 2 +.loop + movu q3, [r0-8] + movu q2, [r0+r1-8] + movu q1, [r0+r1*2-8] + movu q0, [r0+r5-8] + movu p0, [r4-8] + movu p1, [r4+r1-8] + movu p2, [r4+r1*2-8] + movu p3, [r4+r5-8] + TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 + + LOAD_AB m1, m2, r2d, r3d + LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 + psrlw m1, 2 + paddw m1, m0 ; alpha/4+2 + DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 + DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta + DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta + pand m6, m3 + pand m7, m6 + pand m6, t1 + + mova spill, q3 + LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 + LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 + mova m7, spill + + LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 + + lea r0, [r0+r1*8] + lea r4, [r4+r1*8] + dec r6 + jg .loop + ADD rsp, pad + RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA_INTRA_64 sse2 +INIT_AVX +DEBLOCK_LUMA_INTRA_64 avx + +%endif + +%macro DEBLOCK_LUMA_INTRA 1 +;----------------------------------------------------------------------------- +; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) + LUMA_INTRA_INIT 3 + lea r4, [r1*4] + lea r5, [r1*3] + neg r4 + add r4, r0 + mov r6, 32/mmsize + shl r2d, 2 + shl r3d, 2 +.loop: + mova m0, [r4+r1*2] ; p1 + mova m1, [r4+r5] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] + LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] + mova t3, [r0+r1*2] ; q2 + LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] + add r0, mmsize + add r4, mmsize + dec r6 + jg .loop + ADD rsp, pad + RET + +;----------------------------------------------------------------------------- +; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) + LUMA_INTRA_INIT 8 +%if mmsize == 8 + lea r4, [r1*3] + mov r5, 32/mmsize +%else + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + add r4, r0 ; pix+4*stride + mov r6, 32/mmsize +%endif + shl r2d, 2 + shl r3d, 2 +.loop: + LUMA_H_INTRA_LOAD + LUMA_INTRA_INTER t8, t9, t10, t5, t6 + + LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 + mova t3, t6 ; q2 + LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 + + mova m2, t4 + mova m0, t11 + mova m1, t5 + mova m3, t8 + mova m6, t6 + + LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 + + lea r0, [r0+r1*(mmsize/2)] +%if mmsize == 8 + dec r5 +%else + lea r4, [r4+r1*(mmsize/2)] + dec r6 +%endif + jg .loop + ADD rsp, pad + RET +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_LUMA mmxext +DEBLOCK_LUMA_INTRA mmxext +INIT_XMM +DEBLOCK_LUMA sse2 +DEBLOCK_LUMA_INTRA sse2 +INIT_AVX +DEBLOCK_LUMA avx +DEBLOCK_LUMA_INTRA avx +%endif + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', %2=q0' +%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 + mova %6, [pw_2] + paddw %6, %3 + paddw %6, %4 + paddw %7, %6, %2 + paddw %6, %1 + paddw %6, %3 + paddw %7, %4 + psraw %6, 2 + psraw %7, 2 + psubw %6, %1 + psubw %7, %2 + pand %6, %5 + pand %7, %5 + paddw %1, %6 + paddw %2, %7 +%endmacro + +%macro CHROMA_V_LOAD 1 + mova m0, [r0] ; p1 + mova m1, [r0+r1] ; p0 + mova m2, [%1] ; q0 + mova m3, [%1+r1] ; q1 +%endmacro + +%macro CHROMA_V_STORE 0 + mova [r0+1*r1], m1 + mova [r0+2*r1], m2 +%endmacro + +%macro DEBLOCK_CHROMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) + mov r5, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r6, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r5 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + LOAD_TC m6, r4 + psubw m6, [pw_3] + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r5, mmsize + add r4, mmsize/8 + dec r6 + jg .loop + REP_RET +%else + RET +%endif + +;----------------------------------------------------------------------------- +; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) + mov r4, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r5, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r4 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r4, mmsize + dec r5 + jg .loop + REP_RET +%else + RET +%endif +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif +INIT_XMM +DEBLOCK_CHROMA sse2 +INIT_AVX +DEBLOCK_CHROMA avx |