diff options
author | Jason Garrett-Glaser <jason@x264.com> | 2011-05-10 07:08:24 -0700 |
---|---|---|
committer | Jason Garrett-Glaser <jason@x264.com> | 2011-05-10 20:01:58 -0700 |
commit | 8ad77b65b548a6b2f4707265ebd7e97f956acf0b (patch) | |
tree | e0d9053df2b4130023fc4b8960dc8c3e3d139fdc | |
parent | b66752790a94820c23b0ac994d6190dd9048582d (diff) | |
download | ffmpeg-8ad77b65b548a6b2f4707265ebd7e97f956acf0b.tar.gz |
Update x86 H.264 deblock asm
Includes AVX versions from x264.
-rw-r--r-- | libavcodec/x86/h264_deblock.asm | 395 | ||||
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 56 | ||||
-rw-r--r-- | libavcodec/x86/x86util.asm | 19 |
3 files changed, 277 insertions, 193 deletions
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 01778a45cb..081c0e1aef 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -1,10 +1,11 @@ ;***************************************************************************** -;* MMX/SSE2-optimized H.264 deblocking code +;* MMX/SSE2/AVX-optimized H.264 deblocking code ;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project +;* Copyright (C) 2005-2011 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Oskar Arvidsson <oskar@irock.se> ;* ;* This file is part of Libav. ;* @@ -26,96 +27,135 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pb_A1 -SECTION .text - ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + punpckh%3 m1, m0, m4 + punpckh%3 m3, m2, m6 + punpckl%3 m0, m4 + punpckl%3 m2, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 - punpckhdq m4, m4 - punpckhdq m5, m5 - punpckhdq m6, m6 +%macro TRANSPOSE8x4B_STORE 8 + punpckhdq m4, m0, m0 + punpckhdq m5, m1, m1 + punpckhdq m6, m2, m2 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd %1, m0 - punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + movh %1, m1 punpckhdq m1, m1 - movd %4, m1 + movh %2, m1 + movh %3, m0 + punpckhdq m0, m0 + movh %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 - punpcklwd m4, m6 - punpckhwd m5, m6 - movd %5, m4 - punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + movh %5, m5 punpckhdq m5, m5 - movd %8, m5 + movh %6, m5 + movh %7, m4 + punpckhdq m4, m4 + movh %8, m4 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 +%if mmsize==8 + movd %3, m0 + movd %1, m1 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %5, m0 + movd %1, m1 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro %macro SBUTTERFLY3 4 - movq %4, %2 + punpckh%1 %4, %2, %3 punpckl%1 %2, %3 - punpckh%1 %4, %3 %endmacro ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -123,30 +163,32 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY3 bw, m6, %8, m5 - SBUTTERFLY3 wd, m0, m2, m1 - SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY3 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY3 wd, m7, [%9+0x10], m6 - SBUTTERFLY3 wd, m3, m5, m4 - SBUTTERFLY3 dq, m7, m3, m0 - SBUTTERFLY3 dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY3 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -154,38 +196,44 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - SBUTTERFLY3 bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY3 wd, m0, m2, m3 - SBUTTERFLY3 wd, m4, m6, m2 - SBUTTERFLY3 wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY3 wd, m2, m5, m1 - SBUTTERFLY3 dq, m0, m4, m5 - SBUTTERFLY3 dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY3 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY3 dq, m3, %11, m0 - SBUTTERFLY3 dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY3 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 +%if avx_enabled == 0 mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%else + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%endif por %4, %5 psubusb %4, %3 %endmacro @@ -193,32 +241,28 @@ SECTION .text ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5 +%ifdef ARCH_X86_64 + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%else mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro -%macro SPLATW 1 -%ifidn m0, xmm0 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %1, 0 -%endif -%endmacro - ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 movd m4, %1 movd m5, %2 - SPLATW m4 - SPLATW m5 + SPLATW m4, m4 + SPLATW m5, m5 packuswb m4, m4 ; 16x alpha-1 packuswb m5, m5 ; 16x beta-1 %if %0>2 @@ -237,8 +281,7 @@ SECTION .text ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 - mova m5, m1 - pxor m5, m2 ; p0^q0 + pxor m5, m1, m2 ; p0^q0 pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 @@ -264,14 +307,12 @@ SECTION .text ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 - mova %6, m1 - pavgb %6, m2 + pavgb %6, m1, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - mova %6, %1 - psubusb %6, %5 + psubusb %6, %1, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 @@ -280,10 +321,10 @@ SECTION .text %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 +%macro DEBLOCK_LUMA 1 +cglobal deblock_v_luma_%1, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 movdqa m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 - mova m7, m8 - psubb m7, m6 + psubb m7, m8, m6 pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 +cglobal deblock_h_luma_%1, 5,7 movsxd r10, r1d lea r11, [r10+r10*2] lea r6, [r0-4] @@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call x264_deblock_v_luma_sse2 + call deblock_v_luma_%1 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7 add rsp, 0x68 %endif RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA sse2 +INIT_AVX +DEBLOCK_LUMA avx %else %macro DEBLOCK_LUMA 3 ;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%3] ; tc - mova m7, m4 - psubb m7, m6 + psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %endif ADD esp, 20 @@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -502,22 +547,34 @@ INIT_MMX DEBLOCK_LUMA mmxext, v8, 8 INIT_XMM DEBLOCK_LUMA sse2, v, 16 +INIT_AVX +DEBLOCK_LUMA avx, v, 16 %endif ; ARCH %macro LUMA_INTRA_P012 4 ; p0..p3 in memory +%ifdef ARCH_X86_64 + pavgb t0, p2, p1 + pavgb t1, p0, q0 +%else mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 +%endif pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 +%ifdef ARCH_X86_64 + paddb t2, p2, p1 + paddb t3, p0, q0 +%else mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 +%endif paddb t2, t3 mova t3, t2 mova t4, t2 @@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16 pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; +%ifdef ARCH_X86_64 + pavgb t1, p2, q1 + psubb t2, p2, q1 +%else mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 +%endif paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_1 @@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16 pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - mova t3, p0 - mova t2, p0 - pxor t3, q1 - pavgb t2, q1 + pxor t3, p0, q1 + pavgb t2, p0, q1 pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16 mova %1, t1 ; store p0 mova t1, %4 ; p3 - mova t2, t1 + paddb t2, t1, p2 pavgb t1, p2 - paddb t2, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 @@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16 %endif ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 INIT_MMX %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] @@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 lea r0, [pix_tmp+0x40] mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 + call deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 add rsp, 0x88 RET %else -cglobal x264_deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 PUSH dword r2m PUSH dword 16 PUSH r0 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %endif ADD esp, 16 @@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 INIT_XMM DEBLOCK_LUMA_INTRA sse2, v +INIT_AVX +DEBLOCK_LUMA_INTRA avx , v %ifndef ARCH_X86_64 INIT_MMX DEBLOCK_LUMA_INTRA mmxext, v8 %endif - - INIT_MMX %macro CHROMA_V_START 0 @@ -790,23 +849,23 @@ INIT_MMX %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7 %define buf1 r2m %endif CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq m0, buf0 movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_inter_body_mmxext: +ff_chroma_inter_body_mmxext: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 @@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext: %define t6 r5 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_intra_body_mmxext + call ff_chroma_intra_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_mmxext, 4,6 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call x264_chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) + call ff_chroma_intra_body_mmxext + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_intra_body_mmxext: +ff_chroma_intra_body_mmxext: LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 3a783a39ab..7d27c02ea2 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -219,11 +219,11 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] } #define LF_FUNC(DIR, TYPE, OPT) \ -void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta, int8_t *tc0); +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta, int8_t *tc0); #define LF_IFUNC(DIR, TYPE, OPT) \ -void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta); +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta); LF_FUNC (h, chroma, mmxext) LF_IFUNC(h, chroma_intra, mmxext) @@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext) LF_IFUNC(h, luma_intra, mmxext) #if HAVE_YASM && ARCH_X86_32 LF_FUNC (v8, luma, mmxext) -static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { if((tc0[0] & tc0[1]) >= 0) - ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); + ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); if((tc0[2] & tc0[3]) >= 0) - ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); + ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); } LF_IFUNC(v8, luma_intra, mmxext) -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) +static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) { - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); + ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); } #endif @@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2) LF_IFUNC(h, luma_intra, sse2) LF_FUNC (v, luma, sse2) LF_IFUNC(v, luma_intra, sse2) +LF_FUNC (h, luma, avx) +LF_IFUNC(h, luma_intra, avx) +LF_FUNC (v, luma, avx) +LF_IFUNC(v, luma_intra, avx) /***********************************/ /* weighted prediction */ @@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; - c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; - c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; - c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext; + c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext; #if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; - c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext; #endif c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; @@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2; #endif c->h264_idct_add16 = ff_h264_idct_add16_sse2; @@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; } + if (mm_flags&AV_CPU_FLAG_AVX) { +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx; +#endif + } } } #endif diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index f731616270..28baf7a96a 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -24,16 +24,20 @@ ;****************************************************************************** %macro SBUTTERFLY 4 +%if avx_enabled == 0 mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif SWAP %3, %4 %endmacro %macro SBUTTERFLY2 4 - mova m%4, m%2 - punpckh%1 m%2, m%3 - punpckl%1 m%4, m%3 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 SWAP %2, %4, %3 %endmacro @@ -444,3 +448,12 @@ %macro PMINUB_MMXEXT 3 ; dst, src, ignored pminub %1, %2 %endmacro + +%macro SPLATW 2-3 0 +%if mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%else + pshufw %1, %2, (%3)*0x55 +%endif +%endmacro |