diff options
author | Mickaƫl Raulet <mraulet@insa-rennes.fr> | 2014-07-25 17:55:40 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-07-26 01:19:42 +0200 |
commit | 7bdcf5c934f085fe4643a049a931500b42a8b24b (patch) | |
tree | 5c35819237f9abe56b260b72690aafa7796c2886 | |
parent | a06fac353ce2bd055f920e7f6f5e2145736d2d2c (diff) | |
download | ffmpeg-7bdcf5c934f085fe4643a049a931500b42a8b24b.tar.gz |
x86/hevc: add 12bits support for deblocking filter
cherry picked from commit 97d46afe320c7d61d7b9525e5f5588355cde4bb0
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/x86/hevc_deblock.asm | 128 | ||||
-rw-r--r-- | libavcodec/x86/hevcdsp_init.c | 16 |
2 files changed, 116 insertions, 28 deletions
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index b263dca0d2..f1fc7235e9 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -26,10 +26,12 @@ SECTION_RODATA -pw_pixel_max: times 8 dw ((1 << 10)-1) -pw_m1: times 8 dw -1 -pw_m2: times 8 dw -2 -pd_1 : times 4 dd 1 +pw_pixel_max_12: times 8 dw ((1 << 12)-1) +pw_pixel_max_10: times 8 dw ((1 << 10)-1) +pw_pixel_max: times 8 dw ((1 << 10)-1) +pw_m1: times 8 dw -1 +pw_m2: times 8 dw -2 +pd_1 : times 4 dd 1 cextern pw_4 cextern pw_8 @@ -136,12 +138,12 @@ INIT_XMM sse2 ; in: 4 rows of 8 words in m0..m3 ; out: 8 rows of 4 words in %1..%8 -%macro TRANSPOSE8x4W_STORE 8 +%macro TRANSPOSE8x4W_STORE 9 pxor m5, m5; zeros reg - CLIPW m0, m5, [pw_pixel_max] - CLIPW m1, m5, [pw_pixel_max] - CLIPW m2, m5, [pw_pixel_max] - CLIPW m3, m5, [pw_pixel_max] + CLIPW m0, m5, %9 + CLIPW m1, m5, %9 + CLIPW m2, m5, %9 + CLIPW m3, m5, %9 punpckhwd m4, m0, m1 punpcklwd m0, m1 @@ -264,18 +266,18 @@ INIT_XMM sse2 ; in: 8 rows of 8 words in m0..m8 ; out: 8 rows of 8 words in %1..%8 -%macro TRANSPOSE8x8W_STORE 8 +%macro TRANSPOSE8x8W_STORE 9 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 pxor m8, m8 - CLIPW m0, m8, [pw_pixel_max] - CLIPW m1, m8, [pw_pixel_max] - CLIPW m2, m8, [pw_pixel_max] - CLIPW m3, m8, [pw_pixel_max] - CLIPW m4, m8, [pw_pixel_max] - CLIPW m5, m8, [pw_pixel_max] - CLIPW m6, m8, [pw_pixel_max] - CLIPW m7, m8, [pw_pixel_max] + CLIPW m0, m8, %9 + CLIPW m1, m8, %9 + CLIPW m2, m8, %9 + CLIPW m3, m8, %9 + CLIPW m4, m8, %9 + CLIPW m5, m8, %9 + CLIPW m6, m8, %9 + CLIPW m7, m8, %9 movdqu %1, m0 movdqu %2, m1 @@ -678,7 +680,17 @@ cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride add pixq, r3strideq TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) CHROMA_DEBLOCK_BODY 10 - TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) + TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10] + RET + +cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride + sub pixq, 4 + lea r3strideq, [3*strideq] + mov pix0q, pixq + add pixq, r3strideq + TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) + CHROMA_DEBLOCK_BODY 12 + TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12] RET ;----------------------------------------------------------------------------- @@ -713,8 +725,24 @@ cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0 movu m3, [pixq+strideq]; q1 CHROMA_DEBLOCK_BODY 10 pxor m5, m5; zeros reg - CLIPW m1, m5, [pw_pixel_max] - CLIPW m2, m5, [pw_pixel_max] + CLIPW m1, m5, [pw_pixel_max_10] + CLIPW m2, m5, [pw_pixel_max_10] + movu [pix0q+strideq], m1 + movu [pixq], m2 + RET + +cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0 + mov pix0q, pixq + sub pix0q, strideq + sub pix0q, strideq + movu m0, [pix0q]; p1 + movu m1, [pix0q+strideq]; p0 + movu m2, [pixq]; q0 + movu m3, [pixq+strideq]; q1 + CHROMA_DEBLOCK_BODY 12 + pxor m5, m5; zeros reg + CLIPW m1, m5, [pw_pixel_max_12] + CLIPW m2, m5, [pw_pixel_max_12] movu [pix0q+strideq], m1 movu [pixq], m2 RET @@ -744,7 +772,19 @@ cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5) LUMA_DEBLOCK_BODY 10, v .store: - TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5) + TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5), [pw_pixel_max_10] +.bypassluma: + RET + +cglobal hevc_v_loop_filter_luma_12, 4, 15, 16, pix, stride, beta, tc + sub pixq, 8 + lea r5, [3 * strideq] + mov r6, pixq + add pixq, r5 + TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5) + LUMA_DEBLOCK_BODY 12, v +.store: + TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5), [pw_pixel_max_12] .bypassluma: RET @@ -803,12 +843,43 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix LUMA_DEBLOCK_BODY 10, h .store: pxor m8, m8; zeros reg - CLIPW m1, m8, [pw_pixel_max] - CLIPW m2, m8, [pw_pixel_max] - CLIPW m3, m8, [pw_pixel_max] - CLIPW m4, m8, [pw_pixel_max] - CLIPW m5, m8, [pw_pixel_max] - CLIPW m6, m8, [pw_pixel_max] + CLIPW m1, m8, [pw_pixel_max_10] + CLIPW m2, m8, [pw_pixel_max_10] + CLIPW m3, m8, [pw_pixel_max_10] + CLIPW m4, m8, [pw_pixel_max_10] + CLIPW m5, m8, [pw_pixel_max_10] + CLIPW m6, m8, [pw_pixel_max_10] + movdqu [pix0q + strideq], m1; p2 + movdqu [pix0q + 2 * strideq], m2; p1 + movdqu [pix0q + src3strideq], m3; p0 + movdqu [pixq ], m4; q0 + movdqu [pixq + strideq], m5; q1 + movdqu [pixq + 2 * strideq], m6; q2 +.bypassluma: + RET + +cglobal hevc_h_loop_filter_luma_12, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride + lea src3strideq, [3 * strideq] + mov pix0q, pixq + sub pix0q, src3strideq + sub pix0q, strideq + movdqu m0, [pix0q]; p3 + movdqu m1, [pix0q + strideq]; p2 + movdqu m2, [pix0q + 2 * strideq]; p1 + movdqu m3, [pix0q + src3strideq]; p0 + movdqu m4, [pixq]; q0 + movdqu m5, [pixq + strideq]; q1 + movdqu m6, [pixq + 2 * strideq]; q2 + movdqu m7, [pixq + src3strideq]; q3 + LUMA_DEBLOCK_BODY 12, h +.store: + pxor m8, m8; zeros reg + CLIPW m1, m8, [pw_pixel_max_12] + CLIPW m2, m8, [pw_pixel_max_12] + CLIPW m3, m8, [pw_pixel_max_12] + CLIPW m4, m8, [pw_pixel_max_12] + CLIPW m5, m8, [pw_pixel_max_12] + CLIPW m6, m8, [pw_pixel_max_12] movdqu [pix0q + strideq], m1; p2 movdqu [pix0q + 2 * strideq], m2; p1 movdqu [pix0q + src3strideq], m3; p0 @@ -817,6 +888,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix movdqu [pixq + 2 * strideq], m6; q2 .bypassluma: RET + %endmacro INIT_XMM sse2 diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index f7c35fd07b..ebe9847048 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -46,10 +46,13 @@ LFL_FUNC(v, depth, opt) LFC_FUNCS(uint8_t, 8, sse2) LFC_FUNCS(uint8_t, 10, sse2) +LFC_FUNCS(uint8_t, 12, sse2) LFL_FUNCS(uint8_t, 8, sse2) LFL_FUNCS(uint8_t, 10, sse2) +LFL_FUNCS(uint8_t, 12, sse2) LFL_FUNCS(uint8_t, 8, ssse3) LFL_FUNCS(uint8_t, 10, ssse3) +LFL_FUNCS(uint8_t, 12, ssse3) #if HAVE_SSE2_EXTERNAL void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) @@ -499,5 +502,18 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx2; } + } else if (bit_depth == 12) { + if (EXTERNAL_SSE2(mm_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; + } + } + if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3; + } } } |