diff options
author | James Almer <jamrial@gmail.com> | 2015-02-04 20:21:56 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2015-02-05 15:02:27 -0300 |
commit | 042c1159fcf8431725c718a504b1fb40364bf434 (patch) | |
tree | f80a1ec51d5d10feccdbcc6be4cc1a652c26e4ce /libavcodec/x86/hevc_sao.asm | |
parent | 1f1c7c8a57e81803a034b00077bd43e7a6e4ed82 (diff) | |
download | ffmpeg-042c1159fcf8431725c718a504b1fb40364bf434.tar.gz |
x86/hevcdsp: add ff_hevc_sao_edge_filter_8_{ssse3,avx2}
Original x86 intrinsics code and initial yasm port by Pierre-Edouard Lepere.
Refactoring and optimizations by James Almer.
Benchmarks of BQTerrace_1920x1080_60_qp22.bin with an Intel Core i5-4200U
Width 32
158583 decicycles in edge, sao_edge_filter_8 runs, 0 skips
5205 decicycles in ff_hevc_sao_edge_filter_32_8_ssse3, 32767 runs, 1 skips
2942 decicycles in ff_hevc_sao_edge_filter_32_8_avx2, 32767 runs, 1 skips
Width 64
705639 decicycles in sao_edge_filter_8, 262144 runs, 0 skips
19224 decicycles in ff_hevc_sao_edge_filter_64_8_ssse3, 262111 runs, 33 skips
10433 decicycles in ff_hevc_sao_edge_filter_64_8_avx2, 262115 runs, 29 skips
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/hevc_sao.asm')
-rw-r--r-- | libavcodec/x86/hevc_sao.asm | 171 |
1 files changed, 170 insertions, 1 deletions
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm index b389a9f68d..a306a84e81 100644 --- a/libavcodec/x86/hevc_sao.asm +++ b/libavcodec/x86/hevc_sao.asm @@ -23,14 +23,25 @@ %include "libavutil/x86/x86util.asm" -%if ARCH_X86_64 SECTION_RODATA 32 pw_mask10: times 16 dw 0x03FF pw_mask12: times 16 dw 0x0FFF +pb_2: times 32 db 2 +pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1 +cextern pb_1 SECTION_TEXT +%define MAX_PB_SIZE 64 +%define PADDING_SIZE 32 ; FF_INPUT_BUFFER_PADDING_SIZE + +;****************************************************************************** +;SAO Band Filter +;****************************************************************************** + +%if ARCH_X86_64 %macro HEVC_SAO_BAND_FILTER_INIT 1 and leftq, 31 movd xm0, leftd @@ -239,3 +250,161 @@ HEVC_SAO_BAND_FILTER_16 12, 48, 1 HEVC_SAO_BAND_FILTER_16 12, 64, 2 %endif %endif + +;****************************************************************************** +;SAO Edge Filter +;****************************************************************************** + +%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE + +%macro HEVC_SAO_EDGE_FILTER_COMPUTE_8 1 + pminub m4, m1, m2 + pminub m5, m1, m3 + pcmpeqb m2, m4 + pcmpeqb m3, m5 + pcmpeqb m4, m1 + pcmpeqb m5, m1 + psubb m4, m2 + psubb m5, m3 + paddb m4, m6 + paddb m4, m5 + + pshufb m2, m0, m4 +%if %1 > 8 + punpckhbw m5, m7, m1 + punpckhbw m4, m2, m7 + punpcklbw m3, m7, m1 + punpcklbw m2, m7 + pmaddubsw m5, m4 + pmaddubsw m3, m2 + packuswb m3, m5 +%else + punpcklbw m3, m7, m1 + punpcklbw m2, m7 + pmaddubsw m3, m2 + packuswb m3, m3 +%endif +%endmacro + +;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, +; int eo, int width, int height); +%macro HEVC_SAO_EDGE_FILTER_8 2-3 +%if WIN64 +cglobal hevc_sao_edge_filter_%1_8, 4, 8, 8, dst, src, dststride, offset, a_stride, b_stride, height, tmp +%define eoq heightq + movsxd eoq, dword r4m + movsx a_strideq, byte [pb_eo+eoq*4+1] + movsx b_strideq, byte [pb_eo+eoq*4+3] + imul a_strideq, EDGE_SRCSTRIDE + imul b_strideq, EDGE_SRCSTRIDE + movsx tmpq, byte [pb_eo+eoq*4] + add a_strideq, tmpq + movsx tmpq, byte [pb_eo+eoq*4+2] + add b_strideq, tmpq + mov heightd, r6m + +%elif ARCH_X86_64 +cglobal hevc_sao_edge_filter_%1_8, 5, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp +%define tmp2q heightq + movsxd eoq, eod + lea tmp2q, [pb_eo] + movsx a_strideq, byte [tmp2q+eoq*4+1] + movsx b_strideq, byte [tmp2q+eoq*4+3] + imul a_strideq, EDGE_SRCSTRIDE + imul b_strideq, EDGE_SRCSTRIDE + movsx tmpq, byte [tmp2q+eoq*4] + add a_strideq, tmpq + movsx tmpq, byte [tmp2q+eoq*4+2] + add b_strideq, tmpq + mov heightd, r6m + +%else ; ARCH_X86_32 +cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height +%define eoq srcq +%define tmpq heightq +%define tmp2q dststrideq +%define offsetq heightq + mov eoq, r4m + lea tmp2q, [pb_eo] + movsx a_strideq, byte [tmp2q+eoq*4+1] + movsx b_strideq, byte [tmp2q+eoq*4+3] + imul a_strideq, EDGE_SRCSTRIDE + imul b_strideq, EDGE_SRCSTRIDE + movsx tmpq, byte [tmp2q+eoq*4] + add a_strideq, tmpq + movsx tmpq, byte [tmp2q+eoq*4+2] + add b_strideq, tmpq + + mov srcq, srcm + mov offsetq, r3m + mov dststrideq, dststridem +%endif ; ARCH + +%if mmsize > 16 + vbroadcasti128 m0, [offsetq] +%else + movu m0, [offsetq] +%endif + mova m1, [pb_edge_shuffle] + packsswb m0, m0 + mova m7, [pb_1] + pshufb m0, m1 + mova m6, [pb_2] +%if ARCH_X86_32 + mov heightd, r6m +%endif + +align 16 +.loop: + +%if %1 == 8 + movq m1, [srcq] + movq m2, [srcq + a_strideq] + movq m3, [srcq + b_strideq] + HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 + movq [dstq], m3 +%endif + +%assign i 0 +%rep %2 + mova m1, [srcq + i] + movu m2, [srcq + a_strideq + i] + movu m3, [srcq + b_strideq + i] + HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 + mov%3 [dstq + i], m3 +%assign i i+mmsize +%endrep + +%if %1 == 48 +INIT_XMM cpuname + + mova m1, [srcq + i] + movu m2, [srcq + a_strideq + i] + movu m3, [srcq + b_strideq + i] + HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 + mova [dstq + i], m3 +%if cpuflag(avx2) +INIT_YMM cpuname +%endif +%endif + + add dstq, dststrideq + add srcq, EDGE_SRCSTRIDE + dec heightd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +HEVC_SAO_EDGE_FILTER_8 8, 0 +HEVC_SAO_EDGE_FILTER_8 16, 1, a +HEVC_SAO_EDGE_FILTER_8 32, 2, a +HEVC_SAO_EDGE_FILTER_8 48, 2, a +HEVC_SAO_EDGE_FILTER_8 64, 4, a + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +HEVC_SAO_EDGE_FILTER_8 32, 1, a +HEVC_SAO_EDGE_FILTER_8 48, 1, u +HEVC_SAO_EDGE_FILTER_8 64, 2, a +%endif |