diff options
author | James Almer <jamrial@gmail.com> | 2015-02-09 20:38:20 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2015-02-09 20:41:21 -0300 |
commit | 06fe6dfe12d7248827722dde69c72970fb06280d (patch) | |
tree | 328ec40c8da291ce80e0a314a476723b1e50582d | |
parent | f5d32acc37572459cc16e00012057bb19fb7a054 (diff) | |
download | ffmpeg-06fe6dfe12d7248827722dde69c72970fb06280d.tar.gz |
x86/hevc_sao: make sao_band_filter work on x86_32
Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/hevc_sao.asm | 40 | ||||
-rw-r--r-- | libavcodec/x86/hevcdsp_init.c | 24 |
2 files changed, 48 insertions, 16 deletions
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm index 8202236447..cd95f0b550 100644 --- a/libavcodec/x86/hevc_sao.asm +++ b/libavcodec/x86/hevc_sao.asm @@ -45,7 +45,6 @@ SECTION_TEXT ;SAO Band Filter ;****************************************************************************** -%if ARCH_X86_64 %macro HEVC_SAO_BAND_FILTER_INIT 1 and leftq, 31 movd xm0, leftd @@ -76,17 +75,37 @@ SECTION_TEXT SPLATW m7, m7, 3 %endif +%if ARCH_X86_64 %if %1 > 8 mova m13, [pw_mask %+ %1] %endif pxor m14, m14 +%else ; ARCH_X86_32 + mova [rsp+mmsize*0], m0 + mova [rsp+mmsize*1], m1 + mova [rsp+mmsize*2], m2 + mova [rsp+mmsize*3], m3 + mova [rsp+mmsize*4], m4 + mova [rsp+mmsize*5], m5 + mova [rsp+mmsize*6], m6 + pxor m0, m0 +%if %1 > 8 + mova m1, [pw_mask %+ %1] +%endif + %assign MMSIZE mmsize + %define m14 m0 + %define m13 m1 + %define m9 m2 + %define m8 m3 +%endif ; ARCH DEFINE_ARGS dst, src, dststride, srcstride, offset, height mov heightd, r7m %endmacro %macro HEVC_SAO_BAND_FILTER_COMPUTE 3 psraw %2, %3, %1-5 +%if ARCH_X86_64 pcmpeqw m10, %2, m0 pcmpeqw m11, %2, m1 pcmpeqw m12, %2, m2 @@ -99,12 +118,26 @@ DEFINE_ARGS dst, src, dststride, srcstride, offset, height por m12, %2 por m10, m12 paddw %3, m10 +%else ; ARCH_X86_32 + pcmpeqw m4, %2, [rsp+MMSIZE*0] + pcmpeqw m5, %2, [rsp+MMSIZE*1] + pcmpeqw m6, %2, [rsp+MMSIZE*2] + pcmpeqw %2, [rsp+MMSIZE*3] + pand m4, [rsp+MMSIZE*4] + pand m5, [rsp+MMSIZE*5] + pand m6, [rsp+MMSIZE*6] + pand %2, m7 + por m4, m5 + por m6, %2 + por m4, m6 + paddw %3, m4 +%endif ; ARCH %endmacro ;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ; int16_t *sao_offset_val, int sao_left_class, int width, int height); %macro HEVC_SAO_BAND_FILTER_8 2 -cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, dst, src, dststride, srcstride, offset, left +cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left HEVC_SAO_BAND_FILTER_INIT 8 align 16 @@ -154,7 +187,7 @@ INIT_YMM cpuname ;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ; int16_t *sao_offset_val, int sao_left_class, int width, int height); %macro HEVC_SAO_BAND_FILTER_16 3 -cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, dst, src, dststride, srcstride, offset, left +cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left HEVC_SAO_BAND_FILTER_INIT %1 align 16 @@ -253,7 +286,6 @@ HEVC_SAO_BAND_FILTER_16 12, 32, 1 HEVC_SAO_BAND_FILTER_16 12, 48, 1 HEVC_SAO_BAND_FILTER_16 12, 64, 2 %endif -%endif ;****************************************************************************** ;SAO Edge Filter diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index f7b3d0fb46..8f7473d37c 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -714,8 +714,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2; - SAO_BAND_INIT(8, sse2); } + SAO_BAND_INIT(8, sse2); + c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2; c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2; c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2; @@ -749,9 +750,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; - - SAO_BAND_INIT(8, avx); } + SAO_BAND_INIT(8, avx); + c->transform_add[1] = ff_hevc_transform_add8_8_avx; c->transform_add[2] = ff_hevc_transform_add16_8_avx; c->transform_add[3] = ff_hevc_transform_add32_8_avx; @@ -760,7 +761,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; if (ARCH_X86_64) { - SAO_BAND_INIT(8, avx2); c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; @@ -845,6 +845,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2; c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2; } + SAO_BAND_INIT(8, avx2); + c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2; c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2; c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2; @@ -864,9 +866,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2; - SAO_BAND_INIT(10, sse2); SAO_EDGE_INIT(10, sse2); } + SAO_BAND_INIT(10, sse2); c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2; c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2; @@ -897,16 +899,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx; - - SAO_BAND_INIT(10, avx); } + SAO_BAND_INIT(10, avx); } if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; if (ARCH_X86_64) { - SAO_BAND_INIT(10, avx2); c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2; c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2; c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2; @@ -1054,6 +1054,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2; c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2; } + SAO_BAND_INIT(10, avx2); c->transform_add[2] = ff_hevc_transform_add16_10_avx2; c->transform_add[3] = ff_hevc_transform_add32_10_avx2; @@ -1071,9 +1072,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; - SAO_BAND_INIT(12, sse2); SAO_EDGE_INIT(12, sse2); } + SAO_BAND_INIT(12, sse2); c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2; c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2; @@ -1100,19 +1101,18 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx; - - SAO_BAND_INIT(12, avx); } + SAO_BAND_INIT(12, avx); } if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2; if (ARCH_X86_64) { - SAO_BAND_INIT(12, avx2); c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_12_avx2; c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_12_avx2; c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_12_avx2; } + SAO_BAND_INIT(12, avx2); } } } |