diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2013-04-09 22:16:36 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-04-19 13:19:45 +0200 |
commit | 76c7277385120741914168d02b57a7a1acf87a9b (patch) | |
tree | 1a0821b6a1b2aae976515ed3423a05ac9cd245f5 /libavcodec/x86/sbrdsp_init.c | |
parent | 380cfce2b2138a0513f7c054134458b3b4c92fd4 (diff) | |
download | ffmpeg-76c7277385120741914168d02b57a7a1acf87a9b.tar.gz |
x86: sbrdsp: implement SSE2 hf_apply_noise
233 to 105 cycles on Arrandale and Win64.
Replacing the multiplication by s_m[m] by a pand and a pxor with
appropriate vectors is slower. Unrolling is a 15 cycles win.
A SSE version was 4 cycles slower.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/sbrdsp_init.c')
-rw-r--r-- | libavcodec/x86/sbrdsp_init.c | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c index f97479e9d7..0bc4a6183e 100644 --- a/libavcodec/x86/sbrdsp_init.c +++ b/libavcodec/x86/sbrdsp_init.c @@ -38,6 +38,19 @@ void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1); void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1); void ff_sbr_qmf_pre_shuffle_sse2(float *z); +void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); + av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) { int mm_flags = av_get_cpu_flags(); @@ -55,5 +68,9 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) if (EXTERNAL_SSE2(mm_flags)) { s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2; s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2; } } |