diff options
author | Martin Storsjö <martin@martin.st> | 2017-01-14 13:22:30 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2017-02-24 00:03:09 +0200 |
commit | 575e31e931e4178e9f1e24407503c9b4ec0ef9ba (patch) | |
tree | f1d574cfcc51f393f8a4918929f7776fb97e5318 /libavcodec/arm/vp9dsp_init_arm.c | |
parent | 3bf9c48320f25f3d5557485b0202f22ae60748b0 (diff) | |
download | ffmpeg-575e31e931e4178e9f1e24407503c9b4ec0ef9ba.tar.gz |
arm: vp9lpf: Implement the mix2_44 function with one single filter pass
For this case, with 8 inputs but only changing 4 of them, we can fit
all 16 input pixels into a q register, and still have enough temporary
registers for doing the loop filter.
The wd=8 filters would require too many temporary registers for
processing all 16 pixels at once though.
Before: Cortex A7 A8 A9 A53
vp9_loop_filter_mix2_v_44_16_neon: 289.7 256.2 237.5 181.2
After:
vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 177.7 138.0
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/arm/vp9dsp_init_arm.c')
-rw-r--r-- | libavcodec/arm/vp9dsp_init_arm.c | 7 |
1 files changed, 4 insertions, 3 deletions
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c index e99d931674..1ede1708e7 100644 --- a/libavcodec/arm/vp9dsp_init_arm.c +++ b/libavcodec/arm/vp9dsp_init_arm.c @@ -194,6 +194,8 @@ define_loop_filters(8, 8); define_loop_filters(16, 8); define_loop_filters(16, 16); +define_loop_filters(44, 16); + #define lf_mix_fn(dir, wd1, wd2, stridea) \ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, \ ptrdiff_t stride, \ @@ -207,7 +209,6 @@ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, lf_mix_fn(h, wd1, wd2, stride) \ lf_mix_fn(v, wd1, wd2, sizeof(uint8_t)) -lf_mix_fns(4, 4) lf_mix_fns(4, 8) lf_mix_fns(8, 4) lf_mix_fns(8, 8) @@ -227,8 +228,8 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon; dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon; - dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon; - dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon; + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon; + dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon; dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon; dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon; dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon; |