diff options
author | Mans Rullgard <mans@mansr.com> | 2011-12-08 04:26:12 +0000 |
---|---|---|
committer | Mans Rullgard <mans@mansr.com> | 2011-12-14 11:26:30 +0000 |
commit | 71ce76027d9df47f4ac80c2c114b47d51243ed8e (patch) | |
tree | 1e597e11e7e258bdf490bb689c733e992d959bf6 | |
parent | d8edf1b515ae9fbcea2103305241d130c16e1003 (diff) | |
download | ffmpeg-71ce76027d9df47f4ac80c2c114b47d51243ed8e.tar.gz |
rv40: NEON optimised loop filter strength selection
Signed-off-by: Mans Rullgard <mans@mansr.com>
-rw-r--r-- | libavcodec/arm/asm.S | 6 | ||||
-rw-r--r-- | libavcodec/arm/rv40dsp_init_neon.c | 10 | ||||
-rw-r--r-- | libavcodec/arm/rv40dsp_neon.S | 86 |
3 files changed, 102 insertions, 0 deletions
diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S index a124918c1c..aaf497e8c2 100644 --- a/libavcodec/arm/asm.S +++ b/libavcodec/arm/asm.S @@ -113,6 +113,12 @@ T add \rn, \rn, \rm T ldr \rt, [\rn] .endm +.macro ldr_dpre rt, rn, rm:vararg +A ldr \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T ldr \rt, [\rn] +.endm + .macro ldr_post rt, rn, rm:vararg A ldr \rt, [\rn], \rm T ldr \rt, [\rn] diff --git a/libavcodec/arm/rv40dsp_init_neon.c b/libavcodec/arm/rv40dsp_init_neon.c index 36d75e6fd8..59dddb6605 100644 --- a/libavcodec/arm/rv40dsp_init_neon.c +++ b/libavcodec/arm/rv40dsp_init_neon.c @@ -54,6 +54,13 @@ void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int); void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int); +int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, int stride, + int beta, int beta2, int edge, + int *p1, int *q1); +int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, int stride, + int beta, int beta2, int edge, + int *p1, int *q1); + void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) { c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon; @@ -116,4 +123,7 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon; c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon; + + c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; + c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; } diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S index 07ba8428c1..a4313d89f9 100644 --- a/libavcodec/arm/rv40dsp_neon.S +++ b/libavcodec/arm/rv40dsp_neon.S @@ -722,3 +722,89 @@ function ff_rv40_weight_func_8_neon, export=1 bne 1b bx lr endfunc + +function ff_rv40_h_loop_filter_strength_neon, export=1 + pkhbt r2, r3, r2, lsl #18 + + ldr r3, [r0] + ldr_dpre r12, r0, r1 + teq r3, r12 + beq 1f + + sub r0, r0, r1, lsl #1 + + vld1.32 {d4[]}, [r0,:32], r1 @ -3 + vld1.32 {d0[]}, [r0,:32], r1 @ -2 + vld1.32 {d4[1]}, [r0,:32], r1 @ -1 + vld1.32 {d5[]}, [r0,:32], r1 @ 0 + vld1.32 {d1[]}, [r0,:32], r1 @ 1 + vld1.32 {d5[0]}, [r0,:32], r1 @ 2 + + vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1 + vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0 + vdup.32 d30, r2 @ beta2, beta << 2 + vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1 + vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0 + vabd.u16 d16, d18, d16 + vclt.u16 d16, d16, d30 + + ldrd r2, r3, [sp, #4] + vmovl.u16 q12, d16 + vtrn.16 d16, d17 + vshr.u32 q12, q12, #15 + ldr r0, [sp] + vst1.32 {d24[1]}, [r2,:32] + vst1.32 {d25[1]}, [r3,:32] + + cmp r0, #0 + it eq + bxeq lr + + vand d18, d16, d17 + vtrn.32 d18, d19 + vand d18, d18, d19 + vmov.u16 r0, d18[0] + bx lr +1: + ldrd r2, r3, [sp, #4] + mov r0, #0 + str r0, [r2] + str r0, [r3] + bx lr +endfunc + +function ff_rv40_v_loop_filter_strength_neon, export=1 + sub r0, r0, #3 + pkhbt r2, r3, r2, lsl #18 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d3}, [r0], r1 + + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vdup.32 q15, r2 + vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2 + vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2 + vabd.u16 q0, q1, q0 + vclt.u16 q0, q0, q15 + + ldrd r2, r3, [sp, #4] + vmovl.u16 q1, d0 + vext.16 d1, d0, d1, #3 + vshr.u32 q1, q1, #15 + ldr r0, [sp] + vst1.32 {d2[1]}, [r2,:32] + vst1.32 {d3[1]}, [r3,:32] + + cmp r0, #0 + it eq + bxeq lr + + vand d0, d0, d1 + vtrn.16 d0, d1 + vand d0, d0, d1 + vmov.u16 r0, d0[0] + bx lr +endfunc |