diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-02-18 21:03:02 -0800 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-02-19 16:25:50 +0100 |
commit | 1acd7d594c15aa491729c837ad3519d3469e620a (patch) | |
tree | 4a515aa31c7483d8a2fdfa1d4399c9a9d2b29cae /libavcodec/arm/h264idct_neon.S | |
parent | a1f1ca96b42698002920467c0aab9e636893088a (diff) | |
download | ffmpeg-1acd7d594c15aa491729c837ad3519d3469e620a.tar.gz |
h264: integrate clear_blocks calls with IDCT.
The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700
to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb
(in the decode_slice loop) goes from 1759 to 1733 cycles on the clip
tested (cathedral), i.e. almost 30 cycles per mb faster.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/arm/h264idct_neon.S')
-rw-r--r-- | libavcodec/arm/h264idct_neon.S | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S index 1b349ced58..fa5b90c23a 100644 --- a/libavcodec/arm/h264idct_neon.S +++ b/libavcodec/arm/h264idct_neon.S @@ -22,9 +22,12 @@ function ff_h264_idct_add_neon, export=1 vld1.64 {d0-d3}, [r1,:128] + vmov.i16 q15, #0 vswp d1, d2 + vst1.16 {q15}, [r1,:128]! vadd.i16 d4, d0, d1 + vst1.16 {q15}, [r1,:128]! vshr.s16 q8, q1, #1 vsub.i16 d5, d0, d1 vadd.i16 d6, d2, d17 @@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1 vst1.32 {d0[1]}, [r0,:32], r2 vst1.32 {d1[0]}, [r0,:32], r2 + sub r1, r1, #32 bx lr endfunc function ff_h264_idct_dc_add_neon, export=1 + mov r3, #0 vld1.16 {d2[],d3[]}, [r1,:16] + strh r3, [r1] vrshr.s16 q1, q1, #6 vld1.32 {d0[0]}, [r0,:32], r2 vld1.32 {d0[1]}, [r0,:32], r2 @@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1 add r5, r1, #16*4 add r1, r2, #16*32 mov r2, r3 - mov r3, r1 + mov r10, r1 ldr r6, [sp, #32] movrel r7, scan8+16 mov r12, #0 @@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1 ldr r0, [r5, r12, lsl #2] ldrb r8, [r6, r8] add r0, r0, r4 - add r1, r3, r12, lsl #5 + add r1, r10, r12, lsl #5 cmp r8, #0 ldrsh r8, [r1] iteet ne @@ -180,7 +186,9 @@ endfunc qb .req q14 vshr.s16 q2, q10, #1 vadd.i16 q0, q8, q12 - vld1.16 {q14-q15},[r1,:128]! + vld1.16 {q14-q15},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! vsub.i16 q1, q8, q12 vshr.s16 q3, q14, #1 vsub.i16 q2, q2, q14 @@ -259,9 +267,16 @@ endfunc .endm function ff_h264_idct8_add_neon, export=1 - vld1.16 {q8-q9}, [r1,:128]! - vld1.16 {q10-q11},[r1,:128]! - vld1.16 {q12-q13},[r1,:128]! + vmov.i16 q7, #0 + vld1.16 {q8-q9}, [r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vld1.16 {q10-q11},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vld1.16 {q12-q13},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! idct8x8_cols 0 idct8x8_cols 1 @@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1 endfunc function ff_h264_idct8_dc_add_neon, export=1 + mov r3, #0 vld1.16 {d30[],d31[]},[r1,:16] + strh r3, [r1] vld1.32 {d0}, [r0,:64], r2 vrshr.s16 q15, q15, #6 vld1.32 {d1}, [r0,:64], r2 |