diff options
author | Mans Rullgard <mans@mansr.com> | 2012-10-19 13:39:11 +0100 |
---|---|---|
committer | Mans Rullgard <mans@mansr.com> | 2012-10-20 01:28:38 +0100 |
commit | 1846ddf0a7870e6862905121d4e9da1c283d6ff6 (patch) | |
tree | 7d3282533e754b9b5c50f165e4836a617a0f7f37 /libavcodec | |
parent | 2f41eaa9c6b5699ed7c3c09b1a39b3322d8e7175 (diff) | |
download | ffmpeg-1846ddf0a7870e6862905121d4e9da1c283d6ff6.tar.gz |
ARM: fix overreads in neon h264 chroma mc
The loops were reading ahead one line, which could end up outside the
buffer for reference blocks at the edge of the picture. Removing
this readahead has no measurable performance impact.
Signed-off-by: Mans Rullgard <mans@mansr.com>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/arm/h264cmc_neon.S | 86 |
1 files changed, 28 insertions, 58 deletions
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S index c7e54605bb..3427e36011 100644 --- a/libavcodec/arm/h264cmc_neon.S +++ b/libavcodec/arm/h264cmc_neon.S @@ -51,24 +51,20 @@ T cmp r7, #0 beq 2f - add r5, r1, r2 - vdup.8 d0, r4 - lsl r4, r2, #1 vdup.8 d1, r12 - vld1.8 {d4, d5}, [r1], r4 + vld1.8 {d4, d5}, [r1], r2 vdup.8 d2, r6 - vld1.8 {d6, d7}, [r5], r4 vdup.8 d3, r7 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 -1: pld [r5] +1: vld1.8 {d6, d7}, [r1], r2 vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 - vld1.8 {d4, d5}, [r1], r4 + vext.8 d7, d6, d7, #1 + vld1.8 {d4, d5}, [r1], r2 vmlal.u8 q8, d6, d2 + pld [r1] vext.8 d5, d4, d5, #1 vmlal.u8 q8, d7, d3 vmull.u8 q9, d6, d0 @@ -76,8 +72,7 @@ T cmp r7, #0 vmlal.u8 q9, d7, d1 vmlal.u8 q9, d4, d2 vmlal.u8 q9, d5, d3 - vld1.8 {d6, d7}, [r5], r4 - pld [r1] + pld [r1, r2] .ifc \codec,h264 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 @@ -92,7 +87,6 @@ T cmp r7, #0 vld1.8 {d21}, [lr,:64], r2 vrhadd.u8 q8, q8, q10 .endif - vext.8 d7, d6, d7, #1 vst1.8 {d16}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2 bgt 1b @@ -106,18 +100,15 @@ T cmp r7, #0 beq 4f - add r5, r1, r2 - lsl r4, r2, #1 - vld1.8 {d4}, [r1], r4 - vld1.8 {d6}, [r5], r4 + vld1.8 {d4}, [r1], r2 -3: pld [r5] +3: vld1.8 {d6}, [r1], r2 vmull.u8 q8, d4, d0 vmlal.u8 q8, d6, d1 - vld1.8 {d4}, [r1], r4 + vld1.8 {d4}, [r1], r2 vmull.u8 q9, d6, d0 vmlal.u8 q9, d4, d1 - vld1.8 {d6}, [r5], r4 + pld [r1] .ifc \codec,h264 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 @@ -127,13 +118,13 @@ T cmp r7, #0 vshrn.u16 d16, q8, #6 vshrn.u16 d17, q9, #6 .endif + pld [r1, r2] .ifc \type,avg vld1.8 {d20}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2 vrhadd.u8 q8, q8, q10 .endif subs r3, r3, #2 - pld [r1] vst1.8 {d16}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2 bgt 3b @@ -144,16 +135,13 @@ T cmp r7, #0 vld1.8 {d6, d7}, [r1], r2 vext.8 d5, d4, d5, #1 vext.8 d7, d6, d7, #1 - -5: pld [r1] + pld [r1] subs r3, r3, #2 vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 - vld1.8 {d4, d5}, [r1], r2 vmull.u8 q9, d6, d0 vmlal.u8 q9, d7, d1 - pld [r1] - vext.8 d5, d4, d5, #1 + pld [r1, r2] .ifc \codec,h264 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 @@ -168,11 +156,9 @@ T cmp r7, #0 vld1.8 {d21}, [lr,:64], r2 vrhadd.u8 q8, q8, q10 .endif - vld1.8 {d6, d7}, [r1], r2 - vext.8 d7, d6, d7, #1 vst1.8 {d16}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2 - bgt 5b + bgt 4b pop {r4-r7, pc} endfunc @@ -209,33 +195,29 @@ T cmp r7, #0 beq 2f - add r5, r1, r2 - vdup.8 d0, r4 - lsl r4, r2, #1 vdup.8 d1, r12 - vld1.8 {d4}, [r1], r4 + vld1.8 {d4}, [r1], r2 vdup.8 d2, r6 - vld1.8 {d6}, [r5], r4 vdup.8 d3, r7 vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 vtrn.32 d4, d5 - vtrn.32 d6, d7 vtrn.32 d0, d1 vtrn.32 d2, d3 -1: pld [r5] +1: vld1.8 {d6}, [r1], r2 + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 vmull.u8 q8, d4, d0 vmlal.u8 q8, d6, d2 - vld1.8 {d4}, [r1], r4 + vld1.8 {d4}, [r1], r2 vext.8 d5, d4, d5, #1 vtrn.32 d4, d5 + pld [r1] vmull.u8 q9, d6, d0 vmlal.u8 q9, d4, d2 - vld1.8 {d6}, [r5], r4 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 .ifc \codec,h264 @@ -245,14 +227,12 @@ T cmp r7, #0 vshrn.u16 d16, q8, #6 .endif subs r3, r3, #2 - pld [r1] + pld [r1, r2] .ifc \type,avg vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2 vrhadd.u8 d16, d16, d20 .endif - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 vst1.32 {d16[0]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2 bgt 1b @@ -268,18 +248,15 @@ T cmp r7, #0 beq 4f vext.32 d1, d0, d1, #1 - add r5, r1, r2 - lsl r4, r2, #1 - vld1.32 {d4[0]}, [r1], r4 - vld1.32 {d4[1]}, [r5], r4 + vld1.32 {d4[0]}, [r1], r2 -3: pld [r5] +3: vld1.32 {d4[1]}, [r1], r2 vmull.u8 q8, d4, d0 - vld1.32 {d4[0]}, [r1], r4 + vld1.32 {d4[0]}, [r1], r2 vmull.u8 q9, d4, d1 - vld1.32 {d4[1]}, [r5], r4 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 + pld [r1] .ifc \codec,h264 vrshrn.u16 d16, q8, #6 .else @@ -292,7 +269,7 @@ T cmp r7, #0 vrhadd.u8 d16, d16, d20 .endif subs r3, r3, #2 - pld [r1] + pld [r1, r2] vst1.32 {d16[0]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2 bgt 3b @@ -305,13 +282,9 @@ T cmp r7, #0 vext.8 d7, d6, d7, #1 vtrn.32 d4, d5 vtrn.32 d6, d7 - -5: vmull.u8 q8, d4, d0 + vmull.u8 q8, d4, d0 vmull.u8 q9, d6, d0 subs r3, r3, #2 - vld1.8 {d4}, [r1], r2 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 pld [r1] @@ -326,13 +299,10 @@ T cmp r7, #0 vld1.32 {d20[1]}, [lr,:32], r2 vrhadd.u8 d16, d16, d20 .endif - vld1.8 {d6}, [r1], r2 - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 pld [r1] vst1.32 {d16[0]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2 - bgt 5b + bgt 4b pop {r4-r7, pc} endfunc |