aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMans Rullgard <mans@mansr.com>2012-10-19 13:39:11 +0100
committerMans Rullgard <mans@mansr.com>2012-10-20 01:28:38 +0100
commit1846ddf0a7870e6862905121d4e9da1c283d6ff6 (patch)
tree7d3282533e754b9b5c50f165e4836a617a0f7f37
parent2f41eaa9c6b5699ed7c3c09b1a39b3322d8e7175 (diff)
downloadffmpeg-1846ddf0a7870e6862905121d4e9da1c283d6ff6.tar.gz
ARM: fix overreads in neon h264 chroma mc
The loops were reading ahead one line, which could end up outside the buffer for reference blocks at the edge of the picture. Removing this readahead has no measurable performance impact. Signed-off-by: Mans Rullgard <mans@mansr.com>
-rw-r--r--libavcodec/arm/h264cmc_neon.S86
1 files changed, 28 insertions, 58 deletions
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index c7e54605bb..3427e36011 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -51,24 +51,20 @@ T cmp r7, #0
beq 2f
- add r5, r1, r2
-
vdup.8 d0, r4
- lsl r4, r2, #1
vdup.8 d1, r12
- vld1.8 {d4, d5}, [r1], r4
+ vld1.8 {d4, d5}, [r1], r2
vdup.8 d2, r6
- vld1.8 {d6, d7}, [r5], r4
vdup.8 d3, r7
-
vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
-1: pld [r5]
+1: vld1.8 {d6, d7}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
- vld1.8 {d4, d5}, [r1], r4
+ vext.8 d7, d6, d7, #1
+ vld1.8 {d4, d5}, [r1], r2
vmlal.u8 q8, d6, d2
+ pld [r1]
vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3
vmull.u8 q9, d6, d0
@@ -76,8 +72,7 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
- vld1.8 {d6, d7}, [r5], r4
- pld [r1]
+ pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
@@ -92,7 +87,6 @@ T cmp r7, #0
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
- vext.8 d7, d6, d7, #1
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 1b
@@ -106,18 +100,15 @@ T cmp r7, #0
beq 4f
- add r5, r1, r2
- lsl r4, r2, #1
- vld1.8 {d4}, [r1], r4
- vld1.8 {d6}, [r5], r4
+ vld1.8 {d4}, [r1], r2
-3: pld [r5]
+3: vld1.8 {d6}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1
- vld1.8 {d4}, [r1], r4
+ vld1.8 {d4}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
- vld1.8 {d6}, [r5], r4
+ pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
@@ -127,13 +118,13 @@ T cmp r7, #0
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
+ pld [r1, r2]
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
subs r3, r3, #2
- pld [r1]
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 3b
@@ -144,16 +135,13 @@ T cmp r7, #0
vld1.8 {d6, d7}, [r1], r2
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
-
-5: pld [r1]
+ pld [r1]
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
- vld1.8 {d4, d5}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d7, d1
- pld [r1]
- vext.8 d5, d4, d5, #1
+ pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
@@ -168,11 +156,9 @@ T cmp r7, #0
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
- vld1.8 {d6, d7}, [r1], r2
- vext.8 d7, d6, d7, #1
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
- bgt 5b
+ bgt 4b
pop {r4-r7, pc}
endfunc
@@ -209,33 +195,29 @@ T cmp r7, #0
beq 2f
- add r5, r1, r2
-
vdup.8 d0, r4
- lsl r4, r2, #1
vdup.8 d1, r12
- vld1.8 {d4}, [r1], r4
+ vld1.8 {d4}, [r1], r2
vdup.8 d2, r6
- vld1.8 {d6}, [r5], r4
vdup.8 d3, r7
vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
- vtrn.32 d6, d7
vtrn.32 d0, d1
vtrn.32 d2, d3
-1: pld [r5]
+1: vld1.8 {d6}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d2
- vld1.8 {d4}, [r1], r4
+ vld1.8 {d4}, [r1], r2
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
+ pld [r1]
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d2
- vld1.8 {d6}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
.ifc \codec,h264
@@ -245,14 +227,12 @@ T cmp r7, #0
vshrn.u16 d16, q8, #6
.endif
subs r3, r3, #2
- pld [r1]
+ pld [r1, r2]
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 1b
@@ -268,18 +248,15 @@ T cmp r7, #0
beq 4f
vext.32 d1, d0, d1, #1
- add r5, r1, r2
- lsl r4, r2, #1
- vld1.32 {d4[0]}, [r1], r4
- vld1.32 {d4[1]}, [r5], r4
+ vld1.32 {d4[0]}, [r1], r2
-3: pld [r5]
+3: vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0
- vld1.32 {d4[0]}, [r1], r4
+ vld1.32 {d4[0]}, [r1], r2
vmull.u8 q9, d4, d1
- vld1.32 {d4[1]}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
+ pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
@@ -292,7 +269,7 @@ T cmp r7, #0
vrhadd.u8 d16, d16, d20
.endif
subs r3, r3, #2
- pld [r1]
+ pld [r1, r2]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 3b
@@ -305,13 +282,9 @@ T cmp r7, #0
vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
vtrn.32 d6, d7
-
-5: vmull.u8 q8, d4, d0
+ vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
subs r3, r3, #2
- vld1.8 {d4}, [r1], r2
- vext.8 d5, d4, d5, #1
- vtrn.32 d4, d5
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
@@ -326,13 +299,10 @@ T cmp r7, #0
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
- vld1.8 {d6}, [r1], r2
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
- bgt 5b
+ bgt 4b
pop {r4-r7, pc}
endfunc