aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/arm/vp9itxfm_neon.S
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2017-01-04 13:08:51 +0200
committerMartin Storsjö <martin@martin.st>2017-02-11 00:31:52 +0200
commita76bf8cf1277ef6feb1580b578f5e6ca327e713c (patch)
tree6962dd26690bae991aa4dc8e514a20631cd1e98c /libavcodec/arm/vp9itxfm_neon.S
parent388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7 (diff)
downloadffmpeg-a76bf8cf1277ef6feb1580b578f5e6ca327e713c.tar.gz
arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
This work is sponsored by, and copyright, Google. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.7 235.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 459.2 862.2 553.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 226.5 145.0 225.1 171.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 721.2 415.7 727.6 475.0 Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/arm/vp9itxfm_neon.S')
-rw-r--r--libavcodec/arm/vp9itxfm_neon.S54
1 files changed, 36 insertions, 18 deletions
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 3d0b0fab2e..8dc4bbfa55 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -542,16 +542,23 @@ function idct16x16_dc_add_neon
vrshr.s16 q8, q8, #6
+ mov r3, r0
mov r12, #16
1:
@ Loop to add the constant from q8 into all 16x16 outputs
- vld1.8 {q3}, [r0,:128]
- vaddw.u8 q10, q8, d6
- vaddw.u8 q11, q8, d7
- vqmovun.s16 d6, q10
- vqmovun.s16 d7, q11
- vst1.8 {q3}, [r0,:128], r1
- subs r12, r12, #1
+ subs r12, r12, #2
+ vld1.8 {q2}, [r0,:128], r1
+ vaddw.u8 q10, q8, d4
+ vld1.8 {q3}, [r0,:128], r1
+ vaddw.u8 q11, q8, d5
+ vaddw.u8 q12, q8, d6
+ vaddw.u8 q13, q8, d7
+ vqmovun.s16 d4, q10
+ vqmovun.s16 d5, q11
+ vqmovun.s16 d6, q12
+ vst1.8 {q2}, [r3,:128], r1
+ vqmovun.s16 d7, q13
+ vst1.8 {q3}, [r3,:128], r1
bne 1b
bx lr
@@ -1147,20 +1154,31 @@ function idct32x32_dc_add_neon
vrshr.s16 q8, q8, #6
+ mov r3, r0
mov r12, #32
1:
@ Loop to add the constant from q8 into all 32x32 outputs
- vld1.8 {q2-q3}, [r0,:128]
- vaddw.u8 q10, q8, d4
- vaddw.u8 q11, q8, d5
- vaddw.u8 q12, q8, d6
- vaddw.u8 q13, q8, d7
- vqmovun.s16 d4, q10
- vqmovun.s16 d5, q11
- vqmovun.s16 d6, q12
- vqmovun.s16 d7, q13
- vst1.8 {q2-q3}, [r0,:128], r1
- subs r12, r12, #1
+ subs r12, r12, #2
+ vld1.8 {q0-q1}, [r0,:128], r1
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vld1.8 {q2-q3}, [r0,:128], r1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vaddw.u8 q13, q8, d4
+ vaddw.u8 q14, q8, d5
+ vaddw.u8 q15, q8, d6
+ vqmovun.s16 d0, q9
+ vaddw.u8 q9, q8, d7
+ vqmovun.s16 d1, q10
+ vqmovun.s16 d2, q11
+ vqmovun.s16 d3, q12
+ vqmovun.s16 d4, q13
+ vqmovun.s16 d5, q14
+ vst1.8 {q0-q1}, [r3,:128], r1
+ vqmovun.s16 d6, q15
+ vqmovun.s16 d7, q9
+ vst1.8 {q2-q3}, [r3,:128], r1
bne 1b
bx lr