aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2017-01-09 00:04:19 +0200
committerMartin Storsjö <martin@martin.st>2017-03-19 22:53:32 +0200
commit21c89f3a26bb1331381b90e653277585447cfbb3 (patch)
treee29745bb6f8296b5a7952721a48822ab645003b2
parent70317b25aa35c0907720e4d2b7686408588c07aa (diff)
downloadffmpeg-21c89f3a26bb1331381b90e653277585447cfbb3.tar.gz
arm/aarch64: vp9: Fix vertical alignment
Align the second/third operands as they usually are. Due to the wildly varying sizes of the written out operands in aarch64 assembly, the column alignment is usually not as clear as in arm assembly. This is cherrypicked from libav commit 7995ebfad12002033c73feed422a1cfc62081e8f. Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r--libavcodec/aarch64/vp9itxfm_neon.S36
-rw-r--r--libavcodec/arm/vp9itxfm_neon.S14
-rw-r--r--libavcodec/arm/vp9lpf_neon.S2
3 files changed, 26 insertions, 26 deletions
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 3e5da0880c..b12890f0db 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
movrel x4, idct_coeffs
.else
- movrel x4, iadst8_coeffs
+ movrel x4, iadst8_coeffs
ld1 {v1.8h}, [x4], #16
.endif
ld1 {v0.8h}, [x4]
@@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst
function idct16x16_dc_add_neon
- movrel x4, idct_coeffs
+ movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
- movi v1.4h, #0
+ movi v1.4h, #0
ld1 {v2.h}[0], [x2]
- smull v2.4s, v2.4h, v0.h[0]
- rshrn v2.4h, v2.4s, #14
- smull v2.4s, v2.4h, v0.h[0]
- rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
dup v2.8h, v2.h[0]
st1 {v1.h}[0], [x2]
- srshr v2.8h, v2.8h, #6
+ srshr v2.8h, v2.8h, #6
- mov x3, x0
- mov x4, #16
+ mov x3, x0
+ mov x4, #16
1:
// Loop to add the constant from v2 into all 16x16 outputs
subs x4, x4, #2
@@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1,idct
ld1 {v0.8h,v1.8h}, [x10]
.endif
- mov x9, #32
+ mov x9, #32
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #10
@@ -1046,10 +1046,10 @@ idct16_partial quarter
idct16_partial half
function idct32x32_dc_add_neon
- movrel x4, idct_coeffs
+ movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
- movi v1.4h, #0
+ movi v1.4h, #0
ld1 {v2.h}[0], [x2]
smull v2.4s, v2.4h, v0.h[0]
@@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon
dup v2.8h, v2.h[0]
st1 {v1.h}[0], [x2]
- srshr v0.8h, v2.8h, #6
+ srshr v0.8h, v2.8h, #6
- mov x3, x0
- mov x4, #32
+ mov x3, x0
+ mov x4, #32
1:
// Loop to add the constant v0 into all 32x32 outputs
subs x4, x4, #2
@@ -1230,7 +1230,7 @@ endfunc
// x9 = double input stride
function idct32_1d_8x32_pass1\suffix\()_neon
mov x14, x30
- movi v2.8h, #0
+ movi v2.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.ifb \suffix
@@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
.endif
add x2, x2, #64
- movi v2.8h, #0
+ movi v2.8h, #0
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 6d4d765c28..6c09922cae 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -530,7 +530,7 @@ function idct16x16_dc_add_neon
movrel r12, idct_coeffs
vld1.16 {d0}, [r12,:64]
- vmov.i16 q2, #0
+ vmov.i16 q2, #0
vld1.16 {d16[]}, [r2,:16]
vmull.s16 q8, d16, d0[0]
@@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon
push {lr}
mov r12, #32
- vmov.s16 q2, #0
+ vmov.s16 q2, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
@@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon
movrel r12, idct_coeffs
vld1.16 {d0}, [r12,:64]
- vmov.i16 q2, #0
+ vmov.i16 q2, #0
vld1.16 {d16[]}, [r2,:16]
vmull.s16 q8, d16, d0[0]
@@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
@ Double stride of the input, since we only read every other line
mov r12, #128
- vmov.s16 d4, #0
+ vmov.s16 d4, #0
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
.ifb \suffix
@@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
.endif
add r2, r2, #64
- vmov.s16 d8, #0
+ vmov.s16 d8, #0
@ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
.endif
vld1.32 {d12[]}, [r0,:32], r1
vld1.32 {d12[1]}, [r0,:32], r1
- vrshr.s16 q4, q4, #6
+ vrshr.s16 q4, q4, #6
vld1.32 {d13[]}, [r0,:32], r1
- vrshr.s16 q5, q5, #6
+ vrshr.s16 q5, q5, #6
vld1.32 {d13[1]}, [r0,:32], r1
sub r0, r0, r1, lsl #2
vaddw.u8 q4, q4, d12
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index 8d44d58f32..4b3608064a 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
endfunc
function vp9_loop_filter_h_16_neon
- sub r12, r0, #8
+ sub r12, r0, #8
vld1.8 {d16}, [r12,:64], r1
vld1.8 {d24}, [r0, :64], r1
vld1.8 {d17}, [r12,:64], r1