diff options
author | Martin Storsjö <martin@martin.st> | 2016-11-22 15:47:17 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2016-11-24 13:39:21 +0200 |
commit | 2f99117f6ff24ce5be2abb9e014cb8b86c2aa0e0 (patch) | |
tree | 1599843f816d435bd4719722011cb9175d648ad7 /libavcodec | |
parent | 2dbe2aa2c2d4f02d2669feae45dee4fc45414813 (diff) | |
download | ffmpeg-2f99117f6ff24ce5be2abb9e014cb8b86c2aa0e0.tar.gz |
aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/aarch64/vp9itxfm_neon.S | 26 |
1 files changed, 15 insertions, 11 deletions
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 2dc6b7524a..f4194a670f 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -599,9 +599,9 @@ endfunc // x1 = unused // x2 = src // x3 = slice offset +// x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon - mov x9, #32 movi v2.8h, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 load_clear \i, x2, x9 @@ -649,8 +649,8 @@ endfunc // x1 = dst stride // x2 = src (temp buffer) // x3 = slice offset +// x9 = temp buffer stride function \txfm\()16_1d_8x16_pass2_neon - mov x9, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 load \i, x2, x9 .endr @@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1,idct ld1 {v0.8h,v1.8h}, [x10] .endif + mov x9, #32 .irp i, 0, 8 add x0, sp, #(\i*32) @@ -882,13 +883,12 @@ endfunc // x0 = dst (temp buffer) // x1 = unused // x2 = src +// x9 = double input stride // x10 = idct_coeffs // x11 = idct_coeffs + 32 function idct32_1d_8x32_pass1_neon ld1 {v0.8h,v1.8h}, [x10] - // Double stride of the input, since we only read every other line - mov x9, #128 movi v4.8h, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) @@ -987,12 +987,13 @@ endfunc // x0 = dst // x1 = dst stride // x2 = src (temp buffer) +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride // x10 = idct_coeffs // x11 = idct_coeffs + 32 function idct32_1d_8x32_pass2_neon ld1 {v0.8h,v1.8h}, [x10] - mov x9, #128 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x2], x9 @@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon idct16 - mov x9, #128 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x2], x9 .endr @@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon idct32_odd - mov x9, #128 .macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 ld1 {v4.8h}, [x2], x9 ld1 {v5.8h}, [x2], x9 -.if \neg == 0 add v4.8h, v4.8h, v\a\().8h ld1 {v6.8h}, [x2], x9 add v5.8h, v5.8h, v\b\().8h @@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon add v6.8h, v6.8h, v\c\().8h add v7.8h, v7.8h, v\d\().8h .else + ld1 {v4.8h}, [x2], x7 + ld1 {v5.8h}, [x2], x7 sub v4.8h, v4.8h, v\a\().8h - ld1 {v6.8h}, [x2], x9 + ld1 {v6.8h}, [x2], x7 sub v5.8h, v5.8h, v\b\().8h - ld1 {v7.8h}, [x2], x9 + ld1 {v7.8h}, [x2], x7 sub v6.8h, v6.8h, v\c\().8h sub v7.8h, v7.8h, v\d\().8h .endif @@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon load_acc_store 23, 22, 21, 20 load_acc_store 19, 18, 17, 16 sub x2, x2, x9 - neg x9, x9 load_acc_store 16, 17, 18, 19, 1 load_acc_store 20, 21, 22, 23, 1 load_acc_store 24, 25, 26, 27, 1 @@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 mov x5, x1 mov x6, x2 + // Double stride of the input, since we only read every other line + mov x9, #128 + neg x7, x9 + .irp i, 0, 8, 16, 24 add x0, sp, #(\i*64) add x2, x6, #(\i*2) |