aarch64: vp9itxfm: Use a single lane ld1 instead of ld1r where possible

The ld1r is a leftover from the arm version, where this trick is beneficial on some cores. Use a single-lane load where we don't need the semantics of ld1r. Signed-off-by: Martin Storsjö <martin@martin.st>
author: Martin Storsjö <martin@martin.st> 2017-01-03 14:55:46 +0200
committer: Martin Storsjö <martin@martin.st> 2017-02-09 23:56:54 +0200
commit: ed8d293306e12c9b79022d37d39f48825ce7f2fa (patch)
tree: 0a148527a05de27bf481002b295529b8921734f2 /libavcodec
parent: 4da4b2b87f08a1331650c7e36eb7d4029a160776 (diff)
download: ffmpeg-ed8d293306e12c9b79022d37d39f48825ce7f2fa.tar.gz
1 files changed, 8 insertions, 8 deletions
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index a9c7626e65..e7b88364f1 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -255,7 +255,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
         cmp             w3,  #1
         b.ne            1f
         // DC-only for idct/idct
-        ld1r            {v2.4h},  [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -287,8 +287,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 
         \txfm2\()4      v4,  v5,  v6,  v7
 2:
-        ld1r            {v0.2s},   [x0], x1
-        ld1r            {v1.2s},   [x0], x1
+        ld1             {v0.s}[0],   [x0], x1
+        ld1             {v1.s}[0],   [x0], x1
 .ifnc \txfm1,iwht
         srshr           v4.4h,  v4.4h,  #4
         srshr           v5.4h,  v5.4h,  #4
@@ -297,8 +297,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 .endif
         uaddw           v4.8h,  v4.8h,  v0.8b
         uaddw           v5.8h,  v5.8h,  v1.8b
-        ld1r            {v2.2s},   [x0], x1
-        ld1r            {v3.2s},   [x0], x1
+        ld1             {v2.s}[0],   [x0], x1
+        ld1             {v3.s}[0],   [x0], x1
         sqxtun          v0.8b,  v4.8h
         sqxtun          v1.8b,  v5.8h
         sub             x0,  x0,  x1, lsl #2
@@ -394,7 +394,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         cmp             w3,  #1
         b.ne            1f
         // DC-only for idct/idct
-        ld1r            {v2.4h},  [x2]
+        ld1             {v2.h}[0],  [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -485,7 +485,7 @@ function idct16x16_dc_add_neon
 
         movi            v1.4h, #0
 
-        ld1r            {v2.4h}, [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -1044,7 +1044,7 @@ function idct32x32_dc_add_neon
 
         movi            v1.4h, #0
 
-        ld1r            {v2.4h}, [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h,  v0.h[0]
         rshrn           v2.4h,  v2.4s,  #14
         smull           v2.4s,  v2.4h,  v0.h[0]
author	Martin Storsjö <martin@martin.st>	2017-01-03 14:55:46 +0200
committer	Martin Storsjö <martin@martin.st>	2017-02-09 23:56:54 +0200
commit	ed8d293306e12c9b79022d37d39f48825ce7f2fa (patch)
tree	0a148527a05de27bf481002b295529b8921734f2 /libavcodec
parent	4da4b2b87f08a1331650c7e36eb7d4029a160776 (diff)
download	ffmpeg-ed8d293306e12c9b79022d37d39f48825ce7f2fa.tar.gz