aarch64: hevc_idct: Fix overflows in idct_dc

This is marginally slower, but correct for all input values. The previous implementation failed with certain input seeds, e.g. "checkasm --test=hevc_idct 98". Signed-off-by: Martin Storsjö <martin@martin.st> (cherry picked from commit f27e3ccf06ee19935d160164ca4a02f28cfc2a27)
author: Martin Storsjö <martin@martin.st> 2021-05-17 12:48:03 +0300
committer: Martin Storsjö <martin@martin.st> 2021-05-22 22:33:20 +0300
commit: c813f5e3436b5ba40b105cdaaaa7b1184baabde7 (patch)
tree: 2249ec01469d36c991001bf625912b600ec9c77b
parent: f7468a9c4037fd406847af4348c0deb2d521d0fc (diff)
download: ffmpeg-c813f5e3436b5ba40b105cdaaaa7b1184baabde7.tar.gz
1 files changed, 5 insertions, 6 deletions
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 28c11e632c..0869431294 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -573,14 +573,13 @@ idct_16x16 10
 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 .macro idct_dc size, bitdepth
 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
-        movi          v1.8h,  #((1 << (14 - \bitdepth))+1)
         ld1r         {v4.8h}, [x0]
-        add           v4.8h,  v4.8h,  v1.8h
-        sshr          v0.8h,  v4.8h,  #(15 - \bitdepth)
-        sshr          v1.8h,  v4.8h,  #(15 - \bitdepth)
+        srshr         v4.8h,  v4.8h,  #1
+        srshr         v0.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr         v1.8h,  v4.8h,  #(14 - \bitdepth)
 .if \size > 4
-        sshr          v2.8h,  v4.8h,  #(15 - \bitdepth)
-        sshr          v3.8h,  v4.8h,  #(15 - \bitdepth)
+        srshr         v2.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr         v3.8h,  v4.8h,  #(14 - \bitdepth)
 .if \size > 16 /* dc 32x32 */
         mov              x2,  #4
 1:
author	Martin Storsjö <martin@martin.st>	2021-05-17 12:48:03 +0300
committer	Martin Storsjö <martin@martin.st>	2021-05-22 22:33:20 +0300
commit	c813f5e3436b5ba40b105cdaaaa7b1184baabde7 (patch)
tree	2249ec01469d36c991001bf625912b600ec9c77b
parent	f7468a9c4037fd406847af4348c0deb2d521d0fc (diff)
download	ffmpeg-c813f5e3436b5ba40b105cdaaaa7b1184baabde7.tar.gz