diff options
author | Ramiro Polla <ramiro.polla@gmail.com> | 2024-09-24 22:43:22 +0200 |
---|---|---|
committer | Ramiro Polla <ramiro.polla@gmail.com> | 2024-12-05 21:10:29 +0100 |
commit | 58bcdeb7425ed7b74f1aac20099cb3c025e6ce8d (patch) | |
tree | 0c6ed45501dcfaa7fa5caf1db0dd55c92af24496 | |
parent | 2d1358a84d7096bac98433bbfae46f7ffd4efbd0 (diff) | |
download | ffmpeg-58bcdeb7425ed7b74f1aac20099cb3c025e6ce8d.tar.gz |
swscale/aarch64/range_convert: saturate output instead of limiting input
aarch64 A55:
chrRangeFromJpeg8_1920_c: 28836.2 (1.00x)
chrRangeFromJpeg8_1920_neon: 5312.6 (5.43x) 5313.9 (5.43x)
chrRangeToJpeg8_1920_c: 44196.2 (1.00x)
chrRangeToJpeg8_1920_neon: 6034.6 (7.32x) 5551.3 (7.96x)
lumRangeFromJpeg8_1920_c: 15388.5 (1.00x)
lumRangeFromJpeg8_1920_neon: 3150.7 (4.88x) 3152.3 (4.88x)
lumRangeToJpeg8_1920_c: 23069.7 (1.00x)
lumRangeToJpeg8_1920_neon: 3873.2 (5.96x) 3628.7 (6.36x)
aarch64 A76:
chrRangeFromJpeg8_1920_c: 6334.7 (1.00x)
chrRangeFromJpeg8_1920_neon: 2264.5 (2.80x) 2344.5 (2.70x)
chrRangeToJpeg8_1920_c: 11474.5 (1.00x)
chrRangeToJpeg8_1920_neon: 2646.5 (4.34x) 2824.2 (4.06x)
lumRangeFromJpeg8_1920_c: 4453.2 (1.00x)
lumRangeFromJpeg8_1920_neon: 1104.8 (4.03x) 1104.5 (4.03x)
lumRangeToJpeg8_1920_c: 6645.0 (1.00x)
lumRangeToJpeg8_1920_neon: 1310.5 (5.07x) 1329.8 (5.00x)
-rw-r--r-- | libswscale/aarch64/range_convert_neon.S | 39 | ||||
-rw-r--r-- | libswscale/aarch64/swscale.c | 5 |
2 files changed, 18 insertions, 26 deletions
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S index 30991ab2a6..2f418adb24 100644 --- a/libswscale/aarch64/range_convert_neon.S +++ b/libswscale/aarch64/range_convert_neon.S @@ -20,12 +20,8 @@ #include "libavutil/aarch64/asm.S" -.macro lumConvertRange name, max, mult, offset, shift +.macro lumConvertRange name, fromto, mult, offset, shift function ff_\name, export=1 -.if \max != 0 - mov w3, #\max - dup v24.8h, w3 -.endif mov w3, #\mult dup v25.4s, w3 movz w3, #(\offset & 0xffff) @@ -33,17 +29,19 @@ function ff_\name, export=1 dup v26.4s, w3 1: ld1 {v0.8h}, [x0] -.if \max != 0 - smin v0.8h, v0.8h, v24.8h -.endif mov v16.16b, v26.16b mov v18.16b, v26.16b sxtl v20.4s, v0.4h sxtl2 v22.4s, v0.8h mla v16.4s, v20.4s, v25.4s mla v18.4s, v22.4s, v25.4s +.ifc \fromto, To + sqshrn v0.4h, v16.4s, #\shift + sqshrn2 v0.8h, v18.4s, #\shift +.else shrn v0.4h, v16.4s, #\shift shrn2 v0.8h, v18.4s, #\shift +.endif subs w1, w1, #8 st1 {v0.8h}, [x0], #16 b.gt 1b @@ -51,12 +49,8 @@ function ff_\name, export=1 endfunc .endm -.macro chrConvertRange name, max, mult, offset, shift +.macro chrConvertRange name, fromto, mult, offset, shift function ff_\name, export=1 -.if \max != 0 - mov w3, #\max - dup v24.8h, w3 -.endif mov w3, #\mult dup v25.4s, w3 movz w3, #(\offset & 0xffff) @@ -65,10 +59,6 @@ function ff_\name, export=1 1: ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x1] -.if \max != 0 - smin v0.8h, v0.8h, v24.8h - smin v1.8h, v1.8h, v24.8h -.endif mov v16.16b, v26.16b mov v17.16b, v26.16b mov v18.16b, v26.16b @@ -81,10 +71,17 @@ function ff_\name, export=1 mla v17.4s, v21.4s, v25.4s mla v18.4s, v22.4s, v25.4s mla v19.4s, v23.4s, v25.4s +.ifc \fromto, To + sqshrn v0.4h, v16.4s, #\shift + sqshrn v1.4h, v17.4s, #\shift + sqshrn2 v0.8h, v18.4s, #\shift + sqshrn2 v1.8h, v19.4s, #\shift +.else shrn v0.4h, v16.4s, #\shift shrn v1.4h, v17.4s, #\shift shrn2 v0.8h, v18.4s, #\shift shrn2 v1.8h, v19.4s, #\shift +.endif subs w2, w2, #8 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x1], #16 @@ -93,7 +90,7 @@ function ff_\name, export=1 endfunc .endm -lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14 -chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12 -lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14 -chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11 +lumConvertRange lumRangeToJpeg_neon, To, 19077, -39057361, 14 +chrConvertRange chrRangeToJpeg_neon, To, 4663, -9289992, 12 +lumConvertRange lumRangeFromJpeg_neon, From, 14071, 33561947, 14 +chrConvertRange chrRangeFromJpeg_neon, From, 1799, 4081085, 11 diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 1fce77df26..5173359e09 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -225,10 +225,6 @@ void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width); av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) { - /* This code is currently disabled because of changes in the base - * implementation of these functions. This code should be enabled - * again once those changes are ported to this architecture. */ -#if 0 int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { @@ -242,7 +238,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) } } } -#endif } av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) |