diff options
author | Hubert Mazur <hum@semihalf.com> | 2022-09-08 11:25:03 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2022-09-09 10:19:46 +0300 |
commit | 200f5e578f2fbf70a966f08257e0500a6f1ddd6c (patch) | |
tree | 1a29c01c7f02071726a6a3d248a9761e1fb0cb3b | |
parent | a2fd553dd3079601249e56f6aa1a6fe711d0663c (diff) | |
download | ffmpeg-200f5e578f2fbf70a966f08257e0500a6f1ddd6c.tar.gz |
lavc/aarch64: Add neon implementation for vsad16
Provide optimized implementation of vsad16 function for arm64.
Performance comparison tests are shown below.
- vsad_0_c: 285.2
- vsad_0_neon: 39.5
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Co-authored-by: Martin Storsjö <martin@martin.st>
Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libavcodec/aarch64/me_cmp_init_aarch64.c | 5 | ||||
-rw-r--r-- | libavcodec/aarch64/me_cmp_neon.S | 65 |
2 files changed, 70 insertions, 0 deletions
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index fb7c3f5059..ddc5d05611 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -41,6 +41,9 @@ int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h); +int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, + ptrdiff_t stride, int h); + av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { int cpu_flags = av_get_cpu_flags(); @@ -57,5 +60,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->sse[0] = sse16_neon; c->sse[1] = sse8_neon; c->sse[2] = sse4_neon; + + c->vsad[0] = vsad16_neon; } } diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 4198985c6c..1d0b166d69 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -584,3 +584,68 @@ function sse4_neon, export=1 ret endfunc + +function vsad16_neon, export=1 + // x0 unused + // x1 uint8_t *pix1 + // x2 uint8_t *pix2 + // x3 ptrdiff_t stride + // w4 int h + + ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration + ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration + + sub w4, w4, #1 // we need to make h-1 iterations + movi v16.8h, #0 + + cmp w4, #3 // check if we can make 3 iterations at once + usubl v31.8h, v0.8b, v1.8b // Signed difference pix1[0] - pix2[0], first iteration + usubl2 v30.8h, v0.16b, v1.16b // Signed difference pix1[0] - pix2[0], first iteration + + b.lt 2f + +1: + // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride]) + ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration + ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration + ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration + ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration + usubl v29.8h, v0.8b, v1.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration + usubl2 v28.8h, v0.16b, v1.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration + ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration + ld1 {v5.16b}, [x2], x3 // Load pix2[0 + stride], third iteration + usubl v27.8h, v2.8b, v3.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration + saba v16.8h, v31.8h, v29.8h // Signed absolute difference and accumulate the result. first iteration + usubl2 v26.8h, v2.16b, v3.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration + saba v16.8h, v30.8h, v28.8h // Signed absolute difference and accumulate the result. first iteration + usubl v25.8h, v4.8b, v5.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration + usubl2 v24.8h, v4.16b, v5.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration + saba v16.8h, v29.8h, v27.8h // Signed absolute difference and accumulate the result. second iteration + mov v31.16b, v25.16b + saba v16.8h, v28.8h, v26.8h // Signed absolute difference and accumulate the result. second iteration + sub w4, w4, #3 // h -= 3 + mov v30.16b, v24.16b + saba v16.8h, v27.8h, v25.8h // Signed absolute difference and accumulate the result. third iteration + cmp w4, #3 + saba v16.8h, v26.8h, v24.8h // Signed absolute difference and accumulate the result. third iteration + + b.ge 1b + cbz w4, 3f +2: + ld1 {v0.16b}, [x1], x3 + ld1 {v1.16b}, [x2], x3 + subs w4, w4, #1 + usubl v29.8h, v0.8b, v1.8b + usubl2 v28.8h, v0.16b, v1.16b + saba v16.8h, v31.8h, v29.8h + mov v31.16b, v29.16b + saba v16.8h, v30.8h, v28.8h + mov v30.16b, v28.16b + + b.ne 2b +3: + uaddlv s17, v16.8h + fmov w0, s17 + + ret +endfunc |