diff options
author | Hubert Mazur <hum@semihalf.com> | 2022-07-12 11:15:33 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2022-07-13 23:25:22 +0300 |
commit | 01e190dc9950f4c35a4d5d795736460577807e3f (patch) | |
tree | dfad7f7f0381f20ab6c3e38c9fa7380f76276eb4 | |
parent | a0994440e8e66ff4f67aae56c01ee2727c7d283c (diff) | |
download | ffmpeg-01e190dc9950f4c35a4d5d795736460577807e3f.tar.gz |
lavc/aarch64: Add pix_abs16_x2 neon implementation
Provide neon implementation for pix_abs16_x2 function.
Performance tests of implementation are below.
- pix_abs_0_1_c: 283.5
- pix_abs_0_1_neon: 39.0
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libavcodec/aarch64/me_cmp_init_aarch64.c | 3 | ||||
-rw-r--r-- | libavcodec/aarch64/me_cmp_neon.S | 75 |
2 files changed, 78 insertions, 0 deletions
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index bec9148a1a..136b008eb7 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -27,6 +27,8 @@ int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h); int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h); +int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { @@ -34,6 +36,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) if (have_neon(cpu_flags)) { c->pix_abs[0][0] = ff_pix_abs16_neon; + c->pix_abs[0][1] = ff_pix_abs16_x2_neon; c->pix_abs[0][3] = ff_pix_abs16_xy2_neon; c->sad[0] = ff_pix_abs16_neon; diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index a7937bd8be..e49d049fc2 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -203,3 +203,78 @@ function ff_pix_abs16_xy2_neon, export=1 fmov w0, s0 // copy result to general purpose register ret endfunc + +function ff_pix_abs16_x2_neon, export=1 + // x0 unused + // x1 uint8_t *pix1 + // x2 uint8_t *pix2 + // x3 ptrdiff_t stride + // w4 int h + + cmp w4, #4 + // initialize buffers + movi d20, #0 + add x5, x2, #1 // pix2 + 1 + b.lt 2f + +// make 4 iterations at once +1: + + // abs(pix1[0] - avg2(pix2[0], pix2[1])) + // avg2(a,b) = (((a) + (b) + 1) >> 1) + // abs(x) = (x < 0 ? -x : x) + + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x5], x3 + urhadd v30.16b, v1.16b, v2.16b + ld1 {v0.16b}, [x1], x3 + uabdl v16.8h, v0.8b, v30.8b + ld1 {v4.16b}, [x2], x3 + uabdl2 v17.8h, v0.16b, v30.16b + ld1 {v5.16b}, [x5], x3 + urhadd v29.16b, v4.16b, v5.16b + ld1 {v3.16b}, [x1], x3 + uabal v16.8h, v3.8b, v29.8b + ld1 {v7.16b}, [x2], x3 + uabal2 v17.8h, v3.16b, v29.16b + ld1 {v22.16b}, [x5], x3 + urhadd v28.16b, v7.16b, v22.16b + ld1 {v6.16b}, [x1], x3 + uabal v16.8h, v6.8b, v28.8b + ld1 {v24.16b}, [x2], x3 + uabal2 v17.8h, v6.16b, v28.16b + ld1 {v25.16b}, [x5], x3 + urhadd v27.16b, v24.16b, v25.16b + ld1 {v23.16b}, [x1], x3 + uabal v16.8h, v23.8b, v27.8b + uabal2 v17.8h, v23.16b, v27.16b + + sub w4, w4, #4 + + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h + cmp w4, #4 + add d20, d20, d16 + + b.ge 1b + cbz w4, 3f + +// iterate by one +2: + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x5], x3 + urhadd v29.16b, v1.16b, v2.16b + ld1 {v0.16b}, [x1], x3 + uabd v28.16b, v0.16b, v29.16b + + uaddlv h28, v28.16b + subs w4, w4, #1 + + add d20, d20, d28 + b.ne 2b + +3: + fmov w0, s20 + + ret +endfunc |