aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2019-02-01 10:04:56 +0200
committerMartin Storsjö <martin@martin.st>2019-02-19 11:46:04 +0200
commit52c9b0a6c0d02cff6caebcf6989e565e05b55200 (patch)
tree64ea5d55b27e4483e3c45b21c83b66a7977b6617
parentc513fcd7d235aa4cef45a6c3125bd4dcc03bf276 (diff)
downloadffmpeg-52c9b0a6c0d02cff6caebcf6989e565e05b55200.tar.gz
aarch64: vp8: Port vp8_luma_dc_wht and vp8_idct_dc_add4uv from arm version
Cortex A53 A72 A73 vp8_luma_dc_wht_c: 115.7 75.7 90.7 vp8_luma_dc_wht_neon: 60.7 41.2 45.7 vp8_idct_dc_add4uv_c: 376.1 262.9 282.5 vp8_idct_dc_add4uv_neon: 52.0 29.0 37.0 Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r--libavcodec/aarch64/vp8dsp_init_aarch64.c3
-rw-r--r--libavcodec/aarch64/vp8dsp_neon.S109
2 files changed, 112 insertions, 0 deletions
diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c
index 20bb607dfd..e296ccd893 100644
--- a/libavcodec/aarch64/vp8dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -28,6 +28,7 @@ void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
VP8_LF(neon);
@@ -55,10 +56,12 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
{
if (!have_neon(av_get_cpu_flags()))
return;
+ dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
dsp->vp8_idct_add = ff_vp8_idct_add_neon;
dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
+ dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index 2b5b049075..4ea62c0644 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -4,6 +4,7 @@
* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
*
* This file is part of Libav.
*
@@ -25,6 +26,62 @@
#include "libavutil/aarch64/asm.S"
#include "neon.S"
+function ff_vp8_luma_dc_wht_neon, export=1
+ ld1 {v0.4h - v3.4h}, [x1]
+ movi v30.8h, #0
+
+ add v4.4h, v0.4h, v3.4h
+ add v6.4h, v1.4h, v2.4h
+ st1 {v30.8h}, [x1], #16
+ sub v7.4h, v1.4h, v2.4h
+ sub v5.4h, v0.4h, v3.4h
+ st1 {v30.8h}, [x1]
+ add v0.4h, v4.4h, v6.4h
+ add v1.4h, v5.4h, v7.4h
+ sub v2.4h, v4.4h, v6.4h
+ sub v3.4h, v5.4h, v7.4h
+
+ movi v16.4h, #3
+
+ transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
+
+ add v0.4h, v0.4h, v16.4h
+
+ add v4.4h, v0.4h, v3.4h
+ add v6.4h, v1.4h, v2.4h
+ sub v7.4h, v1.4h, v2.4h
+ sub v5.4h, v0.4h, v3.4h
+ add v0.4h, v4.4h, v6.4h
+ add v1.4h, v5.4h, v7.4h
+ sub v2.4h, v4.4h, v6.4h
+ sub v3.4h, v5.4h, v7.4h
+
+ sshr v0.4h, v0.4h, #3
+ sshr v1.4h, v1.4h, #3
+ sshr v2.4h, v2.4h, #3
+ sshr v3.4h, v3.4h, #3
+
+ mov x3, #32
+ st1 {v0.h}[0], [x0], x3
+ st1 {v1.h}[0], [x0], x3
+ st1 {v2.h}[0], [x0], x3
+ st1 {v3.h}[0], [x0], x3
+ st1 {v0.h}[1], [x0], x3
+ st1 {v1.h}[1], [x0], x3
+ st1 {v2.h}[1], [x0], x3
+ st1 {v3.h}[1], [x0], x3
+ st1 {v0.h}[2], [x0], x3
+ st1 {v1.h}[2], [x0], x3
+ st1 {v2.h}[2], [x0], x3
+ st1 {v3.h}[2], [x0], x3
+ st1 {v0.h}[3], [x0], x3
+ st1 {v1.h}[3], [x0], x3
+ st1 {v2.h}[3], [x0], x3
+ st1 {v3.h}[3], [x0], x3
+
+ ret
+endfunc
+
function ff_vp8_idct_add_neon, export=1
ld1 {v0.8b - v3.8b}, [x1]
mov w4, #20091
@@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1
ret
endfunc
+function ff_vp8_idct_dc_add4uv_neon, export=1
+ movi v0.4h, #0
+ mov x3, #32
+ ld1r {v16.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v17.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v18.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v19.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ mov x3, x0
+ srshr v16.8h, v16.8h, #3 // dc >>= 3
+ ld1 {v0.8b}, [x0], x2
+ srshr v18.8h, v18.8h, #3
+ ld1 {v1.8b}, [x0], x2
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v2.8b}, [x0], x2
+ uaddw v0.8h, v16.8h, v1.8b
+ ld1 {v3.8b}, [x0], x2
+ uaddw v22.8h, v16.8h, v2.8b
+ ld1 {v4.8b}, [x0], x2
+ uaddw v2.8h, v16.8h, v3.8b
+ ld1 {v5.8b}, [x0], x2
+ uaddw v24.8h, v18.8h, v4.8b
+ ld1 {v6.8b}, [x0], x2
+ uaddw v4.8h, v18.8h, v5.8b
+ ld1 {v7.8b}, [x0], x2
+ uaddw v26.8h, v18.8h, v6.8b
+ sqxtun v20.8b, v20.8h
+ uaddw v6.8h, v18.8h, v7.8b
+ sqxtun v21.8b, v0.8h
+ sqxtun v22.8b, v22.8h
+ st1 {v20.8b}, [x3], x2
+ sqxtun v23.8b, v2.8h
+ st1 {v21.8b}, [x3], x2
+ sqxtun v24.8b, v24.8h
+ st1 {v22.8b}, [x3], x2
+ sqxtun v25.8b, v4.8h
+ st1 {v23.8b}, [x3], x2
+ sqxtun v26.8b, v26.8h
+ st1 {v24.8b}, [x3], x2
+ sqxtun v27.8b, v6.8h
+ st1 {v25.8b}, [x3], x2
+ st1 {v26.8b}, [x3], x2
+ st1 {v27.8b}, [x3], x2
+
+ ret
+endfunc
+
function ff_vp8_idct_dc_add4y_neon, export=1
movi v0.16b, #0
mov x3, #32