aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2014-01-15 15:13:12 +0100
committerMichael Niedermayer <michaelni@gmx.at>2014-01-15 15:13:41 +0100
commit19fc3c0122ebbffd5d8678cc70750503545fa7f8 (patch)
treed4181c5647f90d5d18c3f95484abc6073ec5e95d /libavcodec/aarch64
parentfb1c786a9dc646b7fdd1d15ad98a4be87c8deb87 (diff)
parentd5dd8c7bf0f0d77c581db3236e0d938f06fd5591 (diff)
downloadffmpeg-19fc3c0122ebbffd5d8678cc70750503545fa7f8.tar.gz
Merge commit 'd5dd8c7bf0f0d77c581db3236e0d938f06fd5591'
* commit 'd5dd8c7bf0f0d77c581db3236e0d938f06fd5591': aarch64: h264 qpel NEON optimizations Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/Makefile2
-rw-r--r--libavcodec/aarch64/h264qpel_init_aarch64.c172
-rw-r--r--libavcodec/aarch64/h264qpel_neon.S934
-rw-r--r--libavcodec/aarch64/neon.S64
4 files changed, 1172 insertions, 0 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 06e3c778ad..1d80d9a268 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,7 +1,9 @@
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
+OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o
+NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
new file mode 100644
index 0000000000..570dee12d9
--- /dev/null
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -0,0 +1,172 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
+{
+ const int high_bit_depth = bit_depth > 8;
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags) && !high_bit_depth) {
+ /* c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; */
+ c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+ c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+ c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+ c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+ c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+ c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+ c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+ c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+ c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+ c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+ c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+ c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+ /* c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; */
+ c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+ c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+ c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+ c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+ c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+ c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+ c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+ c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+ c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+ c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+ c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+ /* c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; */
+ c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
+ c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
+ c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
+ c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
+ c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
+ c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
+ c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
+
+ /* c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; */
+ c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
+ c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
+ c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
+ c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
+ c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
+ c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
+ c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+ }
+}
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
new file mode 100644
index 0000000000..d27cfac494
--- /dev/null
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+ /* H.264 qpel MC */
+
+.macro lowpass_const r
+ movz \r, #20, lsl #16
+ movk \r, #5
+ mov v6.S[0], \r
+.endm
+
+//trashes v0-v5
+.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
+ ext v2.8B, \r0\().8B, \r1\().8B, #2
+ ext v3.8B, \r0\().8B, \r1\().8B, #3
+ uaddl v2.8H, v2.8B, v3.8B
+ ext v4.8B, \r0\().8B, \r1\().8B, #1
+ ext v5.8B, \r0\().8B, \r1\().8B, #4
+ uaddl v4.8H, v4.8B, v5.8B
+ ext v1.8B, \r0\().8B, \r1\().8B, #5
+ uaddl \d0\().8H, \r0\().8B, v1.8B
+ ext v0.8B, \r2\().8B, \r3\().8B, #2
+ mla \d0\().8H, v2.8H, v6.H[1]
+ ext v1.8B, \r2\().8B, \r3\().8B, #3
+ uaddl v0.8H, v0.8B, v1.8B
+ ext v1.8B, \r2\().8B, \r3\().8B, #1
+ mls \d0\().8H, v4.8H, v6.H[0]
+ ext v3.8B, \r2\().8B, \r3\().8B, #4
+ uaddl v1.8H, v1.8B, v3.8B
+ ext v2.8B, \r2\().8B, \r3\().8B, #5
+ uaddl \d1\().8H, \r2\().8B, v2.8B
+ mla \d1\().8H, v0.8H, v6.H[1]
+ mls \d1\().8H, v1.8H, v6.H[0]
+ .if \narrow
+ sqrshrun \d0\().8B, \d0\().8H, #5
+ sqrshrun \d1\().8B, \d1\().8H, #5
+ .endif
+.endm
+
+//trashes v0-v5, v7, v30-v31
+.macro lowpass_8H r0, r1
+ ext v0.16B, \r0\().16B, \r0\().16B, #2
+ ext v1.16B, \r0\().16B, \r0\().16B, #3
+ uaddl v0.8H, v0.8B, v1.8B
+ ext v2.16B, \r0\().16B, \r0\().16B, #1
+ ext v3.16B, \r0\().16B, \r0\().16B, #4
+ uaddl v2.8H, v2.8B, v3.8B
+ ext v30.16B, \r0\().16B, \r0\().16B, #5
+ uaddl \r0\().8H, \r0\().8B, v30.8B
+ ext v4.16B, \r1\().16B, \r1\().16B, #2
+ mla \r0\().8H, v0.8H, v6.H[1]
+ ext v5.16B, \r1\().16B, \r1\().16B, #3
+ uaddl v4.8H, v4.8B, v5.8B
+ ext v7.16B, \r1\().16B, \r1\().16B, #1
+ mls \r0\().8H, v2.8H, v6.H[0]
+ ext v0.16B, \r1\().16B, \r1\().16B, #4
+ uaddl v7.8H, v7.8B, v0.8B
+ ext v31.16B, \r1\().16B, \r1\().16B, #5
+ uaddl \r1\().8H, \r1\().8B, v31.8B
+ mla \r1\().8H, v4.8H, v6.H[1]
+ mls \r1\().8H, v7.8H, v6.H[0]
+.endm
+
+// trashes v2-v5, v30
+.macro lowpass_8_1 r0, r1, d0, narrow=1
+ ext v2.8B, \r0\().8B, \r1\().8B, #2
+ ext v3.8B, \r0\().8B, \r1\().8B, #3
+ uaddl v2.8H, v2.8B, v3.8B
+ ext v4.8B, \r0\().8B, \r1\().8B, #1
+ ext v5.8B, \r0\().8B, \r1\().8B, #4
+ uaddl v4.8H, v4.8B, v5.8B
+ ext v30.8B, \r0\().8B, \r1\().8B, #5
+ uaddl \d0\().8H, \r0\().8B, v30.8B
+ mla \d0\().8H, v2.8H, v6.H[1]
+ mls \d0\().8H, v4.8H, v6.H[0]
+ .if \narrow
+ sqrshrun \d0\().8B, \d0\().8H, #5
+ .endif
+.endm
+
+// trashed v0-v7
+.macro lowpass_8.16 r0, r1, r2
+ ext v1.16B, \r0\().16B, \r1\().16B, #4
+ ext v0.16B, \r0\().16B, \r1\().16B, #6
+ saddl v5.4S, v1.4H, v0.4H
+ ext v2.16B, \r0\().16B, \r1\().16B, #2
+ saddl2 v1.4S, v1.8H, v0.8H
+ ext v3.16B, \r0\().16B, \r1\().16B, #8
+ saddl v6.4S, v2.4H, v3.4H
+ ext \r1\().16B, \r0\().16B, \r1\().16B, #10
+ saddl2 v2.4S, v2.8H, v3.8H
+ saddl v0.4S, \r0\().4H, \r1\().4H
+ saddl2 v4.4S, \r0\().8H, \r1\().8H
+
+ shl v3.4S, v5.4S, #4
+ shl v5.4S, v5.4S, #2
+ shl v7.4S, v6.4S, #2
+ add v5.4S, v5.4S, v3.4S
+ add v6.4S, v6.4S, v7.4S
+
+ shl v3.4S, v1.4S, #4
+ shl v1.4S, v1.4S, #2
+ shl v7.4S, v2.4S, #2
+ add v1.4S, v1.4S, v3.4S
+ add v2.4S, v2.4S, v7.4S
+
+ add v5.4S, v5.4S, v0.4S
+ sub v5.4S, v5.4S, v6.4S
+
+ add v1.4S, v1.4S, v4.4S
+ sub v1.4S, v1.4S, v2.4S
+
+ rshrn v5.4H, v5.4S, #10
+ rshrn2 v5.8H, v1.4S, #10
+
+ sqxtun \r2\().8B, v5.8H
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+ mov x4, x30
+ mov x12, #16
+ mov x3, #8
+ bl put_h264_qpel8_h_lowpass_neon
+ sub x1, x1, x2, lsl #4
+ add x1, x1, #8
+ mov x12, #16
+ mov x30, x4
+ b put_h264_qpel8_h_lowpass_neon
+endfunc
+
+.macro h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
+ mov x13, x30
+ mov x12, #16
+ bl \type\()_h264_qpel8_h_lowpass_neon
+ sub x0, x0, x3, lsl #4
+ sub x1, x1, x2, lsl #4
+ add x0, x0, #8
+ add x1, x1, #8
+ mov x12, #16
+ mov x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon
+1: ld1 {v28.8B, v29.8B}, [x1], x2
+ ld1 {v16.8B, v17.8B}, [x1], x2
+ subs x12, x12, #2
+ lowpass_8 v28, v29, v16, v17, v28, v16
+ .ifc \type,avg
+ ld1 {v2.8B}, [x0], x3
+ urhadd v28.8B, v28.8B, v2.8B
+ ld1 {v3.8B}, [x0]
+ urhadd v16.8B, v16.8B, v3.8B
+ sub x0, x0, x3
+ .endif
+ st1 {v28.8B}, [x0], x3
+ st1 {v16.8B}, [x0], x3
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+ h264_qpel_h_lowpass put
+ h264_qpel_h_lowpass avg
+
+.macro h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
+ mov x13, x30
+ mov x12, #16
+ bl \type\()_h264_qpel8_h_lowpass_l2_neon
+ sub x0, x0, x2, lsl #4
+ sub x1, x1, x2, lsl #4
+ sub x3, x3, x2, lsl #4
+ add x0, x0, #8
+ add x1, x1, #8
+ add x3, x3, #8
+ mov x12, #16
+ mov x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon
+1: ld1 {v26.8B, v27.8B}, [x1], x2
+ ld1 {v16.8B, v17.8B}, [x1], x2
+ ld1 {v28.8B}, [x3], x2
+ ld1 {v29.8B}, [x3], x2
+ subs x12, x12, #2
+ lowpass_8 v26, v27, v16, v17, v26, v27
+ urhadd v26.8B, v26.8B, v28.8B
+ urhadd v27.8B, v27.8B, v29.8B
+ .ifc \type,avg
+ ld1 {v2.8B}, [x0], x2
+ urhadd v26.8B, v26.8B, v2.8B
+ ld1 {v3.8B}, [x0]
+ urhadd v27.8B, v27.8B, v3.8B
+ sub x0, x0, x2
+ .endif
+ st1 {v26.8B}, [x0], x2
+ st1 {v27.8B}, [x0], x2
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+ h264_qpel_h_lowpass_l2 put
+ h264_qpel_h_lowpass_l2 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed
+ mov x4, x30
+ mov x2, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+ b put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
+ mov x4, x30
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ sub x0, x0, x2, lsl #4
+ add x0, x0, #8
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon
+ ld1 {v16.8B}, [x1], x3
+ ld1 {v18.8B}, [x1], x3
+ ld1 {v20.8B}, [x1], x3
+ ld1 {v22.8B}, [x1], x3
+ ld1 {v24.8B}, [x1], x3
+ ld1 {v26.8B}, [x1], x3
+ ld1 {v28.8B}, [x1], x3
+ ld1 {v30.8B}, [x1], x3
+ ld1 {v17.8B}, [x1], x3
+ ld1 {v19.8B}, [x1], x3
+ ld1 {v21.8B}, [x1], x3
+ ld1 {v23.8B}, [x1], x3
+ ld1 {v25.8B}, [x1]
+
+ transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
+ transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
+ lowpass_8 v16, v17, v18, v19, v16, v17
+ lowpass_8 v20, v21, v22, v23, v18, v19
+ lowpass_8 v24, v25, v26, v27, v20, v21
+ lowpass_8 v28, v29, v30, v31, v22, v23
+ transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+
+ .ifc \type,avg
+ ld1 {v24.8B}, [x0], x2
+ urhadd v16.8B, v16.8B, v24.8B
+ ld1 {v25.8B}, [x0], x2
+ urhadd v17.8B, v17.8B, v25.8B
+ ld1 {v26.8B}, [x0], x2
+ urhadd v18.8B, v18.8B, v26.8B
+ ld1 {v27.8B}, [x0], x2
+ urhadd v19.8B, v19.8B, v27.8B
+ ld1 {v28.8B}, [x0], x2
+ urhadd v20.8B, v20.8B, v28.8B
+ ld1 {v29.8B}, [x0], x2
+ urhadd v21.8B, v21.8B, v29.8B
+ ld1 {v30.8B}, [x0], x2
+ urhadd v22.8B, v22.8B, v30.8B
+ ld1 {v31.8B}, [x0], x2
+ urhadd v23.8B, v23.8B, v31.8B
+ sub x0, x0, x2, lsl #3
+ .endif
+
+ st1 {v16.8B}, [x0], x2
+ st1 {v17.8B}, [x0], x2
+ st1 {v18.8B}, [x0], x2
+ st1 {v19.8B}, [x0], x2
+ st1 {v20.8B}, [x0], x2
+ st1 {v21.8B}, [x0], x2
+ st1 {v22.8B}, [x0], x2
+ st1 {v23.8B}, [x0], x2
+
+ ret
+endfunc
+.endm
+
+ h264_qpel_v_lowpass put
+ h264_qpel_v_lowpass avg
+
+.macro h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
+ mov x4, x30
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ sub x0, x0, x3, lsl #4
+ sub x12, x12, x2, lsl #4
+ add x0, x0, #8
+ add x12, x12, #8
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon
+ ld1 {v16.8B}, [x1], x3
+ ld1 {v18.8B}, [x1], x3
+ ld1 {v20.8B}, [x1], x3
+ ld1 {v22.8B}, [x1], x3
+ ld1 {v24.8B}, [x1], x3
+ ld1 {v26.8B}, [x1], x3
+ ld1 {v28.8B}, [x1], x3
+ ld1 {v30.8B}, [x1], x3
+ ld1 {v17.8B}, [x1], x3
+ ld1 {v19.8B}, [x1], x3
+ ld1 {v21.8B}, [x1], x3
+ ld1 {v23.8B}, [x1], x3
+ ld1 {v25.8B}, [x1]
+
+ transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
+ transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
+ lowpass_8 v16, v17, v18, v19, v16, v17
+ lowpass_8 v20, v21, v22, v23, v18, v19
+ lowpass_8 v24, v25, v26, v27, v20, v21
+ lowpass_8 v28, v29, v30, v31, v22, v23
+ transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+
+ ld1 {v24.8B}, [x12], x2
+ ld1 {v25.8B}, [x12], x2
+ ld1 {v26.8B}, [x12], x2
+ ld1 {v27.8B}, [x12], x2
+ ld1 {v28.8B}, [x12], x2
+ urhadd v16.8B, v24.8B, v16.8B
+ urhadd v17.8B, v25.8B, v17.8B
+ ld1 {v29.8B}, [x12], x2
+ urhadd v18.8B, v26.8B, v18.8B
+ urhadd v19.8B, v27.8B, v19.8B
+ ld1 {v30.8B}, [x12], x2
+ urhadd v20.8B, v28.8B, v20.8B
+ urhadd v21.8B, v29.8B, v21.8B
+ ld1 {v31.8B}, [x12], x2
+ urhadd v22.8B, v30.8B, v22.8B
+ urhadd v23.8B, v31.8B, v23.8B
+
+ .ifc \type,avg
+ ld1 {v24.8B}, [x0], x3
+ urhadd v16.8B, v16.8B, v24.8B
+ ld1 {v25.8B}, [x0], x3
+ urhadd v17.8B, v17.8B, v25.8B
+ ld1 {v26.8B}, [x0], x3
+ urhadd v18.8B, v18.8B, v26.8B
+ ld1 {v27.8B}, [x0], x3
+ urhadd v19.8B, v19.8B, v27.8B
+ ld1 {v28.8B}, [x0], x3
+ urhadd v20.8B, v20.8B, v28.8B
+ ld1 {v29.8B}, [x0], x3
+ urhadd v21.8B, v21.8B, v29.8B
+ ld1 {v30.8B}, [x0], x3
+ urhadd v22.8B, v22.8B, v30.8B
+ ld1 {v31.8B}, [x0], x3
+ urhadd v23.8B, v23.8B, v31.8B
+ sub x0, x0, x3, lsl #3
+ .endif
+
+ st1 {v16.8B}, [x0], x3
+ st1 {v17.8B}, [x0], x3
+ st1 {v18.8B}, [x0], x3
+ st1 {v19.8B}, [x0], x3
+ st1 {v20.8B}, [x0], x3
+ st1 {v21.8B}, [x0], x3
+ st1 {v22.8B}, [x0], x3
+ st1 {v23.8B}, [x0], x3
+
+ ret
+endfunc
+.endm
+
+ h264_qpel_v_lowpass_l2 put
+ h264_qpel_v_lowpass_l2 avg
+
+function put_h264_qpel8_hv_lowpass_neon_top
+ lowpass_const w12
+ ld1 {v16.8H}, [x1], x3
+ ld1 {v17.8H}, [x1], x3
+ ld1 {v18.8H}, [x1], x3
+ ld1 {v19.8H}, [x1], x3
+ ld1 {v20.8H}, [x1], x3
+ ld1 {v21.8H}, [x1], x3
+ ld1 {v22.8H}, [x1], x3
+ ld1 {v23.8H}, [x1], x3
+ ld1 {v24.8H}, [x1], x3
+ ld1 {v25.8H}, [x1], x3
+ ld1 {v26.8H}, [x1], x3
+ ld1 {v27.8H}, [x1], x3
+ ld1 {v28.8H}, [x1]
+ lowpass_8H v16, v17
+ lowpass_8H v18, v19
+ lowpass_8H v20, v21
+ lowpass_8H v22, v23
+ lowpass_8H v24, v25
+ lowpass_8H v26, v27
+ lowpass_8H v28, v29
+
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lowpass_8.16 v16, v24, v16
+ lowpass_8.16 v17, v25, v17
+
+ lowpass_8.16 v18, v26, v18
+ lowpass_8.16 v19, v27, v19
+
+ lowpass_8.16 v20, v28, v20
+ lowpass_8.16 v21, v29, v21
+
+ lowpass_8.16 v22, v30, v22
+ lowpass_8.16 v23, v31, v23
+
+ transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+
+ ret
+endfunc
+
+.macro h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
+ mov x10, x30
+ bl put_h264_qpel8_hv_lowpass_neon_top
+ .ifc \type,avg
+ ld1 {v0.8B}, [x0], x2
+ urhadd v16.8B, v16.8B, v0.8B
+ ld1 {v1.8B}, [x0], x2
+ urhadd v17.8B, v17.8B, v1.8B
+ ld1 {v2.8B}, [x0], x2
+ urhadd v18.8B, v18.8B, v2.8B
+ ld1 {v3.8B}, [x0], x2
+ urhadd v19.8B, v19.8B, v3.8B
+ ld1 {v4.8B}, [x0], x2
+ urhadd v20.8B, v20.8B, v4.8B
+ ld1 {v5.8B}, [x0], x2
+ urhadd v21.8B, v21.8B, v5.8B
+ ld1 {v6.8B}, [x0], x2
+ urhadd v22.8B, v22.8B, v6.8B
+ ld1 {v7.8B}, [x0], x2
+ urhadd v23.8B, v23.8B, v7.8B
+ sub x0, x0, x2, lsl #3
+ .endif
+
+ st1 {v16.8B}, [x0], x2
+ st1 {v17.8B}, [x0], x2
+ st1 {v18.8B}, [x0], x2
+ st1 {v19.8B}, [x0], x2
+ st1 {v20.8B}, [x0], x2
+ st1 {v21.8B}, [x0], x2
+ st1 {v22.8B}, [x0], x2
+ st1 {v23.8B}, [x0], x2
+
+ ret x10
+endfunc
+.endm
+
+ h264_qpel8_hv_lowpass put
+ h264_qpel8_hv_lowpass avg
+
+.macro h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
+ mov x10, x30
+ bl put_h264_qpel8_hv_lowpass_neon_top
+
+ ld1 {v0.8B, v1.8B}, [x2], #16
+ ld1 {v2.8B, v3.8B}, [x2], #16
+ urhadd v0.8B, v0.8B, v16.8B
+ urhadd v1.8B, v1.8B, v17.8B
+ ld1 {v4.8B, v5.8B}, [x2], #16
+ urhadd v2.8B, v2.8B, v18.8B
+ urhadd v3.8B, v3.8B, v19.8B
+ ld1 {v6.8B, v7.8B}, [x2], #16
+ urhadd v4.8B, v4.8B, v20.8B
+ urhadd v5.8B, v5.8B, v21.8B
+ urhadd v6.8B, v6.8B, v22.8B
+ urhadd v7.8B, v7.8B, v23.8B
+ .ifc \type,avg
+ ld1 {v16.8B}, [x0], x3
+ urhadd v0.8B, v0.8B, v16.8B
+ ld1 {v17.8B}, [x0], x3
+ urhadd v1.8B, v1.8B, v17.8B
+ ld1 {v18.8B}, [x0], x3
+ urhadd v2.8B, v2.8B, v18.8B
+ ld1 {v19.8B}, [x0], x3
+ urhadd v3.8B, v3.8B, v19.8B
+ ld1 {v20.8B}, [x0], x3
+ urhadd v4.8B, v4.8B, v20.8B
+ ld1 {v21.8B}, [x0], x3
+ urhadd v5.8B, v5.8B, v21.8B
+ ld1 {v22.8B}, [x0], x3
+ urhadd v6.8B, v6.8B, v22.8B
+ ld1 {v23.8B}, [x0], x3
+ urhadd v7.8B, v7.8B, v23.8B
+ sub x0, x0, x3, lsl #3
+ .endif
+ st1 {v0.8B}, [x0], x3
+ st1 {v1.8B}, [x0], x3
+ st1 {v2.8B}, [x0], x3
+ st1 {v3.8B}, [x0], x3
+ st1 {v4.8B}, [x0], x3
+ st1 {v5.8B}, [x0], x3
+ st1 {v6.8B}, [x0], x3
+ st1 {v7.8B}, [x0], x3
+
+ ret x10
+endfunc
+.endm
+
+ h264_qpel8_hv_lowpass_l2 put
+ h264_qpel8_hv_lowpass_l2 avg
+
+.macro h264_qpel16_hv type
+function \type\()_h264_qpel16_hv_lowpass_neon
+ mov x13, x30
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ sub x0, x0, x2, lsl #4
+ add x0, x0, #8
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x13
+ b \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
+
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
+ mov x13, x30
+ sub x2, x4, #256
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ sub x0, x0, x3, lsl #4
+ add x0, x0, #8
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x13
+ b \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+.endm
+
+ h264_qpel16_hv put
+ h264_qpel16_hv avg
+
+.macro h264_qpel8 type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
+ lowpass_const w3
+ mov x3, x1
+ sub x1, x1, #2
+ mov x12, #8
+ b \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
+ lowpass_const w3
+ sub x1, x1, #2
+ mov x3, x2
+ mov x12, #8
+ b \type\()_h264_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
+ lowpass_const w3
+ add x3, x1, #1
+ sub x1, x1, #2
+ mov x12, #8
+ b \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
+ mov x14, x30
+ mov x12, x1
+\type\()_h264_qpel8_mc01:
+ lowpass_const w3
+ mov x3, x2
+ sub x1, x1, x2, lsl #1
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel8_mc11:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #64
+ mov x0, sp
+ sub x1, x1, #2
+ mov x3, #8
+ mov x12, #8
+ bl put_h264_qpel8_h_lowpass_neon
+ mov x0, x8
+ mov x3, x2
+ mov x12, sp
+ sub x1, x9, x2, lsl #1
+ mov x2, #8
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel8_mc21:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #(8*8+16*12)
+ sub x1, x1, #2
+ mov x3, #8
+ mov x0, sp
+ mov x12, #8
+ bl put_h264_qpel8_h_lowpass_neon
+ mov x4, x0
+ mov x0, x8
+ sub x1, x9, x2, lsl #1
+ sub x1, x1, #2
+ mov x3, x2
+ sub x2, x4, #64
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
+ add x1, x1, #1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ sub x1, x1, #1
+ b \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
+ mov x14, x30
+ lowpass_const w3
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel8_mc12:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #(8*8+16*12)
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ mov x2, #8
+ mov x0, sp
+ bl put_h264_qpel8_v_lowpass_neon
+ mov x4, x0
+ mov x0, x8
+ sub x1, x9, x3, lsl #1
+ sub x1, x1, #2
+ sub x2, x4, #64
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
+ mov x14, x30
+ mov x11, sp
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, #2
+ mov x3, x2
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, #1
+ b \type\()_h264_qpel8_mc12
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
+ mov x14, x30
+ add x12, x1, x2
+ b \type\()_h264_qpel8_mc01
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel8_mc21
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
+ add x1, x1, #1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ sub x1, x1, #1
+ b \type\()_h264_qpel8_mc11
+endfunc
+.endm
+
+ h264_qpel8 put
+ h264_qpel8 avg
+
+.macro h264_qpel16 type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
+ lowpass_const w3
+ mov x3, x1
+ sub x1, x1, #2
+ b \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
+ lowpass_const w3
+ sub x1, x1, #2
+ mov x3, x2
+ b \type\()_h264_qpel16_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
+ lowpass_const w3
+ add x3, x1, #1
+ sub x1, x1, #2
+ b \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
+ mov x14, x30
+ mov x12, x1
+\type\()_h264_qpel16_mc01:
+ lowpass_const w3
+ mov x3, x2
+ sub x1, x1, x2, lsl #1
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel16_mc11:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #256
+ mov x0, sp
+ sub x1, x1, #2
+ mov x3, #16
+ bl put_h264_qpel16_h_lowpass_neon
+ mov x0, x8
+ mov x3, x2
+ mov x12, sp
+ sub x1, x9, x2, lsl #1
+ mov x2, #16
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel16_mc21:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #(16*16+16*12)
+ sub x1, x1, #2
+ mov x0, sp
+ bl put_h264_qpel16_h_lowpass_neon_packed
+ mov x4, x0
+ mov x0, x8
+ sub x1, x9, x2, lsl #1
+ sub x1, x1, #2
+ mov x3, x2
+ bl \type\()_h264_qpel16_hv_lowpass_l2_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
+ add x1, x1, #1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ sub x1, x1, #1
+ b \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
+ mov x14, x30
+ lowpass_const w3
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ bl \type\()_h264_qpel16_v_lowpass_neon
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel16_mc12:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #(16*16+16*12)
+ sub x1, x1, x2, lsl #1
+ mov x0, sp
+ mov x3, x2
+ bl put_h264_qpel16_v_lowpass_neon_packed
+ mov x4, x0
+ mov x0, x8
+ sub x1, x9, x3, lsl #1
+ sub x1, x1, #2
+ mov x2, x3
+ bl \type\()_h264_qpel16_hv_lowpass_l2_neon
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
+ mov x14, x30
+ lowpass_const w3
+ mov x11, sp
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, #2
+ mov x3, x2
+ bl \type\()_h264_qpel16_hv_lowpass_neon
+ mov sp, x11 // restore stack
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, #1
+ b \type\()_h264_qpel16_mc12
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
+ mov x14, x30
+ add x12, x1, x2
+ b \type\()_h264_qpel16_mc01
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel16_mc21
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
+ add x1, x1, #1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ sub x1, x1, #1
+ b \type\()_h264_qpel16_mc11
+endfunc
+.endm
+
+ h264_qpel16 put
+ h264_qpel16 avg
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 37d1244851..19ab6cb0bd 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -16,6 +16,70 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+ trn1 \r8\().8B, \r0\().8B, \r1\().8B
+ trn2 \r9\().8B, \r0\().8B, \r1\().8B
+ trn1 \r1\().8B, \r2\().8B, \r3\().8B
+ trn2 \r3\().8B, \r2\().8B, \r3\().8B
+ trn1 \r0\().8B, \r4\().8B, \r5\().8B
+ trn2 \r5\().8B, \r4\().8B, \r5\().8B
+ trn1 \r2\().8B, \r6\().8B, \r7\().8B
+ trn2 \r7\().8B, \r6\().8B, \r7\().8B
+
+ trn1 \r4\().4H, \r0\().4H, \r2\().4H
+ trn2 \r2\().4H, \r0\().4H, \r2\().4H
+ trn1 \r6\().4H, \r5\().4H, \r7\().4H
+ trn2 \r7\().4H, \r5\().4H, \r7\().4H
+ trn1 \r5\().4H, \r9\().4H, \r3\().4H
+ trn2 \r9\().4H, \r9\().4H, \r3\().4H
+ trn1 \r3\().4H, \r8\().4H, \r1\().4H
+ trn2 \r8\().4H, \r8\().4H, \r1\().4H
+
+ trn1 \r0\().2S, \r3\().2S, \r4\().2S
+ trn2 \r4\().2S, \r3\().2S, \r4\().2S
+
+ trn1 \r1\().2S, \r5\().2S, \r6\().2S
+ trn2 \r5\().2S, \r5\().2S, \r6\().2S
+
+ trn2 \r6\().2S, \r8\().2S, \r2\().2S
+ trn1 \r2\().2S, \r8\().2S, \r2\().2S
+
+ trn1 \r3\().2S, \r9\().2S, \r7\().2S
+ trn2 \r7\().2S, \r9\().2S, \r7\().2S
+.endm
+
+.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+ trn1 \t0\().16B, \r0\().16B, \r1\().16B
+ trn2 \t1\().16B, \r0\().16B, \r1\().16B
+ trn1 \r1\().16B, \r2\().16B, \r3\().16B
+ trn2 \r3\().16B, \r2\().16B, \r3\().16B
+ trn1 \r0\().16B, \r4\().16B, \r5\().16B
+ trn2 \r5\().16B, \r4\().16B, \r5\().16B
+ trn1 \r2\().16B, \r6\().16B, \r7\().16B
+ trn2 \r7\().16B, \r6\().16B, \r7\().16B
+
+ trn1 \r4\().8H, \r0\().8H, \r2\().8H
+ trn2 \r2\().8H, \r0\().8H, \r2\().8H
+ trn1 \r6\().8H, \r5\().8H, \r7\().8H
+ trn2 \r7\().8H, \r5\().8H, \r7\().8H
+ trn1 \r5\().8H, \t1\().8H, \r3\().8H
+ trn2 \t1\().8H, \t1\().8H, \r3\().8H
+ trn1 \r3\().8H, \t0\().8H, \r1\().8H
+ trn2 \t0\().8H, \t0\().8H, \r1\().8H
+
+ trn1 \r0\().4S, \r3\().4S, \r4\().4S
+ trn2 \r4\().4S, \r3\().4S, \r4\().4S
+
+ trn1 \r1\().4S, \r5\().4S, \r6\().4S
+ trn2 \r5\().4S, \r5\().4S, \r6\().4S
+
+ trn2 \r6\().4S, \t0\().4S, \r2\().4S
+ trn1 \r2\().4S, \t0\().4S, \r2\().4S
+
+ trn1 \r3\().4S, \t1\().4S, \r7\().4S
+ trn2 \r7\().4S, \t1\().4S, \r7\().4S
+.endm
+
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H