aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorMans Rullgard <mans@mansr.com>2011-02-02 16:26:20 +0000
committerMans Rullgard <mans@mansr.com>2011-02-07 16:08:23 +0000
commita1c1d3c003b0ec16fdb6574913781313fb2c7ab6 (patch)
treedf33c3a7a54ec1a24675091eaf2e94bb9335d699 /libavcodec/arm
parent5bea615dc383cf3617c5057db4fbc6832fc64137 (diff)
downloadffmpeg-a1c1d3c003b0ec16fdb6574913781313fb2c7ab6.tar.gz
VP8: ARM NEON optimisations for dsp functions
This adds NEON optimised versions of all functions in VP8DSPContext. Based on initial work by Rob Clark. Signed-off-by: Mans Rullgard <mans@mansr.com>
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/Makefile3
-rw-r--r--libavcodec/arm/vp8dsp_init_arm.c163
-rw-r--r--libavcodec/arm/vp8dsp_neon.S1910
3 files changed, 2076 insertions, 0 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 014456ee32..15269ea676 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -2,6 +2,7 @@ OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o
+OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
@@ -54,6 +55,8 @@ NEON-OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_neon.o \
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
arm/vp3dsp_neon.o \
+NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_neon.o
+
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
arm/dsputil_neon.o \
arm/fmtconvert_neon.o \
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
new file mode 100644
index 0000000000..c970ca548c
--- /dev/null
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -0,0 +1,163 @@
+/**
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavcodec/vp8dsp.h"
+
+void ff_vp8_luma_dc_wht_neon(DCTELEM block[4][4][16], DCTELEM dc[16]);
+void ff_vp8_luma_dc_wht_dc_neon(DCTELEM block[4][4][16], DCTELEM dc[16]);
+
+void ff_vp8_idct_add_neon(uint8_t *dst, DCTELEM block[16], int stride);
+void ff_vp8_idct_dc_add_neon(uint8_t *dst, DCTELEM block[16], int stride);
+void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, DCTELEM block[4][16], int stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, DCTELEM block[4][16], int stride);
+
+void ff_vp8_v_loop_filter16_neon(uint8_t *dst, int stride,
+ int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter16_neon(uint8_t *dst, int stride,
+ int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_neon(uint8_t *dstU, uint8_t *dstV, int stride,
+ int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_neon(uint8_t *dstU, uint8_t *dstV, int stride,
+ int flim_E, int flim_I, int hev_thresh);
+
+void ff_vp8_v_loop_filter16_inner_neon(uint8_t *dst, int stride,
+ int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter16_inner_neon(uint8_t *dst, int stride,
+ int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_inner_neon(uint8_t *dstU, uint8_t *dstV,
+ int stride, int flim_E, int flim_I,
+ int hev_thresh);
+void ff_vp8_h_loop_filter8uv_inner_neon(uint8_t *dstU, uint8_t *dstV,
+ int stride, int flim_E, int flim_I,
+ int hev_thresh);
+
+void ff_vp8_v_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim);
+void ff_vp8_h_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim);
+
+
+#define VP8_MC(n) \
+ void ff_put_vp8_##n##_neon(uint8_t *dst, int dststride, \
+ uint8_t *src, int srcstride, \
+ int h, int x, int y)
+
+#define VP8_EPEL(w) \
+ VP8_MC(pixels ## w); \
+ VP8_MC(epel ## w ## _h4); \
+ VP8_MC(epel ## w ## _h6); \
+ VP8_MC(epel ## w ## _v4); \
+ VP8_MC(epel ## w ## _h4v4); \
+ VP8_MC(epel ## w ## _h6v4); \
+ VP8_MC(epel ## w ## _v6); \
+ VP8_MC(epel ## w ## _h4v6); \
+ VP8_MC(epel ## w ## _h6v6)
+
+VP8_EPEL(16);
+VP8_EPEL(8);
+VP8_EPEL(4);
+
+VP8_MC(bilin16_h);
+VP8_MC(bilin16_v);
+VP8_MC(bilin16_hv);
+VP8_MC(bilin8_h);
+VP8_MC(bilin8_v);
+VP8_MC(bilin8_hv);
+VP8_MC(bilin4_h);
+VP8_MC(bilin4_v);
+VP8_MC(bilin4_hv);
+
+av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
+{
+ if (HAVE_NEON) {
+ dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
+ dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_neon;
+
+ dsp->vp8_idct_add = ff_vp8_idct_add_neon;
+ dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
+ dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
+ dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
+
+ dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
+ dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
+ dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
+ dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
+
+ dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
+ dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
+ dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
+ dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
+
+ dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
+ dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
+
+ dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+ dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
+
+ dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
+
+ dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
+ }
+}
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
new file mode 100644
index 0000000000..01c39593a0
--- /dev/null
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -0,0 +1,1910 @@
+/**
+ * VP8 NEON optimisations
+ *
+ * Copyright (c) 2010 Rob Clark <rob@ti.com>
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+function ff_vp8_luma_dc_wht_neon, export=1
+ vld1.16 {q0-q1}, [r1,:128]
+ vmov.i16 q15, #0
+
+ vadd.i16 d4, d0, d3
+ vadd.i16 d6, d1, d2
+ vst1.16 {q15}, [r1,:128]!
+ vsub.i16 d7, d1, d2
+ vsub.i16 d5, d0, d3
+ vst1.16 {q15}, [r1,:128]
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vmov.i16 q8, #3
+
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.i16 d0, d0, d16
+
+ vadd.i16 d4, d0, d3
+ vadd.i16 d6, d1, d2
+ vsub.i16 d7, d1, d2
+ vsub.i16 d5, d0, d3
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vshr.s16 q0, q0, #3
+ vshr.s16 q1, q1, #3
+
+ mov r3, #32
+ vst1.16 {d0[0]}, [r0,:16], r3
+ vst1.16 {d1[0]}, [r0,:16], r3
+ vst1.16 {d2[0]}, [r0,:16], r3
+ vst1.16 {d3[0]}, [r0,:16], r3
+ vst1.16 {d0[1]}, [r0,:16], r3
+ vst1.16 {d1[1]}, [r0,:16], r3
+ vst1.16 {d2[1]}, [r0,:16], r3
+ vst1.16 {d3[1]}, [r0,:16], r3
+ vst1.16 {d0[2]}, [r0,:16], r3
+ vst1.16 {d1[2]}, [r0,:16], r3
+ vst1.16 {d2[2]}, [r0,:16], r3
+ vst1.16 {d3[2]}, [r0,:16], r3
+ vst1.16 {d0[3]}, [r0,:16], r3
+ vst1.16 {d1[3]}, [r0,:16], r3
+ vst1.16 {d2[3]}, [r0,:16], r3
+ vst1.16 {d3[3]}, [r0,:16], r3
+
+ bx lr
+endfunc
+
+function ff_vp8_luma_dc_wht_dc_neon, export=1
+ ldrsh r2, [r1]
+ mov r3, #0
+ add r2, r2, #3
+ strh r3, [r1]
+ asr r2, r2, #3
+ .rept 16
+ strh r2, [r0], #32
+ .endr
+ bx lr
+endfunc
+
+function ff_vp8_idct_add_neon, export=1
+ vld1.16 {q0-q1}, [r1,:128]
+ movw r3, #20091
+ movt r3, #35468/2
+ vdup.32 d4, r3
+
+ vmull.s16 q12, d1, d4[0]
+ vmull.s16 q13, d3, d4[0]
+ vqdmulh.s16 d20, d1, d4[1]
+ vqdmulh.s16 d23, d3, d4[1]
+ vshrn.s32 d21, q12, #16
+ vshrn.s32 d22, q13, #16
+ vadd.s16 d21, d21, d1
+ vadd.s16 d22, d22, d3
+
+ vadd.s16 d16, d0, d2
+ vsub.s16 d17, d0, d2
+ vadd.s16 d18, d21, d23
+ vsub.s16 d19, d20, d22
+ vadd.s16 q0, q8, q9
+ vsub.s16 q1, q8, q9
+
+ vtrn.32 d0, d3
+ vtrn.32 d1, d2
+ vtrn.16 d0, d1
+ vtrn.16 d3, d2
+
+ vmov.i16 q15, #0
+ vmull.s16 q12, d1, d4[0]
+ vst1.16 {q15}, [r1,:128]!
+ vmull.s16 q13, d2, d4[0]
+ vst1.16 {q15}, [r1,:128]
+ vqdmulh.s16 d21, d1, d4[1]
+ vqdmulh.s16 d23, d2, d4[1]
+ vshrn.s32 d20, q12, #16
+ vshrn.s32 d22, q13, #16
+ vadd.i16 d20, d20, d1
+ vadd.i16 d22, d22, d2
+
+ vadd.i16 d16, d0, d3
+ vsub.i16 d17, d0, d3
+ vadd.i16 d18, d20, d23
+ vld1.32 {d20[]}, [r0,:32], r2
+ vsub.i16 d19, d21, d22
+ vld1.32 {d22[]}, [r0,:32], r2
+ vadd.s16 q0, q8, q9
+ vld1.32 {d23[]}, [r0,:32], r2
+ vsub.s16 q1, q8, q9
+ vld1.32 {d21[]}, [r0,:32], r2
+ vrshr.s16 q0, q0, #3
+ vtrn.32 q10, q11
+ vrshr.s16 q1, q1, #3
+
+ sub r0, r0, r2, lsl #2
+
+ vtrn.32 d0, d3
+ vtrn.32 d1, d2
+ vtrn.16 d0, d1
+ vtrn.16 d3, d2
+
+ vaddw.u8 q0, q0, d20
+ vaddw.u8 q1, q1, d21
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+
+ bx lr
+endfunc
+
+function ff_vp8_idct_dc_add_neon, export=1
+ mov r3, #0
+ ldrsh r12, [r1]
+ strh r3, [r1]
+ vdup.16 q1, r12
+ vrshr.s16 q1, q1, #3
+ vld1.32 {d0[]}, [r0,:32], r2
+ vld1.32 {d1[]}, [r0,:32], r2
+ vld1.32 {d0[1]}, [r0,:32], r2
+ vld1.32 {d1[1]}, [r0,:32], r2
+ vaddw.u8 q2, q1, d0
+ vaddw.u8 q3, q1, d1
+ sub r0, r0, r2, lsl #2
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ bx lr
+endfunc
+
+function ff_vp8_idct_dc_add4uv_neon, export=1
+ vmov.i16 d0, #0
+ mov r3, #32
+ vld1.16 {d16[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d17[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d18[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d19[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ mov r3, r0
+ vrshr.s16 q8, q8, #3 @ dc >>= 3
+ vld1.8 {d0}, [r0,:64], r2
+ vrshr.s16 q9, q9, #3
+ vld1.8 {d1}, [r0,:64], r2
+ vaddw.u8 q10, q8, d0
+ vld1.8 {d2}, [r0,:64], r2
+ vaddw.u8 q0, q8, d1
+ vld1.8 {d3}, [r0,:64], r2
+ vaddw.u8 q11, q8, d2
+ vld1.8 {d4}, [r0,:64], r2
+ vaddw.u8 q1, q8, d3
+ vld1.8 {d5}, [r0,:64], r2
+ vaddw.u8 q12, q9, d4
+ vld1.8 {d6}, [r0,:64], r2
+ vaddw.u8 q2, q9, d5
+ vld1.8 {d7}, [r0,:64], r2
+ vaddw.u8 q13, q9, d6
+ vqmovun.s16 d20, q10
+ vaddw.u8 q3, q9, d7
+ vqmovun.s16 d21, q0
+ vqmovun.s16 d22, q11
+ vst1.8 {d20}, [r3,:64], r2
+ vqmovun.s16 d23, q1
+ vst1.8 {d21}, [r3,:64], r2
+ vqmovun.s16 d24, q12
+ vst1.8 {d22}, [r3,:64], r2
+ vqmovun.s16 d25, q2
+ vst1.8 {d23}, [r3,:64], r2
+ vqmovun.s16 d26, q13
+ vst1.8 {d24}, [r3,:64], r2
+ vqmovun.s16 d27, q3
+ vst1.8 {d25}, [r3,:64], r2
+ vst1.8 {d26}, [r3,:64], r2
+ vst1.8 {d27}, [r3,:64], r2
+
+ bx lr
+endfunc
+
+function ff_vp8_idct_dc_add4y_neon, export=1
+ vmov.i16 d0, #0
+ mov r3, #32
+ vld1.16 {d16[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d17[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d18[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d19[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vrshr.s16 q8, q8, #3 @ dc >>= 3
+ vld1.8 {q0}, [r0,:128], r2
+ vrshr.s16 q9, q9, #3
+ vld1.8 {q1}, [r0,:128], r2
+ vaddw.u8 q10, q8, d0
+ vld1.8 {q2}, [r0,:128], r2
+ vaddw.u8 q0, q9, d1
+ vld1.8 {q3}, [r0,:128], r2
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q1, q9, d3
+ vaddw.u8 q12, q8, d4
+ vaddw.u8 q2, q9, d5
+ vaddw.u8 q13, q8, d6
+ vaddw.u8 q3, q9, d7
+ sub r0, r0, r2, lsl #2
+ vqmovun.s16 d20, q10
+ vqmovun.s16 d21, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q1
+ vqmovun.s16 d24, q12
+ vst1.8 {q10}, [r0,:128], r2
+ vqmovun.s16 d25, q2
+ vst1.8 {q11}, [r0,:128], r2
+ vqmovun.s16 d26, q13
+ vst1.8 {q12}, [r0,:128], r2
+ vqmovun.s16 d27, q3
+ vst1.8 {q13}, [r0,:128], r2
+
+ bx lr
+endfunc
+
+@ Register layout:
+@ P3..Q3 -> q0..q7
+@ flim_E -> q14
+@ flim_I -> q15
+@ hev_thresh -> r12
+@
+.macro vp8_loop_filter, inner=0, simple=0
+ .if \simple
+ vabd.u8 q9, q3, q4 @ abs(P0-Q0)
+ vabd.u8 q15, q2, q5 @ abs(P1-Q1)
+ vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
+ vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
+ vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+ vmov.i8 q13, #0x80
+ vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
+ .else
+ @ calculate hev and normal_limit:
+ vabd.u8 q12, q2, q3 @ abs(P1-P0)
+ vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
+ vabd.u8 q10, q0, q1 @ abs(P3-P2)
+ vabd.u8 q11, q1, q2 @ abs(P2-P1)
+ vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
+ vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
+ vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
+ vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
+ vand q8, q8, q9
+ vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
+ vand q8, q8, q11
+ vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
+ vand q8, q8, q10
+ vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
+ vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
+ vabd.u8 q9, q3, q4 @ abs(P0-Q0)
+ vabd.u8 q15, q2, q5 @ abs(P1-Q1)
+ vand q8, q8, q10
+ vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
+ vand q8, q8, q11
+ vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
+ vdup.8 q15, r12 @ hev_thresh
+ vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+ vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
+ vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
+ vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
+ vand q8, q8, q11
+ vmov.i8 q13, #0x80
+ vorr q9, q12, q14
+ .endif
+
+ @ at this point:
+ @ q8: normal_limit
+ @ q9: hev
+
+ @ convert to signed value:
+ veor q3, q3, q13 @ PS0 = P0 ^ 0x80
+ veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
+
+ vmov.i16 q12, #3
+ vsubl.s8 q10, d8, d6 @ QS0 - PS0
+ vsubl.s8 q11, d9, d7 @ (widened to 16bit)
+ veor q2, q2, q13 @ PS1 = P1 ^ 0x80
+ veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
+ vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
+ vmul.i16 q11, q11, q12
+
+ vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
+ vmov.i8 q14, #4
+ vmov.i8 q15, #3
+ .if \inner
+ vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
+ .endif
+ vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
+ vaddw.s8 q11, q11, d25
+ vqmovn.s16 d20, q10 @ narrow result back into q10
+ vqmovn.s16 d21, q11
+ .if !\inner && !\simple
+ veor q1, q1, q13 @ PS2 = P2 ^ 0x80
+ veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
+ .endif
+ vand q10, q10, q8 @ w &= normal_limit
+
+ @ registers used at this point..
+ @ q0 -> P3 (don't corrupt)
+ @ q1-q6 -> PS2-QS2
+ @ q7 -> Q3 (don't corrupt)
+ @ q9 -> hev
+ @ q10 -> w
+ @ q13 -> #0x80
+ @ q14 -> #4
+ @ q15 -> #3
+ @ q8, q11, q12 -> unused
+
+ @ filter_common: is4tap==1
+ @ c1 = clamp(w + 4) >> 3;
+ @ c2 = clamp(w + 3) >> 3;
+ @ Q0 = s2u(QS0 - c1);
+ @ P0 = s2u(PS0 + c2);
+
+ .if \simple
+ vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
+ vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
+ vshr.s8 q11, q11, #3 @ c1 >>= 3
+ vshr.s8 q12, q12, #3 @ c2 >>= 3
+ vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
+ vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
+ veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
+ veor q3, q3, q13 @ P0 = PS0 ^ 0x80
+ veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
+ veor q2, q2, q13 @ P1 = PS1 ^ 0x80
+ .elseif \inner
+ @ the !is4tap case of filter_common, only used for inner blocks
+ @ c3 = ((c1&~hev) + 1) >> 1;
+ @ Q1 = s2u(QS1 - c3);
+ @ P1 = s2u(PS1 + c3);
+ vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
+ vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
+ vshr.s8 q11, q11, #3 @ c1 >>= 3
+ vshr.s8 q12, q12, #3 @ c2 >>= 3
+ vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
+ vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
+ vbic q11, q11, q9 @ c1 & ~hev
+ veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
+ vrshr.s8 q11, q11, #1 @ c3 >>= 1
+ veor q3, q3, q13 @ P0 = PS0 ^ 0x80
+ vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
+ vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
+ veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
+ veor q2, q2, q13 @ P1 = PS1 ^ 0x80
+ .else
+ vand q12, q10, q9 @ w & hev
+ vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
+ vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
+ vshr.s8 q11, q11, #3 @ c1 >>= 3
+ vshr.s8 q12, q12, #3 @ c2 >>= 3
+ vbic q10, q10, q9 @ w &= ~hev
+ vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
+ vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
+
+ @ filter_mbedge:
+ @ a = clamp((27*w + 63) >> 7);
+ @ Q0 = s2u(QS0 - a);
+ @ P0 = s2u(PS0 + a);
+ @ a = clamp((18*w + 63) >> 7);
+ @ Q1 = s2u(QS1 - a);
+ @ P1 = s2u(PS1 + a);
+ @ a = clamp((9*w + 63) >> 7);
+ @ Q2 = s2u(QS2 - a);
+ @ P2 = s2u(PS2 + a);
+ vmov.i16 q9, #63
+ vshll.s8 q14, d20, #3
+ vshll.s8 q15, d21, #3
+ vaddw.s8 q14, q14, d20
+ vaddw.s8 q15, q15, d21
+ vadd.s16 q8, q9, q14
+ vadd.s16 q9, q9, q15 @ 9*w + 63
+ vadd.s16 q11, q8, q14
+ vadd.s16 q12, q9, q15 @ 18*w + 63
+ vadd.s16 q14, q11, q14
+ vadd.s16 q15, q12, q15 @ 27*w + 63
+ vqshrn.s16 d16, q8, #7
+ vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
+ vqshrn.s16 d22, q11, #7
+ vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
+ vqshrn.s16 d28, q14, #7
+ vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
+ vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
+ vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
+ vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
+ vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
+ vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
+ vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
+ veor q3, q3, q13 @ P0 = PS0 ^ 0x80
+ veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
+ veor q2, q2, q13 @ P1 = PS1 ^ 0x80
+ veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
+ veor q1, q1, q13 @ P2 = PS2 ^ 0x80
+ veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
+ .endif
+.endm
+
+.macro transpose8x16matrix
+ vtrn.32 q0, q4
+ vtrn.32 q1, q5
+ vtrn.32 q2, q6
+ vtrn.32 q3, q7
+
+ vtrn.16 q0, q2
+ vtrn.16 q1, q3
+ vtrn.16 q4, q6
+ vtrn.16 q5, q7
+
+ vtrn.8 q0, q1
+ vtrn.8 q2, q3
+ vtrn.8 q4, q5
+ vtrn.8 q6, q7
+.endm
+
+.macro vp8_v_loop_filter16 name, inner=0, simple=0
+function ff_vp8_v_loop_filter16\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, r1, lsl #1+!\simple
+
+ @ Load pixels:
+ .if !\simple
+ ldr r12, [sp, #64] @ hev_thresh
+ vld1.8 {q0}, [r0,:128], r1 @ P3
+ vld1.8 {q1}, [r0,:128], r1 @ P2
+ .endif
+ vld1.8 {q2}, [r0,:128], r1 @ P1
+ vld1.8 {q3}, [r0,:128], r1 @ P0
+ vld1.8 {q4}, [r0,:128], r1 @ Q0
+ vld1.8 {q5}, [r0,:128], r1 @ Q1
+ .if !\simple
+ vld1.8 {q6}, [r0,:128], r1 @ Q2
+ vld1.8 {q7}, [r0,:128] @ Q3
+ vdup.8 q15, r3 @ flim_I
+ .endif
+ vdup.8 q14, r2 @ flim_E
+
+ vp8_loop_filter inner=\inner, simple=\simple
+
+ @ back up to P2: dst -= stride * 6
+ sub r0, r0, r1, lsl #2
+ .if !\simple
+ sub r0, r0, r1, lsl #1
+
+ @ Store pixels:
+ vst1.8 {q1}, [r0,:128], r1 @ P2
+ .endif
+ vst1.8 {q2}, [r0,:128], r1 @ P1
+ vst1.8 {q3}, [r0,:128], r1 @ P0
+ vst1.8 {q4}, [r0,:128], r1 @ Q0
+ vst1.8 {q5}, [r0,:128], r1 @ Q1
+ .if !\simple
+ vst1.8 {q6}, [r0,:128] @ Q2
+ .endif
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_v_loop_filter16
+vp8_v_loop_filter16 _inner, inner=1
+vp8_v_loop_filter16 _simple, simple=1
+
+.macro vp8_v_loop_filter8uv name, inner=0
+function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, r2, lsl #2
+ sub r1, r1, r2, lsl #2
+ ldr r12, [sp, #64] @ flim_I
+
+ @ Load pixels:
+ vld1.8 {d0}, [r0,:64], r2 @ P3
+ vld1.8 {d1}, [r1,:64], r2 @ P3
+ vld1.8 {d2}, [r0,:64], r2 @ P2
+ vld1.8 {d3}, [r1,:64], r2 @ P2
+ vld1.8 {d4}, [r0,:64], r2 @ P1
+ vld1.8 {d5}, [r1,:64], r2 @ P1
+ vld1.8 {d6}, [r0,:64], r2 @ P0
+ vld1.8 {d7}, [r1,:64], r2 @ P0
+ vld1.8 {d8}, [r0,:64], r2 @ Q0
+ vld1.8 {d9}, [r1,:64], r2 @ Q0
+ vld1.8 {d10}, [r0,:64], r2 @ Q1
+ vld1.8 {d11}, [r1,:64], r2 @ Q1
+ vld1.8 {d12}, [r0,:64], r2 @ Q2
+ vld1.8 {d13}, [r1,:64], r2 @ Q2
+ vld1.8 {d14}, [r0,:64] @ Q3
+ vld1.8 {d15}, [r1,:64] @ Q3
+
+ vdup.8 q14, r3 @ flim_E
+ vdup.8 q15, r12 @ flim_I
+ ldr r12, [sp, #68] @ hev_thresh
+
+ vp8_loop_filter inner=\inner
+
+ @ back up to P2: u,v -= stride * 6
+ sub r0, r0, r2, lsl #2
+ sub r1, r1, r2, lsl #2
+ sub r0, r0, r2, lsl #1
+ sub r1, r1, r2, lsl #1
+
+ @ Store pixels:
+ vst1.8 {d2}, [r0,:64], r2 @ P2
+ vst1.8 {d3}, [r1,:64], r2 @ P2
+ vst1.8 {d4}, [r0,:64], r2 @ P1
+ vst1.8 {d5}, [r1,:64], r2 @ P1
+ vst1.8 {d6}, [r0,:64], r2 @ P0
+ vst1.8 {d7}, [r1,:64], r2 @ P0
+ vst1.8 {d8}, [r0,:64], r2 @ Q0
+ vst1.8 {d9}, [r1,:64], r2 @ Q0
+ vst1.8 {d10}, [r0,:64], r2 @ Q1
+ vst1.8 {d11}, [r1,:64], r2 @ Q1
+ vst1.8 {d12}, [r0,:64] @ Q2
+ vst1.8 {d13}, [r1,:64] @ Q2
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_v_loop_filter8uv
+vp8_v_loop_filter8uv _inner, inner=1
+
+.macro vp8_h_loop_filter16 name, inner=0, simple=0
+function ff_vp8_h_loop_filter16\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, #4
+ .if !\simple
+ ldr r12, [sp, #64] @ hev_thresh
+ .endif
+
+ @ Load pixels:
+ vld1.8 {d0}, [r0], r1 @ load first 8-line src data
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d8}, [r0], r1
+ vld1.8 {d10}, [r0], r1
+ vld1.8 {d12}, [r0], r1
+ vld1.8 {d14}, [r0], r1
+ vld1.8 {d1}, [r0], r1 @ load second 8-line src data
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d5}, [r0], r1
+ vld1.8 {d7}, [r0], r1
+ vld1.8 {d9}, [r0], r1
+ vld1.8 {d11}, [r0], r1
+ vld1.8 {d13}, [r0], r1
+ vld1.8 {d15}, [r0], r1
+
+ transpose8x16matrix
+
+ vdup.8 q14, r2 @ flim_E
+ .if !\simple
+ vdup.8 q15, r3 @ flim_I
+ .endif
+
+ vp8_loop_filter inner=\inner, simple=\simple
+
+ sub r0, r0, r1, lsl #4 @ backup 16 rows
+
+ transpose8x16matrix
+
+ @ Store pixels:
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d4}, [r0], r1
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d12}, [r0], r1
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+ vst1.8 {d3}, [r0], r1
+ vst1.8 {d5}, [r0], r1
+ vst1.8 {d7}, [r0], r1
+ vst1.8 {d9}, [r0], r1
+ vst1.8 {d11}, [r0], r1
+ vst1.8 {d13}, [r0], r1
+ vst1.8 {d15}, [r0]
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_h_loop_filter16
+vp8_h_loop_filter16 _inner, inner=1
+vp8_h_loop_filter16 _simple, simple=1
+
+.macro vp8_h_loop_filter8uv name, inner=0
+function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, #4
+ sub r1, r1, #4
+ ldr r12, [sp, #64] @ flim_I
+
+ @ Load pixels:
+ vld1.8 {d0}, [r0], r2 @ load u
+ vld1.8 {d1}, [r1], r2 @ load v
+ vld1.8 {d2}, [r0], r2
+ vld1.8 {d3}, [r1], r2
+ vld1.8 {d4}, [r0], r2
+ vld1.8 {d5}, [r1], r2
+ vld1.8 {d6}, [r0], r2
+ vld1.8 {d7}, [r1], r2
+ vld1.8 {d8}, [r0], r2
+ vld1.8 {d9}, [r1], r2
+ vld1.8 {d10}, [r0], r2
+ vld1.8 {d11}, [r1], r2
+ vld1.8 {d12}, [r0], r2
+ vld1.8 {d13}, [r1], r2
+ vld1.8 {d14}, [r0], r2
+ vld1.8 {d15}, [r1], r2
+
+ transpose8x16matrix
+
+ vdup.8 q14, r3 @ flim_E
+ vdup.8 q15, r12 @ flim_I
+ ldr r12, [sp, #68] @ hev_thresh
+
+ vp8_loop_filter inner=\inner
+
+ sub r0, r0, r2, lsl #3 @ backup u 8 rows
+ sub r1, r1, r2, lsl #3 @ backup v 8 rows
+
+ transpose8x16matrix
+
+ @ Store pixels:
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r1], r2
+ vst1.8 {d2}, [r0], r2
+ vst1.8 {d3}, [r1], r2
+ vst1.8 {d4}, [r0], r2
+ vst1.8 {d5}, [r1], r2
+ vst1.8 {d6}, [r0], r2
+ vst1.8 {d7}, [r1], r2
+ vst1.8 {d8}, [r0], r2
+ vst1.8 {d9}, [r1], r2
+ vst1.8 {d10}, [r0], r2
+ vst1.8 {d11}, [r1], r2
+ vst1.8 {d12}, [r0], r2
+ vst1.8 {d13}, [r1], r2
+ vst1.8 {d14}, [r0]
+ vst1.8 {d15}, [r1]
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_h_loop_filter8uv
+vp8_h_loop_filter8uv _inner, inner=1
+
+function ff_put_vp8_pixels16_neon, export=1
+ ldr r12, [sp, #0] @ h
+1:
+ subs r12, r12, #4
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q3}, [r2], r3
+ vst1.8 {q0}, [r0,:128], r1
+ vst1.8 {q1}, [r0,:128], r1
+ vst1.8 {q2}, [r0,:128], r1
+ vst1.8 {q3}, [r0,:128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_put_vp8_pixels8_neon, export=1
+ ldr r12, [sp, #0] @ h
+1:
+ subs r12, r12, #4
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vst1.8 {d0}, [r0,:64], r1
+ vst1.8 {d1}, [r0,:64], r1
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_put_vp8_pixels4_neon, export=1
+ ldr r12, [sp, #0] @ h
+ push {r4-r6,lr}
+1:
+ subs r12, r12, #4
+ ldr r4, [r2], r3
+ ldr r5, [r2], r3
+ ldr r6, [r2], r3
+ ldr lr, [r2], r3
+ str r4, [r0], r1
+ str r5, [r0], r1
+ str r6, [r0], r1
+ str lr, [r0], r1
+ bgt 1b
+ pop {r4-r6,pc}
+endfunc
+
+/* 4/6-tap 8th-pel MC */
+
+.macro vp8_epel8_h6 d, a, b
+ vext.8 d27, \a, \b, #1
+ vmovl.u8 q8, \a
+ vext.8 d28, \a, \b, #2
+ vmovl.u8 q9, d27
+ vext.8 d29, \a, \b, #3
+ vmovl.u8 q10, d28
+ vext.8 d30, \a, \b, #4
+ vmovl.u8 q11, d29
+ vext.8 d31, \a, \b, #5
+ vmovl.u8 q12, d30
+ vmul.u16 q10, q10, d0[2]
+ vmovl.u8 q13, d31
+ vmul.u16 q11, q11, d0[3]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q11, q12, d1[0]
+ vmla.u16 q10, q8, d0[0]
+ vmla.u16 q11, q13, d1[1]
+ vqadd.s16 q11, q10, q11
+ vqrshrun.s16 \d, q11, #7
+.endm
+
+.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
+ vext.8 q14, \q0, \q1, #3
+ vext.8 q15, \q0, \q1, #4
+ vmovl.u8 q11, d28
+ vmovl.u8 q14, d29
+ vext.8 q3, \q0, \q1, #2
+ vmovl.u8 q12, d30
+ vmovl.u8 q15, d31
+ vext.8 q8, \q0, \q1, #1
+ vmovl.u8 q10, d6
+ vmovl.u8 q3, d7
+ vext.8 q2, \q0, \q1, #5
+ vmovl.u8 q13, d4
+ vmovl.u8 q2, d5
+ vmovl.u8 q9, d16
+ vmovl.u8 q8, d17
+ vmul.u16 q11, q11, d0[3]
+ vmul.u16 q10, q10, d0[2]
+ vmul.u16 q3, q3, d0[2]
+ vmul.u16 q14, q14, d0[3]
+ vmls.u16 q11, q12, d1[0]
+ vmovl.u8 q12, \s0
+ vmovl.u8 q1, \s1
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q3, q8, d0[1]
+ vmls.u16 q14, q15, d1[0]
+ vmla.u16 q10, q12, d0[0]
+ vmla.u16 q11, q13, d1[1]
+ vmla.u16 q3, q1, d0[0]
+ vmla.u16 q14, q2, d1[1]
+ vqadd.s16 q11, q10, q11
+ vqadd.s16 q14, q3, q14
+ vqrshrun.s16 \d0, q11, #7
+ vqrshrun.s16 \d1, q14, #7
+.endm
+
+.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
+ vmovl.u8 q10, \s2
+ vmovl.u8 q11, \s3
+ vmovl.u8 q9, \s1
+ vmovl.u8 q12, \s4
+ vmovl.u8 q8, \s0
+ vmovl.u8 q13, \s5
+ vmul.u16 q10, q10, d0[2]
+ vmul.u16 q11, q11, d0[3]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q11, q12, d1[0]
+ vmla.u16 q10, q8, d0[0]
+ vmla.u16 q11, q13, d1[1]
+ vqadd.s16 q11, q10, q11
+ vqrshrun.s16 \d0, q11, #7
+.endm
+
+.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
+ vmovl.u8 q10, \s0
+ vmovl.u8 q11, \s3
+ vmovl.u8 q14, \s6
+ vmovl.u8 q9, \s1
+ vmovl.u8 q12, \s4
+ vmovl.u8 q8, \s2
+ vmovl.u8 q13, \s5
+ vmul.u16 q10, q10, d0[0]
+ vmul.u16 q15, q11, d0[3]
+ vmul.u16 q11, q11, d0[2]
+ vmul.u16 q14, q14, d1[1]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q15, q12, d1[0]
+ vmls.u16 q11, q8, d0[1]
+ vmls.u16 q14, q13, d1[0]
+ vmla.u16 q10, q8, d0[2]
+ vmla.u16 q15, q13, d1[1]
+ vmla.u16 q11, q9, d0[0]
+ vmla.u16 q14, q12, d0[3]
+ vqadd.s16 q15, q10, q15
+ vqadd.s16 q14, q11, q14
+ vqrshrun.s16 \d0, q15, #7
+ vqrshrun.s16 \d1, q14, #7
+.endm
+
+.macro vp8_epel8_h4 d, a, b
+ vext.8 d28, \a, \b, #1
+ vmovl.u8 q9, \a
+ vext.8 d29, \a, \b, #2
+ vmovl.u8 q10, d28
+ vext.8 d30, \a, \b, #3
+ vmovl.u8 q11, d29
+ vmovl.u8 q12, d30
+ vmul.u16 q10, q10, d0[2]
+ vmul.u16 q11, q11, d0[3]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q11, q12, d1[0]
+ vqadd.s16 q11, q10, q11
+ vqrshrun.s16 \d, q11, #7
+.endm
+
+.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
+ vmovl.u8 q9, \s0
+ vmovl.u8 q10, \s1
+ vmovl.u8 q11, \s2
+ vmovl.u8 q12, \s3
+ vmovl.u8 q13, \s4
+ vmul.u16 q8, q10, d0[2]
+ vmul.u16 q14, q11, d0[3]
+ vmul.u16 q11, q11, d0[2]
+ vmul.u16 q15, q12, d0[3]
+ vmls.u16 q8, q9, d0[1]
+ vmls.u16 q14, q12, d1[0]
+ vmls.u16 q11, q10, d0[1]
+ vmls.u16 q15, q13, d1[0]
+ vqadd.s16 q8, q8, q14
+ vqadd.s16 q11, q11, q15
+ vqrshrun.s16 \d0, q8, #7
+ vqrshrun.s16 \d1, q11, #7
+.endm
+
+function ff_put_vp8_epel16_v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ push {r4,lr}
+ vpush {d8-d15}
+
+ ldr r4, [sp, #80] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #72] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2-d3}, [r2], r3
+ vld1.8 {d4-d5}, [r2], r3
+ vld1.8 {d6-d7}, [r2], r3
+ vld1.8 {d8-d9}, [r2], r3
+ vld1.8 {d10-d11},[r2], r3
+ vld1.8 {d12-d13},[r2], r3
+ vld1.8 {d14-d15},[r2]
+ sub r2, r2, r3, lsl #2
+
+ vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
+ vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
+
+ vst1.8 {d2-d3}, [r0,:128], r1
+ vst1.8 {d4-d5}, [r0,:128], r1
+ subs r12, r12, #2
+ bne 1b
+
+ vpop {d8-d15}
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel16_h6_neon, export=1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2-d4}, [r2], r3
+
+ vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
+
+ vst1.8 {d2-d3}, [r0,:128], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel16_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #2
+ push {r4,lr}
+ vpush {d8-d9}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #28] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #24] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #336+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3,d4}, [r2], r3
+
+ vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
+
+ vst1.8 {d2-d3}, [lr,:128]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #336+16+32] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #336+16+24] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6-d9}, [lr,:128]!
+ vld1.8 {d28-d31},[lr,:128]
+ sub lr, lr, #48
+
+ vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
+ vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
+
+ vst1.8 {d2-d3}, [r0,:128], r1
+ subs r12, r12, #1
+ bne 2b
+
+ add sp, sp, #336+16
+ vpop {d8-d9}
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d7}, [r2], r3
+ vld1.8 {d28}, [r2]
+
+ sub r2, r2, r3, lsl #2
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h6_neon, export=1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h6 d2, d2, d3
+
+ vst1.8 {d2}, [r0,:64], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h6 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6-d7}, [lr,:128]!
+ vld1.8 {d30}, [lr,:64]
+ sub lr, lr, #32
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_v4_neon, export=1
+ sub r2, r2, r3
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d6}, [r2]
+ sub r2, r2, r3, lsl #1
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h4_neon, export=1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h4 d2, d2, d3
+
+ vst1.8 {d2}, [r0,:64], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h4v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #1
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h4 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]
+ sub lr, lr, #16
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h6v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #2
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h6 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]
+ sub lr, lr, #16
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h4v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h4 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6-d7}, [lr,:128]!
+ vld1.8 {d30}, [lr,:64]
+ sub lr, lr, #32
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d6[]}, [r2], r3
+ vld1.32 {d7[]}, [r2], r3
+ vld1.32 {d28[]}, [r2]
+ sub r2, r2, r3, lsl #2
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d5[1]}, [r2], r3
+ vld1.32 {d6[1]}, [r2], r3
+ vld1.32 {d7[1]}, [r2], r3
+ vld1.32 {d28[1]}, [r2]
+ sub r2, r2, r3, lsl #2
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
+
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h6_neon, export=1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {q1}, [r2], r3
+ vp8_epel8_h6 d2, d2, d3
+ vst1.32 {d2[0]}, [r0,:32], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #52+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {q1}, [r2], r3
+ vp8_epel8_h6 d2, d2, d3
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #52+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #52+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]!
+ vld1.32 {d28[]}, [lr,:32]
+ sub lr, lr, #16
+ vld1.8 {d4-d5}, [lr]!
+ vld1.8 {d7}, [lr,:64]!
+ vld1.32 {d28[1]}, [lr,:32]
+ sub lr, lr, #16
+ vtrn.32 q1, q2
+ vtrn.32 d6, d7
+ vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #52+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h4v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #52+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2}, [r2], r3
+ vp8_epel8_h4 d2, d2, d2
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #52+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #52+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]!
+ vld1.32 {d28[]}, [lr,:32]
+ sub lr, lr, #16
+ vld1.8 {d4-d5}, [lr]!
+ vld1.8 {d7}, [lr,:64]!
+ vld1.32 {d28[1]}, [lr,:32]
+ sub lr, lr, #16
+ vtrn.32 q1, q2
+ vtrn.32 d6, d7
+ vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #52+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h6v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #44+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {q1}, [r2], r3
+ vp8_epel8_h6 d2, d2, d3
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #44+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #44+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.32 {d6[]}, [lr,:32]
+ sub lr, lr, #8
+ vld1.8 {d4-d5}, [lr]!
+ vld1.32 {d6[1]}, [lr,:32]
+ sub lr, lr, #8
+ vtrn.32 q1, q2
+ vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #44+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h4_neon, export=1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2}, [r2], r3
+ vp8_epel8_h4 d2, d2, d2
+ vst1.32 {d2[0]}, [r0,:32], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_v4_neon, export=1
+ sub r2, r2, r3
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d6[]}, [r2]
+ sub r2, r2, r3, lsl #1
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d5[1]}, [r2], r3
+ vld1.32 {d6[1]}, [r2]
+ sub r2, r2, r3, lsl #1
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h4v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #44+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {d2}, [r2], r3
+ vp8_epel8_h4 d2, d2, d3
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #44+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #44+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.32 {d6[]}, [lr,:32]
+ sub lr, lr, #8
+ vld1.8 {d4-d5}, [lr]!
+ vld1.32 {d6[1]}, [lr,:32]
+ sub lr, lr, #8
+ vtrn.32 q1, q2
+ vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #44+16
+ pop {r4,pc}
+endfunc
+
+@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
+@ arithmatic can be used to apply filters
+const subpel_filters, align=4
+ .short 0, 6, 123, 12, 1, 0, 0, 0
+ .short 2, 11, 108, 36, 8, 1, 0, 0
+ .short 0, 9, 93, 50, 6, 0, 0, 0
+ .short 3, 16, 77, 77, 16, 3, 0, 0
+ .short 0, 6, 50, 93, 9, 0, 0, 0
+ .short 1, 8, 36, 108, 11, 2, 0, 0
+ .short 0, 1, 12, 123, 6, 0, 0, 0
+endconst
+
+/* Bilinear MC */
+
+function ff_put_vp8_bilin16_h_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+1:
+ subs r12, r12, #2
+ vld1.8 {d2-d4}, [r2], r1
+ vext.8 q2, q1, q2, #1
+ vmull.u8 q8, d2, d1
+ vmlal.u8 q8, d4, d0
+ vld1.8 {d18-d20},[r2], r1
+ vmull.u8 q3, d3, d1
+ vmlal.u8 q3, d5, d0
+ vext.8 q10, q9, q10, #1
+ vmull.u8 q11, d18, d1
+ vmlal.u8 q11, d20, d0
+ vmull.u8 q12, d19, d1
+ vmlal.u8 q12, d21, d0
+ vrshrn.u16 d4, q8, #3
+ vrshrn.u16 d5, q3, #3
+ vrshrn.u16 d6, q11, #3
+ vrshrn.u16 d7, q12, #3
+ vst1.8 {q2}, [r0,:128], r1
+ vst1.8 {q3}, [r0,:128], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin16_v_neon, export=1
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+ vld1.8 {q1}, [r2], r1
+1:
+ subs r12, r12, #2
+ vld1.8 {q2}, [r2], r1
+ vmull.u8 q3, d2, d1
+ vmlal.u8 q3, d4, d0
+ vmull.u8 q8, d3, d1
+ vmlal.u8 q8, d5, d0
+ vld1.8 {q1}, [r2], r1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d2, d0
+ vmull.u8 q10, d5, d1
+ vmlal.u8 q10, d3, d0
+ vrshrn.u16 d4, q3, #3
+ vrshrn.u16 d5, q8, #3
+ vrshrn.u16 d6, q9, #3
+ vrshrn.u16 d7, q10, #3
+ vst1.8 {q2}, [r0,:128], r1
+ vst1.8 {q3}, [r0,:128], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin16_hv_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d2, r3
+ vdup.8 d3, r12
+ ldr r12, [sp] @ h
+
+ vld1.8 {d4-d6}, [r2], r1
+ vext.8 q3, q2, q3, #1
+ vmull.u8 q8, d4, d1
+ vmlal.u8 q8, d6, d0
+ vmull.u8 q9, d5, d1
+ vmlal.u8 q9, d7, d0
+ vrshrn.u16 d4, q8, #3
+ vrshrn.u16 d5, q9, #3
+1:
+ subs r12, r12, #2
+ vld1.8 {d18-d20},[r2], r1
+ vext.8 q10, q9, q10, #1
+ vmull.u8 q11, d18, d1
+ vmlal.u8 q11, d20, d0
+ vld1.8 {d26-d28},[r2], r1
+ vmull.u8 q12, d19, d1
+ vmlal.u8 q12, d21, d0
+ vext.8 q14, q13, q14, #1
+ vmull.u8 q8, d26, d1
+ vmlal.u8 q8, d28, d0
+ vmull.u8 q9, d27, d1
+ vmlal.u8 q9, d29, d0
+ vrshrn.u16 d6, q11, #3
+ vrshrn.u16 d7, q12, #3
+ vmull.u8 q12, d4, d3
+ vmlal.u8 q12, d6, d2
+ vmull.u8 q15, d5, d3
+ vmlal.u8 q15, d7, d2
+ vrshrn.u16 d4, q8, #3
+ vrshrn.u16 d5, q9, #3
+ vmull.u8 q10, d6, d3
+ vmlal.u8 q10, d4, d2
+ vmull.u8 q11, d7, d3
+ vmlal.u8 q11, d5, d2
+ vrshrn.u16 d24, q12, #3
+ vrshrn.u16 d25, q15, #3
+ vst1.8 {q12}, [r0,:128], r1
+ vrshrn.u16 d20, q10, #3
+ vrshrn.u16 d21, q11, #3
+ vst1.8 {q10}, [r0,:128], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin8_h_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+1:
+ subs r12, r12, #2
+ vld1.8 {q1}, [r2], r1
+ vext.8 d3, d2, d3, #1
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vld1.8 {q3}, [r2], r1
+ vext.8 d7, d6, d7, #1
+ vmull.u8 q8, d6, d1
+ vmlal.u8 q8, d7, d0
+ vrshrn.u16 d4, q2, #3
+ vrshrn.u16 d16, q8, #3
+ vst1.8 {d4}, [r0,:64], r1
+ vst1.8 {d16}, [r0,:64], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin8_v_neon, export=1
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+ vld1.8 {d2}, [r2], r1
+1:
+ subs r12, r12, #2
+ vld1.8 {d3}, [r2], r1
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vld1.8 {d2}, [r2], r1
+ vmull.u8 q3, d3, d1
+ vmlal.u8 q3, d2, d0
+ vrshrn.u16 d4, q2, #3
+ vrshrn.u16 d6, q3, #3
+ vst1.8 {d4}, [r0,:64], r1
+ vst1.8 {d6}, [r0,:64], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin8_hv_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d2, r3
+ vdup.8 d3, r12
+ ldr r12, [sp] @ h
+
+ vld1.8 {q2}, [r2], r1
+ vext.8 d5, d4, d5, #1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d5, d0
+ vrshrn.u16 d22, q9, #3
+1:
+ subs r12, r12, #2
+ vld1.8 {q3}, [r2], r1
+ vext.8 d7, d6, d7, #1
+ vmull.u8 q8, d6, d1
+ vmlal.u8 q8, d7, d0
+ vld1.8 {q2}, [r2], r1
+ vext.8 d5, d4, d5, #1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d5, d0
+ vrshrn.u16 d16, q8, #3
+ vmull.u8 q10, d22, d3
+ vmlal.u8 q10, d16, d2
+ vrshrn.u16 d22, q9, #3
+ vmull.u8 q12, d16, d3
+ vmlal.u8 q12, d22, d2
+ vrshrn.u16 d20, q10, #3
+ vst1.8 {d20}, [r0,:64], r1
+ vrshrn.u16 d23, q12, #3
+ vst1.8 {d23}, [r0,:64], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin4_h_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+1:
+ subs r12, r12, #2
+ vld1.8 {d2}, [r2], r1
+ vext.8 d3, d2, d3, #1
+ vld1.8 {d6}, [r2], r1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 q1, q3
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vrshrn.u16 d4, q2, #3
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin4_v_neon, export=1
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+ vld1.32 {d2[]}, [r2], r1
+1:
+ vld1.32 {d3[]}, [r2]
+ vld1.32 {d2[1]}, [r2], r1
+ vld1.32 {d3[1]}, [r2], r1
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vtrn.32 d3, d2
+ vrshrn.u16 d4, q2, #3
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ subs r12, r12, #2
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin4_hv_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d2, r3
+ vdup.8 d3, r12
+ ldr r12, [sp] @ h
+
+ vld1.8 {d4}, [r2], r1
+ vext.8 d5, d4, d4, #1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d5, d0
+ vrshrn.u16 d22, q9, #3
+1:
+ subs r12, r12, #2
+ vld1.8 {d6}, [r2], r1
+ vext.8 d7, d6, d6, #1
+ vld1.8 {d4}, [r2], r1
+ vext.8 d5, d4, d4, #1
+ vtrn.32 q3, q2
+ vmull.u8 q8, d6, d1
+ vmlal.u8 q8, d7, d0
+ vrshrn.u16 d16, q8, #3
+ vmull.u8 q10, d16, d2
+ vtrn.32 d22, d16
+ vmlal.u8 q10, d22, d3
+ vrev64.32 d22, d16
+ vrshrn.u16 d20, q10, #3
+ vst1.32 {d20[0]}, [r0,:32], r1
+ vst1.32 {d20[1]}, [r0,:32], r1
+ bgt 1b
+
+ bx lr
+endfunc