aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJanne Grunau <janne-libav@jannau.net>2011-12-05 21:18:05 +0000
committerMans Rullgard <mans@mansr.com>2011-12-06 13:48:25 +0000
commitf5c05b9aa5aeb6079b76f9da452f8ee4050e8955 (patch)
tree8e93cc8ae1cca551af6e6bce06f522464176976a
parentf054a82727728e813861851648e109cd24574178 (diff)
downloadffmpeg-f5c05b9aa5aeb6079b76f9da452f8ee4050e8955.tar.gz
rv40: NEON optimised chroma MC
Signed-off-by: Mans Rullgard <mans@mansr.com>
-rw-r--r--libavcodec/arm/Makefile2
-rw-r--r--libavcodec/arm/h264cmc_neon.S80
-rw-r--r--libavcodec/arm/rv40dsp_init_neon.c38
-rw-r--r--libavcodec/rv34dsp.h1
-rw-r--r--libavcodec/rv40dsp.c2
5 files changed, 118 insertions, 5 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index c125a59078..a948e6db3f 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -68,6 +68,8 @@ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_neon.o \
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_neon.o \
arm/rv34dsp_neon.o \
+ arm/rv40dsp_init_neon.o \
+ arm/h264cmc_neon.o \
NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index e10adaca10..a6feadd189 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -21,8 +21,8 @@
#include "asm.S"
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-.macro h264_chroma_mc8 type
-function ff_\type\()_h264_chroma_mc8_neon, export=1
+.macro h264_chroma_mc8 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
push {r4-r7, lr}
ldrd r4, [sp, #20]
.ifc \type,avg
@@ -31,6 +31,15 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
pld [r1]
pld [r1, r2]
+ .ifc \codec,rv40
+ movrel r6, rv40bias
+ lsr r7, r5, #1
+ add r6, r6, r7, lsl #3
+ lsr r7, r4, #1
+ add r6, r6, r7, lsl #1
+ vld1.16 {d22[],d23[]}, [r6,:16]
+ .endif
+
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
@@ -67,10 +76,17 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
- vrshrn.u16 d16, q8, #6
vld1.8 {d6, d7}, [r5], r4
pld [r1]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vadd.u16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
@@ -102,8 +118,15 @@ T cmp r7, #0
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
vld1.8 {d6}, [r5], r4
+ .ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vadd.u16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
@@ -131,8 +154,15 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1
pld [r1]
vext.8 d5, d4, d5, #1
+ .ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vadd.u16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
@@ -149,8 +179,8 @@ endfunc
.endm
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-.macro h264_chroma_mc4 type
-function ff_\type\()_h264_chroma_mc4_neon, export=1
+.macro h264_chroma_mc4 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
push {r4-r7, lr}
ldrd r4, [sp, #20]
.ifc \type,avg
@@ -159,6 +189,15 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
pld [r1]
pld [r1, r2]
+ .ifc \codec,rv40
+ movrel r6, rv40bias
+ lsr r7, r5, #1
+ add r6, r6, r7, lsl #3
+ lsr r7, r4, #1
+ add r6, r6, r7, lsl #1
+ vld1.16 {d22[],d23[]}, [r6,:16]
+ .endif
+
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
@@ -199,7 +238,12 @@ T cmp r7, #0
vld1.8 {d6}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
+ .ifc \codec,h264
vrshrn.u16 d16, q8, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ .endif
subs r3, r3, #2
pld [r1]
.ifc \type,avg
@@ -236,7 +280,12 @@ T cmp r7, #0
vld1.32 {d4[1]}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
+ .ifc \codec,h264
vrshrn.u16 d16, q8, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ .endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
@@ -266,7 +315,12 @@ T cmp r7, #0
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
+ .ifc \codec,h264
vrshrn.u16 d16, q8, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ .endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
@@ -352,9 +406,25 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
endfunc
.endm
+#if CONFIG_H264_DECODER
h264_chroma_mc8 put
h264_chroma_mc8 avg
h264_chroma_mc4 put
h264_chroma_mc4 avg
h264_chroma_mc2 put
h264_chroma_mc2 avg
+#endif
+
+#if CONFIG_RV40_DECODER
+const rv40bias
+ .short 0, 16, 32, 16
+ .short 32, 28, 32, 28
+ .short 0, 32, 16, 32
+ .short 32, 28, 32, 28
+endconst
+
+ h264_chroma_mc8 put, rv40
+ h264_chroma_mc8 avg, rv40
+ h264_chroma_mc4 put, rv40
+ h264_chroma_mc4 avg, rv40
+#endif
diff --git a/libavcodec/arm/rv40dsp_init_neon.c b/libavcodec/arm/rv40dsp_init_neon.c
new file mode 100644
index 0000000000..aa4a88da1a
--- /dev/null
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
+{
+ c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
+ c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
+ c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
+ c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+}
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index a1636e6eb5..695af06970 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -59,5 +59,6 @@ void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
#endif /* AVCODEC_RV34DSP_H */
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index f193b6050d..06bdf18c42 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -534,4 +534,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) {
if (HAVE_MMX)
ff_rv40dsp_init_x86(c, dsp);
+ if (HAVE_NEON)
+ ff_rv40dsp_init_neon(c, dsp);
}