aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorKieran Kunhya <kieran@kunhya.com>2011-10-18 19:50:49 +0100
committerMichael Niedermayer <michaelni@gmx.at>2011-10-19 20:26:55 +0200
commit44d27736fcd3c53ea102847368e609b96e6eda86 (patch)
tree4bb59978d56c8feb6f24ac1b878ccad198b36fe3 /libavcodec
parentb1766c170c8fe3dfbc829625e8b162985f633389 (diff)
downloadffmpeg-44d27736fcd3c53ea102847368e609b96e6eda86.tar.gz
Add V210 SIMD
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/v210dec.c89
-rw-r--r--libavcodec/v210dec.h34
-rw-r--r--libavcodec/x86/Makefile2
-rw-r--r--libavcodec/x86/v210-init.c48
-rw-r--r--libavcodec/x86/v210.asm85
5 files changed, 241 insertions, 17 deletions
diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ecd88be22b..4f40e0872f 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -22,10 +22,35 @@
*/
#include "avcodec.h"
+#include "v210dec.h"
#include "libavutil/bswap.h"
+#include "libavutil/x86/timer.h"
+
+#define READ_PIXELS(a, b, c) \
+ do { \
+ val = av_le2ne32(*src++); \
+ *a++ = val & 0x3FF; \
+ *b++ = (val >> 10) & 0x3FF; \
+ *c++ = (val >> 20) & 0x3FF; \
+ } while (0)
+
+static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+{
+ uint32_t val;
+ int i;
+
+ for( i = 0; i < width-5; i += 6 ){
+ READ_PIXELS(u, y, v);
+ READ_PIXELS(y, u, y);
+ READ_PIXELS(v, y, u);
+ READ_PIXELS(y, v, y);
+ }
+}
static av_cold int decode_init(AVCodecContext *avctx)
{
+ V210DecContext *s = avctx->priv_data;
+
if (avctx->width & 1) {
av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
return -1;
@@ -35,18 +60,37 @@ static av_cold int decode_init(AVCodecContext *avctx)
avctx->coded_frame = avcodec_alloc_frame();
+ s->unpack_frame = v210_planar_unpack_c;
+
+ if (HAVE_MMX)
+ v210_x86_init(s);
+
return 0;
}
static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
AVPacket *avpkt)
{
- int h, w;
+ V210DecContext *s = avctx->priv_data;
+
+ int h, w, stride, aligned_input;
AVFrame *pic = avctx->coded_frame;
const uint8_t *psrc = avpkt->data;
uint16_t *y, *u, *v;
- int aligned_width = ((avctx->width + 47) / 48) * 48;
- int stride = aligned_width * 8 / 3;
+
+ if (s->custom_stride )
+ stride = s->custom_stride;
+ else {
+ int aligned_width = ((avctx->width + 47) / 48) * 48;
+ stride = aligned_width * 8 / 3;
+ }
+
+ aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
+ if (aligned_input != s->aligned_input) {
+ s->aligned_input = aligned_input;
+ if (HAVE_MMX)
+ v210_x86_init(s);
+ }
if (pic->data[0])
avctx->release_buffer(avctx, pic);
@@ -66,23 +110,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
pic->pict_type = AV_PICTURE_TYPE_I;
pic->key_frame = 1;
-#define READ_PIXELS(a, b, c) \
- do { \
- val = av_le2ne32(*src++); \
- *a++ = val & 0x3FF; \
- *b++ = (val >> 10) & 0x3FF; \
- *c++ = (val >> 20) & 0x3FF; \
- } while (0)
-
for (h = 0; h < avctx->height; h++) {
const uint32_t *src = (const uint32_t*)psrc;
uint32_t val;
- for (w = 0; w < avctx->width - 5; w += 6) {
- READ_PIXELS(u, y, v);
- READ_PIXELS(y, u, y);
- READ_PIXELS(v, y, u);
- READ_PIXELS(y, v, y);
- }
+
+ w = (avctx->width / 6) * 6;
+ s->unpack_frame(src, y, u, v, w);
+
+ y += w;
+ u += w >> 1;
+ v += w >> 1;
+ src += (w << 1) / 3;
+
if (w < avctx->width - 1) {
READ_PIXELS(u, y, v);
@@ -120,13 +159,29 @@ static av_cold int decode_close(AVCodecContext *avctx)
return 0;
}
+#define V210DEC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption v210dec_options[] = {
+ {"custom_stride", "Custom V210 stride", offsetof(V210DecContext, custom_stride), FF_OPT_TYPE_INT,
+ {.dbl = 0}, INT_MIN, INT_MAX, V210DEC_FLAGS},
+ {NULL}
+};
+
+static const AVClass v210dec_class = {
+ "V210 Decoder",
+ av_default_item_name,
+ v210dec_options,
+ LIBAVUTIL_VERSION_INT,
+};
+
AVCodec ff_v210_decoder = {
.name = "v210",
.type = AVMEDIA_TYPE_VIDEO,
.id = CODEC_ID_V210,
+ .priv_data_size = sizeof(V210DecContext),
.init = decode_init,
.close = decode_close,
.decode = decode_frame,
.capabilities = CODEC_CAP_DR1,
.long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+ .priv_class = &v210dec_class,
};
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
new file mode 100644
index 0000000000..48be729a5f
--- /dev/null
+++ b/libavcodec/v210dec.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V210DEC_H
+#define AVCODEC_V210DEC_H
+
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+ AVClass *av_class;
+ int custom_stride;
+ int aligned_input;
+ void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+} V210DecContext;
+
+void v210_x86_init(V210DecContext *s);
+
+#endif /* AVCODEC_V210DEC_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 3ae63ececb..f4783bc4d2 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,6 +39,8 @@ MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp-init.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o
MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o
+YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
+MMX-OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp.o
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..4dd6d6de8a
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void v210_x86_init(V210DecContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+ if (s->aligned_input) {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+ if (cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+ }
+ else {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+ if (cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+ }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..344bed0beb
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,85 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 2
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1_%2, 5, 5
+ movsxdifnidn r4, r4d
+ lea r1, [r1+2*r4]
+ add r2, r4
+ add r3, r4
+ neg r4
+
+ mova m3, [v210_mult]
+ mova m4, [v210_mask]
+ mova m5, [v210_luma_shuf]
+ mova m6, [v210_chroma_shuf]
+.loop
+%ifidn %1, unaligned
+ movu m0, [r0]
+%else
+ mova m0, [r0]
+%endif
+
+ pmullw m1, m0, m3
+ psrld m0, 10
+ psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
+ pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+ shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+ pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ movu [r1+2*r4], m2
+
+ shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+ pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+ movq [r2+r4], m1
+ movhps [r3+r4], m1
+
+ add r0, mmsize
+ add r4, 6
+ jl .loop
+
+ REP_RET
+%endmacro
+
+INIT_XMM
+v210_planar_unpack unaligned, ssse3
+INIT_AVX
+v210_planar_unpack unaligned, avx
+
+INIT_XMM
+v210_planar_unpack aligned, ssse3
+INIT_AVX
+v210_planar_unpack aligned, avx