diff options
author | Kieran Kunhya <kieran@kunhya.com> | 2011-10-18 19:50:49 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-10-19 20:26:55 +0200 |
commit | 44d27736fcd3c53ea102847368e609b96e6eda86 (patch) | |
tree | 4bb59978d56c8feb6f24ac1b878ccad198b36fe3 /libavcodec | |
parent | b1766c170c8fe3dfbc829625e8b162985f633389 (diff) | |
download | ffmpeg-44d27736fcd3c53ea102847368e609b96e6eda86.tar.gz |
Add V210 SIMD
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/v210dec.c | 89 | ||||
-rw-r--r-- | libavcodec/v210dec.h | 34 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/x86/v210-init.c | 48 | ||||
-rw-r--r-- | libavcodec/x86/v210.asm | 85 |
5 files changed, 241 insertions, 17 deletions
diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ecd88be22b..4f40e0872f 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -22,10 +22,35 @@ */ #include "avcodec.h" +#include "v210dec.h" #include "libavutil/bswap.h" +#include "libavutil/x86/timer.h" + +#define READ_PIXELS(a, b, c) \ + do { \ + val = av_le2ne32(*src++); \ + *a++ = val & 0x3FF; \ + *b++ = (val >> 10) & 0x3FF; \ + *c++ = (val >> 20) & 0x3FF; \ + } while (0) + +static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) +{ + uint32_t val; + int i; + + for( i = 0; i < width-5; i += 6 ){ + READ_PIXELS(u, y, v); + READ_PIXELS(y, u, y); + READ_PIXELS(v, y, u); + READ_PIXELS(y, v, y); + } +} static av_cold int decode_init(AVCodecContext *avctx) { + V210DecContext *s = avctx->priv_data; + if (avctx->width & 1) { av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n"); return -1; @@ -35,18 +60,37 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->coded_frame = avcodec_alloc_frame(); + s->unpack_frame = v210_planar_unpack_c; + + if (HAVE_MMX) + v210_x86_init(s); + return 0; } static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { - int h, w; + V210DecContext *s = avctx->priv_data; + + int h, w, stride, aligned_input; AVFrame *pic = avctx->coded_frame; const uint8_t *psrc = avpkt->data; uint16_t *y, *u, *v; - int aligned_width = ((avctx->width + 47) / 48) * 48; - int stride = aligned_width * 8 / 3; + + if (s->custom_stride ) + stride = s->custom_stride; + else { + int aligned_width = ((avctx->width + 47) / 48) * 48; + stride = aligned_width * 8 / 3; + } + + aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf); + if (aligned_input != s->aligned_input) { + s->aligned_input = aligned_input; + if (HAVE_MMX) + v210_x86_init(s); + } if (pic->data[0]) avctx->release_buffer(avctx, pic); @@ -66,23 +110,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, pic->pict_type = AV_PICTURE_TYPE_I; pic->key_frame = 1; -#define READ_PIXELS(a, b, c) \ - do { \ - val = av_le2ne32(*src++); \ - *a++ = val & 0x3FF; \ - *b++ = (val >> 10) & 0x3FF; \ - *c++ = (val >> 20) & 0x3FF; \ - } while (0) - for (h = 0; h < avctx->height; h++) { const uint32_t *src = (const uint32_t*)psrc; uint32_t val; - for (w = 0; w < avctx->width - 5; w += 6) { - READ_PIXELS(u, y, v); - READ_PIXELS(y, u, y); - READ_PIXELS(v, y, u); - READ_PIXELS(y, v, y); - } + + w = (avctx->width / 6) * 6; + s->unpack_frame(src, y, u, v, w); + + y += w; + u += w >> 1; + v += w >> 1; + src += (w << 1) / 3; + if (w < avctx->width - 1) { READ_PIXELS(u, y, v); @@ -120,13 +159,29 @@ static av_cold int decode_close(AVCodecContext *avctx) return 0; } +#define V210DEC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM +static const AVOption v210dec_options[] = { + {"custom_stride", "Custom V210 stride", offsetof(V210DecContext, custom_stride), FF_OPT_TYPE_INT, + {.dbl = 0}, INT_MIN, INT_MAX, V210DEC_FLAGS}, + {NULL} +}; + +static const AVClass v210dec_class = { + "V210 Decoder", + av_default_item_name, + v210dec_options, + LIBAVUTIL_VERSION_INT, +}; + AVCodec ff_v210_decoder = { .name = "v210", .type = AVMEDIA_TYPE_VIDEO, .id = CODEC_ID_V210, + .priv_data_size = sizeof(V210DecContext), .init = decode_init, .close = decode_close, .decode = decode_frame, .capabilities = CODEC_CAP_DR1, .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"), + .priv_class = &v210dec_class, }; diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h new file mode 100644 index 0000000000..48be729a5f --- /dev/null +++ b/libavcodec/v210dec.h @@ -0,0 +1,34 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_V210DEC_H +#define AVCODEC_V210DEC_H + +#include "libavutil/log.h" +#include "libavutil/opt.h" + +typedef struct { + AVClass *av_class; + int custom_stride; + int aligned_input; + void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +} V210DecContext; + +void v210_x86_init(V210DecContext *s); + +#endif /* AVCODEC_V210DEC_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 3ae63ececb..f4783bc4d2 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -39,6 +39,8 @@ MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp-init.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o +YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o +MMX-OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp.o diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c new file mode 100644 index 0000000000..4dd6d6de8a --- /dev/null +++ b/libavcodec/x86/v210-init.c @@ -0,0 +1,48 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavcodec/v210dec.h" + +extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +av_cold void v210_x86_init(V210DecContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_YASM + if (s->aligned_input) { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3; + + if (cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + } + else { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3; + + if (cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + } +#endif +} diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm new file mode 100644 index 0000000000..344bed0beb --- /dev/null +++ b/libavcodec/x86/v210.asm @@ -0,0 +1,85 @@ +;****************************************************************************** +;* V210 SIMD unpack +;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> +;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +v210_mask: times 4 dd 0x3ff +v210_mult: dw 64,4,64,4,64,4,64,4 +v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 +v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 + +SECTION .text + +%macro v210_planar_unpack 2 + +; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) +cglobal v210_planar_unpack_%1_%2, 5, 5 + movsxdifnidn r4, r4d + lea r1, [r1+2*r4] + add r2, r4 + add r3, r4 + neg r4 + + mova m3, [v210_mult] + mova m4, [v210_mask] + mova m5, [v210_luma_shuf] + mova m6, [v210_chroma_shuf] +.loop +%ifidn %1, unaligned + movu m0, [r0] +%else + mova m0, [r0] +%endif + + pmullw m1, m0, m3 + psrld m0, 10 + psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 + pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ + + shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ + pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ + movu [r1+2*r4], m2 + + shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ + pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ + movq [r2+r4], m1 + movhps [r3+r4], m1 + + add r0, mmsize + add r4, 6 + jl .loop + + REP_RET +%endmacro + +INIT_XMM +v210_planar_unpack unaligned, ssse3 +INIT_AVX +v210_planar_unpack unaligned, avx + +INIT_XMM +v210_planar_unpack aligned, ssse3 +INIT_AVX +v210_planar_unpack aligned, avx |