diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2014-02-03 23:17:04 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-02-06 02:51:19 +0100 |
commit | 9c978f243a47a0906fb32d723fcdd37d7b8cee93 (patch) | |
tree | b19acd945ac1a94299f1929c26d75db08b9257d3 | |
parent | 4a37e2977cb24713fd36d04ec97e97adc2aaba87 (diff) | |
download | ffmpeg-9c978f243a47a0906fb32d723fcdd37d7b8cee93.tar.gz |
flac/x86: add ff_flac_lpc_32_sse4()
benchmarked on sandybridge x86_64:
1358232 decicycles in flac_lpc_32_c
1244575 decicycles in flac_lpc_32_sse4, James Almer's patch
650045 decicycles in flac_lpc_32_sse4, this patch
I haven't tested the edgecases such as odd block lengths
odd block length tested-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/flacdsp.c | 2 | ||||
-rw-r--r-- | libavcodec/flacdsp.h | 1 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/x86/flacdsp.asm | 71 | ||||
-rw-r--r-- | libavcodec/x86/flacdsp_init.c | 39 |
5 files changed, 115 insertions, 0 deletions
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c index 02eba3ea8a..b15bc7476b 100644 --- a/libavcodec/flacdsp.c +++ b/libavcodec/flacdsp.c @@ -128,4 +128,6 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, if (ARCH_ARM) ff_flacdsp_init_arm(c, fmt, bps); + if (ARCH_X86) + ff_flacdsp_init_x86(c, fmt, bps); } diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h index 5e66dc2f10..272cf2a674 100644 --- a/libavcodec/flacdsp.h +++ b/libavcodec/flacdsp.h @@ -33,5 +33,6 @@ typedef struct FLACDSPContext { void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps); void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int bps); +void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int bps); #endif /* AVCODEC_FLACDSP_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index dddaae1f72..6d5d0081df 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -12,6 +12,7 @@ OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \ x86/fdct.o \ x86/motion_est.o OBJS-$(CONFIG_FFT) += x86/fft_init.o +OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp_init.o OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o @@ -70,6 +71,7 @@ YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ x86/qpel.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o YASM-OBJS-$(CONFIG_FFT) += x86/fft.o +YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm new file mode 100644 index 0000000000..e28f905c6f --- /dev/null +++ b/libavcodec/x86/flacdsp.asm @@ -0,0 +1,71 @@ +;****************************************************************************** +;* FLAC DSP SIMD optimizations +;* +;* Copyright (C) 2014 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +INIT_XMM sse4 +cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j + sub lend, pred_orderd + jle .ret + lea decodedq, [decodedq+pred_orderq*4-8] + lea coeffsq, [coeffsq+pred_orderq*4] + neg pred_orderq + movd m4, qlevelm +ALIGN 16 +.loop_sample: + movd m0, [decodedq+pred_orderq*4+8] + add decodedq, 8 + movd m1, [coeffsq+pred_orderq*4] + pxor m2, m2 + pxor m3, m3 + lea jq, [pred_orderq+1] + test jq, jq + jz .end_order +.loop_order: + pmuldq m0, m1 + paddq m2, m0 + movd m0, [decodedq+jq*4] + pmuldq m1, m0 + paddq m3, m1 + movd m1, [coeffsq+jq*4] + inc jq + jl .loop_order +.end_order: + pmuldq m0, m1 + paddq m2, m0 + psrlq m2, m4 + movd m0, [decodedq] + paddd m0, m2 + movd [decodedq], m0 + sub lend, 2 + jl .ret + pmuldq m1, m0 + paddq m3, m1 + psrlq m3, m4 + movd m1, [decodedq+4] + paddd m1, m3 + movd [decodedq+4], m1 + jg .loop_sample +.ret: + REP_RET diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c new file mode 100644 index 0000000000..d30a41e9d4 --- /dev/null +++ b/libavcodec/x86/flacdsp_init.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/flacdsp.h" +#include "libavutil/x86/cpu.h" +#include "config.h" + +void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + +av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, + int bps) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE4(cpu_flags)) { + if (bps > 16) + c->lpc = ff_flac_lpc_32_sse4; + } +#endif +} |