diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-01-20 13:20:30 -0800 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2013-01-22 11:55:42 -0800 |
commit | 42d324694883cdf1fff1612ac70fa403692a1ad4 (patch) | |
tree | 99b6a8629388614ec2395c847b79a2c448583341 | |
parent | 55aa03b9f8f11ebb7535424cc0e5635558590f49 (diff) | |
download | ffmpeg-42d324694883cdf1fff1612ac70fa403692a1ad4.tar.gz |
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp.
Now, nellymoserenc and aacenc no longer depends on dsputil. Independent
of this patch, wmaprodec also does not depend on dsputil, so I removed
it from there also.
32 files changed, 231 insertions, 310 deletions
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c index d59dea4f0e..0c4e356d8f 100644 --- a/libavcodec/aacdec.c +++ b/libavcodec/aacdec.c @@ -2067,9 +2067,9 @@ static void windowing_and_mdct_ltp(AACContext *ac, float *out, ac->fdsp.vector_fmul(in + 448, in + 448, swindow_prev, 128); } if (ics->window_sequence[0] != LONG_START_SEQUENCE) { - ac->dsp.vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024); + ac->fdsp.vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024); } else { - ac->dsp.vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128); + ac->fdsp.vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128); memset(in + 1024 + 576, 0, 448 * sizeof(float)); } ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in); @@ -2122,17 +2122,17 @@ static void update_ltp(AACContext *ac, SingleChannelElement *sce) if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { memcpy(saved_ltp, saved, 512 * sizeof(float)); memset(saved_ltp + 576, 0, 448 * sizeof(float)); - ac->dsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64); + ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64); for (i = 0; i < 64; i++) saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i]; } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) { memcpy(saved_ltp, ac->buf_mdct + 512, 448 * sizeof(float)); memset(saved_ltp + 576, 0, 448 * sizeof(float)); - ac->dsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64); + ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64); for (i = 0; i < 64; i++) saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i]; } else { // LONG_STOP or ONLY_LONG - ac->dsp.vector_fmul_reverse(saved_ltp, ac->buf_mdct + 512, &lwindow[512], 512); + ac->fdsp.vector_fmul_reverse(saved_ltp, ac->buf_mdct + 512, &lwindow[512], 512); for (i = 0; i < 512; i++) saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * lwindow[511 - i]; } diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c index 6f582ca8aa..00a6d0397d 100644 --- a/libavcodec/aacenc.c +++ b/libavcodec/aacenc.c @@ -183,7 +183,7 @@ static void put_audio_specific_config(AVCodecContext *avctx) } #define WINDOW_FUNC(type) \ -static void apply_ ##type ##_window(DSPContext *dsp, AVFloatDSPContext *fdsp, \ +static void apply_ ##type ##_window(AVFloatDSPContext *fdsp, \ SingleChannelElement *sce, \ const float *audio) @@ -193,8 +193,8 @@ WINDOW_FUNC(only_long) const float *pwindow = sce->ics.use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024; float *out = sce->ret_buf; - fdsp->vector_fmul (out, audio, lwindow, 1024); - dsp->vector_fmul_reverse(out + 1024, audio + 1024, pwindow, 1024); + fdsp->vector_fmul (out, audio, lwindow, 1024); + fdsp->vector_fmul_reverse(out + 1024, audio + 1024, pwindow, 1024); } WINDOW_FUNC(long_start) @@ -205,7 +205,7 @@ WINDOW_FUNC(long_start) fdsp->vector_fmul(out, audio, lwindow, 1024); memcpy(out + 1024, audio + 1024, sizeof(out[0]) * 448); - dsp->vector_fmul_reverse(out + 1024 + 448, audio + 1024 + 448, swindow, 128); + fdsp->vector_fmul_reverse(out + 1024 + 448, audio + 1024 + 448, swindow, 128); memset(out + 1024 + 576, 0, sizeof(out[0]) * 448); } @@ -218,7 +218,7 @@ WINDOW_FUNC(long_stop) memset(out, 0, sizeof(out[0]) * 448); fdsp->vector_fmul(out + 448, audio + 448, swindow, 128); memcpy(out + 576, audio + 576, sizeof(out[0]) * 448); - dsp->vector_fmul_reverse(out + 1024, audio + 1024, lwindow, 1024); + fdsp->vector_fmul_reverse(out + 1024, audio + 1024, lwindow, 1024); } WINDOW_FUNC(eight_short) @@ -230,15 +230,15 @@ WINDOW_FUNC(eight_short) int w; for (w = 0; w < 8; w++) { - fdsp->vector_fmul (out, in, w ? pwindow : swindow, 128); + fdsp->vector_fmul (out, in, w ? pwindow : swindow, 128); out += 128; in += 128; - dsp->vector_fmul_reverse(out, in, swindow, 128); + fdsp->vector_fmul_reverse(out, in, swindow, 128); out += 128; } } -static void (*const apply_window[4])(DSPContext *dsp, AVFloatDSPContext *fdsp, +static void (*const apply_window[4])(AVFloatDSPContext *fdsp, SingleChannelElement *sce, const float *audio) = { [ONLY_LONG_SEQUENCE] = apply_only_long_window, @@ -253,7 +253,7 @@ static void apply_window_and_mdct(AACEncContext *s, SingleChannelElement *sce, int i; float *output = sce->ret_buf; - apply_window[sce->ics.window_sequence[0]](&s->dsp, &s->fdsp, sce, audio); + apply_window[sce->ics.window_sequence[0]](&s->fdsp, sce, audio); if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) s->mdct1024.mdct_calc(&s->mdct1024, sce->coeffs, output); @@ -694,7 +694,6 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s) { int ret = 0; - ff_dsputil_init(&s->dsp, avctx); avpriv_float_dsp_init(&s->fdsp, avctx->flags & CODEC_FLAG_BITEXACT); // window init diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h index 81ffb973d6..cc2eb7717d 100644 --- a/libavcodec/aacenc.h +++ b/libavcodec/aacenc.h @@ -58,7 +58,6 @@ typedef struct AACEncContext { PutBitContext pb; FFTContext mdct1024; ///< long (1024 samples) frame transform context FFTContext mdct128; ///< short (128 samples) frame transform context - DSPContext dsp; AVFloatDSPContext fdsp; float *planar_samples[6]; ///< saved preprocessed input diff --git a/libavcodec/aacsbr.c b/libavcodec/aacsbr.c index 0b96abb95b..8b290340d4 100644 --- a/libavcodec/aacsbr.c +++ b/libavcodec/aacsbr.c @@ -1150,7 +1150,7 @@ static void sbr_dequant(SpectralBandReplication *sbr, int id_aac) * @param x pointer to the beginning of the first sample window * @param W array of complex-valued samples split into subbands */ -static void sbr_qmf_analysis(DSPContext *dsp, FFTContext *mdct, +static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct, SBRDSPContext *sbrdsp, const float *in, float *x, float z[320], float W[2][32][32][2], int buf_idx) { @@ -1663,7 +1663,7 @@ void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac, } for (ch = 0; ch < nch; ch++) { /* decode channel */ - sbr_qmf_analysis(&ac->dsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples, + sbr_qmf_analysis(&ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples, (float*)sbr->qmf_filter_scratch, sbr->data[ch].W, sbr->data[ch].Ypos); sbr_lf_gen(ac, sbr, sbr->X_low, sbr->data[ch].W, sbr->data[ch].Ypos); diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 71048f9c4c..5ebda62cf6 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -56,9 +56,6 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \ VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o -VFP-OBJS += arm/dsputil_vfp.o \ - arm/dsputil_init_vfp.o \ - NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \ arm/fft_fixed_neon.o \ diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c index 0c1563df93..9feda140ff 100644 --- a/libavcodec/arm/dsputil_init_arm.c +++ b/libavcodec/arm/dsputil_init_arm.c @@ -121,6 +121,5 @@ void ff_dsputil_init_arm(DSPContext* c, AVCodecContext *avctx) if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx); if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx); - if (have_vfp(cpu_flags)) ff_dsputil_init_vfp(c, avctx); if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx); } diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 0d23b26826..bb0c4af69a 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -144,8 +144,6 @@ void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); void ff_butterflies_float_neon(float *v1, float *v2, int len); float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); -void ff_vector_fmul_reverse_neon(float *dst, const float *src0, - const float *src1, int len); void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, int len); @@ -298,7 +296,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->butterflies_float = ff_butterflies_float_neon; c->scalarproduct_float = ff_scalarproduct_float_neon; - c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; c->vector_clipf = ff_vector_clipf_neon; c->vector_clip_int32 = ff_vector_clip_int32_neon; diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c deleted file mode 100644 index d77d686578..0000000000 --- a/libavcodec/arm/dsputil_init_vfp.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, - const float *src1, int len); - -void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; -} diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index 5e512a7a21..4ceecbcb4e 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -556,30 +556,6 @@ NOVFP vmov.32 r0, d0[0] bx lr endfunc -function ff_vector_fmul_reverse_neon, export=1 - add r2, r2, r3, lsl #2 - sub r2, r2, #32 - mov r12, #-32 - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q2-q3}, [r2,:128], r12 -1: pld [r1, #32] - vrev64.32 q3, q3 - vmul.f32 d16, d0, d7 - vmul.f32 d17, d1, d6 - pld [r2, #-32] - vrev64.32 q2, q2 - vmul.f32 d18, d2, d5 - vmul.f32 d19, d3, d4 - subs r3, r3, #8 - beq 2f - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q2-q3}, [r2,:128], r12 - vst1.32 {q8-q9}, [r0,:128]! - b 1b -2: vst1.32 {q8-q9}, [r0,:128]! - bx lr -endfunc - function ff_vector_clipf_neon, export=1 VFP vdup.32 q1, d0[1] VFP vdup.32 q0, d0[0] diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S deleted file mode 100644 index 9df955dbf9..0000000000 --- a/libavcodec/arm/dsputil_vfp.S +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/arm/asm.S" - -/* - * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle - * throughput for almost all the instructions (except for double precision - * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles - * for arithmetic operations. Scheduling code to avoid pipeline stalls is very - * important for performance. One more interesting feature is that VFP has - * independent load/store and arithmetics pipelines, so it is possible to make - * them work simultaneously and get more than 1 operation per cycle. Load/store - * pipeline can process 2 single precision floating point values per cycle and - * supports bulk loads and stores for large sets of registers. Arithmetic operations - * can be done on vectors, which allows to keep the arithmetics pipeline busy, - * while the processor may issue and execute other instructions. Detailed - * optimization manuals can be found at http://www.arm.com - */ - -/** - * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. - * Assume that len is a positive number and is multiple of 8 - */ -@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, -@ const float *src1, int len) -function ff_vector_fmul_reverse_vfp, export=1 - vpush {d8-d15} - add r2, r2, r3, lsl #2 - vldmdb r2!, {s0-s3} - vldmia r1!, {s8-s11} - vldmdb r2!, {s4-s7} - vldmia r1!, {s12-s15} - vmul.f32 s8, s3, s8 - vmul.f32 s9, s2, s9 - vmul.f32 s10, s1, s10 - vmul.f32 s11, s0, s11 -1: - subs r3, r3, #16 - it ge - vldmdbge r2!, {s16-s19} - vmul.f32 s12, s7, s12 - it ge - vldmiage r1!, {s24-s27} - vmul.f32 s13, s6, s13 - it ge - vldmdbge r2!, {s20-s23} - vmul.f32 s14, s5, s14 - it ge - vldmiage r1!, {s28-s31} - vmul.f32 s15, s4, s15 - it ge - vmulge.f32 s24, s19, s24 - it gt - vldmdbgt r2!, {s0-s3} - it ge - vmulge.f32 s25, s18, s25 - vstmia r0!, {s8-s13} - it ge - vmulge.f32 s26, s17, s26 - it gt - vldmiagt r1!, {s8-s11} - itt ge - vmulge.f32 s27, s16, s27 - vmulge.f32 s28, s23, s28 - it gt - vldmdbgt r2!, {s4-s7} - it ge - vmulge.f32 s29, s22, s29 - vstmia r0!, {s14-s15} - ittt ge - vmulge.f32 s30, s21, s30 - vmulge.f32 s31, s20, s31 - vmulge.f32 s8, s3, s8 - it gt - vldmiagt r1!, {s12-s15} - itttt ge - vmulge.f32 s9, s2, s9 - vmulge.f32 s10, s1, s10 - vstmiage r0!, {s24-s27} - vmulge.f32 s11, s0, s11 - it ge - vstmiage r0!, {s28-s31} - bgt 1b - - vpop {d8-d15} - bx lr -endfunc diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 4cbb48b0cd..0590e0c525 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -2353,13 +2353,6 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) WRAPPER8_16_SQ(rd8x8_c, rd16_c) WRAPPER8_16_SQ(bit8x8_c, bit16_c) -static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ - int i; - src1 += len-1; - for(i=0; i<len; i++) - dst[i] = src0[i] * src1[-i]; -} - static void butterflies_float_c(float *restrict v1, float *restrict v2, int len) { @@ -2707,7 +2700,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx) c->try_8x8basis= try_8x8basis_c; c->add_8x8basis= add_8x8basis_c; - c->vector_fmul_reverse = vector_fmul_reverse_c; c->vector_clipf = vector_clipf_c; c->scalarproduct_int16 = scalarproduct_int16_c; c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index b49ea1e81d..c197041071 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -340,8 +340,6 @@ typedef struct DSPContext { void (*h261_loop_filter)(uint8_t *src, int stride); - /* assume len is a multiple of 16, and arrays are 32-byte aligned */ - void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); /* assume len is a multiple of 8, and arrays are 16-byte aligned */ void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); /** diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c index 122282623c..34a15e5a50 100644 --- a/libavcodec/nellymoserenc.c +++ b/libavcodec/nellymoserenc.c @@ -55,7 +55,6 @@ typedef struct NellyMoserEncodeContext { AVCodecContext *avctx; int last_frame; - DSPContext dsp; AVFloatDSPContext fdsp; FFTContext mdct_ctx; AudioFrameQueue afq; @@ -122,12 +121,12 @@ static void apply_mdct(NellyMoserEncodeContext *s) float *in1 = s->buf + NELLY_BUF_LEN; float *in2 = s->buf + 2 * NELLY_BUF_LEN; - s->fdsp.vector_fmul (s->in_buff, in0, ff_sine_128, NELLY_BUF_LEN); - s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN); + s->fdsp.vector_fmul (s->in_buff, in0, ff_sine_128, NELLY_BUF_LEN); + s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN); s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff); - s->fdsp.vector_fmul (s->in_buff, in1, ff_sine_128, NELLY_BUF_LEN); - s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN); + s->fdsp.vector_fmul (s->in_buff, in1, ff_sine_128, NELLY_BUF_LEN); + s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN); s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff); } @@ -173,7 +172,6 @@ static av_cold int encode_init(AVCodecContext *avctx) s->avctx = avctx; if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0) goto error; - ff_dsputil_init(&s->dsp, avctx); avpriv_float_dsp_init(&s->fdsp, avctx->flags & CODEC_FLAG_BITEXACT); /* Generate overlap window */ diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index e1ebf26d0f..cdc6435be1 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -15,7 +15,6 @@ ALTIVEC-OBJS-$(CONFIG_VP8_DECODER) += ppc/vp8dsp_altivec.o ALTIVEC-OBJS += ppc/dsputil_altivec.o \ ppc/fdct_altivec.o \ - ppc/float_altivec.o \ ppc/fmtconvert_altivec.o \ ppc/gmc_altivec.o \ ppc/idct_altivec.o \ diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index a3069402da..24dbb8c10e 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -160,7 +160,6 @@ void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { ff_dsputil_init_altivec(c, avctx); - ff_float_init_altivec(c, avctx); ff_int_init_altivec(c, avctx); c->gmc1 = ff_gmc1_altivec; diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c deleted file mode 100644 index e336cbed5f..0000000000 --- a/libavcodec/ppc/float_altivec.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/dsputil.h" - -#include "dsputil_altivec.h" - -static void vector_fmul_reverse_altivec(float *dst, const float *src0, - const float *src1, int len) -{ - int i; - vector float d, s0, s1, h0, l0, - s2, s3, zero = (vector float)vec_splat_u32(0); - src1 += len-4; - for(i=0; i<len-7; i+=8) { - s1 = vec_ld(0, src1-i); // [a,b,c,d] - s0 = vec_ld(0, src0+i); - l0 = vec_mergel(s1, s1); // [c,c,d,d] - s3 = vec_ld(-16, src1-i); - h0 = vec_mergeh(s1, s1); // [a,a,b,b] - s2 = vec_ld(16, src0+i); - s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b] - vec_mergeh(l0,h0)); // [c,a,c,a] - // [d,c,b,a] - l0 = vec_mergel(s3, s3); - d = vec_madd(s0, s1, zero); - h0 = vec_mergeh(s3, s3); - vec_st(d, 0, dst+i); - s3 = vec_mergeh(vec_mergel(l0,h0), - vec_mergeh(l0,h0)); - d = vec_madd(s2, s3, zero); - vec_st(d, 16, dst+i); - } -} - -void ff_float_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul_reverse = vector_fmul_reverse_altivec; -} diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c index 3da09c59b1..7d5ca1b606 100644 --- a/libavcodec/wmadec.c +++ b/libavcodec/wmadec.c @@ -401,7 +401,7 @@ static void wma_window(WMACodecContext *s, float *out) block_len = s->block_len; bsize = s->frame_len_bits - s->block_len_bits; - s->dsp.vector_fmul_reverse(out, in, s->windows[bsize], block_len); + s->fdsp.vector_fmul_reverse(out, in, s->windows[bsize], block_len); } else { block_len = 1 << s->next_block_len_bits; @@ -410,7 +410,7 @@ static void wma_window(WMACodecContext *s, float *out) memcpy(out, in, n*sizeof(float)); - s->dsp.vector_fmul_reverse(out+n, in+n, s->windows[bsize], block_len); + s->fdsp.vector_fmul_reverse(out+n, in+n, s->windows[bsize], block_len); memset(out+n+block_len, 0, n*sizeof(float)); } diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c index 044114b516..bf8c2674b9 100644 --- a/libavcodec/wmaenc.c +++ b/libavcodec/wmaenc.c @@ -112,7 +112,7 @@ static void apply_window_and_mdct(AVCodecContext * avctx, const AVFrame *frame) for (ch = 0; ch < avctx->channels; ch++) { memcpy(s->output, s->frame_out[ch], window_len * sizeof(*s->output)); s->fdsp.vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len); - s->dsp.vector_fmul_reverse(&s->output[window_len], s->frame_out[ch], win, len); + s->fdsp.vector_fmul_reverse(&s->output[window_len], s->frame_out[ch], win, len); s->fdsp.vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len); mdct->mdct_calc(mdct, s->coefs[ch], s->output); } diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c index f04b43fd61..b54484ea9d 100644 --- a/libavcodec/wmaprodec.c +++ b/libavcodec/wmaprodec.c @@ -170,7 +170,6 @@ typedef struct WMAProDecodeCtx { /* generic decoder variables */ AVCodecContext* avctx; ///< codec context for av_log AVFrame frame; ///< AVFrame for decoded output - DSPContext dsp; ///< accelerated DSP functions AVFloatDSPContext fdsp; uint8_t frame_data[MAX_FRAMESIZE + FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data @@ -281,7 +280,6 @@ static av_cold int decode_init(AVCodecContext *avctx) int num_possible_block_sizes; s->avctx = avctx; - ff_dsputil_init(&s->dsp, avctx); avpriv_float_dsp_init(&s->fdsp, avctx->flags & CODEC_FLAG_BITEXACT); init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE); diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 26b9dee5db..27e77d565d 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -567,43 +567,6 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 VECTOR_CLIP_INT32 6, 1, 0, 0 %endif -;----------------------------------------------------------------------------- -; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, -; int len) -;----------------------------------------------------------------------------- -%macro VECTOR_FMUL_REVERSE 0 -cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len - lea lenq, [lend*4 - 2*mmsize] -ALIGN 16 -.loop: -%if cpuflag(avx) - vmovaps xmm0, [src1q + 16] - vinsertf128 m0, m0, [src1q], 1 - vshufps m0, m0, m0, q0123 - vmovaps xmm1, [src1q + mmsize + 16] - vinsertf128 m1, m1, [src1q + mmsize], 1 - vshufps m1, m1, m1, q0123 -%else - mova m0, [src1q] - mova m1, [src1q + mmsize] - shufps m0, m0, q0123 - shufps m1, m1, q0123 -%endif - mulps m0, m0, [src0q + lenq + mmsize] - mulps m1, m1, [src0q + lenq] - mova [dstq + lenq + mmsize], m0 - mova [dstq + lenq], m1 - add src1q, 2*mmsize - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL_REVERSE -INIT_YMM avx -VECTOR_FMUL_REVERSE - ; %1 = aligned/unaligned %macro BSWAP_LOOPS 1 mov r3, r2 diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 4ed48d64a4..503764817a 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -1848,11 +1848,6 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); -void ff_vector_fmul_reverse_sse(float *dst, const float *src0, - const float *src1, int len); -void ff_vector_fmul_reverse_avx(float *dst, const float *src0, - const float *src1, int len); - void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, @@ -2135,8 +2130,6 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags) #endif /* HAVE_INLINE_ASM */ #if HAVE_YASM - c->vector_fmul_reverse = ff_vector_fmul_reverse_sse; - c->scalarproduct_float = ff_scalarproduct_float_sse; #endif /* HAVE_YASM */ } @@ -2288,7 +2281,6 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx; } } - c->vector_fmul_reverse = ff_vector_fmul_reverse_avx; #endif /* HAVE_AVX_EXTERNAL */ } diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c index 41e513fcdc..c6f02bd2c5 100644 --- a/libavutil/arm/float_dsp_init_neon.c +++ b/libavutil/arm/float_dsp_init_neon.c @@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0, void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) { fdsp->vector_fmul = ff_vector_fmul_neon; @@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon; fdsp->vector_fmul_window = ff_vector_fmul_window_neon; fdsp->vector_fmul_add = ff_vector_fmul_add_neon; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon; } diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index ef808d8a5b..7247d56762 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -25,10 +25,14 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp) { int cpu_flags = av_get_cpu_flags(); if (!have_vfpv3(cpu_flags)) fdsp->vector_fmul = ff_vector_fmul_vfp; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; } diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S index 100eb02455..d00e59de8f 100644 --- a/libavutil/arm/float_dsp_neon.S +++ b/libavutil/arm/float_dsp_neon.S @@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1 2: vst1.32 {q12-q13},[r0,:128]! bx lr endfunc + +function ff_vector_fmul_reverse_neon, export=1 + add r2, r2, r3, lsl #2 + sub r2, r2, #32 + mov r12, #-32 + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 +1: pld [r1, #32] + vrev64.32 q3, q3 + vmul.f32 d16, d0, d7 + vmul.f32 d17, d1, d6 + pld [r2, #-32] + vrev64.32 q2, q2 + vmul.f32 d18, d2, d5 + vmul.f32 d19, d3, d4 + subs r3, r3, #8 + beq 2f + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 + vst1.32 {q8-q9}, [r0,:128]! + b 1b +2: vst1.32 {q8-q9}, [r0,:128]! + bx lr +endfunc diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index 3931828381..82952807de 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1 vpop {d8-d15} bx lr endfunc + +/** + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, +@ const float *src1, int len) +function ff_vector_fmul_reverse_vfp, export=1 + vpush {d8-d15} + add r2, r2, r3, lsl #2 + vldmdb r2!, {s0-s3} + vldmia r1!, {s8-s11} + vldmdb r2!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s3, s8 + vmul.f32 s9, s2, s9 + vmul.f32 s10, s1, s10 + vmul.f32 s11, s0, s11 +1: + subs r3, r3, #16 + it ge + vldmdbge r2!, {s16-s19} + vmul.f32 s12, s7, s12 + it ge + vldmiage r1!, {s24-s27} + vmul.f32 s13, s6, s13 + it ge + vldmdbge r2!, {s20-s23} + vmul.f32 s14, s5, s14 + it ge + vldmiage r1!, {s28-s31} + vmul.f32 s15, s4, s15 + it ge + vmulge.f32 s24, s19, s24 + it gt + vldmdbgt r2!, {s0-s3} + it ge + vmulge.f32 s25, s18, s25 + vstmia r0!, {s8-s13} + it ge + vmulge.f32 s26, s17, s26 + it gt + vldmiagt r1!, {s8-s11} + itt ge + vmulge.f32 s27, s16, s27 + vmulge.f32 s28, s23, s28 + it gt + vldmdbgt r2!, {s4-s7} + it ge + vmulge.f32 s29, s22, s29 + vstmia r0!, {s14-s15} + ittt ge + vmulge.f32 s30, s21, s30 + vmulge.f32 s31, s20, s31 + vmulge.f32 s8, s3, s8 + it gt + vldmiagt r1!, {s12-s15} + itttt ge + vmulge.f32 s9, s2, s9 + vmulge.f32 s10, s1, s10 + vstmiage r0!, {s24-s27} + vmulge.f32 s11, s0, s11 + it ge + vstmiage r0!, {s28-s31} + bgt 1b + + vpop {d8-d15} + bx lr +endfunc diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c index 456f83b656..aa5293710b 100644 --- a/libavutil/float_dsp.c +++ b/libavutil/float_dsp.c @@ -79,6 +79,16 @@ static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, dst[i] = src0[i] * src1[i] + src2[i]; } +static void vector_fmul_reverse_c(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + + src1 += len-1; + for (i = 0; i < len; i++) + dst[i] = src0[i] * src1[-i]; +} + void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact) { fdsp->vector_fmul = vector_fmul_c; @@ -87,6 +97,7 @@ void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact) fdsp->vector_dmul_scalar = vector_dmul_scalar_c; fdsp->vector_fmul_window = vector_fmul_window_c; fdsp->vector_fmul_add = vector_fmul_add_c; + fdsp->vector_fmul_reverse = vector_fmul_reverse_c; #if ARCH_ARM ff_float_dsp_init_arm(fdsp); diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h index b45c4560ff..96fcdab1e6 100644 --- a/libavutil/float_dsp.h +++ b/libavutil/float_dsp.h @@ -118,6 +118,25 @@ typedef struct AVFloatDSPContext { */ void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); + + /** + * Calculate the product of two vectors of floats, and store the result + * in a vector of floats. The second vector of floats is iterated over + * in reverse order. + * + * @param dst output vector + * constraints: 32-byte aligned + * @param src0 first input vector + * constraints: 32-byte aligned + * @param src1 second input vector + * constraints: 32-byte aligned + * @param src1 third input vector + * constraints: 32-byte aligned + * @param len number of elements in the input + * constraints: multiple of 16 + */ + void (*vector_fmul_reverse)(float *dst, const float *src0, + const float *src1, int len); } AVFloatDSPContext; /** diff --git a/libavutil/ppc/float_dsp_altivec.c b/libavutil/ppc/float_dsp_altivec.c index 87379e92f8..fee4e7c7db 100644 --- a/libavutil/ppc/float_dsp_altivec.c +++ b/libavutil/ppc/float_dsp_altivec.c @@ -93,3 +93,32 @@ void ff_vector_fmul_add_altivec(float *dst, const float *src0, vec_st(t0, 0, dst + i); } } + +void ff_vector_fmul_reverse_altivec(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + vector float d, s0, s1, h0, l0, + s2, s3, zero = (vector float) vec_splat_u32(0); + + src1 += len-4; + for(i = 0; i < len - 7; i += 8) { + s1 = vec_ld(0, src1 - i); // [a,b,c,d] + s0 = vec_ld(0, src0 + i); + l0 = vec_mergel(s1, s1); // [c,c,d,d] + s3 = vec_ld(-16, src1 - i); + h0 = vec_mergeh(s1, s1); // [a,a,b,b] + s2 = vec_ld(16, src0 + i); + s1 = vec_mergeh(vec_mergel(l0, h0), // [d,b,d,b] + vec_mergeh(l0, h0)); // [c,a,c,a] + // [d,c,b,a] + l0 = vec_mergel(s3, s3); + d = vec_madd(s0, s1, zero); + h0 = vec_mergeh(s3, s3); + vec_st(d, 0, dst + i); + s3 = vec_mergeh(vec_mergel(l0, h0), + vec_mergeh(l0, h0)); + d = vec_madd(s2, s3, zero); + vec_st(d, 16, dst + i); + } +} diff --git a/libavutil/ppc/float_dsp_altivec.h b/libavutil/ppc/float_dsp_altivec.h index 38a9e15434..b484a630e8 100644 --- a/libavutil/ppc/float_dsp_altivec.h +++ b/libavutil/ppc/float_dsp_altivec.h @@ -32,4 +32,7 @@ extern void ff_vector_fmul_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int len); +extern void ff_vector_fmul_reverse_altivec(float *dst, const float *src0, + const float *src1, int len); + #endif /* AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H */ diff --git a/libavutil/ppc/float_dsp_init.c b/libavutil/ppc/float_dsp_init.c index f3d8a42f53..bd682f5a30 100644 --- a/libavutil/ppc/float_dsp_init.c +++ b/libavutil/ppc/float_dsp_init.c @@ -33,6 +33,7 @@ void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int bit_exact) fdsp->vector_fmul = ff_vector_fmul_altivec; fdsp->vector_fmul_add = ff_vector_fmul_add_altivec; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_altivec; if (!bit_exact) { fdsp->vector_fmul_window = ff_vector_fmul_window_altivec; diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 70fc1d0310..126f3495c4 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -190,3 +190,40 @@ INIT_XMM sse VECTOR_FMUL_ADD INIT_YMM avx VECTOR_FMUL_ADD + +;----------------------------------------------------------------------------- +; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, +; int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_REVERSE 0 +cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: +%if cpuflag(avx) + vmovaps xmm0, [src1q + 16] + vinsertf128 m0, m0, [src1q], 1 + vshufps m0, m0, m0, q0123 + vmovaps xmm1, [src1q + mmsize + 16] + vinsertf128 m1, m1, [src1q + mmsize], 1 + vshufps m1, m1, m1, q0123 +%else + mova m0, [src1q] + mova m1, [src1q + mmsize] + shufps m0, m0, q0123 + shufps m1, m1, q0123 +%endif + mulps m0, m0, [src0q + lenq + mmsize] + mulps m1, m1, [src0q + lenq] + mova [dstq + lenq + mmsize], m0 + mova [dstq + lenq], m1 + add src1q, 2*mmsize + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_REVERSE +INIT_YMM avx +VECTOR_FMUL_REVERSE diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 0a2ad5ba0e..9f63b4c057 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -46,6 +46,11 @@ void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_reverse_sse(float *dst, const float *src0, + const float *src1, int len); +void ff_vector_fmul_reverse_avx(float *dst, const float *src0, + const float *src1, int len); + #if HAVE_6REGS && HAVE_INLINE_ASM static void vector_fmul_window_3dnowext(float *dst, const float *src0, const float *src1, const float *win, @@ -129,6 +134,7 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse; fdsp->vector_fmul_add = ff_vector_fmul_add_sse; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse; } if (EXTERNAL_SSE2(mm_flags)) { fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; @@ -138,5 +144,6 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } } |