diff options
author | Justin Ruggles <justin.ruggles@gmail.com> | 2011-10-30 01:13:55 -0400 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2011-11-11 14:13:58 -0500 |
commit | 9d06037d48041ad8ccbae6c12aa9f3a313a89c4e (patch) | |
tree | 9521e25c2c61b0397b86f99d026b624251ee0d14 | |
parent | 7b966566da24598a636a433a75a7842e272b18f6 (diff) | |
download | ffmpeg-9d06037d48041ad8ccbae6c12aa9f3a313a89c4e.tar.gz |
twinvq: add SSE/AVX optimized sum/difference stereo interleaving
-rw-r--r-- | libavcodec/dsputil.c | 13 | ||||
-rw-r--r-- | libavcodec/dsputil.h | 17 | ||||
-rw-r--r-- | libavcodec/twinvq.c | 34 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 7 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_yasm.asm | 48 |
5 files changed, 101 insertions, 18 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 182063ca2b..91238578b6 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2, } } +static void butterflies_float_interleave_c(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + for (i = 0; i < len; i++) { + float f1 = src0[i]; + float f2 = src1[i]; + dst[2*i ] = f1 + f2; + dst[2*i + 1] = f1 - f2; + } +} + static float scalarproduct_float_c(const float *v1, const float *v2, int len) { float p = 0.0; @@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->vector_clip_int32 = vector_clip_int32_c; c->scalarproduct_float = scalarproduct_float_c; c->butterflies_float = butterflies_float_c; + c->butterflies_float_interleave = butterflies_float_interleave_c; c->vector_fmul_scalar = vector_fmul_scalar_c; c->vector_fmac_scalar = vector_fmac_scalar_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index acb2041460..98b7b1eeaa 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -453,6 +453,23 @@ typedef struct DSPContext { */ void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); + /** + * Calculate the sum and difference of two vectors of floats and interleave + * results into a separate output vector of floats, with each sum + * positioned before the corresponding difference. + * + * @param dst output vector + * constraints: 16-byte aligned + * @param src0 first input vector + * constraints: 32-byte aligned + * @param src1 second input vector + * constraints: 32-byte aligned + * @param len number of elements in the input + * constraints: multiple of 8 + */ + void (*butterflies_float_interleave)(float *dst, const float *src0, + const float *src1, int len); + /* (I)DCT */ void (*fdct)(DCTELEM *block/* align 16*/); void (*fdct248)(DCTELEM *block/* align 16*/); diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c index 73eb7c1499..a2851562ee 100644 --- a/libavcodec/twinvq.c +++ b/libavcodec/twinvq.c @@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype, float *out) { const ModeTab *mtab = tctx->mtab; + int size1, size2; float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0]; - int i, j; + int i; for (i = 0; i < tctx->avctx->channels; i++) { imdct_and_window(tctx, ftype, wtype, @@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype, i); } + size2 = tctx->last_block_pos[0]; + size1 = mtab->size - size2; if (tctx->avctx->channels == 2) { - for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) { - float f1 = prev_buf[ i]; - float f2 = prev_buf[2*mtab->size + i]; - out[2*i ] = f1 + f2; - out[2*i + 1] = f1 - f2; - } - for (j = 0; i < mtab->size; j++,i++) { - float f1 = tctx->curr_frame[ j]; - float f2 = tctx->curr_frame[2*mtab->size + j]; - out[2*i ] = f1 + f2; - out[2*i + 1] = f1 - f2; - } + tctx->dsp.butterflies_float_interleave(out, prev_buf, + &prev_buf[2*mtab->size], + size1); + + out += 2 * size1; + + tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame, + &tctx->curr_frame[2*mtab->size], + size2); } else { - memcpy(out, prev_buf, - (mtab->size - tctx->last_block_pos[0]) * sizeof(*out)); + memcpy(out, prev_buf, size1 * sizeof(*out)); - out += mtab->size - tctx->last_block_pos[0]; + out += size1; - memcpy(out, tctx->curr_frame, - (tctx->last_block_pos[0]) * sizeof(*out)); + memcpy(out, tctx->curr_frame, size2 * sizeof(*out)); } } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index dd6cbf51ad..f0de05a763 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); +extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0, + const float *src1, int len); +extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, + const float *src1, int len); + void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); @@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->vector_clipf = vector_clipf_sse; #if HAVE_YASM c->scalarproduct_float = ff_scalarproduct_float_sse; + c->butterflies_float_interleave = ff_butterflies_float_interleave_sse; #endif } if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) @@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; } + c->butterflies_float_interleave = ff_butterflies_float_interleave_avx; } #endif } diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 8e3cbdc03d..f2894cd501 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 %else VECTOR_CLIP_INT32 6, 1, 0, 0 %endif + +;----------------------------------------------------------------------------- +; void ff_butterflies_float_interleave(float *dst, const float *src0, +; const float *src1, int len); +;----------------------------------------------------------------------------- + +%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 +cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len +%ifdef ARCH_X86_64 + movsxd lenq, lend +%endif + test lenq, lenq + jz .end + shl lenq, 2 + lea src0q, [src0q + lenq] + lea src1q, [src1q + lenq] + lea dstq, [ dstq + 2*lenq] + neg lenq +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + subps m2, m0, m1 + addps m0, m0, m1 + unpcklps m1, m0, m2 + unpckhps m0, m0, m2 +%if cpuflag(avx) + vextractf128 [dstq + 2*lenq ], m1, 0 + vextractf128 [dstq + 2*lenq + 16], m0, 0 + vextractf128 [dstq + 2*lenq + 32], m1, 1 + vextractf128 [dstq + 2*lenq + 48], m0, 1 +%else + mova [dstq + 2*lenq ], m1 + mova [dstq + 2*lenq + mmsize], m0 +%endif + add lenq, mmsize + jl .loop +%if mmsize == 32 + vzeroupper + RET +%endif +.end: + REP_RET +%endmacro + +INIT_XMM sse +BUTTERFLIES_FLOAT_INTERLEAVE +INIT_YMM avx +BUTTERFLIES_FLOAT_INTERLEAVE |