diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-06-19 20:52:00 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-06-19 20:53:27 +0200 |
commit | cabbd271a5f37042291c06b9f8bd6c641fbddfde (patch) | |
tree | 110238d357631f95c4849d0d99d978a61b2a1ee7 /libavutil | |
parent | 6b9446e93296ed236d497fe3f493d8956571f888 (diff) | |
parent | 4cc2920dd2c0ce4e64e709da4f78508e1ec9871e (diff) | |
download | ffmpeg-cabbd271a5f37042291c06b9f8bd6c641fbddfde.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (24 commits)
flvdec: remove incomplete, disabled seeking code
mem: add support for _aligned_malloc() as found on Windows
lavc: Extend the documentation for avcodec_init_packet
flvdec: remove incomplete, disabled seeking code
http: replace atoll() with strtoll()
mpegts: remove unused/incomplete/broken seeking code
af_amix: allow float planar sample format as input
af_amix: use AVFloatDSPContext.vector_fmac_scalar()
float_dsp: add x86-optimized functions for vector_fmac_scalar()
float_dsp: Move vector_fmac_scalar() from libavcodec to libavutil
lavr: Add x86-optimized function for flt to s32 conversion
lavr: Add x86-optimized function for flt to s16 conversion
lavr: Add x86-optimized functions for s32 to flt conversion
lavr: Add x86-optimized functions for s32 to s16 conversion
lavr: Add x86-optimized functions for s16 to flt conversion
lavr: Add x86-optimized function for s16 to s32 conversion
rtpenc: Support packetizing iLBC
rtpdec: Add a depacketizer for iLBC
Implement the iLBC storage file format
mov: Support muxing/demuxing iLBC
...
Conflicts:
Changelog
configure
libavcodec/avcodec.h
libavcodec/dsputil.c
libavcodec/version.h
libavformat/movenc.c
libavformat/mpegts.c
libavformat/version.h
libavutil/mem.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavutil')
-rw-r--r-- | libavutil/arm/float_dsp_init_neon.c | 4 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_neon.S | 48 | ||||
-rw-r--r-- | libavutil/float_dsp.c | 9 | ||||
-rw-r--r-- | libavutil/float_dsp.h | 16 | ||||
-rw-r--r-- | libavutil/mem.c | 6 | ||||
-rw-r--r-- | libavutil/x86/float_dsp.asm | 47 | ||||
-rw-r--r-- | libavutil/x86/float_dsp_init.c | 7 |
7 files changed, 137 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c index fa6d0d7d15..3ca0288b31 100644 --- a/libavutil/arm/float_dsp_init_neon.c +++ b/libavutil/arm/float_dsp_init_neon.c @@ -26,7 +26,11 @@ void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul, + int len); + void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) { fdsp->vector_fmul = ff_vector_fmul_neon; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon; } diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S index d66fa09424..03b164388f 100644 --- a/libavutil/arm/float_dsp_neon.S +++ b/libavutil/arm/float_dsp_neon.S @@ -62,3 +62,51 @@ function ff_vector_fmul_neon, export=1 3: vst1.32 {d16-d19},[r0,:128]! bx lr endfunc + +function ff_vector_fmac_scalar_neon, export=1 +VFP len .req r2 +VFP acc .req r3 +NOVFP len .req r3 +NOVFP acc .req r2 +VFP vdup.32 q15, d0[0] +NOVFP vdup.32 q15, r2 + bics r12, len, #15 + mov acc, r0 + beq 3f + vld1.32 {q0}, [r1,:128]! + vld1.32 {q8}, [acc,:128]! + vld1.32 {q1}, [r1,:128]! + vld1.32 {q9}, [acc,:128]! +1: vmla.f32 q8, q0, q15 + vld1.32 {q2}, [r1,:128]! + vld1.32 {q10}, [acc,:128]! + vmla.f32 q9, q1, q15 + vld1.32 {q3}, [r1,:128]! + vld1.32 {q11}, [acc,:128]! + vmla.f32 q10, q2, q15 + vst1.32 {q8}, [r0,:128]! + vmla.f32 q11, q3, q15 + vst1.32 {q9}, [r0,:128]! + subs r12, r12, #16 + beq 2f + vld1.32 {q0}, [r1,:128]! + vld1.32 {q8}, [acc,:128]! + vst1.32 {q10}, [r0,:128]! + vld1.32 {q1}, [r1,:128]! + vld1.32 {q9}, [acc,:128]! + vst1.32 {q11}, [r0,:128]! + b 1b +2: vst1.32 {q10}, [r0,:128]! + vst1.32 {q11}, [r0,:128]! + ands len, len, #15 + it eq + bxeq lr +3: vld1.32 {q0}, [r1,:128]! + vld1.32 {q8}, [acc,:128]! + vmla.f32 q8, q0, q15 + vst1.32 {q8}, [r0,:128]! + subs len, len, #4 + bgt 3b + bx lr + .unreq len +endfunc diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c index 87cfd88268..f5a8360c86 100644 --- a/libavutil/float_dsp.c +++ b/libavutil/float_dsp.c @@ -31,9 +31,18 @@ static void vector_fmul_c(float *dst, const float *src0, const float *src1, dst[i] = src0[i] * src1[i]; } +static void vector_fmac_scalar_c(float *dst, const float *src, float mul, + int len) +{ + int i; + for (i = 0; i < len; i++) + dst[i] += src[i] * mul; +} + void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact) { fdsp->vector_fmul = vector_fmul_c; + fdsp->vector_fmac_scalar = vector_fmac_scalar_c; #if ARCH_ARM ff_float_dsp_init_arm(fdsp); diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h index 02c4ab7bde..735eb34c36 100644 --- a/libavutil/float_dsp.h +++ b/libavutil/float_dsp.h @@ -35,6 +35,22 @@ typedef struct AVFloatDSPContext { */ void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len); + + /** + * Multiply a vector of floats by a scalar float and add to + * destination vector. Source and destination vectors must + * overlap exactly or not at all. + * + * @param dst result vector + * constraints: 32-byte aligned + * @param src input vector + * constraints: 32-byte aligned + * @param mul scalar value + * @param len length of vector + * constraints: multiple of 16 + */ + void (*vector_fmac_scalar)(float *dst, const float *src, float mul, + int len); } AVFloatDSPContext; /** diff --git a/libavutil/mem.c b/libavutil/mem.c index de22ad8db8..385ace0702 100644 --- a/libavutil/mem.c +++ b/libavutil/mem.c @@ -94,6 +94,8 @@ void *av_malloc(size_t size) if (size) //OS X on SDK 10.6 has a broken posix_memalign implementation if (posix_memalign(&ptr,ALIGN,size)) ptr = NULL; +#elif HAVE_ALIGNED_MALLOC + ptr = _aligned_malloc(size, ALIGN); #elif HAVE_MEMALIGN ptr = memalign(ALIGN,size); /* Why 64? @@ -145,6 +147,8 @@ void *av_realloc(void *ptr, size_t size) ptr= realloc((char*)ptr - diff, size + diff); if(ptr) ptr = (char*)ptr + diff; return ptr; +#elif HAVE_ALIGNED_MALLOC + return _aligned_realloc(ptr, size + !size, ALIGN); #else return realloc(ptr, size + !size); #endif @@ -170,6 +174,8 @@ void av_free(void *ptr) #if CONFIG_MEMALIGN_HACK if (ptr) free((char*)ptr - ((char*)ptr)[-1]); +#elif HAVE_ALIGNED_MALLOC + _aligned_free(ptr); #else free(ptr); #endif diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 6ed716c026..f68e0bfe2d 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -21,6 +21,7 @@ ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION .text @@ -55,3 +56,49 @@ VECTOR_FMUL INIT_YMM avx VECTOR_FMUL %endif + +;------------------------------------------------------------------------------ +; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_FMAC_SCALAR 0 +%if UNIX64 +cglobal vector_fmac_scalar, 3,3,3, dst, src, len +%else +cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len +%endif +%if WIN64 + SWAP 0, 2 +%endif +%if ARCH_X86_32 + VBROADCASTSS m0, mulm +%else + shufps xmm0, xmm0, 0 +%if cpuflag(avx) + vinsertf128 m0, m0, xmm0, 1 +%endif +%endif + lea lenq, [lend*4-2*mmsize] +.loop + mulps m1, m0, [srcq+lenq ] + mulps m2, m0, [srcq+lenq+mmsize] + addps m1, m1, [dstq+lenq ] + addps m2, m2, [dstq+lenq+mmsize] + mova [dstq+lenq ], m1 + mova [dstq+lenq+mmsize], m2 + sub lenq, 2*mmsize + jge .loop +%if mmsize == 32 + vzeroupper + RET +%else + REP_RET +%endif +%endmacro + +INIT_XMM sse +VECTOR_FMAC_SCALAR +%if HAVE_AVX +INIT_YMM avx +VECTOR_FMAC_SCALAR +%endif diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 8f6980cbc2..3e05b9d4ca 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -26,6 +26,11 @@ extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1, extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, int len); +extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul, + int len); +extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, + int len); + void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) { #if HAVE_YASM @@ -33,9 +38,11 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { fdsp->vector_fmul = ff_vector_fmul_sse; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; } if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { fdsp->vector_fmul = ff_vector_fmul_avx; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; } #endif } |