diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2013-01-23 14:04:50 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-01-23 14:04:50 +0100 |
commit | 6e6e1708984e45881b9a5d4e26c3e7de852c54d5 (patch) | |
tree | 5e04d38f8e152faf98921843ca5e4530cbdc46a4 /libavutil/arm | |
parent | b1b870fbd7185bffbe27c5918001b40a8ff8b920 (diff) | |
parent | 42d324694883cdf1fff1612ac70fa403692a1ad4 (diff) | |
download | ffmpeg-6e6e1708984e45881b9a5d4e26c3e7de852c54d5.tar.gz |
Merge commit '42d324694883cdf1fff1612ac70fa403692a1ad4'
* commit '42d324694883cdf1fff1612ac70fa403692a1ad4':
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp.
Conflicts:
libavcodec/arm/dsputil_init_vfp.c
libavcodec/arm/dsputil_vfp.S
libavcodec/dsputil.c
libavcodec/ppc/float_altivec.c
libavcodec/x86/dsputil.asm
libavutil/x86/float_dsp.asm
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavutil/arm')
-rw-r--r-- | libavutil/arm/float_dsp_init_neon.c | 4 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_init_vfp.c | 4 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_neon.S | 24 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_vfp.S | 69 |
4 files changed, 101 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c index 41e513fcdc..c6f02bd2c5 100644 --- a/libavutil/arm/float_dsp_init_neon.c +++ b/libavutil/arm/float_dsp_init_neon.c @@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0, void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) { fdsp->vector_fmul = ff_vector_fmul_neon; @@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon; fdsp->vector_fmul_window = ff_vector_fmul_window_neon; fdsp->vector_fmul_add = ff_vector_fmul_add_neon; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon; } diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index 7abc3322cf..f7e2f54601 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -25,10 +25,14 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp) { int cpu_flags = av_get_cpu_flags(); if (!have_vfpv3(cpu_flags)) fdsp->vector_fmul = ff_vector_fmul_vfp; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; } diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S index 100eb02455..d00e59de8f 100644 --- a/libavutil/arm/float_dsp_neon.S +++ b/libavutil/arm/float_dsp_neon.S @@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1 2: vst1.32 {q12-q13},[r0,:128]! bx lr endfunc + +function ff_vector_fmul_reverse_neon, export=1 + add r2, r2, r3, lsl #2 + sub r2, r2, #32 + mov r12, #-32 + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 +1: pld [r1, #32] + vrev64.32 q3, q3 + vmul.f32 d16, d0, d7 + vmul.f32 d17, d1, d6 + pld [r2, #-32] + vrev64.32 q2, q2 + vmul.f32 d18, d2, d5 + vmul.f32 d19, d3, d4 + subs r3, r3, #8 + beq 2f + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 + vst1.32 {q8-q9}, [r0,:128]! + b 1b +2: vst1.32 {q8-q9}, [r0,:128]! + bx lr +endfunc diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index db63e5a675..8695fbd981 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1 vpop {d8-d15} bx lr endfunc + +/** + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, +@ const float *src1, int len) +function ff_vector_fmul_reverse_vfp, export=1 + vpush {d8-d15} + add r2, r2, r3, lsl #2 + vldmdb r2!, {s0-s3} + vldmia r1!, {s8-s11} + vldmdb r2!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s3, s8 + vmul.f32 s9, s2, s9 + vmul.f32 s10, s1, s10 + vmul.f32 s11, s0, s11 +1: + subs r3, r3, #16 + it ge + vldmdbge r2!, {s16-s19} + vmul.f32 s12, s7, s12 + it ge + vldmiage r1!, {s24-s27} + vmul.f32 s13, s6, s13 + it ge + vldmdbge r2!, {s20-s23} + vmul.f32 s14, s5, s14 + it ge + vldmiage r1!, {s28-s31} + vmul.f32 s15, s4, s15 + it ge + vmulge.f32 s24, s19, s24 + it gt + vldmdbgt r2!, {s0-s3} + it ge + vmulge.f32 s25, s18, s25 + vstmia r0!, {s8-s13} + it ge + vmulge.f32 s26, s17, s26 + it gt + vldmiagt r1!, {s8-s11} + itt ge + vmulge.f32 s27, s16, s27 + vmulge.f32 s28, s23, s28 + it gt + vldmdbgt r2!, {s4-s7} + it ge + vmulge.f32 s29, s22, s29 + vstmia r0!, {s14-s15} + ittt ge + vmulge.f32 s30, s21, s30 + vmulge.f32 s31, s20, s31 + vmulge.f32 s8, s3, s8 + it gt + vldmiagt r1!, {s12-s15} + itttt ge + vmulge.f32 s9, s2, s9 + vmulge.f32 s10, s1, s10 + vstmiage r0!, {s24-s27} + vmulge.f32 s11, s0, s11 + it ge + vstmiage r0!, {s28-s31} + bgt 1b + + vpop {d8-d15} + bx lr +endfunc |