diff options
author | Ben Avison <bavison@riscosopen.org> | 2013-07-15 18:28:17 +0100 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2013-07-22 10:15:44 +0300 |
commit | ff30d121595e21f2fe07fbe31afefad0c719b5d7 (patch) | |
tree | f38765ea6be93f152a045ea5e88bbe72cbf48511 | |
parent | 800ffab48a7844dd5dc0a33b8f6b8e5ed718cf2e (diff) | |
download | ffmpeg-ff30d121595e21f2fe07fbe31afefad0c719b5d7.tar.gz |
arm: Add VFP-accelerated version of qmf_32_subbands
Before After
Mean StdDev Mean StdDev Change
This function 1323.0 98.0 746.2 60.6 +77.3%
Overall 15400.0 336.4 14147.5 288.4 +8.9%
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libavcodec/arm/dcadsp_init_arm.c | 10 | ||||
-rw-r--r-- | libavcodec/arm/dcadsp_vfp.S | 273 |
2 files changed, 282 insertions, 1 deletions
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c index 824b909aae..c25bb83c80 100644 --- a/libavcodec/arm/dcadsp_init_arm.c +++ b/libavcodec/arm/dcadsp_init_arm.c @@ -26,6 +26,12 @@ void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, int decifactor, float scale); +void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, + SynthFilterContext *synth, FFTContext *imdct, + float synth_buf_ptr[512], + int *synth_buf_offset, float synth_buf2[32], + const float window[512], float *samples_out, + float raXin[32], float scale); void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, int decifactor, float scale); @@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); - if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) + if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { s->lfe_fir = ff_dca_lfe_fir_vfp; + s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp; + } if (have_neon(cpu_flags)) s->lfe_fir = ff_dca_lfe_fir_neon; } diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S index 57e16196f7..6039e87dfc 100644 --- a/libavcodec/arm/dcadsp_vfp.S +++ b/libavcodec/arm/dcadsp_vfp.S @@ -218,3 +218,276 @@ endfunc .unreq POST1 .unreq POST2 .unreq POST3 + + +IN .req a1 +SBACT .req a2 +OLDFPSCR .req a3 +IMDCT .req a4 +WINDOW .req v1 +OUT .req v2 +BUF .req v3 +SCALEINT .req v4 @ only used in softfp case +COUNT .req v5 + +SCALE .req s0 + +/* Stack layout differs in softfp and hardfp cases: + * + * hardfp + * fp -> 6 arg words saved by caller + * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) + * s16-s23 on entry + * align 16 + * buf -> 8*32*4 bytes buffer + * s0 on entry + * sp -> 3 arg words for callee + * + * softfp + * fp -> 7 arg words saved by caller + * a4,v1-v5,fp,lr on entry + * s16-s23 on entry + * align 16 + * buf -> 8*32*4 bytes buffer + * sp -> 4 arg words for callee + */ + +/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, + * SynthFilterContext *synth, FFTContext *imdct, + * float (*synth_buf_ptr)[512], + * int *synth_buf_offset, float (*synth_buf2)[32], + * const float (*window)[512], float *samples_out, + * float (*raXin)[32], float scale); + */ +function ff_dca_qmf_32_subbands_vfp, export=1 +VFP push {a3-a4,v1-v3,v5,fp,lr} +NOVFP push {a4,v1-v5,fp,lr} + add fp, sp, #8*4 + vpush {s16-s23} + @ The buffer pointed at by raXin isn't big enough for us to do a + @ complete matrix transposition as we want to, so allocate an + @ alternative buffer from the stack. Align to 4 words for speed. + sub BUF, sp, #8*32*4 + bic BUF, BUF, #15 + mov sp, BUF + ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 + fmrx OLDFPSCR, FPSCR + fmxr FPSCR, lr + @ COUNT is used to count down 2 things at once: + @ bits 0-4 are the number of word pairs remaining in the output row + @ bits 5-31 are the number of words to copy (with possible negation) + @ from the source matrix before we start zeroing the remainder + mov COUNT, #(-4 << 5) + 16 + adds COUNT, COUNT, SBACT, lsl #5 + bmi 2f +1: + vldr s8, [IN, #(0*8+0)*4] + vldr s10, [IN, #(0*8+1)*4] + vldr s12, [IN, #(0*8+2)*4] + vldr s14, [IN, #(0*8+3)*4] + vldr s16, [IN, #(0*8+4)*4] + vldr s18, [IN, #(0*8+5)*4] + vldr s20, [IN, #(0*8+6)*4] + vldr s22, [IN, #(0*8+7)*4] + vneg.f s8, s8 + vldr s9, [IN, #(1*8+0)*4] + vldr s11, [IN, #(1*8+1)*4] + vldr s13, [IN, #(1*8+2)*4] + vldr s15, [IN, #(1*8+3)*4] + vneg.f s16, s16 + vldr s17, [IN, #(1*8+4)*4] + vldr s19, [IN, #(1*8+5)*4] + vldr s21, [IN, #(1*8+6)*4] + vldr s23, [IN, #(1*8+7)*4] + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + vldr s9, [IN, #(3*8+0)*4] + vldr s11, [IN, #(3*8+1)*4] + vldr s13, [IN, #(3*8+2)*4] + vldr s15, [IN, #(3*8+3)*4] + vldr s17, [IN, #(3*8+4)*4] + vldr s19, [IN, #(3*8+5)*4] + vldr s21, [IN, #(3*8+6)*4] + vldr s23, [IN, #(3*8+7)*4] + vneg.f s9, s9 + vldr s8, [IN, #(2*8+0)*4] + vldr s10, [IN, #(2*8+1)*4] + vldr s12, [IN, #(2*8+2)*4] + vldr s14, [IN, #(2*8+3)*4] + vneg.f s17, s17 + vldr s16, [IN, #(2*8+4)*4] + vldr s18, [IN, #(2*8+5)*4] + vldr s20, [IN, #(2*8+6)*4] + vldr s22, [IN, #(2*8+7)*4] + vstr d4, [BUF, #(0*32+2)*4] + vstr d5, [BUF, #(1*32+2)*4] + vstr d6, [BUF, #(2*32+2)*4] + vstr d7, [BUF, #(3*32+2)*4] + vstr d8, [BUF, #(4*32+2)*4] + vstr d9, [BUF, #(5*32+2)*4] + vstr d10, [BUF, #(6*32+2)*4] + vstr d11, [BUF, #(7*32+2)*4] + add IN, IN, #4*8*4 + add BUF, BUF, #4*4 + subs COUNT, COUNT, #(4 << 5) + 2 + bpl 1b +2: @ Now deal with trailing < 4 samples + adds COUNT, COUNT, #3 << 5 + bmi 4f @ sb_act was a multiple of 4 + bics lr, COUNT, #0x1F + bne 3f + @ sb_act was n*4+1 + vldr s8, [IN, #(0*8+0)*4] + vldr s10, [IN, #(0*8+1)*4] + vldr s12, [IN, #(0*8+2)*4] + vldr s14, [IN, #(0*8+3)*4] + vldr s16, [IN, #(0*8+4)*4] + vldr s18, [IN, #(0*8+5)*4] + vldr s20, [IN, #(0*8+6)*4] + vldr s22, [IN, #(0*8+7)*4] + vneg.f s8, s8 + vldr s9, zero + vldr s11, zero + vldr s13, zero + vldr s15, zero + vneg.f s16, s16 + vldr s17, zero + vldr s19, zero + vldr s21, zero + vldr s23, zero + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + sub COUNT, COUNT, #1 + b 4f +3: @ sb_act was n*4+2 or n*4+3, so do the first 2 + vldr s8, [IN, #(0*8+0)*4] + vldr s10, [IN, #(0*8+1)*4] + vldr s12, [IN, #(0*8+2)*4] + vldr s14, [IN, #(0*8+3)*4] + vldr s16, [IN, #(0*8+4)*4] + vldr s18, [IN, #(0*8+5)*4] + vldr s20, [IN, #(0*8+6)*4] + vldr s22, [IN, #(0*8+7)*4] + vneg.f s8, s8 + vldr s9, [IN, #(1*8+0)*4] + vldr s11, [IN, #(1*8+1)*4] + vldr s13, [IN, #(1*8+2)*4] + vldr s15, [IN, #(1*8+3)*4] + vneg.f s16, s16 + vldr s17, [IN, #(1*8+4)*4] + vldr s19, [IN, #(1*8+5)*4] + vldr s21, [IN, #(1*8+6)*4] + vldr s23, [IN, #(1*8+7)*4] + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + sub COUNT, COUNT, #(2 << 5) + 1 + bics lr, COUNT, #0x1F + bne 4f + @ sb_act was n*4+3 + vldr s8, [IN, #(2*8+0)*4] + vldr s10, [IN, #(2*8+1)*4] + vldr s12, [IN, #(2*8+2)*4] + vldr s14, [IN, #(2*8+3)*4] + vldr s16, [IN, #(2*8+4)*4] + vldr s18, [IN, #(2*8+5)*4] + vldr s20, [IN, #(2*8+6)*4] + vldr s22, [IN, #(2*8+7)*4] + vldr s9, zero + vldr s11, zero + vldr s13, zero + vldr s15, zero + vldr s17, zero + vldr s19, zero + vldr s21, zero + vldr s23, zero + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + sub COUNT, COUNT, #1 +4: @ Now fill the remainder with 0 + vldr s8, zero + vldr s9, zero + ands COUNT, COUNT, #0x1F + beq 6f +5: vstr d4, [BUF, #(0*32+0)*4] + vstr d4, [BUF, #(1*32+0)*4] + vstr d4, [BUF, #(2*32+0)*4] + vstr d4, [BUF, #(3*32+0)*4] + vstr d4, [BUF, #(4*32+0)*4] + vstr d4, [BUF, #(5*32+0)*4] + vstr d4, [BUF, #(6*32+0)*4] + vstr d4, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + subs COUNT, COUNT, #1 + bne 5b +6: + fmxr FPSCR, OLDFPSCR + ldr WINDOW, [fp, #3*4] + ldr OUT, [fp, #4*4] + sub BUF, BUF, #32*4 +NOVFP ldr SCALEINT, [fp, #6*4] + mov COUNT, #8 +VFP vpush {SCALE} +VFP sub sp, sp, #3*4 +NOVFP sub sp, sp, #4*4 +7: +VFP ldr a1, [fp, #-7*4] @ imdct +NOVFP ldr a1, [fp, #-8*4] + ldmia fp, {a2-a4} +VFP stmia sp, {WINDOW, OUT, BUF} +NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} +VFP vldr SCALE, [sp, #3*4] + bl ff_synth_filter_float_vfp + add OUT, OUT, #32*4 + add BUF, BUF, #32*4 + subs COUNT, COUNT, #1 + bne 7b + +A sub sp, fp, #(8+8)*4 +T sub fp, fp, #(8+8)*4 +T mov sp, fp + vpop {s16-s23} +VFP pop {a3-a4,v1-v3,v5,fp,pc} +NOVFP pop {a4,v1-v5,fp,pc} +endfunc + + .unreq IN + .unreq SBACT + .unreq OLDFPSCR + .unreq IMDCT + .unreq WINDOW + .unreq OUT + .unreq BUF + .unreq SCALEINT + .unreq COUNT + + .unreq SCALE + + .align 2 +zero: .word 0 |