diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-06-16 03:53:58 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-06-16 03:53:58 +0200 |
commit | 7a02527b05e2ae5ffab579062dbe3c888758335f (patch) | |
tree | c52ec9666f9436a6c15df3c6b32d08d897771aba /libavcodec | |
parent | a0bafaabb0656ca3bb3591beba0de79f6153fdac (diff) | |
parent | b203f65451646b1555d458a3601159f7d89a3397 (diff) | |
download | ffmpeg-7a02527b05e2ae5ffab579062dbe3c888758335f.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
ac3enc: use correct alignment and length in channel coupling dsp functions.
ffmpeg: don't abuse a global for passing framerate from input to output
ffmpeg: don't abuse a global for passing channels from input to output
ffmpeg: don't abuse a global for passing samplerate from input to output
ARM: update ff_h264_idct8_add4_neon for 4:4:4 changes
swscale: use SwsContext for av_log when available
swscale: Remove HAVE_MMX from files that are only compiled with MMX enabled.
swscale: Fix compilation with --disable-mmx2.
Conflicts:
ffmpeg.c
libswscale/utils.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/ac3enc_template.c | 24 | ||||
-rw-r--r-- | libavcodec/arm/h264dsp_init_arm.c | 3 | ||||
-rw-r--r-- | libavcodec/arm/h264idct_neon.S | 41 |
3 files changed, 38 insertions, 30 deletions
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c index 0547165aaf..f6248a82c9 100644 --- a/libavcodec/ac3enc_template.c +++ b/libavcodec/ac3enc_template.c @@ -134,36 +134,38 @@ void AC3_NAME(apply_channel_coupling)(AC3EncodeContext *s) LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); int blk, ch, bnd, i, j; CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}}; - int num_cpl_coefs = s->num_cpl_subbands * 12; + int cpl_start, num_cpl_coefs; memset(cpl_coords, 0, AC3_MAX_BLOCKS * sizeof(*cpl_coords)); memset(fixed_cpl_coords, 0, AC3_MAX_BLOCKS * sizeof(*fixed_cpl_coords)); + /* align start to 16-byte boundary. align length to multiple of 32. + note: coupling start bin % 4 will always be 1 */ + cpl_start = s->start_freq[CPL_CH] - 1; + num_cpl_coefs = FFALIGN(s->num_cpl_subbands * 12 + 1, 32); + cpl_start = FFMIN(256, cpl_start + num_cpl_coefs) - num_cpl_coefs; + /* calculate coupling channel from fbw channels */ for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) { AC3Block *block = &s->blocks[blk]; - CoefType *cpl_coef = &block->mdct_coef[CPL_CH][s->start_freq[CPL_CH]]; + CoefType *cpl_coef = &block->mdct_coef[CPL_CH][cpl_start]; if (!block->cpl_in_use) continue; - memset(cpl_coef-1, 0, (num_cpl_coefs+4) * sizeof(*cpl_coef)); + memset(cpl_coef, 0, num_cpl_coefs * sizeof(*cpl_coef)); for (ch = 1; ch <= s->fbw_channels; ch++) { - CoefType *ch_coef = &block->mdct_coef[ch][s->start_freq[CPL_CH]]; + CoefType *ch_coef = &block->mdct_coef[ch][cpl_start]; if (!block->channel_in_cpl[ch]) continue; for (i = 0; i < num_cpl_coefs; i++) cpl_coef[i] += ch_coef[i]; } - /* note: coupling start bin % 4 will always be 1 and num_cpl_coefs - will always be a multiple of 12, so we need to subtract 1 from - the start and add 4 to the length when using optimized - functions which require 16-byte alignment. */ /* coefficients must be clipped to +/- 1.0 in order to be encoded */ - s->dsp.vector_clipf(cpl_coef-1, cpl_coef-1, -1.0f, 1.0f, num_cpl_coefs+4); + s->dsp.vector_clipf(cpl_coef, cpl_coef, -1.0f, 1.0f, num_cpl_coefs); /* scale coupling coefficients from float to 24-bit fixed-point */ - s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][s->start_freq[CPL_CH]-1], - cpl_coef-1, num_cpl_coefs+4); + s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][cpl_start], + cpl_coef, num_cpl_coefs); } /* calculate energy in each band in coupling channel and each fbw channel */ diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index b344584799..e9146405c2 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -122,8 +122,7 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth) c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; c->h264_idct_add16 = ff_h264_idct_add16_neon; c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; - //FIXME: reenable when asm is updated. - //c->h264_idct_add8 = ff_h264_idct_add8_neon; + c->h264_idct_add8 = ff_h264_idct_add8_neon; c->h264_idct8_add = ff_h264_idct8_add_neon; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; c->h264_idct8_add4 = ff_h264_idct8_add4_neon; diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S index 6b6a669f35..afd3718518 100644 --- a/libavcodec/arm/h264idct_neon.S +++ b/libavcodec/arm/h264idct_neon.S @@ -148,24 +148,27 @@ function ff_h264_idct_add8_neon, export=1 add r5, r1, #16*4 add r1, r2, #16*32 mov r2, r3 + mov r3, r1 ldr r6, [sp, #32] movrel r7, scan8+16 - mov ip, #7 -1: ldrb r8, [r7], #1 - ldr r0, [r5], #4 + mov r12, #0 +1: ldrb r8, [r7, r12] + ldr r0, [r5, r12, lsl #2] ldrb r8, [r6, r8] - tst ip, #4 - addne r0, r0, r4 - addeq r0, r0, r9 + add r0, r0, r4 + add r1, r3, r12, lsl #5 cmp r8, #0 ldrsh r8, [r1] adrne lr, ff_h264_idct_add_neon adreq lr, ff_h264_idct_dc_add_neon cmpeq r8, #0 blxne lr - subs ip, ip, #1 - add r1, r1, #32 - bge 1b + add r12, r12, #1 + cmp r12, #4 + moveq r12, #16 + moveq r4, r9 + cmp r12, #20 + blt 1b pop {r4-r10,pc} endfunc @@ -374,11 +377,15 @@ function ff_h264_idct8_add4_neon, export=1 endfunc .section .rodata -scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 - .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 - .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 - .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 - .byte 1+1*8, 2+1*8 - .byte 1+2*8, 2+2*8 - .byte 1+4*8, 2+4*8 - .byte 1+5*8, 2+5*8 +scan8: .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 + .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 + .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 + .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 |