diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-01-10 02:50:54 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-01-10 03:50:41 +0100 |
commit | dd3ca3ea15392da8636c06764e2da31e6ca700f0 (patch) | |
tree | 97d3fc3bdb9463a99728e14d3cd4a0062aa3af19 /libavcodec | |
parent | 4805a33043e9356fc344aa53c7df747d41ce6b37 (diff) | |
parent | a67b8c86d06eb5b78a0fe4cb9be4e93b29726db1 (diff) | |
download | ffmpeg-dd3ca3ea15392da8636c06764e2da31e6ca700f0.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
fate: Add tests for more AAC features.
aacps: Add missing newline in error message.
fate: Add tests for vc1/wmapro in ism.
aacdec: Add a fate test for 5.1 channel SBR.
aacdec: Turn off PS for multichannel files that use PCE based configs.
cabac: remove put_cabac_u/ueg from cabac-test.
swscale: RGB4444 and BGR444 input
FATE: add test for xWMA demuxer.
FATE: add test for SMJPEG demuxer and associated IMA ADPCM audio decoder.
mpegaudiodec: optimized iMDCT transform
mpegaudiodec: change imdct window arrangment for better pointer alignment
mpegaudiodec: move imdct and windowing function to mpegaudiodsp
mpegaudiodec: interleave iMDCT buffer to simplify future SIMD implementations
swscale: convert yuy2/uyvy/nv12/nv21ToY/UV from inline asm to yasm.
FATE: test to exercise WTV demuxer.
mjpegdec: K&R formatting cosmetics
swscale: K&R formatting cosmetics for code examples
swscale: K&R reformatting cosmetics for header files
FATE test: cvid-grayscale; ensures that the grayscale Cinepak variant is exercised.
Conflicts:
libavcodec/cabac.c
libavcodec/mjpegdec.c
libavcodec/mpegaudiodec.c
libavcodec/mpegaudiodsp.c
libavcodec/mpegaudiodsp.h
libavcodec/mpegaudiodsp_template.c
libavcodec/x86/Makefile
libavcodec/x86/imdct36_sse.asm
libavcodec/x86/mpegaudiodec_mmx.c
libswscale/swscale-test.c
libswscale/swscale.c
libswscale/swscale_internal.h
libswscale/x86/swscale_template.c
tests/fate/demux.mak
tests/fate/microsoft.mak
tests/fate/video.mak
tests/fate/wma.mak
tests/ref/lavfi/pixfmts_scale
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/aac.h | 1 | ||||
-rw-r--r-- | libavcodec/aacdec.c | 21 | ||||
-rw-r--r-- | libavcodec/aacps.c | 2 | ||||
-rw-r--r-- | libavcodec/cabac.c | 74 | ||||
-rw-r--r-- | libavcodec/mpegaudiodec.c | 89 | ||||
-rw-r--r-- | libavcodec/mpegaudiodsp.c | 6 | ||||
-rw-r--r-- | libavcodec/mpegaudiodsp.h | 24 | ||||
-rw-r--r-- | libavcodec/mpegaudiodsp_template.c | 121 | ||||
-rw-r--r-- | libavcodec/x86/imdct36_sse.asm | 538 | ||||
-rw-r--r-- | libavcodec/x86/mpegaudiodec_mmx.c | 87 |
10 files changed, 681 insertions, 282 deletions
diff --git a/libavcodec/aac.h b/libavcodec/aac.h index 631fdd405f..6fdeedea21 100644 --- a/libavcodec/aac.h +++ b/libavcodec/aac.h @@ -84,6 +84,7 @@ enum BandType { #define IS_CODEBOOK_UNSIGNED(x) ((x - 1) & 10) enum ChannelPosition { + AAC_CHANNEL_OFF = 0, AAC_CHANNEL_FRONT = 1, AAC_CHANNEL_SIDE = 2, AAC_CHANNEL_BACK = 3, diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c index c0798429af..b7f76a61da 100644 --- a/libavcodec/aacdec.c +++ b/libavcodec/aacdec.c @@ -163,6 +163,19 @@ static ChannelElement *get_che(AACContext *ac, int type, int elem_id) } } +static int count_channels(enum ChannelPosition che_pos[4][MAX_ELEM_ID]) +{ + int i, type, sum = 0; + for (i = 0; i < MAX_ELEM_ID; i++) { + for (type = 0; type < 4; type++) { + sum += (1 + (type == TYPE_CPE)) * + (che_pos[type][i] != AAC_CHANNEL_OFF && + che_pos[type][i] != AAC_CHANNEL_CC); + } + } + return sum; +} + /** * Check for the channel element in the current channel position configuration. * If it exists, make sure the appropriate element is allocated and map the @@ -437,6 +450,12 @@ static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx, if ((ret = set_default_channel_config(avctx, new_che_pos, channel_config))) return ret; } + + if (count_channels(new_che_pos) > 1) { + m4ac->ps = 0; + } else if (m4ac->sbr == 1 && m4ac->ps == -1) + m4ac->ps = 1; + if (ac && (ret = output_configure(ac, ac->che_pos, new_che_pos, channel_config, OC_GLOBAL_HDR))) return ret; @@ -495,8 +514,6 @@ static int decode_audio_specific_config(AACContext *ac, av_log(avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", m4ac->sampling_index); return -1; } - if (m4ac->sbr == 1 && m4ac->ps == -1) - m4ac->ps = 1; skip_bits_long(&gb, i); diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c index 6ef8347ee5..3b5aa58109 100644 --- a/libavcodec/aacps.c +++ b/libavcodec/aacps.c @@ -223,7 +223,7 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps cnt -= 2 + ps_read_extension_data(gb, ps, ps_extension_id); } if (cnt < 0) { - av_log(avctx, AV_LOG_ERROR, "ps extension overflow %d", cnt); + av_log(avctx, AV_LOG_ERROR, "ps extension overflow %d\n", cnt); goto err; } skip_bits(gb, cnt); diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c index 983614581d..6cabfb9e73 100644 --- a/libavcodec/cabac.c +++ b/libavcodec/cabac.c @@ -252,67 +252,6 @@ static int put_cabac_terminate(CABACContext *c, int bit){ return (put_bits_count(&c->pb)+7)>>3; } -/** - * put (truncated) unary binarization. - */ -static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ - int i; - - assert(v <= max); - - for(i=0; i<v; i++){ - put_cabac(c, state, 1); - if(i < max_index) state++; - } - if(truncated==0 || v<max) - put_cabac(c, state, 0); -} - -/** - * put unary exp golomb k-th order binarization. - */ -static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ - int i; - - if(v==0) - put_cabac(c, state, 0); - else{ - const int sign= v < 0; - - if(is_signed) v= FFABS(v); - - if(v<max){ - for(i=0; i<v; i++){ - put_cabac(c, state, 1); - if(i < max_index) state++; - } - - put_cabac(c, state, 0); - }else{ - int m= 1<<k; - - for(i=0; i<max; i++){ - put_cabac(c, state, 1); - if(i < max_index) state++; - } - - v -= max; - while(v >= m){ //FIXME optimize - put_cabac_bypass(c, 1); - v-= m; - m+= m; - } - put_cabac_bypass(c, 0); - while(m>>=1){ - put_cabac_bypass(c, v&m); - } - } - - if(is_signed) - put_cabac_bypass(c, sign); - } -} - int main(void){ CABACContext c; uint8_t b[9*SIZE]; @@ -342,19 +281,6 @@ START_TIMER STOP_TIMER("put_cabac") } -#if 0 - for(i=0; i<SIZE; i++){ -START_TIMER - put_cabac_u(&c, state, r[i], 6, 3, i&1); -STOP_TIMER("put_cabac_u") - } - - for(i=0; i<SIZE; i++){ -START_TIMER - put_cabac_ueg(&c, state, r[i], 3, 0, 1, 2); -STOP_TIMER("put_cabac_ueg") - } -#endif put_cabac_terminate(&c, 1); ff_init_cabac_decoder(&c, b, SIZE); diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index eae030ce44..18ef40e011 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -132,10 +132,6 @@ static uint16_t band_index_long[9][23]; static INTFLOAT is_table[2][16]; static INTFLOAT is_table_lsf[2][2][16]; static INTFLOAT csa_table[8][4]; -/** Window for MDCT. Note that only the component [0,17] and [20,37] are used, - the components 18 and 19 are there only to assure 128-bit alignment for asm - */ -DECLARE_ALIGNED(16, static INTFLOAT, mdct_win)[8][40]; static int16_t division_tab3[1<<6 ]; static int16_t division_tab5[1<<8 ]; @@ -422,45 +418,6 @@ static av_cold void decode_init_static(void) csa_table[i][3] = ca - cs; #endif } - - /* compute mdct windows */ - for (i = 0; i < 36; i++) { - for (j = 0; j < 4; j++) { - double d; - - if (j == 2 && i % 3 != 1) - continue; - - d = sin(M_PI * (i + 0.5) / 36.0); - if (j == 1) { - if (i >= 30) d = 0; - else if (i >= 24) d = sin(M_PI * (i - 18 + 0.5) / 12.0); - else if (i >= 18) d = 1; - } else if (j == 3) { - if (i < 6) d = 0; - else if (i < 12) d = sin(M_PI * (i - 6 + 0.5) / 12.0); - else if (i < 18) d = 1; - } - //merge last stage of imdct into the window coefficients - d *= 0.5 / cos(M_PI * (2 * i + 19) / 72); - - if (j == 2) - mdct_win[j][i/3] = FIXHR((d / (1<<5))); - else { - int idx = i < 18 ? i : i + 2; - mdct_win[j][idx] = FIXHR((d / (1<<5))); - } - } - } - - /* NOTE: we do frequency inversion adter the MDCT by changing - the sign of the right window coefs */ - for (j = 0; j < 4; j++) { - for (i = 0; i < 40; i += 2) { - mdct_win[j + 4][i ] = mdct_win[j][i ]; - mdct_win[j + 4][i + 1] = -mdct_win[j][i + 1]; - } - } } static av_cold int decode_init(AVCodecContext * avctx) @@ -1284,59 +1241,53 @@ static void compute_imdct(MPADecodeContext *s, GranuleDef *g, mdct_long_end = sblimit; } - buf = mdct_buf; - ptr = g->sb_hybrid; - for (j = 0; j < mdct_long_end; j++) { - int win_idx = (g->switch_point && j < 2) ? 0 : g->block_type; - /* apply window & overlap with previous buffer */ - out_ptr = sb_samples + j; - /* select window */ - win = mdct_win[win_idx + (4 & -(j & 1))]; - s->mpadsp.RENAME(imdct36)(out_ptr, buf, ptr, win); - out_ptr += 18 * SBLIMIT; - ptr += 18; - buf += 18; - } + s->mpadsp.RENAME(imdct36_blocks)(sb_samples, mdct_buf, g->sb_hybrid, + mdct_long_end, g->switch_point, + g->block_type); + + buf = mdct_buf + 4*18*(mdct_long_end >> 2) + (mdct_long_end & 3); + ptr = g->sb_hybrid + 18 * mdct_long_end; + for (j = mdct_long_end; j < sblimit; j++) { /* select frequency inversion */ - win = mdct_win[2 + (4 & -(j & 1))]; + win = RENAME(ff_mdct_win)[2 + (4 & -(j & 1))]; out_ptr = sb_samples + j; for (i = 0; i < 6; i++) { - *out_ptr = buf[i]; + *out_ptr = buf[4*i]; out_ptr += SBLIMIT; } imdct12(out2, ptr + 0); for (i = 0; i < 6; i++) { - *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[i + 6*1]; - buf[i + 6*2] = MULH3(out2[i + 6], win[i + 6], 1); + *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*1)]; + buf[4*(i + 6*2)] = MULH3(out2[i + 6], win[i + 6], 1); out_ptr += SBLIMIT; } imdct12(out2, ptr + 1); for (i = 0; i < 6; i++) { - *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[i + 6*2]; - buf[i + 6*0] = MULH3(out2[i + 6], win[i + 6], 1); + *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*2)]; + buf[4*(i + 6*0)] = MULH3(out2[i + 6], win[i + 6], 1); out_ptr += SBLIMIT; } imdct12(out2, ptr + 2); for (i = 0; i < 6; i++) { - buf[i + 6*0] = MULH3(out2[i ], win[i ], 1) + buf[i + 6*0]; - buf[i + 6*1] = MULH3(out2[i + 6], win[i + 6], 1); - buf[i + 6*2] = 0; + buf[4*(i + 6*0)] = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*0)]; + buf[4*(i + 6*1)] = MULH3(out2[i + 6], win[i + 6], 1); + buf[4*(i + 6*2)] = 0; } ptr += 18; - buf += 18; + buf += (j&3) != 3 ? 1 : (4*18-3); } /* zero bands */ for (j = sblimit; j < SBLIMIT; j++) { /* overlap */ out_ptr = sb_samples + j; for (i = 0; i < 18; i++) { - *out_ptr = buf[i]; - buf[i] = 0; + *out_ptr = buf[4*i]; + buf[4*i] = 0; out_ptr += SBLIMIT; } - buf += 18; + buf += (j&3) != 3 ? 1 : (4*18-3); } } diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c index 212610d887..cc12dd9cee 100644 --- a/libavcodec/mpegaudiodsp.c +++ b/libavcodec/mpegaudiodsp.c @@ -28,6 +28,8 @@ void ff_mpadsp_init(MPADSPContext *s) DCTContext dct; ff_dct_init(&dct, 5, DCT_II); + ff_init_mpadsp_tabs_float(); + ff_init_mpadsp_tabs_fixed(); s->apply_window_float = ff_mpadsp_apply_window_float; s->apply_window_fixed = ff_mpadsp_apply_window_fixed; @@ -35,8 +37,8 @@ void ff_mpadsp_init(MPADSPContext *s) s->dct32_float = dct.dct32; s->dct32_fixed = ff_dct32_fixed; - s->imdct36_float = ff_imdct36_float; - s->imdct36_fixed = ff_imdct36_fixed; + s->imdct36_blocks_float = ff_imdct36_blocks_float; + s->imdct36_blocks_fixed = ff_imdct36_blocks_fixed; if (ARCH_ARM) ff_mpadsp_init_arm(s); if (HAVE_MMX) ff_mpadsp_init_mmx(s); diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h index 9e84784403..da53fe56e8 100644 --- a/libavcodec/mpegaudiodsp.h +++ b/libavcodec/mpegaudiodsp.h @@ -20,6 +20,7 @@ #define AVCODEC_MPEGAUDIODSP_H #include <stdint.h> +#include "libavutil/common.h" typedef struct MPADSPContext { void (*apply_window_float)(float *synth_buf, float *window, @@ -28,8 +29,11 @@ typedef struct MPADSPContext { int *dither_state, int16_t *samples, int incr); void (*dct32_float)(float *dst, const float *src); void (*dct32_fixed)(int *dst, const int *src); - void (*imdct36_float)(float *out, float *buf, float *in, float *win); - void (*imdct36_fixed)(int *out, int *buf, int *in, int *win); + + void (*imdct36_blocks_float)(float *out, float *buf, float *in, + int count, int switch_point, int block_type); + void (*imdct36_blocks_fixed)(int *out, int *buf, int *in, + int count, int switch_point, int block_type); } MPADSPContext; void ff_mpadsp_init(MPADSPContext *s); @@ -63,7 +67,19 @@ void ff_mpadsp_apply_window_fixed(int32_t *synth_buf, int32_t *window, int *dither_state, int16_t *samples, int incr); -void ff_imdct36_fixed(int *out, int *buf, int *in, int *win); -void ff_imdct36_float(float *out, float *buf, float *in, float *win); +void ff_imdct36_blocks_float(float *out, float *buf, float *in, + int count, int switch_point, int block_type); + +void ff_imdct36_blocks_fixed(int *out, int *buf, int *in, + int count, int switch_point, int block_type); + +void ff_init_mpadsp_tabs_float(void); +void ff_init_mpadsp_tabs_fixed(void); + +/** For SSE implementation, MDCT_BUF_SIZE/2 should be 128-bit aligned */ +#define MDCT_BUF_SIZE FFALIGN(36, 2*4) + +extern int ff_mdct_win_fixed[8][MDCT_BUF_SIZE]; +extern float ff_mdct_win_float[8][MDCT_BUF_SIZE]; #endif /* AVCODEC_MPEGAUDIODSP_H */ diff --git a/libavcodec/mpegaudiodsp_template.c b/libavcodec/mpegaudiodsp_template.c index 40eca76b81..53b6139da3 100644 --- a/libavcodec/mpegaudiodsp_template.c +++ b/libavcodec/mpegaudiodsp_template.c @@ -69,6 +69,12 @@ static inline int round_sample(int64_t *sum) # define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) #endif +/** Window for MDCT. Actually only the elements in [0,17] and + [MDCT_BUF_SIZE/2, MDCT_BUF_SIZE/2 + 17] are actually used. The rest + is just to preserve alignment for SIMD implementations. +*/ +DECLARE_ALIGNED(16, INTFLOAT, RENAME(ff_mdct_win))[8][MDCT_BUF_SIZE]; + DECLARE_ALIGNED(16, MPA_INT, RENAME(ff_mpa_synth_window))[512+256]; #define SUM8(op, sum, w, p) \ @@ -204,6 +210,7 @@ void av_cold RENAME(ff_mpa_synth_init)(MPA_INT *window) window[512 - i] = v; } + // Needed for avoiding shuffles in ASM implementations for(i=0; i < 8; i++) for(j=0; j < 16; j++) @@ -214,6 +221,48 @@ void av_cold RENAME(ff_mpa_synth_init)(MPA_INT *window) window[512+128+16*i+j] = window[64*i+48-j]; } +void RENAME(ff_init_mpadsp_tabs)(void) +{ + int i, j; + /* compute mdct windows */ + for (i = 0; i < 36; i++) { + for (j = 0; j < 4; j++) { + double d; + + if (j == 2 && i % 3 != 1) + continue; + + d = sin(M_PI * (i + 0.5) / 36.0); + if (j == 1) { + if (i >= 30) d = 0; + else if (i >= 24) d = sin(M_PI * (i - 18 + 0.5) / 12.0); + else if (i >= 18) d = 1; + } else if (j == 3) { + if (i < 6) d = 0; + else if (i < 12) d = sin(M_PI * (i - 6 + 0.5) / 12.0); + else if (i < 18) d = 1; + } + //merge last stage of imdct into the window coefficients + d *= 0.5 / cos(M_PI * (2 * i + 19) / 72); + + if (j == 2) + RENAME(ff_mdct_win)[j][i/3] = FIXHR((d / (1<<5))); + else { + int idx = i < 18 ? i : i + (MDCT_BUF_SIZE/2 - 18); + RENAME(ff_mdct_win)[j][idx] = FIXHR((d / (1<<5))); + } + } + } + + /* NOTE: we do frequency inversion adter the MDCT by changing + the sign of the right window coefs */ + for (j = 0; j < 4; j++) { + for (i = 0; i < MDCT_BUF_SIZE; i += 2) { + RENAME(ff_mdct_win)[j + 4][i ] = RENAME(ff_mdct_win)[j][i ]; + RENAME(ff_mdct_win)[j + 4][i + 1] = -RENAME(ff_mdct_win)[j][i + 1]; + } + } +} /* cos(pi*i/18) */ #define C1 FIXHR(0.98480775301220805936/2) #define C2 FIXHR(0.93969262078590838405/2) @@ -227,43 +276,42 @@ void av_cold RENAME(ff_mpa_synth_init)(MPA_INT *window) /* 0.5 / cos(pi*(2*i+1)/36) */ static const INTFLOAT icos36[9] = { FIXR(0.50190991877167369479), - FIXR(0.51763809020504152469), + FIXR(0.51763809020504152469), //0 FIXR(0.55168895948124587824), FIXR(0.61038729438072803416), - FIXR(0.70710678118654752439), + FIXR(0.70710678118654752439), //1 FIXR(0.87172339781054900991), FIXR(1.18310079157624925896), - FIXR(1.93185165257813657349), + FIXR(1.93185165257813657349), //2 FIXR(5.73685662283492756461), }; /* 0.5 / cos(pi*(2*i+1)/36) */ static const INTFLOAT icos36h[9] = { FIXHR(0.50190991877167369479/2), - FIXHR(0.51763809020504152469/2), + FIXHR(0.51763809020504152469/2), //0 FIXHR(0.55168895948124587824/2), FIXHR(0.61038729438072803416/2), - FIXHR(0.70710678118654752439/2), + FIXHR(0.70710678118654752439/2), //1 FIXHR(0.87172339781054900991/2), FIXHR(1.18310079157624925896/4), - FIXHR(1.93185165257813657349/4), + FIXHR(1.93185165257813657349/4), //2 +// FIXHR(5.73685662283492756461), }; - /* using Lee like decomposition followed by hand coded 9 points DCT */ -void RENAME(ff_imdct36)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, - INTFLOAT *win) +static void imdct36(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, INTFLOAT *win) { int i, j; INTFLOAT t0, t1, t2, t3, s0, s1, s2, s3; INTFLOAT tmp[18], *tmp1, *in1; - for(i=17;i>=1;i--) + for (i = 17; i >= 1; i--) in[i] += in[i-1]; - for(i=17;i>=3;i-=2) + for (i = 17; i >= 3; i -= 2) in[i] += in[i-2]; - for(j=0;j<2;j++) { + for (j = 0; j < 2; j++) { tmp1 = tmp + j; in1 = in + j; @@ -295,7 +343,7 @@ void RENAME(ff_imdct36)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, } i = 0; - for(j=0;j<4;j++) { + for (j = 0; j < 4; j++) { t0 = tmp[i]; t1 = tmp[i + 2]; s0 = t1 + t0; @@ -303,22 +351,22 @@ void RENAME(ff_imdct36)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, t2 = tmp[i + 1]; t3 = tmp[i + 3]; - s1 = MULH3(t3 + t2, icos36h[j], 2); - s3 = MULLx(t3 - t2, icos36[8 - j], FRAC_BITS); + s1 = MULH3(t3 + t2, icos36h[ j], 2); + s3 = MULLx(t3 - t2, icos36 [8 - j], FRAC_BITS); t0 = s0 + s1; t1 = s0 - s1; - out[(9 + j)*SBLIMIT] = MULH3(t1, win[9 + j], 1) + buf[9 + j]; - out[(8 - j)*SBLIMIT] = MULH3(t1, win[8 - j], 1) + buf[8 - j]; - buf[9 + j] = MULH3(t0, win[20 + 9 + j], 1); - buf[8 - j] = MULH3(t0, win[20 + 8 - j], 1); + out[(9 + j) * SBLIMIT] = MULH3(t1, win[ 9 + j], 1) + buf[4*(9 + j)]; + out[(8 - j) * SBLIMIT] = MULH3(t1, win[ 8 - j], 1) + buf[4*(8 - j)]; + buf[4 * ( 9 + j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 9 + j], 1); + buf[4 * ( 8 - j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 8 - j], 1); t0 = s2 + s3; t1 = s2 - s3; - out[(9 + 8 - j)*SBLIMIT] = MULH3(t1, win[9 + 8 - j], 1) + buf[9 + 8 - j]; - out[( j)*SBLIMIT] = MULH3(t1, win[ j], 1) + buf[ j]; - buf[9 + 8 - j] = MULH3(t0, win[20 + 9 + 8 - j], 1); - buf[ + j] = MULH3(t0, win[20 + j], 1); + out[(9 + 8 - j) * SBLIMIT] = MULH3(t1, win[ 9 + 8 - j], 1) + buf[4*(9 + 8 - j)]; + out[ j * SBLIMIT] = MULH3(t1, win[ j], 1) + buf[4*( j)]; + buf[4 * ( 9 + 8 - j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 9 + 8 - j], 1); + buf[4 * ( j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + j], 1); i += 4; } @@ -326,9 +374,28 @@ void RENAME(ff_imdct36)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, s1 = MULH3(tmp[17], icos36h[4], 2); t0 = s0 + s1; t1 = s0 - s1; - out[(9 + 4)*SBLIMIT] = MULH3(t1, win[9 + 4], 1) + buf[9 + 4]; - out[(8 - 4)*SBLIMIT] = MULH3(t1, win[8 - 4], 1) + buf[8 - 4]; - buf[9 + 4] = MULH3(t0, win[20 + 9 + 4], 1); - buf[8 - 4] = MULH3(t0, win[20 + 8 - 4], 1); + out[(9 + 4) * SBLIMIT] = MULH3(t1, win[ 9 + 4], 1) + buf[4*(9 + 4)]; + out[(8 - 4) * SBLIMIT] = MULH3(t1, win[ 8 - 4], 1) + buf[4*(8 - 4)]; + buf[4 * ( 9 + 4 )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 9 + 4], 1); + buf[4 * ( 8 - 4 )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 8 - 4], 1); +} + +void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, + int count, int switch_point, int block_type) +{ + int j; + for (j=0 ; j < count; j++) { + /* apply window & overlap with previous buffer */ + + /* select window */ + int win_idx = (switch_point && j < 2) ? 0 : block_type; + INTFLOAT *win = RENAME(ff_mdct_win)[win_idx + (4 & -(j & 1))]; + + imdct36(out, buf, in, win); + + in += 18; + buf += ((j&3) != 3 ? 1 : (72-3)); + out++; + } } diff --git a/libavcodec/x86/imdct36_sse.asm b/libavcodec/x86/imdct36_sse.asm index 2b8fe57e0b..2908459db7 100644 --- a/libavcodec/x86/imdct36_sse.asm +++ b/libavcodec/x86/imdct36_sse.asm @@ -53,92 +53,118 @@ ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 dd 1.0, 0.70710678118654752439, 0.0, 0.0 +costabs: times 4 dd 0.98480773 + times 4 dd 0.93969262 + times 4 dd 0.86602539 + times 4 dd -0.76604444 + times 4 dd -0.64278764 + times 4 dd 0.50000000 + times 4 dd -0.50000000 + times 4 dd -0.34202015 + times 4 dd -0.17364818 + times 4 dd 0.50190992 + times 4 dd 0.51763808 + times 4 dd 0.55168896 + times 4 dd 0.61038726 + times 4 dd 0.70710677 + times 4 dd 0.87172341 + times 4 dd 1.18310082 + times 4 dd 1.93185163 + times 4 dd 5.73685646 + %define SBLIMIT 32 SECTION_TEXT -%macro PSHUFD_SSE_AVX 3 - shufps %1, %2, %2, %3 -%endmacro -%macro PSHUFD_SSE2 3 +%macro PSHUFD 3 +%if cpuflag(sse2) && notcpuflag(avx) pshufd %1, %2, %3 +%else + shufps %1, %2, %2, %3 +%endif %endmacro -; input %1={x1,x2,x3,x4}, %2={y1,y2,y3,y4} -; output %3={x3,x4,y1,y2} -%macro BUILDINVHIGHLOW_SSE 3 - movlhps %3, %2 - movhlps %3, %1 -%endmacro -%macro BUILDINVHIGHLOW_AVX 3 - shufps %3, %1, %2, 0x4e +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x3,x4,y1,y2} +%macro BUILDINVHIGHLOW 3 +%if cpuflag(avx) + shufps %1, %2, %3, 0x4e +%else + movlhps %1, %3 + movhlps %1, %2 +%endif %endmacro -; input %1={x1,x2,x3,x4}, %2={y1,y2,y3,y4} -; output %3={x4,y1,y2,y3} -%macro ROTLEFT_SSE 3 +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x4,y1,y2,y3} +%macro ROTLEFT 3 +%if cpuflag(ssse3) + palignr %1, %3, %2, 12 +%else BUILDINVHIGHLOW %1, %2, %3 - shufps %3, %3, %2, 0x99 -%endmacro - -%macro ROTLEFT_SSSE3 3 - palignr %3, %2, %1, 12 + shufps %1, %1, %3, 0x99 +%endif %endmacro -%macro INVERTHL_SSE1 2 +%macro INVERTHL 2 +%if cpuflag(sse2) + PSHUFD %1, %2, 0x4e +%else movhlps %1, %2 movlhps %1, %2 +%endif %endmacro -%macro INVERTHL_SSE2 2 - PSHUFD %1, %2, 0x4e -%endmacro - -%macro BUTTERF_SSE12 3 +%macro BUTTERF 3 INVERTHL %2, %1 xorps %1, [ps_p1p1m1m1] addps %1, %2 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xb1 + addsubps %1, %1, %2 +%else mulps %1, [ps_cosh + %3] PSHUFD %2, %1, 0xb1 xorps %1, [ps_p1m1p1m1] addps %1, %2 -%endmacro -%macro BUTTERF_SSE3 3 - INVERTHL %2, %1 - xorps %1, %1, [ps_p1p1m1m1] - addps %1, %1, %2 - mulps %1, %1, [ps_cosh_sse3 + %3] - PSHUFD %2, %1, 0xb1 - addsubps %1, %1, %2 +%endif %endmacro -%macro STORE 3 +%macro STORE 4 movhlps %2, %1 - movss [%3 ], %1 - movss [%3 + 8*SBLIMIT], %2 + movss [%3 ], %1 + movss [%3 + 2*%4], %2 shufps %1, %1, 0xb1 - movss [%3 + 4*SBLIMIT], %1 + movss [%3 + %4], %1 movhlps %2, %1 - movss [%3 + 12*SBLIMIT], %2 + movss [%3 + 3*%4], %2 +%endmacro + +%macro LOAD 4 + movlps %1, [%3 ] + movhps %1, [%3 + %4] + movlps %2, [%3 + 2*%4] + movhps %2, [%3 + 3*%4] + shufps %1, %2, 0x88 %endmacro %macro LOADA64 2 +%if cpuflag(avx) + movu %1, [%2] +%else movlps %1, [%2] movhps %1, [%2 + 8] +%endif %endmacro -%macro STOREA64 2 - movlps [%1 ], %2 - movhps [%1 + 8], %2 -%endmacro - -%macro DEFINE_IMDCT 1 -cglobal imdct36_float_%1, 4,4,9, out, buf, in, win +%macro DEFINE_IMDCT 0 +cglobal imdct36_float, 4,4,9, out, buf, in, win ; for(i=17;i>=1;i--) in[i] += in[i-1]; LOADA64 m0, inq LOADA64 m1, inq + 16 - ROTLEFT m0, m1, m5 + ROTLEFT m5, m0, m1 PSHUFD m6, m0, 0x93 andps m6, m6, [ps_mask] @@ -146,16 +172,16 @@ cglobal imdct36_float_%1, 4,4,9, out, buf, in, win LOADA64 m2, inq + 32 - ROTLEFT m1, m2, m7 + ROTLEFT m7, m1, m2 addps m1, m1, m5 LOADA64 m3, inq + 48 - ROTLEFT m2, m3, m5 + ROTLEFT m5, m2, m3 xorps m4, m4, m4 movlps m4, [inq+64] - BUILDINVHIGHLOW m3, m4, m6 + BUILDINVHIGHLOW m6, m3, m4 shufps m6, m6, m4, 0xa9 addps m4, m4, m6 @@ -166,17 +192,17 @@ cglobal imdct36_float_%1, 4,4,9, out, buf, in, win movlhps m5, m5, m0 andps m5, m5, [ps_mask3] - BUILDINVHIGHLOW m0, m1, m7 + BUILDINVHIGHLOW m7, m0, m1 andps m7, m7, [ps_mask2] addps m0, m0, m5 - BUILDINVHIGHLOW m1, m2, m6 + BUILDINVHIGHLOW m6, m1, m2 andps m6, m6, [ps_mask2] addps m1, m1, m7 - BUILDINVHIGHLOW m2, m3, m7 + BUILDINVHIGHLOW m7, m2, m3 andps m7, m7, [ps_mask2] addps m2, m2, m6 @@ -245,7 +271,7 @@ cglobal imdct36_float_%1, 4,4,9, out, buf, in, win subps m0, m0, m1 addps m0, m0, m6 - BUILDINVHIGHLOW m2, m3, m4 + BUILDINVHIGHLOW m4, m2, m3 shufps m3, m3, m2, 0x4e ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} @@ -280,7 +306,9 @@ cglobal imdct36_float_%1, 4,4,9, out, buf, in, win ; permutation done PSHUFD m6, m2, 0xb1 - movlps m7, [bufq + 64] + movss m4, [bufq + 4*68] + movss m7, [bufq + 4*64] + unpcklps m7, m7, m4 mulps m6, m6, [winq + 16*4] addps m6, m6, m7 movss [outq + 64*SBLIMIT], m6 @@ -288,74 +316,406 @@ cglobal imdct36_float_%1, 4,4,9, out, buf, in, win movss [outq + 68*SBLIMIT], m6 mulps m6, m3, [winq + 4*4] - LOADA64 m4, bufq + 16 + LOAD m4, m7, bufq + 4*16, 16 addps m6, m6, m4 - STORE m6, m7, outq + 16*SBLIMIT + STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT shufps m4, m0, m3, 0xb5 mulps m4, m4, [winq + 8*4] - LOADA64 m7, bufq + 32 + LOAD m7, m6, bufq + 4*32, 16 addps m4, m4, m7 - STORE m4, m6, outq + 32*SBLIMIT + STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT shufps m3, m3, m2, 0xb1 mulps m3, m3, [winq + 12*4] - LOADA64 m7, bufq + 48 + LOAD m7, m6, bufq + 4*48, 16 addps m3, m3, m7 - STORE m3, m7, outq + 48*SBLIMIT + STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT mulps m2, m2, [winq] - LOADA64 m6, bufq + LOAD m6, m7, bufq, 16 addps m2, m2, m6 - STORE m2, m7, outq + STORE m2, m7, outq, 4*SBLIMIT mulps m4, m1, [winq + 20*4] - STOREA64 bufq, m4 + STORE m4, m7, bufq, 16 mulps m3, m5, [winq + 24*4] - STOREA64 bufq + 16, m3 + STORE m3, m7, bufq + 4*16, 16 shufps m0, m0, m5, 0xb0 mulps m0, m0, [winq + 28*4] - STOREA64 bufq + 32, m0 + STORE m0, m7, bufq + 4*32, 16 shufps m5, m5, m1, 0xb1 mulps m5, m5, [winq + 32*4] - STOREA64 bufq + 48, m5 + STORE m5, m7, bufq + 4*48, 16 shufps m1, m1, m1, 0xb1 mulps m1, m1, [winq + 36*4] - movlps [bufq + 64], m1 + movss [bufq + 4*64], m1 + shufps m1, m1, 0xb1 + movss [bufq + 4*68], m1 RET %endmacro -%define PSHUFD PSHUFD_SSE_AVX -%define INVERTHL INVERTHL_SSE1 -%define BUTTERF BUTTERF_SSE12 -%define BUTTERF0 BUTTERF0_SSE12 -%define BUILDINVHIGHLOW BUILDINVHIGHLOW_SSE -%define ROTLEFT ROTLEFT_SSE +INIT_XMM sse +DEFINE_IMDCT -INIT_XMM +INIT_XMM sse2 +DEFINE_IMDCT -DEFINE_IMDCT sse +INIT_XMM sse3 +DEFINE_IMDCT -%define PSHUFD PSHUFD_SSE2 -%define INVERTHL INVERTHL_SSE2 +INIT_XMM ssse3 +DEFINE_IMDCT -DEFINE_IMDCT sse2 +INIT_XMM avx +DEFINE_IMDCT -%define BUTTERF BUTTERF_SSE3 -%define BUTTERF0 BUTTERF0_SSE3 +INIT_XMM sse -DEFINE_IMDCT sse3 - -%define ROTLEFT ROTLEFT_SSSE3 +%ifdef ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP +%define SPILLED(x) m %+ x +%else +%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] +%macro SPILL 2 ; xmm#, mempos + movaps SPILLED(%2), m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, SPILLED(%2) +%endmacro +%endif -DEFINE_IMDCT ssse3 +%macro DEFINE_FOUR_IMDCT 0 +cglobal four_imdct36_float, 5,5,8, out, buf, in, win, tmp + movlps m0, [inq+64] + movhps m0, [inq+64 + 72] + movlps m3, [inq+64 + 2*72] + movhps m3, [inq+64 + 3*72] + + shufps m5, m0, m3, 0xdd + shufps m0, m0, m3, 0x88 + + mova m1, [inq+48] + movu m6, [inq+48 + 72] + mova m7, [inq+48 + 2*72] + movu m3, [inq+48 + 3*72] + + TRANSPOSE4x4PS 1, 6, 7, 3, 4 + + addps m4, m6, m7 + mova [tmpq+4*28], m4 + + addps m7, m3 + addps m6, m1 + addps m3, m0 + addps m0, m5 + addps m0, m7 + addps m7, m6 + mova [tmpq+4*12], m7 + SPILL 3, 12 + + mova m4, [inq+32] + movu m5, [inq+32 + 72] + mova m2, [inq+32 + 2*72] + movu m7, [inq+32 + 3*72] + + TRANSPOSE4x4PS 4, 5, 2, 7, 3 + + addps m1, m7 + SPILL 1, 11 + + addps m3, m5, m2 + SPILL 3, 13 + + addps m7, m2 + addps m5, m4 + addps m6, m7 + mova [tmpq], m6 + addps m7, m5 + mova [tmpq+4*16], m7 + + mova m2, [inq+16] + movu m7, [inq+16 + 72] + mova m1, [inq+16 + 2*72] + movu m6, [inq+16 + 3*72] + + TRANSPOSE4x4PS 2, 7, 1, 6, 3 + + addps m4, m6 + addps m6, m1 + addps m1, m7 + addps m7, m2 + addps m5, m6 + SPILL 5, 15 + addps m6, m7 + mulps m6, [costabs + 16*2] + mova [tmpq+4*8], m6 + SPILL 1, 10 + SPILL 0, 14 + + mova m1, [inq] + movu m6, [inq + 72] + mova m3, [inq + 2*72] + movu m5, [inq + 3*72] + + TRANSPOSE4x4PS 1, 6, 3, 5, 0 + + addps m2, m5 + addps m5, m3 + addps m7, m5 + addps m3, m6 + addps m6, m1 + SPILL 7, 8 + addps m5, m6 + SPILL 6, 9 + addps m6, m4, SPILLED(12) + subps m6, m2 + UNSPILL 7, 11 + SPILL 5, 11 + subps m5, m1, m7 + mulps m7, [costabs + 16*5] + addps m7, m1 + mulps m0, m6, [costabs + 16*6] + addps m0, m5 + mova [tmpq+4*24], m0 + addps m6, m5 + mova [tmpq+4*4], m6 + addps m6, m4, m2 + mulps m6, [costabs + 16*1] + subps m4, SPILLED(12) + mulps m4, [costabs + 16*8] + addps m2, SPILLED(12) + mulps m2, [costabs + 16*3] + subps m5, m7, m6 + subps m5, m2 + addps m6, m7 + addps m6, m4 + addps m7, m2 + subps m7, m4 + mova [tmpq+4*20], m7 + mova m2, [tmpq+4*28] + mova [tmpq+4*28], m5 + UNSPILL 7, 13 + subps m5, m7, m2 + mulps m5, [costabs + 16*7] + UNSPILL 1, 10 + mulps m1, [costabs + 16*2] + addps m4, m3, m2 + mulps m4, [costabs + 16*4] + addps m2, m7 + addps m7, m3 + mulps m7, [costabs] + subps m3, m2 + mulps m3, [costabs + 16*2] + addps m2, m7, m5 + addps m2, m1 + SPILL 2, 10 + addps m7, m4 + subps m7, m1 + SPILL 7, 12 + subps m5, m4 + subps m5, m1 + UNSPILL 0, 14 + SPILL 5, 13 + addps m1, m0, SPILLED(15) + subps m1, SPILLED(8) + mova m4, [costabs + 16*5] + mulps m4, [tmpq] + UNSPILL 2, 9 + addps m4, m2 + subps m2, [tmpq] + mulps m5, m1, [costabs + 16*6] + addps m5, m2 + SPILL 5, 9 + addps m2, m1 + SPILL 2, 14 + UNSPILL 5, 15 + subps m7, m5, m0 + addps m5, SPILLED(8) + mulps m5, [costabs + 16*1] + mulps m7, [costabs + 16*8] + addps m0, SPILLED(8) + mulps m0, [costabs + 16*3] + subps m2, m4, m5 + subps m2, m0 + SPILL 2, 15 + addps m5, m4 + addps m5, m7 + addps m4, m0 + subps m4, m7 + SPILL 4, 8 + mova m7, [tmpq+4*16] + mova m2, [tmpq+4*12] + addps m0, m7, m2 + subps m0, SPILLED(11) + mulps m0, [costabs + 16*2] + addps m4, m7, SPILLED(11) + mulps m4, [costabs] + subps m7, m2 + mulps m7, [costabs + 16*7] + addps m2, SPILLED(11) + mulps m2, [costabs + 16*4] + addps m1, m7, [tmpq+4*8] + addps m1, m4 + addps m4, m2 + subps m4, [tmpq+4*8] + SPILL 4, 11 + subps m7, m2 + subps m7, [tmpq+4*8] + addps m4, m6, SPILLED(10) + subps m6, SPILLED(10) + addps m2, m5, m1 + mulps m2, [costabs + 16*9] + subps m5, m1 + mulps m5, [costabs + 16*17] + subps m1, m4, m2 + addps m4, m2 + mulps m2, m1, [winq+4*36] + addps m2, [bufq+4*36] + mova [outq+1152], m2 + mulps m1, [winq+4*32] + addps m1, [bufq+4*32] + mova [outq+1024], m1 + mulps m1, m4, [winq+4*116] + mova [bufq+4*36], m1 + mulps m4, [winq+4*112] + mova [bufq+4*32], m4 + addps m2, m6, m5 + subps m6, m5 + mulps m1, m6, [winq+4*68] + addps m1, [bufq+4*68] + mova [outq+2176], m1 + mulps m6, [winq] + addps m6, [bufq] + mova [outq], m6 + mulps m1, m2, [winq+4*148] + mova [bufq+4*68], m1 + mulps m2, [winq+4*80] + mova [bufq], m2 + addps m5, m3, [tmpq+4*24] + mova m2, [tmpq+4*24] + subps m2, m3 + mova m1, SPILLED(9) + subps m1, m0 + mulps m1, [costabs + 16*10] + addps m0, SPILLED(9) + mulps m0, [costabs + 16*16] + addps m6, m5, m1 + subps m5, m1 + mulps m3, m5, [winq+4*40] + addps m3, [bufq+4*40] + mova [outq+1280], m3 + mulps m5, [winq+4*28] + addps m5, [bufq+4*28] + mova [outq+896], m5 + mulps m1, m6, [winq+4*120] + mova [bufq+4*40], m1 + mulps m6, [winq+4*108] + mova [bufq+4*28], m6 + addps m1, m2, m0 + subps m2, m0 + mulps m5, m2, [winq+4*64] + addps m5, [bufq+4*64] + mova [outq+2048], m5 + mulps m2, [winq+4*4] + addps m2, [bufq+4*4] + mova [outq+128], m2 + mulps m0, m1, [winq+4*144] + mova [bufq+4*64], m0 + mulps m1, [winq+4*84] + mova [bufq+4*4], m1 + mova m1, [tmpq+4*28] + mova m5, m1 + addps m1, SPILLED(13) + subps m5, SPILLED(13) + UNSPILL 3, 15 + addps m2, m7, m3 + mulps m2, [costabs + 16*11] + subps m3, m7 + mulps m3, [costabs + 16*15] + addps m0, m2, m1 + subps m1, m2 + SWAP m0, m2 + mulps m6, m1, [winq+4*44] + addps m6, [bufq+4*44] + mova [outq+1408], m6 + mulps m1, [winq+4*24] + addps m1, [bufq+4*24] + mova [outq+768], m1 + mulps m0, m2, [winq+4*124] + mova [bufq+4*44], m0 + mulps m2, [winq+4*104] + mova [bufq+4*24], m2 + addps m0, m5, m3 + subps m5, m3 + mulps m1, m5, [winq+4*60] + addps m1, [bufq+4*60] + mova [outq+1920], m1 + mulps m5, [winq+4*8] + addps m5, [bufq+4*8] + mova [outq+256], m5 + mulps m1, m0, [winq+4*140] + mova [bufq+4*60], m1 + mulps m0, [winq+4*88] + mova [bufq+4*8], m0 + mova m1, [tmpq+4*20] + addps m1, SPILLED(12) + mova m2, [tmpq+4*20] + subps m2, SPILLED(12) + UNSPILL 7, 8 + subps m0, m7, SPILLED(11) + addps m7, SPILLED(11) + mulps m4, m7, [costabs + 16*12] + mulps m0, [costabs + 16*14] + addps m5, m1, m4 + subps m1, m4 + mulps m7, m1, [winq+4*48] + addps m7, [bufq+4*48] + mova [outq+1536], m7 + mulps m1, [winq+4*20] + addps m1, [bufq+4*20] + mova [outq+640], m1 + mulps m1, m5, [winq+4*128] + mova [bufq+4*48], m1 + mulps m5, [winq+4*100] + mova [bufq+4*20], m5 + addps m6, m2, m0 + subps m2, m0 + mulps m1, m2, [winq+4*56] + addps m1, [bufq+4*56] + mova [outq+1792], m1 + mulps m2, [winq+4*12] + addps m2, [bufq+4*12] + mova [outq+384], m2 + mulps m0, m6, [winq+4*136] + mova [bufq+4*56], m0 + mulps m6, [winq+4*92] + mova [bufq+4*12], m6 + UNSPILL 0, 14 + mulps m0, [costabs + 16*13] + mova m3, [tmpq+4*4] + addps m2, m0, m3 + subps m3, m0 + mulps m0, m3, [winq+4*52] + addps m0, [bufq+4*52] + mova [outq+1664], m0 + mulps m3, [winq+4*16] + addps m3, [bufq+4*16] + mova [outq+512], m3 + mulps m0, m2, [winq+4*132] + mova [bufq+4*52], m0 + mulps m2, [winq+4*96] + mova [bufq+4*16], m2 + RET +%endmacro -%define BUILDINVHIGHLOW BUILDINVHIGHLOW_AVX -%define PSHUFD PSHUFD_SSE_AVX +INIT_XMM sse +DEFINE_FOUR_IMDCT -INIT_AVX -DEFINE_IMDCT avx +INIT_XMM avx +DEFINE_FOUR_IMDCT diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c index 980faf9cde..b1fa6573b8 100644 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ b/libavcodec/x86/mpegaudiodec_mmx.c @@ -29,6 +29,12 @@ void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); +void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, + float *tmpbuf); +void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, + float *tmpbuf); + +DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; #define MACS(rt, ra, rb) rt+=(ra)*(rb) #define MLSS(rt, ra, rb) rt-=(ra)*(rb) @@ -153,26 +159,79 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out, *out = sum; } + +#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ +static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ + int count, int switch_point, int block_type) \ +{ \ + int align_end = count - (count & 3); \ + int j; \ + for (j = 0; j < align_end; j+= 4) { \ + LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ + float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ + in += 4*18; \ + buf += 4*18; \ + out += 4; \ + } \ + for (; j < count; j++) { \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + int win_idx = (switch_point && j < 2) ? 0 : block_type; \ + float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ + \ + ff_imdct36_float_ ## CPU1(out, buf, in, win); \ + \ + in += 18; \ + buf++; \ + out++; \ + } \ +} + +DECL_IMDCT_BLOCKS(sse,sse) +DECL_IMDCT_BLOCKS(sse2,sse) +DECL_IMDCT_BLOCKS(sse3,sse) +DECL_IMDCT_BLOCKS(ssse3,sse) +DECL_IMDCT_BLOCKS(avx,avx) + void ff_mpadsp_init_mmx(MPADSPContext *s) { int mm_flags = av_get_cpu_flags(); + int i, j; + for (j = 0; j < 4; j++) { + for (i = 0; i < 40; i ++) { + mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; + mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; + mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + } + } + if (mm_flags & AV_CPU_FLAG_SSE2) { s->apply_window_float = apply_window_mp3; } - if (HAVE_YASM && mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { - s->imdct36_float = ff_imdct36_float_avx; - } - else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) { - s->imdct36_float = ff_imdct36_float_ssse3; - } - else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE3 && HAVE_SSE) { - s->imdct36_float = ff_imdct36_float_sse3; - } - else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { - s->imdct36_float = ff_imdct36_float_sse2; - } - else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { - s->imdct36_float = ff_imdct36_float_sse; +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { + s->imdct36_blocks_float = imdct36_blocks_avx; +#if HAVE_SSE + } else if (mm_flags & AV_CPU_FLAG_SSSE3) { + s->imdct36_blocks_float = imdct36_blocks_ssse3; + } else if (mm_flags & AV_CPU_FLAG_SSE3) { + s->imdct36_blocks_float = imdct36_blocks_sse3; + } else if (mm_flags & AV_CPU_FLAG_SSE2) { + s->imdct36_blocks_float = imdct36_blocks_sse2; + } else if (mm_flags & AV_CPU_FLAG_SSE) { + s->imdct36_blocks_float = imdct36_blocks_sse; +#endif /* HAVE_SSE */ } +#endif /* HAVE_YASM */ } |