diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-08-03 22:43:44 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-08-03 23:13:06 +0200 |
commit | a7acab6cda69d55c3d9d75a3ebb6cff68b15e689 (patch) | |
tree | fb79107f7f9e93451092d1f246243b9022a9ee2e | |
parent | a763cafc0c69e3fad91f97867b942182804f79b0 (diff) | |
parent | 9cc74c9f6e8b645e67d45b2070db004caca09af7 (diff) | |
download | ffmpeg-a7acab6cda69d55c3d9d75a3ebb6cff68b15e689.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
vc1dec: Remove separate scaling function for interlaced field MVs
vc1dec: Invoke edge_emulation regardless of MV precision
x86: Use consistent 3dnowext function and macro name suffixes
g723_1: scale output as supposed for the case with postfilter disabled
g723_1: increase excitation storage by 4
g723_1: fix upper bound parameter from inverse maximum autocorrelation
g723_1: make scale_vector() behave like the reference
g723_1: fix off-by-one error in normalize_bits()
g723_1: save/restore excitation with offset to store LPC history
wmapro: prevent division by zero when sample rate is unspecified
x86: proresdsp: improve SIGNEXTEND macro comments
x86: h264dsp: K&R formatting cosmetics
LICENSE: Document all GPL files
Conflicts:
libavcodec/g723_1.c
libavcodec/wmaprodec.c
libavcodec/x86/h264dsp_mmx.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | LICENSE | 8 | ||||
-rw-r--r-- | libavcodec/g723_1.c | 28 | ||||
-rw-r--r-- | libavcodec/vc1dec.c | 34 | ||||
-rw-r--r-- | libavcodec/wmaprodec.c | 5 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 14 | ||||
-rw-r--r-- | libavcodec/x86/fft.c | 6 | ||||
-rw-r--r-- | libavcodec/x86/fft.h | 6 | ||||
-rw-r--r-- | libavcodec/x86/fft_mmx.asm | 46 | ||||
-rw-r--r-- | libavcodec/x86/fmtconvert.asm | 6 | ||||
-rw-r--r-- | libavcodec/x86/fmtconvert_mmx.c | 10 | ||||
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 472 | ||||
-rw-r--r-- | libavcodec/x86/proresdsp.asm | 6 | ||||
-rw-r--r-- | libavutil/x86/x86inc.asm | 2 | ||||
-rw-r--r-- | tests/ref/fate/acodec-g723_1 | 4 |
14 files changed, 340 insertions, 307 deletions
@@ -17,6 +17,14 @@ Specifically, the GPL parts of FFmpeg are - optional x86 optimizations in the files libavcodec/x86/idct_mmx.c - the X11 grabber in libavdevice/x11grab.c +- the texi2pod.pl tool +- the following filters in libavfilter: + - vf_blackframe.c + - vf_boxblur.c + - vf_cropdetect.c + - vf_delogo.c + - vf_hqdn3d.c + - vf_yadif.c There are a handful of files under other licensing terms, namely: diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index c7f6ac9301..80d1a04698 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -52,7 +52,7 @@ typedef struct g723_1_context { int16_t prev_lsp[LPC_ORDER]; int16_t prev_excitation[PITCH_MAX]; - int16_t excitation[PITCH_MAX + FRAME_LEN]; + int16_t excitation[PITCH_MAX + FRAME_LEN + 4]; int16_t synth_mem[LPC_ORDER]; int16_t fir_mem[LPC_ORDER]; int iir_mem[LPC_ORDER]; @@ -267,8 +267,10 @@ static int scale_vector(int16_t *vector, int length) bits = normalize_bits(max, 15); scale = shift_table[bits]; - for (i = 0; i < length; i++) + for (i = 0; i < length; i++) { + av_assert2(av_clipl_int32(vector[i] * (int64_t)scale << 1) == vector[i] * (int64_t)scale << 1); vector[i] = (vector[i] * scale) >> 3; + } return bits - 3; } @@ -592,7 +594,10 @@ static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max, int i; pitch_lag = FFMIN(PITCH_MAX - 3, pitch_lag); - limit = FFMIN(FRAME_LEN + PITCH_MAX - offset - length, pitch_lag + 3); + if (dir > 0) + limit = FFMIN(FRAME_LEN + PITCH_MAX - offset - length, pitch_lag + 3); + else + limit = pitch_lag + 3; for (i = pitch_lag - 3; i <= limit; i++) { ccr = ff_dot_product(buf, buf + dir * i, length)<<1; @@ -967,7 +972,6 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, G723_1_Context *p = avctx->priv_data; const uint8_t *buf = avpkt->data; int buf_size = avpkt->size; - int16_t *out; int dec_mode = buf[0] & 3; PPFParam ppf[SUBFRAMES]; @@ -975,6 +979,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, int16_t lpc[SUBFRAMES * LPC_ORDER]; int16_t acb_vector[SUBFRAME_LEN]; int16_t *vector_ptr; + int16_t *out; int bad_frame = 0, i, j, ret; if (!buf_size || buf_size < frame_size[dec_mode]) { @@ -995,8 +1000,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n"); return ret; } - out= (int16_t*)p->frame.data[0]; + out = (int16_t *)p->frame.data[0]; if (p->cur_frame_type == ACTIVE_FRAME) { if (!bad_frame) @@ -1079,7 +1084,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, memcpy(p->prev_excitation, p->excitation + FRAME_LEN, PITCH_MAX * sizeof(*p->excitation)); } else { - memset(out, 0, sizeof(int16_t)*FRAME_LEN); + memset(out, 0, FRAME_LEN * 2); av_log(avctx, AV_LOG_WARNING, "G.723.1: Comfort noise generation not supported yet\n"); return frame_size[dec_mode]; @@ -1094,13 +1099,18 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, 0, 1, 1 << 12); memcpy(p->synth_mem, out + FRAME_LEN, LPC_ORDER * sizeof(int16_t)); - if (p->postfilter) + if (p->postfilter) { formant_postfilter(p, lpc, out); + } else { // if output is not postfiltered it should be scaled by 2 + for (i = 0; i < FRAME_LEN; i++) + out[LPC_ORDER + i] = av_clip_int16(out[LPC_ORDER + i] << 1); + } memmove(out, out + LPC_ORDER, sizeof(int16_t)*FRAME_LEN); p->frame.nb_samples = FRAME_LEN; - *(AVFrame*)data = p->frame; - *got_frame_ptr = 1; + + *got_frame_ptr = 1; + *(AVFrame *)data = p->frame; return frame_size[dec_mode]; } diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index 55d695bee2..8929a72107 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -1882,8 +1882,8 @@ static void vc1_interp_mc(VC1Context *v) } if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22 - || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 16 - s->mspel * 3 - || (unsigned)(src_y - s->mspel) > v_edge_pos - (my & 3) - 16 - s->mspel * 3) { + || (unsigned)(src_x - 1) > s->h_edge_pos - (mx & 3) - 16 - 3 + || (unsigned)(src_y - 1) > v_edge_pos - (my & 3) - 16 - 3) { uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize; srcY -= s->mspel * (1 + s->linesize); @@ -1979,20 +1979,6 @@ static av_always_inline int scale_mv(int value, int bfrac, int inv, int qs) #endif } -static av_always_inline int scale_mv_intfi(int value, int bfrac, int inv, - int qs, int qs_last) -{ - int n = bfrac; - - if (inv) - n -= 256; - n <<= !qs_last; - if (!qs) - return (value * n + 255) >> 9; - else - return (value * n + 128) >> 8; -} - /** Reconstruct motion vector for B-frame and do motion compensation */ static inline void vc1_b_mc(VC1Context *v, int dmv_x[2], int dmv_y[2], @@ -2246,14 +2232,14 @@ static inline void vc1_pred_b_mv_intfi(VC1Context *v, int n, int *dmv_x, int *dm if (v->bmvtype == BMV_TYPE_DIRECT) { int total_opp, k, f; if (s->next_picture.f.mb_type[mb_pos + v->mb_off] != MB_TYPE_INTRA) { - s->mv[0][0][0] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], - v->bfraction, 0, s->quarter_sample, v->qs_last); - s->mv[0][0][1] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], - v->bfraction, 0, s->quarter_sample, v->qs_last); - s->mv[1][0][0] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], - v->bfraction, 1, s->quarter_sample, v->qs_last); - s->mv[1][0][1] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], - v->bfraction, 1, s->quarter_sample, v->qs_last); + s->mv[0][0][0] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], + v->bfraction, 0, s->quarter_sample); + s->mv[0][0][1] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], + v->bfraction, 0, s->quarter_sample); + s->mv[1][0][0] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], + v->bfraction, 1, s->quarter_sample); + s->mv[1][0][1] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], + v->bfraction, 1, s->quarter_sample); total_opp = v->mv_f_next[0][s->block_index[0] + v->blocks_off] + v->mv_f_next[0][s->block_index[1] + v->blocks_off] diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c index b60a257329..788c8fa3e7 100644 --- a/libavcodec/wmaprodec.c +++ b/libavcodec/wmaprodec.c @@ -341,6 +341,11 @@ static av_cold int decode_init(AVCodecContext *avctx) return AVERROR_INVALIDDATA; } + if (s->avctx->sample_rate <= 0) { + av_log(avctx, AV_LOG_ERROR, "invalid sample rate\n"); + return AVERROR_INVALIDDATA; + } + s->num_channels = avctx->channels; if (s->num_channels < 0) { diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 85753e07f5..47b8ef1fc3 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2485,9 +2485,9 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], } #if HAVE_6REGS -static void vector_fmul_window_3dnow2(float *dst, const float *src0, - const float *src1, const float *win, - int len) +static void vector_fmul_window_3dnowext(float *dst, const float *src0, + const float *src1, const float *win, + int len) { x86_reg i = -len * 4; x86_reg j = len * 4 - 8; @@ -2939,11 +2939,11 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, #endif } -static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx, - int mm_flags) +static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx, + int mm_flags) { #if HAVE_6REGS && HAVE_INLINE_ASM - c->vector_fmul_window = vector_fmul_window_3dnow2; + c->vector_fmul_window = vector_fmul_window_3dnowext; #endif } @@ -3194,7 +3194,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) dsputil_init_3dnow(c, avctx, mm_flags); if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) - dsputil_init_3dnow2(c, avctx, mm_flags); + dsputil_init_3dnowext(c, avctx, mm_flags); if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) dsputil_init_sse(c, avctx, mm_flags); diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index 3b6588ae8e..852c6b8d1e 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -34,9 +34,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s) } if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { /* 3DNowEx for K7 */ - s->imdct_calc = ff_imdct_calc_3dnow2; - s->imdct_half = ff_imdct_half_3dnow2; - s->fft_calc = ff_fft_calc_3dnow2; + s->imdct_calc = ff_imdct_calc_3dnowext; + s->imdct_half = ff_imdct_half_3dnowext; + s->fft_calc = ff_fft_calc_3dnowext; } #endif if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index 405cabe517..3f8b21d95b 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -25,12 +25,12 @@ void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); -void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z); +void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z); void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 73234388ab..f41381760d 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -93,14 +93,14 @@ cextern cos_ %+ i SECTION_TEXT -%macro T2_3DN 4 ; z0, z1, mem0, mem1 +%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 mova %1, %3 mova %2, %1 pfadd %1, %4 pfsub %2, %4 %endmacro -%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 +%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 mova %5, %3 pfsub %3, %4 pfadd %5, %4 ; {t6,t5} @@ -445,13 +445,13 @@ fft16_sse: ret -%macro FFT48_3DN 0 +%macro FFT48_3DNOW 0 align 16 fft4 %+ SUFFIX: - T2_3DN m0, m1, Z(0), Z(1) + T2_3DNOW m0, m1, Z(0), Z(1) mova m2, Z(2) mova m3, Z(3) - T4_3DN m0, m1, m2, m3, m4, m5 + T4_3DNOW m0, m1, m2, m3, m4, m5 PUNPCK m0, m1, m4 PUNPCK m2, m3, m5 mova Z(0), m0 @@ -462,14 +462,14 @@ fft4 %+ SUFFIX: align 16 fft8 %+ SUFFIX: - T2_3DN m0, m1, Z(0), Z(1) + T2_3DNOW m0, m1, Z(0), Z(1) mova m2, Z(2) mova m3, Z(3) - T4_3DN m0, m1, m2, m3, m4, m5 + T4_3DNOW m0, m1, m2, m3, m4, m5 mova Z(0), m0 mova Z(2), m2 - T2_3DN m4, m5, Z(4), Z(5) - T2_3DN m6, m7, Z2(6), Z2(7) + T2_3DNOW m4, m5, Z(4), Z(5) + T2_3DNOW m6, m7, Z2(6), Z2(7) PSWAPD m0, m5 PSWAPD m2, m7 pxor m0, [ps_m1p1] @@ -478,12 +478,12 @@ fft8 %+ SUFFIX: pfadd m7, m2 pfmul m5, [ps_root2] pfmul m7, [ps_root2] - T4_3DN m1, m3, m5, m7, m0, m2 + T4_3DNOW m1, m3, m5, m7, m0, m2 mova Z(5), m5 mova Z2(7), m7 mova m0, Z(0) mova m2, Z(2) - T4_3DN m0, m2, m4, m6, m5, m7 + T4_3DNOW m0, m2, m4, m6, m5, m7 PUNPCK m0, m1, m5 PUNPCK m2, m3, m7 mova Z(0), m0 @@ -501,7 +501,7 @@ fft8 %+ SUFFIX: %if ARCH_X86_32 %macro PSWAPD 2 -%if cpuflag(3dnow2) +%if cpuflag(3dnowext) pswapd %1, %2 %elifidn %1, %2 movd [r0+12], %1 @@ -513,11 +513,11 @@ fft8 %+ SUFFIX: %endif %endmacro -INIT_MMX 3dnow2 -FFT48_3DN +INIT_MMX 3dnowext +FFT48_3DNOW INIT_MMX 3dnow -FFT48_3DN +FFT48_3DNOW %endif %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] @@ -634,7 +634,7 @@ cglobal fft_calc, 2,5,8 %if ARCH_X86_32 INIT_MMX 3dnow FFT_CALC_FUNC -INIT_MMX 3dnow2 +INIT_MMX 3dnowext FFT_CALC_FUNC %endif INIT_XMM sse @@ -728,7 +728,7 @@ cglobal imdct_calc, 3,5,3 %if ARCH_X86_32 INIT_MMX 3dnow IMDCT_CALC_FUNC -INIT_MMX 3dnow2 +INIT_MMX 3dnowext IMDCT_CALC_FUNC %endif @@ -744,8 +744,8 @@ INIT_MMX 3dnow %define unpckhps punpckhdq DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] DECL_PASS pass_interleave_3dnow, PASS_BIG 0 -%define pass_3dnow2 pass_3dnow -%define pass_interleave_3dnow2 pass_interleave_3dnow +%define pass_3dnowext pass_3dnow +%define pass_interleave_3dnowext pass_interleave_3dnow %endif %ifdef PIC @@ -814,7 +814,7 @@ DECL_FFT 5, _interleave INIT_MMX 3dnow DECL_FFT 4 DECL_FFT 4, _interleave -INIT_MMX 3dnow2 +INIT_MMX 3dnowext DECL_FFT 4 DECL_FFT 4, _interleave %endif @@ -846,7 +846,7 @@ INIT_XMM sse PSWAPD m5, m3 pfmul m2, m3 pfmul m6, m5 -%if cpuflag(3dnow2) +%if cpuflag(3dnowext) pfpnacc m0, m4 pfpnacc m2, m6 %else @@ -1019,7 +1019,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i xor r4, r4 sub r4, r3 %endif -%if notcpuflag(3dnow2) && mmsize == 8 +%if notcpuflag(3dnowext) && mmsize == 8 movd m7, [ps_m1m1m1m1] %endif .pre: @@ -1103,7 +1103,7 @@ DECL_IMDCT POSROTATESHUF INIT_MMX 3dnow DECL_IMDCT POSROTATESHUF_3DNOW -INIT_MMX 3dnow2 +INIT_MMX 3dnowext DECL_IMDCT POSROTATESHUF_3DNOW %endif diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 9499a9e3a7..7368b8f518 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -249,7 +249,7 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2 %macro PSWAPD_SSE 2 pshufw %1, %2, 0x4e %endmacro -%macro PSWAPD_3DN1 2 +%macro PSWAPD_3DNOW 2 movq %1, %2 psrlq %1, 32 punpckldq %1, %2 @@ -306,10 +306,10 @@ cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, %define pswapd PSWAPD_SSE FLOAT_TO_INT16_INTERLEAVE6 sse %define cvtps2pi pf2id -%define pswapd PSWAPD_3DN1 +%define pswapd PSWAPD_3DNOW FLOAT_TO_INT16_INTERLEAVE6 3dnow %undef pswapd -FLOAT_TO_INT16_INTERLEAVE6 3dn2 +FLOAT_TO_INT16_INTERLEAVE6 3dnowext %undef cvtps2pi ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 8c9c43f662..814a17f631 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -46,7 +46,7 @@ void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long l void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len); #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse @@ -74,9 +74,11 @@ FLOAT_TO_INT16_INTERLEAVE(3dnow) FLOAT_TO_INT16_INTERLEAVE(sse) FLOAT_TO_INT16_INTERLEAVE(sse2) -static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ +static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src, + long len, int channels) +{ if(channels==6) - ff_float_to_int16_interleave6_3dn2(dst, src, len); + ff_float_to_int16_interleave6_3dnowext(dst, src, len); else float_to_int16_interleave_3dnow(dst, src, len, channels); } @@ -126,7 +128,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) } if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) { if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dn2; + c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; } } if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index c18a4f56ce..bb77a96999 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -25,8 +25,10 @@ /***********************************/ /* IDCT */ -#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride); +#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int16_t *block, \ + int stride); IDCT_ADD_FUNC(, 8, mmx) IDCT_ADD_FUNC(, 10, sse2) @@ -44,10 +46,10 @@ IDCT_ADD_FUNC(8, 10, avx) #endif -#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, const int *block_offset, \ - DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, const int *block_offset, \ + DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); IDCT_ADD_REP_FUNC(8, 4, 8, mmx) IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) @@ -68,10 +70,11 @@ IDCT_ADD_REP_FUNC(, 16intra, 10, avx) #endif -#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t **dst, const int *block_offset, \ - DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ + (uint8_t **dst, const int *block_offset, \ + DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); + IDCT_ADD_REP_FUNC2(, 8, 8, mmx) IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) IDCT_ADD_REP_FUNC2(, 8, 8, sse2) @@ -80,7 +83,7 @@ IDCT_ADD_REP_FUNC2(, 8, 10, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, avx) #endif -void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); +void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul); void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); /***********************************/ @@ -91,273 +94,292 @@ void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field); -#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta, int8_t *tc0); +#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ + int stride, \ + int alpha, \ + int beta, \ + int8_t *tc0); #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta); - -#define LF_FUNCS(type, depth)\ -LF_FUNC (h, chroma, depth, mmx2)\ -LF_IFUNC(h, chroma_intra, depth, mmx2)\ -LF_FUNC (v, chroma, depth, mmx2)\ -LF_IFUNC(v, chroma_intra, depth, mmx2)\ -LF_FUNC (h, luma, depth, mmx2)\ -LF_IFUNC(h, luma_intra, depth, mmx2)\ -LF_FUNC (h, luma, depth, sse2)\ -LF_IFUNC(h, luma_intra, depth, sse2)\ -LF_FUNC (v, luma, depth, sse2)\ -LF_IFUNC(v, luma_intra, depth, sse2)\ -LF_FUNC (h, chroma, depth, sse2)\ -LF_IFUNC(h, chroma_intra, depth, sse2)\ -LF_FUNC (v, chroma, depth, sse2)\ -LF_IFUNC(v, chroma_intra, depth, sse2)\ -LF_FUNC (h, luma, depth, avx)\ -LF_IFUNC(h, luma_intra, depth, avx)\ -LF_FUNC (v, luma, depth, avx)\ -LF_IFUNC(v, luma_intra, depth, avx)\ -LF_FUNC (h, chroma, depth, avx)\ -LF_IFUNC(h, chroma_intra, depth, avx)\ -LF_FUNC (v, chroma, depth, avx)\ -LF_IFUNC(v, chroma_intra, depth, avx) - -LF_FUNCS( uint8_t, 8) +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ + int stride, \ + int alpha, \ + int beta); + +#define LF_FUNCS(type, depth) \ +LF_FUNC(h, chroma, depth, mmx2) \ +LF_IFUNC(h, chroma_intra, depth, mmx2) \ +LF_FUNC(v, chroma, depth, mmx2) \ +LF_IFUNC(v, chroma_intra, depth, mmx2) \ +LF_FUNC(h, luma, depth, mmx2) \ +LF_IFUNC(h, luma_intra, depth, mmx2) \ +LF_FUNC(h, luma, depth, sse2) \ +LF_IFUNC(h, luma_intra, depth, sse2) \ +LF_FUNC(v, luma, depth, sse2) \ +LF_IFUNC(v, luma_intra, depth, sse2) \ +LF_FUNC(h, chroma, depth, sse2) \ +LF_IFUNC(h, chroma_intra, depth, sse2) \ +LF_FUNC(v, chroma, depth, sse2) \ +LF_IFUNC(v, chroma_intra, depth, sse2) \ +LF_FUNC(h, luma, depth, avx) \ +LF_IFUNC(h, luma_intra, depth, avx) \ +LF_FUNC(v, luma, depth, avx) \ +LF_IFUNC(v, luma_intra, depth, avx) \ +LF_FUNC(h, chroma, depth, avx) \ +LF_IFUNC(h, chroma_intra, depth, avx) \ +LF_FUNC(v, chroma, depth, avx) \ +LF_IFUNC(v, chroma_intra, depth, avx) + +LF_FUNCS(uint8_t, 8) LF_FUNCS(uint16_t, 10) #if ARCH_X86_32 && HAVE_YASM -LF_FUNC (v8, luma, 8, mmx2) -static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +LF_FUNC(v8, luma, 8, mmx2) +static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0) { - if((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmx2(pix+0, stride, alpha, beta, tc0); - if((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmx2(pix+8, stride, alpha, beta, tc0+2); + if ((tc0[0] & tc0[1]) >= 0) + ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0); + if ((tc0[2] & tc0[3]) >= 0) + ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2); } -LF_IFUNC(v8, luma_intra, 8, mmx2) -static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, int alpha, int beta) + +LF_IFUNC(v8, luma_intra, 8, mmx2) +static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, + int alpha, int beta) { - ff_deblock_v8_luma_intra_8_mmx2(pix+0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmx2(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta); } #endif /* ARCH_X86_32 */ -LF_FUNC (v, luma, 10, mmx2) -LF_IFUNC(v, luma_intra, 10, mmx2) +LF_FUNC(v, luma, 10, mmx2) +LF_IFUNC(v, luma_intra, 10, mmx2) /***********************************/ /* weighted prediction */ -#define H264_WEIGHT(W, OPT) \ -void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \ - int stride, int height, int log2_denom, int weight, int offset); +#define H264_WEIGHT(W, OPT) \ +void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ + int height, int log2_denom, \ + int weight, int offset); -#define H264_BIWEIGHT(W, OPT) \ -void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \ - uint8_t *src, int stride, int height, int log2_denom, int weightd, \ - int weights, int offset); +#define H264_BIWEIGHT(W, OPT) \ +void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ + int stride, int height, \ + int log2_denom, int weightd, \ + int weights, int offset); -#define H264_BIWEIGHT_MMX(W) \ -H264_WEIGHT (W, mmx2) \ -H264_BIWEIGHT(W, mmx2) +#define H264_BIWEIGHT_MMX(W) \ + H264_WEIGHT(W, mmx2) \ + H264_BIWEIGHT(W, mmx2) -#define H264_BIWEIGHT_MMX_SSE(W) \ -H264_BIWEIGHT_MMX(W) \ -H264_WEIGHT (W, sse2) \ -H264_BIWEIGHT (W, sse2) \ -H264_BIWEIGHT (W, ssse3) +#define H264_BIWEIGHT_MMX_SSE(W) \ + H264_BIWEIGHT_MMX(W) \ + H264_WEIGHT(W, sse2) \ + H264_BIWEIGHT(W, sse2) \ + H264_BIWEIGHT(W, ssse3) H264_BIWEIGHT_MMX_SSE(16) -H264_BIWEIGHT_MMX_SSE( 8) -H264_BIWEIGHT_MMX ( 4) - -#define H264_WEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - int stride, int height, int log2_denom, int weight, int offset); - -#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \ - int weightd, int weights, int offset); - -#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ -H264_WEIGHT_10 (W, DEPTH, sse2) \ -H264_WEIGHT_10 (W, DEPTH, sse4) \ -H264_BIWEIGHT_10(W, DEPTH, sse2) \ -H264_BIWEIGHT_10(W, DEPTH, sse4) +H264_BIWEIGHT_MMX_SSE(8) +H264_BIWEIGHT_MMX(4) + +#define H264_WEIGHT_10(W, DEPTH, OPT) \ +void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int stride, \ + int height, \ + int log2_denom, \ + int weight, \ + int offset); + +#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ +void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + uint8_t *src, \ + int stride, \ + int height, \ + int log2_denom, \ + int weightd, \ + int weights, \ + int offset); + +#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ + H264_WEIGHT_10(W, DEPTH, sse2) \ + H264_WEIGHT_10(W, DEPTH, sse4) \ + H264_BIWEIGHT_10(W, DEPTH, sse2) \ + H264_BIWEIGHT_10(W, DEPTH, sse4) H264_BIWEIGHT_10_SSE(16, 10) -H264_BIWEIGHT_10_SSE( 8, 10) -H264_BIWEIGHT_10_SSE( 4, 10) +H264_BIWEIGHT_10_SSE(8, 10) +H264_BIWEIGHT_10_SSE(4, 10) -void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) +void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) { #if HAVE_YASM int mm_flags = av_get_cpu_flags(); - if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) { + if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2; - } if (bit_depth == 8) { - if (mm_flags & AV_CPU_FLAG_MMX) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_8_mmx; - c->h264_idct8_dc_add = - c->h264_idct8_add = ff_h264_idct8_add_8_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (mm_flags & AV_CPU_FLAG_CMOV) - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; - - if (mm_flags & AV_CPU_FLAG_MMX2) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; + if (mm_flags & AV_CPU_FLAG_MMX) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_8_mmx; + c->h264_idct8_dc_add = + c->h264_idct8_add = ff_h264_idct8_add_8_mmx; + + c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2; - - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmx2; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmx2; - if (chroma_format_idc == 1) { - c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmx2; - c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmx2; - } + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; + if (mm_flags & AV_CPU_FLAG_CMOV) + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; + + if (mm_flags & AV_CPU_FLAG_MMX2) { + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2; + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmx2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2; + if (chroma_format_idc == 1) { + c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmx2; + c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2; + } #if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmx2; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmx2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; -#endif - c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmx2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; +#endif /* ARCH_X86_32 */ + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2; - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2; - if (mm_flags&AV_CPU_FLAG_SSE2) { - c->h264_idct8_add = ff_h264_idct8_add_8_sse2; + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->h264_idct8_add = ff_h264_idct8_add_8_sse2; - c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; - c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; + c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; - c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; -#endif - } - if (mm_flags&AV_CPU_FLAG_SSSE3) { - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; - } - if (HAVE_AVX && mm_flags&AV_CPU_FLAG_AVX) { + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; +#endif /* HAVE_ALIGNED_STACK */ + } + if (mm_flags & AV_CPU_FLAG_SSSE3) { + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; + } + if (HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) { #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; -#endif + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; +#endif /* HAVE_ALIGNED_STACK */ + } } } - } } else if (bit_depth == 10) { - if (mm_flags & AV_CPU_FLAG_MMX) { - if (mm_flags & AV_CPU_FLAG_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX2) { #if ARCH_X86_32 - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmx2; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmx2; - c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmx2; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmx2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; -#endif - c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2; - if (mm_flags&AV_CPU_FLAG_SSE2) { - c->h264_idct_add = ff_h264_idct_add_10_sse2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmx2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmx2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; +#endif /* ARCH_X86_32 */ + c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2; + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->h264_idct_add = ff_h264_idct_add_10_sse2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; + + c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; #if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; -#endif + c->h264_idct8_add = ff_h264_idct8_add_10_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; +#endif /* HAVE_ALIGNED_STACK */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; -#endif - } - if (mm_flags&AV_CPU_FLAG_SSE4) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; - } + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; +#endif /* HAVE_ALIGNED_STACK */ + } + if (mm_flags & AV_CPU_FLAG_SSE4) { + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; + } #if HAVE_AVX - if (mm_flags&AV_CPU_FLAG_AVX) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_10_avx; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; - - c->h264_idct_add16 = ff_h264_idct_add16_10_avx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_avx; - c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx; + if (mm_flags & AV_CPU_FLAG_AVX) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_10_avx; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; + + c->h264_idct_add16 = ff_h264_idct_add16_10_avx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_avx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; #if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_avx; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; -#endif + c->h264_idct8_add = ff_h264_idct8_add_10_avx; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; +#endif /* HAVE_ALIGNED_STACK */ - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; -#endif - } + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; +#endif /* HAVE_ALIGNED_STACK */ + } #endif /* HAVE_AVX */ + } } } - } -#endif +#endif /* HAVE_YASM */ } diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index 01e5deec93..d626fdce08 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -301,12 +301,12 @@ cglobal prores_idct_put_10, 4, 4, %1 RET %endmacro -%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp -%if cpuflag(sse4) +%macro SIGNEXTEND 2-3 +%if cpuflag(sse4) ; dstlow, dsthigh movhlps %2, %1 pmovsxwd %1, %1 pmovsxwd %2, %2 -%else ; sse2 +%elif cpuflag(sse2) ; dstlow, dsthigh, tmp pxor %3, %3 pcmpgtw %3, %1 mova %2, %1 diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 95d707bd7b..d29740f278 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -590,7 +590,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 diff --git a/tests/ref/fate/acodec-g723_1 b/tests/ref/fate/acodec-g723_1 index 38b1a1155a..1060042657 100644 --- a/tests/ref/fate/acodec-g723_1 +++ b/tests/ref/fate/acodec-g723_1 @@ -1,4 +1,4 @@ dec0deb2425e908d232d2471acff04a3 *tests/data/fate/acodec-g723_1.g723_1 4800 tests/data/fate/acodec-g723_1.g723_1 -90b20555c962b638dad0e98ac2c05b25 *tests/data/fate/acodec-g723_1.out.wav -stddev: 8418.34 PSNR: 17.82 MAXDIFF:52968 bytes: 95992/ 96000 +87fd529c9e41914f73a865d147cc9516 *tests/data/fate/acodec-g723_1.out.wav +stddev: 8425.98 PSNR: 17.82 MAXDIFF:53268 bytes: 95992/ 96000 |