diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-06-30 22:44:18 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-06-30 22:44:18 +0200 |
commit | 64b25938e90253432d28ffd7d971f085c560a523 (patch) | |
tree | 56ba6a39d7e8f14ebccdcc4db935164b5da624a2 /libavcodec | |
parent | be24f85176d8e46c3154f1f51013f235b273183e (diff) | |
parent | ceabc13f129cd6344b1eebdbe10119083fe5520e (diff) | |
download | ffmpeg-64b25938e90253432d28ffd7d971f085c560a523.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
dsputilenc_mmx: split assignment of ff_sse16_sse2 to SSE2 section.
dnxhdenc: add space between function argument type and comment.
x86: fmtconvert: add special asm for float_to_int16_interleave_misc_*
attributes: Add a definition of av_always_inline for MSVC
cmdutils: Pass the actual chosen encoder to filter_codec_opts
os_support: Add fallback definitions for stat flags
os_support: Rename the poll fallback function to ff_poll
network: Check for struct pollfd
os_support: Don't compare a negative number against socket descriptors
os_support: Include all the necessary headers for the win32 open function
x86: vc1: fix and enable optimised loop filter
Conflicts:
cmdutils.c
cmdutils.h
ffmpeg.c
ffplay.c
libavformat/os_support.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/dnxhdenc.h | 2 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 9 | ||||
-rw-r--r-- | libavcodec/x86/fmtconvert.asm | 78 | ||||
-rw-r--r-- | libavcodec/x86/fmtconvert_mmx.c | 12 | ||||
-rw-r--r-- | libavcodec/x86/vc1dsp_mmx.c | 2 | ||||
-rw-r--r-- | libavcodec/x86/vc1dsp_yasm.asm | 5 |
6 files changed, 96 insertions, 12 deletions
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h index 279a978cd3..640bbd3995 100644 --- a/libavcodec/dnxhdenc.h +++ b/libavcodec/dnxhdenc.h @@ -90,7 +90,7 @@ typedef struct DNXHDEncContext { RCCMPEntry *mb_cmp; RCEntry (*mb_rc)[8160]; - void (*get_pixels_8x4_sym)(DCTELEM */*align 16*/, const uint8_t *, int); + void (*get_pixels_8x4_sym)(DCTELEM * /*align 16*/, const uint8_t *, int); } DNXHDEncContext; void ff_dnxhd_init_mmx(DNXHDEncContext *ctx); diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index bf439ca5df..00e0a3fc37 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -1128,8 +1128,8 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif c->pix_norm1 = pix_norm1_mmx; - c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; - c->sse[1] = sse8_mmx; + c->sse[0] = sse16_mmx; + c->sse[1] = sse8_mmx; c->vsad[4]= vsad_intra16_mmx; c->nsse[0] = nsse16_mmx; @@ -1165,10 +1165,13 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) if (bit_depth <= 8) c->get_pixels = get_pixels_sse2; c->sum_abs_dctelem= sum_abs_dctelem_sse2; -#if HAVE_YASM && HAVE_ALIGNED_STACK +#if HAVE_YASM + c->sse[0] = ff_sse16_sse2; +#if HAVE_ALIGNED_STACK c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2; c->hadamard8_diff[1]= ff_hadamard8_diff_sse2; #endif +#endif } #if HAVE_SSSE3 diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 72bee55669..9499a9e3a7 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -115,6 +115,84 @@ FLOAT_TO_INT16 sse, 0 FLOAT_TO_INT16 3dnow, 0 %undef cvtps2pi +;------------------------------------------------------------------------------ +; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); +;------------------------------------------------------------------------------ +%macro FLOAT_TO_INT16_STEP 2 +cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2 + add lenq, lenq + lea srcq, [srcq+2*lenq] + lea step3q, [stepq*3] + neg lenq +.loop: +%ifidn %1, sse2 + cvtps2dq m0, [srcq+2*lenq ] + cvtps2dq m1, [srcq+2*lenq+16] + packssdw m0, m1 + movd v1d, m0 + psrldq m0, 4 + movd v2d, m0 + psrldq m0, 4 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] + movd v1d, m0 + psrldq m0, 4 + movd v2d, m0 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] +%else + cvtps2pi m0, [srcq+2*lenq ] + cvtps2pi m1, [srcq+2*lenq+ 8] + cvtps2pi m2, [srcq+2*lenq+16] + cvtps2pi m3, [srcq+2*lenq+24] + packssdw m0, m1 + packssdw m2, m3 + movd v1d, m0 + psrlq m0, 32 + movd v2d, m0 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] + movd v1d, m2 + psrlq m2, 32 + movd v2d, m2 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] +%endif + add lenq, 16 + js .loop +%ifnidn %1, sse2 + emms +%endif + REP_RET +%endmacro + +INIT_XMM +FLOAT_TO_INT16_STEP sse2, 2 +INIT_MMX +FLOAT_TO_INT16_STEP sse, 0 +%define cvtps2pi pf2id +FLOAT_TO_INT16_STEP 3dnow, 0 +%undef cvtps2pi ;------------------------------------------------------------------------------- ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index ca0b29344a..8c9c43f662 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -25,6 +25,7 @@ #include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/fmtconvert.h" +#include "libavcodec/dsputil.h" #if HAVE_YASM @@ -35,6 +36,10 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); +void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); +void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); +void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); + void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); @@ -48,12 +53,9 @@ void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len #define FLOAT_TO_INT16_INTERLEAVE(cpu) \ /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - DECLARE_ALIGNED(16, int16_t, tmp)[len];\ - int i,j,c;\ + int c;\ for(c=0; c<channels; c++){\ - ff_float_to_int16_##cpu(tmp, src[c], len);\ - for(i=0, j=c; i<len; i++, j+=channels)\ - dst[j] = tmp[i];\ + ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\ }\ }\ \ diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index 4adeabca3b..bddac5ec77 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -809,7 +809,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) #if HAVE_YASM if (mm_flags & AV_CPU_FLAG_MMX) { } - return; + if (mm_flags & AV_CPU_FLAG_MMX2) { ASSIGN_LF(mmx2); } diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp_yasm.asm index b897580b76..590aa509a7 100644 --- a/libavcodec/x86/vc1dsp_yasm.asm +++ b/libavcodec/x86/vc1dsp_yasm.asm @@ -119,7 +119,9 @@ section .text pand m2, m6 pand m3, m2 ; d final - PSIGNW m3, m7 + psraw m7, 15 + pxor m3, m7 + psubw m3, m7 psubw m0, m3 paddw m1, m3 packuswb m0, m0 @@ -284,7 +286,6 @@ cglobal vc1_h_loop_filter8_sse2, 3,6,8 RET %define PABSW PABSW_SSSE3 -%define PSIGNW PSIGNW_SSSE3 INIT_MMX ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) |