Merge remote-tracking branch 'qatar/master'

* qatar/master: dsputilenc_mmx: split assignment of ff_sse16_sse2 to SSE2 section. dnxhdenc: add space between function argument type and comment. x86: fmtconvert: add special asm for float_to_int16_interleave_misc_* attributes: Add a definition of av_always_inline for MSVC cmdutils: Pass the actual chosen encoder to filter_codec_opts os_support: Add fallback definitions for stat flags os_support: Rename the poll fallback function to ff_poll network: Check for struct pollfd os_support: Don't compare a negative number against socket descriptors os_support: Include all the necessary headers for the win32 open function x86: vc1: fix and enable optimised loop filter Conflicts: cmdutils.c cmdutils.h ffmpeg.c ffplay.c libavformat/os_support.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2012-06-30 22:44:18 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2012-06-30 22:44:18 +0200
commit: 64b25938e90253432d28ffd7d971f085c560a523 (patch)
tree: 56ba6a39d7e8f14ebccdcc4db935164b5da624a2 /libavcodec
parent: be24f85176d8e46c3154f1f51013f235b273183e (diff)
parent: ceabc13f129cd6344b1eebdbe10119083fe5520e (diff)
download: ffmpeg-64b25938e90253432d28ffd7d971f085c560a523.tar.gz
6 files changed, 96 insertions, 12 deletions
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index 279a978cd3..640bbd3995 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -90,7 +90,7 @@ typedef struct DNXHDEncContext {
     RCCMPEntry *mb_cmp;
     RCEntry   (*mb_rc)[8160];
 
-    void (*get_pixels_8x4_sym)(DCTELEM */*align 16*/, const uint8_t *, int);
+    void (*get_pixels_8x4_sym)(DCTELEM * /*align 16*/, const uint8_t *, int);
 } DNXHDEncContext;
 
 void ff_dnxhd_init_mmx(DNXHDEncContext *ctx);
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index bf439ca5df..00e0a3fc37 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -1128,8 +1128,8 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #endif
 
         c->pix_norm1 = pix_norm1_mmx;
-        c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
-          c->sse[1] = sse8_mmx;
+        c->sse[0] = sse16_mmx;
+        c->sse[1] = sse8_mmx;
         c->vsad[4]= vsad_intra16_mmx;
 
         c->nsse[0] = nsse16_mmx;
@@ -1165,10 +1165,13 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
             if (bit_depth <= 8)
                 c->get_pixels = get_pixels_sse2;
             c->sum_abs_dctelem= sum_abs_dctelem_sse2;
-#if HAVE_YASM && HAVE_ALIGNED_STACK
+#if HAVE_YASM
+            c->sse[0] = ff_sse16_sse2;
+#if HAVE_ALIGNED_STACK
             c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
             c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
 #endif
+#endif
         }
 
 #if HAVE_SSSE3
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 72bee55669..9499a9e3a7 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -115,6 +115,84 @@ FLOAT_TO_INT16 sse, 0
 FLOAT_TO_INT16 3dnow, 0
 %undef cvtps2pi
 
+;------------------------------------------------------------------------------
+; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_STEP 2
+cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2
+    add       lenq, lenq
+    lea       srcq, [srcq+2*lenq]
+    lea     step3q, [stepq*3]
+    neg       lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq    m0, [srcq+2*lenq   ]
+    cvtps2dq    m1, [srcq+2*lenq+16]
+    packssdw    m0, m1
+    movd       v1d, m0
+    psrldq      m0, 4
+    movd       v2d, m0
+    psrldq      m0, 4
+    mov     [dstq], v1w
+    mov  [dstq+stepq*4], v2w
+    shr        v1d, 16
+    shr        v2d, 16
+    mov  [dstq+stepq*2], v1w
+    mov  [dstq+step3q*2], v2w
+    lea       dstq, [dstq+stepq*8]
+    movd       v1d, m0
+    psrldq      m0, 4
+    movd       v2d, m0
+    mov     [dstq], v1w
+    mov  [dstq+stepq*4], v2w
+    shr        v1d, 16
+    shr        v2d, 16
+    mov  [dstq+stepq*2], v1w
+    mov  [dstq+step3q*2], v2w
+    lea       dstq, [dstq+stepq*8]
+%else
+    cvtps2pi    m0, [srcq+2*lenq   ]
+    cvtps2pi    m1, [srcq+2*lenq+ 8]
+    cvtps2pi    m2, [srcq+2*lenq+16]
+    cvtps2pi    m3, [srcq+2*lenq+24]
+    packssdw    m0, m1
+    packssdw    m2, m3
+    movd       v1d, m0
+    psrlq       m0, 32
+    movd       v2d, m0
+    mov     [dstq], v1w
+    mov  [dstq+stepq*4], v2w
+    shr        v1d, 16
+    shr        v2d, 16
+    mov  [dstq+stepq*2], v1w
+    mov  [dstq+step3q*2], v2w
+    lea       dstq, [dstq+stepq*8]
+    movd       v1d, m2
+    psrlq       m2, 32
+    movd       v2d, m2
+    mov     [dstq], v1w
+    mov  [dstq+stepq*4], v2w
+    shr        v1d, 16
+    shr        v2d, 16
+    mov  [dstq+stepq*2], v1w
+    mov  [dstq+step3q*2], v2w
+    lea       dstq, [dstq+stepq*8]
+%endif
+    add       lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_XMM
+FLOAT_TO_INT16_STEP sse2, 2
+INIT_MMX
+FLOAT_TO_INT16_STEP sse, 0
+%define cvtps2pi pf2id
+FLOAT_TO_INT16_STEP 3dnow, 0
+%undef cvtps2pi
 
 ;-------------------------------------------------------------------------------
 ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index ca0b29344a..8c9c43f662 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -25,6 +25,7 @@
 #include "libavutil/cpu.h"
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/fmtconvert.h"
+#include "libavcodec/dsputil.h"
 
 #if HAVE_YASM
 
@@ -35,6 +36,10 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 
+void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
+void ff_float_to_int16_step_sse  (int16_t *dst, const float *src, long len, long step);
+void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
+
 void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
 void ff_float_to_int16_interleave2_sse  (int16_t *dst, const float **src, long len);
 void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
@@ -48,12 +53,9 @@ void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len
 #define FLOAT_TO_INT16_INTERLEAVE(cpu) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
-    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
-    int i,j,c;\
+    int c;\
     for(c=0; c<channels; c++){\
-        ff_float_to_int16_##cpu(tmp, src[c], len);\
-        for(i=0, j=c; i<len; i++, j+=channels)\
-            dst[j] = tmp[i];\
+        ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
     }\
 }\
 \
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 4adeabca3b..bddac5ec77 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -809,7 +809,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 #if HAVE_YASM
     if (mm_flags & AV_CPU_FLAG_MMX) {
     }
-    return;
+
     if (mm_flags & AV_CPU_FLAG_MMX2) {
         ASSIGN_LF(mmx2);
     }
diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp_yasm.asm
index b897580b76..590aa509a7 100644
--- a/libavcodec/x86/vc1dsp_yasm.asm
+++ b/libavcodec/x86/vc1dsp_yasm.asm
@@ -119,7 +119,9 @@ section .text
     pand    m2, m6
     pand    m3, m2  ; d final
 
-    PSIGNW  m3, m7
+    psraw   m7, 15
+    pxor    m3, m7
+    psubw   m3, m7
     psubw   m0, m3
     paddw   m1, m3
     packuswb m0, m0
@@ -284,7 +286,6 @@ cglobal vc1_h_loop_filter8_sse2, 3,6,8
     RET
 
 %define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
 
 INIT_MMX
 ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
author	Michael Niedermayer <michaelni@gmx.at>	2012-06-30 22:44:18 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2012-06-30 22:44:18 +0200
commit	64b25938e90253432d28ffd7d971f085c560a523 (patch)
tree	56ba6a39d7e8f14ebccdcc4db935164b5da624a2 /libavcodec
parent	be24f85176d8e46c3154f1f51013f235b273183e (diff)
parent	ceabc13f129cd6344b1eebdbe10119083fe5520e (diff)
download	ffmpeg-64b25938e90253432d28ffd7d971f085c560a523.tar.gz