diff options
author | Diego Biurrun <diego@biurrun.de> | 2005-12-17 18:14:38 +0000 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2005-12-17 18:14:38 +0000 |
commit | 115329f16062074e11ccf3b89ead6176606c9696 (patch) | |
tree | e98aa993905a702688bf821737ab9a443969fc28 /libavcodec/i386 | |
parent | d76319b1ab716320f6e6a4d690b85fe4504ebd5b (diff) | |
download | ffmpeg-115329f16062074e11ccf3b89ead6176606c9696.tar.gz |
COSMETICS: Remove all trailing whitespace.
Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r-- | libavcodec/i386/cputest.c | 28 | ||||
-rw-r--r-- | libavcodec/i386/dsputil_mmx.c | 228 | ||||
-rw-r--r-- | libavcodec/i386/dsputil_mmx_avg.h | 22 | ||||
-rw-r--r-- | libavcodec/i386/dsputil_mmx_rnd.h | 2 | ||||
-rw-r--r-- | libavcodec/i386/fdct_mmx.c | 174 | ||||
-rw-r--r-- | libavcodec/i386/fft_sse.c | 20 | ||||
-rw-r--r-- | libavcodec/i386/h264dsp_mmx.c | 4 | ||||
-rw-r--r-- | libavcodec/i386/idct_mmx_xvid.c | 12 | ||||
-rw-r--r-- | libavcodec/i386/motion_est_mmx.c | 2 | ||||
-rw-r--r-- | libavcodec/i386/mpegvideo_mmx.c | 32 | ||||
-rw-r--r-- | libavcodec/i386/mpegvideo_mmx_template.c | 218 | ||||
-rw-r--r-- | libavcodec/i386/simple_idct_mmx.c | 26 | ||||
-rw-r--r-- | libavcodec/i386/vp3dsp_mmx.c | 2 | ||||
-rw-r--r-- | libavcodec/i386/vp3dsp_sse2.c | 24 |
14 files changed, 397 insertions, 397 deletions
diff --git a/libavcodec/i386/cputest.c b/libavcodec/i386/cputest.c index 593e0550db..f02c63d449 100644 --- a/libavcodec/i386/cputest.c +++ b/libavcodec/i386/cputest.c @@ -29,28 +29,28 @@ int mm_support(void) int eax, ebx, ecx, edx; int max_std_level, max_ext_level, std_caps=0, ext_caps=0; long a, c; - + __asm__ __volatile__ ( /* See if CPUID instruction is supported ... */ /* ... Get copies of EFLAGS into eax and ecx */ "pushf\n\t" "pop %0\n\t" "mov %0, %1\n\t" - + /* ... Toggle the ID bit in one copy and store */ /* to the EFLAGS reg */ "xor $0x200000, %0\n\t" "push %0\n\t" "popf\n\t" - + /* ... Get the (hopefully modified) EFLAGS */ "pushf\n\t" "pop %0\n\t" : "=a" (a), "=c" (c) : - : "cc" + : "cc" ); - + if (a == c) return 0; /* CPUID not supported */ @@ -60,9 +60,9 @@ int mm_support(void) cpuid(1, eax, ebx, ecx, std_caps); if (std_caps & (1<<23)) rval |= MM_MMX; - if (std_caps & (1<<25)) + if (std_caps & (1<<25)) rval |= MM_MMXEXT | MM_SSE; - if (std_caps & (1<<26)) + if (std_caps & (1<<26)) rval |= MM_SSE2; } @@ -103,18 +103,18 @@ int mm_support(void) According to the table, the only CPU which supports level 2 is also the only one which supports extended CPUID levels. */ - if (eax < 2) + if (eax < 2) return rval; if (ext_caps & (1<<24)) rval |= MM_MMXEXT; } #if 0 - av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s\n", - (rval&MM_MMX) ? "MMX ":"", - (rval&MM_MMXEXT) ? "MMX2 ":"", - (rval&MM_SSE) ? "SSE ":"", - (rval&MM_SSE2) ? "SSE2 ":"", - (rval&MM_3DNOW) ? "3DNow ":"", + av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s\n", + (rval&MM_MMX) ? "MMX ":"", + (rval&MM_MMXEXT) ? "MMX2 ":"", + (rval&MM_SSE) ? "SSE ":"", + (rval&MM_SSE2) ? "SSE2 ":"", + (rval&MM_3DNOW) ? "3DNow ":"", (rval&MM_3DNOWEXT) ? "3DNowExt ":""); #endif return rval; diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index d8e655269b..7566b5d16a 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -602,9 +602,9 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ const int strength= ff_h263_loop_filter_strength[qscale]; asm volatile( - + H263_LOOP_FILTER - + "movq %%mm3, %1 \n\t" "movq %%mm4, %2 \n\t" "movq %%mm5, %0 \n\t" @@ -634,7 +634,7 @@ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int "movd %%mm1, %2 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, %3 \n\t" - + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), "=m" (*(uint32_t*)(dst + 1*dst_stride)), "=m" (*(uint32_t*)(dst + 2*dst_stride)), @@ -650,14 +650,14 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ const int strength= ff_h263_loop_filter_strength[qscale]; uint64_t temp[4] __attribute__ ((aligned(8))); uint8_t *btemp= (uint8_t*)temp; - + src -= 2; transpose4x4(btemp , src , 8, stride); transpose4x4(btemp+4, src + 4*stride, 8, stride); asm volatile( H263_LOOP_FILTER // 5 3 4 6 - + : "+m" (temp[0]), "+m" (temp[1]), "+m" (temp[2]), @@ -796,7 +796,7 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp; @@ -856,7 +856,7 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp; @@ -919,7 +919,7 @@ static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ "paddd %%xmm1,%%xmm7\n" "movd %%xmm7,%3\n" - : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) + : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) : "r" ((long)line_size)); return tmp; } @@ -930,7 +930,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { "movl %3,%%ecx\n" "pxor %%mm7,%%mm7\n" "pxor %%mm6,%%mm6\n" - + "movq (%0),%%mm0\n" "movq %%mm0, %%mm1\n" "psllq $8, %%mm0\n" @@ -944,9 +944,9 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { "punpckhbw %%mm7,%%mm3\n" "psubw %%mm1, %%mm0\n" "psubw %%mm3, %%mm2\n" - + "add %2,%0\n" - + "movq (%0),%%mm4\n" "movq %%mm4, %%mm1\n" "psllq $8, %%mm4\n" @@ -968,14 +968,14 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { "pcmpgtw %%mm2, %%mm1\n\t" "pxor %%mm3, %%mm0\n" "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" + "psubw %%mm3, %%mm0\n" "psubw %%mm1, %%mm2\n" "paddw %%mm0, %%mm2\n" "paddw %%mm2, %%mm6\n" "add %2,%0\n" "1:\n" - + "movq (%0),%%mm0\n" "movq %%mm0, %%mm1\n" "psllq $8, %%mm0\n" @@ -997,13 +997,13 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { "pcmpgtw %%mm5, %%mm1\n\t" "pxor %%mm3, %%mm4\n" "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" + "psubw %%mm3, %%mm4\n" "psubw %%mm1, %%mm5\n" "paddw %%mm4, %%mm5\n" "paddw %%mm5, %%mm6\n" - + "add %2,%0\n" - + "movq (%0),%%mm4\n" "movq %%mm4, %%mm1\n" "psllq $8, %%mm4\n" @@ -1025,7 +1025,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { "pcmpgtw %%mm2, %%mm1\n\t" "pxor %%mm3, %%mm0\n" "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" + "psubw %%mm3, %%mm0\n" "psubw %%mm1, %%mm2\n" "paddw %%mm0, %%mm2\n" "paddw %%mm2, %%mm6\n" @@ -1038,12 +1038,12 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { "punpcklwd %%mm7,%%mm0\n" "punpckhwd %%mm7,%%mm6\n" "paddd %%mm0, %%mm6\n" - + "movq %%mm6,%%mm0\n" "psrlq $32, %%mm6\n" "paddd %%mm6,%%mm0\n" "movd %%mm0,%1\n" - : "+r" (pix1), "=r"(tmp) + : "+r" (pix1), "=r"(tmp) : "r" ((long)line_size) , "g" (h-2) : "%ecx"); return tmp; @@ -1056,7 +1056,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { "movl %3,%%ecx\n" "pxor %%mm7,%%mm7\n" "pxor %%mm6,%%mm6\n" - + "movq (%0),%%mm0\n" "movq 1(%0),%%mm1\n" "movq %%mm0, %%mm2\n" @@ -1067,9 +1067,9 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { "punpckhbw %%mm7,%%mm3\n" "psubw %%mm1, %%mm0\n" "psubw %%mm3, %%mm2\n" - + "add %2,%0\n" - + "movq (%0),%%mm4\n" "movq 1(%0),%%mm1\n" "movq %%mm4, %%mm5\n" @@ -1088,14 +1088,14 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { "pcmpgtw %%mm2, %%mm1\n\t" "pxor %%mm3, %%mm0\n" "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" + "psubw %%mm3, %%mm0\n" "psubw %%mm1, %%mm2\n" "paddw %%mm0, %%mm2\n" "paddw %%mm2, %%mm6\n" "add %2,%0\n" "1:\n" - + "movq (%0),%%mm0\n" "movq 1(%0),%%mm1\n" "movq %%mm0, %%mm2\n" @@ -1118,9 +1118,9 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { "psubw %%mm1, %%mm5\n" "paddw %%mm4, %%mm5\n" "paddw %%mm5, %%mm6\n" - + "add %2,%0\n" - + "movq (%0),%%mm4\n" "movq 1(%0),%%mm1\n" "movq %%mm4, %%mm5\n" @@ -1139,7 +1139,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { "pcmpgtw %%mm2, %%mm1\n\t" "pxor %%mm3, %%mm0\n" "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" + "psubw %%mm3, %%mm0\n" "psubw %%mm1, %%mm2\n" "paddw %%mm0, %%mm2\n" "paddw %%mm2, %%mm6\n" @@ -1152,12 +1152,12 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { "punpcklwd %%mm7,%%mm0\n" "punpckhwd %%mm7,%%mm6\n" "paddd %%mm0, %%mm6\n" - + "movq %%mm6,%%mm0\n" "psrlq $32, %%mm6\n" "paddd %%mm6,%%mm0\n" "movd %%mm0,%1\n" - : "+r" (pix1), "=r"(tmp) + : "+r" (pix1), "=r"(tmp) : "r" ((long)line_size) , "g" (h-2) : "%ecx"); return tmp + hf_noise8_mmx(pix+8, line_size, h); @@ -1186,10 +1186,10 @@ static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { int tmp; - + assert( (((int)pix) & 7) == 0); assert((line_size &7) ==0); - + #define SUM(in0, in1, out0, out1) \ "movq (%0), %%mm2\n"\ "movq 8(%0), %%mm3\n"\ @@ -1213,7 +1213,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si "paddw %%mm2, " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" - + asm volatile ( "movl %3,%%ecx\n" "pxor %%mm6,%%mm6\n" @@ -1224,11 +1224,11 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si "subl $2, %%ecx\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" - + SUM(%%mm4, %%mm5, %%mm0, %%mm1) - + SUM(%%mm0, %%mm1, %%mm4, %%mm5) - + "subl $2, %%ecx\n" "jnz 1b\n" @@ -1239,7 +1239,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si "psrlq $16, %%mm0\n" "paddw %%mm6,%%mm0\n" "movd %%mm0,%1\n" - : "+r" (pix), "=r"(tmp) + : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp & 0xFFFF; @@ -1248,10 +1248,10 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { int tmp; - + assert( (((int)pix) & 7) == 0); assert((line_size &7) ==0); - + #define SUM(in0, in1, out0, out1) \ "movq (%0), " #out0 "\n"\ "movq 8(%0), " #out1 "\n"\ @@ -1271,16 +1271,16 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s "subl $2, %%ecx\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" - + SUM(%%mm4, %%mm5, %%mm0, %%mm1) - + SUM(%%mm0, %%mm1, %%mm4, %%mm5) - + "subl $2, %%ecx\n" "jnz 1b\n" "movd %%mm6,%1\n" - : "+r" (pix), "=r"(tmp) + : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp; @@ -1289,11 +1289,11 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; - + assert( (((int)pix1) & 7) == 0); assert( (((int)pix2) & 7) == 0); assert((line_size &7) ==0); - + #define SUM(in0, in1, out0, out1) \ "movq (%0),%%mm2\n"\ "movq (%1)," #out0 "\n"\ @@ -1324,7 +1324,7 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in "paddw %%mm2, " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" - + asm volatile ( "movl %4,%%ecx\n" "pxor %%mm6,%%mm6\n" @@ -1344,11 +1344,11 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in "pxor %%mm7, %%mm1\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" - + SUM(%%mm4, %%mm5, %%mm0, %%mm1) - + SUM(%%mm0, %%mm1, %%mm4, %%mm5) - + "subl $2, %%ecx\n" "jnz 1b\n" @@ -1359,7 +1359,7 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in "psrlq $16, %%mm0\n" "paddw %%mm6,%%mm0\n" "movd %%mm0,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp & 0x7FFF; @@ -1368,11 +1368,11 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; - + assert( (((int)pix1) & 7) == 0); assert( (((int)pix2) & 7) == 0); assert((line_size &7) ==0); - + #define SUM(in0, in1, out0, out1) \ "movq (%0)," #out0 "\n"\ "movq (%1),%%mm2\n"\ @@ -1408,16 +1408,16 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i "pxor %%mm7, %%mm1\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" - + SUM(%%mm4, %%mm5, %%mm0, %%mm1) - + SUM(%%mm0, %%mm1, %%mm4, %%mm5) - + "subl $2, %%ecx\n" "jnz 1b\n" "movd %%mm6,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp; @@ -1449,7 +1449,7 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ long i=0; uint8_t l, lt; - + asm volatile( "1: \n\t" "movq -1(%1, %0), %%mm0 \n\t" // LT @@ -1462,7 +1462,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t "movq %%mm4, %%mm5 \n\t" // L "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) "pminub %%mm5, %%mm1 \n\t" // min(T, L) - "pminub %%mm2, %%mm4 \n\t" + "pminub %%mm2, %%mm4 \n\t" "pmaxub %%mm1, %%mm4 \n\t" "psubb %%mm4, %%mm3 \n\t" // dst - pred "movq %%mm3, (%3, %0) \n\t" @@ -1475,9 +1475,9 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t l= *left; lt= *left_top; - + dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); - + *left_top= src1[w-1]; *left = src2[w-1]; } @@ -1521,7 +1521,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t "psubw " #a ", " #z " \n\t"\ "pmaxsw " #z ", " #a " \n\t"\ "paddusw " #a ", " #sum " \n\t" - + #define SBUTTERFLY(a,b,t,n)\ "movq " #a ", " #t " \n\t" /* abcd */\ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ @@ -1548,7 +1548,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ uint64_t temp[16] __align8; int sum=0; - + assert(h==8); diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); @@ -1556,38 +1556,38 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, asm volatile( LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 - + "movq %%mm7, 112(%1) \n\t" - + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) - + "movq 112(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 - + "movq %%mm7, 120(%1) \n\t" - + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) - + "movq 120(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) "movq %%mm7, %%mm5 \n\t"//FIXME remove "movq %%mm6, %%mm7 \n\t" "movq %%mm0, %%mm6 \n\t" // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove - + LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 "movq %%mm7, 64(%1) \n\t" MMABS(%%mm0, %%mm7) @@ -1600,10 +1600,10 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, "movq 64(%1), %%mm1 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq %%mm0, 64(%1) \n\t" - + LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 "movq %%mm7, (%1) \n\t" MMABS(%%mm0, %%mm7) @@ -1617,7 +1617,7 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq 64(%1), %%mm1 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0) - + "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" "paddusw %%mm1, %%mm0 \n\t" @@ -1625,7 +1625,7 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, "psrlq $16, %%mm0 \n\t" "paddusw %%mm1, %%mm0 \n\t" "movd %%mm0, %0 \n\t" - + : "=r" (sum) : "r"(temp) ); @@ -1635,7 +1635,7 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ uint64_t temp[16] __align8; int sum=0; - + assert(h==8); diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); @@ -1643,38 +1643,38 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride asm volatile( LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 - + "movq %%mm7, 112(%1) \n\t" - + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) - + "movq 112(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 - + "movq %%mm7, 120(%1) \n\t" - + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) - + "movq 120(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) "movq %%mm7, %%mm5 \n\t"//FIXME remove "movq %%mm6, %%mm7 \n\t" "movq %%mm0, %%mm6 \n\t" // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove - + LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 "movq %%mm7, 64(%1) \n\t" MMABS_MMX2(%%mm0, %%mm7) @@ -1687,10 +1687,10 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride "movq 64(%1), %%mm1 \n\t" MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) "movq %%mm0, 64(%1) \n\t" - + LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) - + HADAMARD48 "movq %%mm7, (%1) \n\t" MMABS_MMX2(%%mm0, %%mm7) @@ -1704,13 +1704,13 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) "movq 64(%1), %%mm1 \n\t" MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) - + "pshufw $0x0E, %%mm0, %%mm1 \n\t" "paddusw %%mm1, %%mm0 \n\t" "pshufw $0x01, %%mm0, %%mm1 \n\t" "paddusw %%mm1, %%mm0 \n\t" "movd %%mm0, %0 \n\t" - + : "=r" (sum) : "r"(temp) ); @@ -2405,7 +2405,7 @@ static void just_return() { return; } static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ long i=0; - + assert(ABS(scale) < 256); scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; @@ -2413,11 +2413,11 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w "psrlw $15, %%mm6 \n\t" // 1w "pxor %%mm7, %%mm7 \n\t" - "movd %4, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" + "movd %4, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" + "movq (%1, %0), %%mm0 \n\t" "movq 8(%1, %0), %%mm1 \n\t" "pmulhw %%mm5, %%mm0 \n\t" "pmulhw %%mm5, %%mm1 \n\t" @@ -2444,7 +2444,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 "paddd %%mm6, %%mm7 \n\t" "psrld $2, %%mm7 \n\t" "movd %%mm7, %0 \n\t" - + : "+r" (i) : "r"(basis), "r"(rem), "r"(weight), "g"(scale) ); @@ -2453,21 +2453,21 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ long i=0; - + if(ABS(scale) < 256){ scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; asm volatile( "pcmpeqw %%mm6, %%mm6 \n\t" // -1w "psrlw $15, %%mm6 \n\t" // 1w - "movd %3, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" + "movd %3, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" + "movq (%1, %0), %%mm0 \n\t" "movq 8(%1, %0), %%mm1 \n\t" "pmulhw %%mm5, %%mm0 \n\t" "pmulhw %%mm5, %%mm1 \n\t" - "paddw %%mm6, %%mm0 \n\t" + "paddw %%mm6, %%mm0 \n\t" "paddw %%mm6, %%mm1 \n\t" "psraw $1, %%mm0 \n\t" "psraw $1, %%mm1 \n\t" @@ -2478,19 +2478,19 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ "add $16, %0 \n\t" "cmp $128, %0 \n\t" //FIXME optimize & bench " jb 1b \n\t" - + : "+r" (i) : "r"(basis), "r"(rem), "g"(scale) ); }else{ for(i=0; i<8*8; i++){ rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); - } + } } } #include "h264dsp_mmx.c" - + /* external functions, from idct_mmx.c */ void ff_mmx_idct(DCTELEM *block); void ff_mmxext_idct(DCTELEM *block); @@ -2563,7 +2563,7 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) add_pixels_clamped_mmx(block, dest, line_size); } #endif - + void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { mm_flags = mm_support(); @@ -2701,14 +2701,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; - + c->add_bytes= add_bytes_mmx; #ifdef CONFIG_ENCODERS c->diff_bytes= diff_bytes_mmx; - + c->hadamard8_diff[0]= hadamard8_diff16_mmx; c->hadamard8_diff[1]= hadamard8_diff_mmx; - + c->pix_norm1 = pix_norm1_mmx; c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; c->sse[1] = sse8_mmx; @@ -2719,19 +2719,19 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->vsad[0] = vsad16_mmx; } - + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->try_8x8basis= try_8x8basis_mmx; } c->add_8x8basis= add_8x8basis_mmx; - + #endif //CONFIG_ENCODERS c->h263_v_loop_filter= h263_v_loop_filter_mmx; - c->h263_h_loop_filter= h263_h_loop_filter_mmx; + c->h263_h_loop_filter= h263_h_loop_filter_mmx; c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; - + if (mm_flags & MM_MMXEXT) { c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; @@ -2945,7 +2945,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; } } - + #ifdef CONFIG_ENCODERS dsputil_init_pix_mmx(c, avctx); #endif //CONFIG_ENCODERS diff --git a/libavcodec/i386/dsputil_mmx_avg.h b/libavcodec/i386/dsputil_mmx_avg.h index c708913048..434bc3a0e8 100644 --- a/libavcodec/i386/dsputil_mmx_avg.h +++ b/libavcodec/i386/dsputil_mmx_avg.h @@ -21,7 +21,7 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> */ - + /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm clobber bug - now it will work with 2.95.2 and also with -fPIC */ @@ -100,7 +100,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); } @@ -147,7 +147,7 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) :"r"(src1Stride), "r"(dstStride) @@ -217,7 +217,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) :"r"(src1Stride), "r"(dstStride) @@ -272,7 +272,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); } @@ -324,7 +324,7 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) :"r"(src1Stride), "r"(dstStride) @@ -412,7 +412,7 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) :"r"(src1Stride), "r"(dstStride) @@ -466,7 +466,7 @@ static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) :"r"(src1Stride), "r"(dstStride) @@ -539,13 +539,13 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) :"r"(src1Stride), "r"(dstStride) :"memory");*/ } - + /* GL: this function does incorrect rounding if overflow */ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { @@ -746,7 +746,7 @@ static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_ :"%"REG_a, "memory"); } -// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter +// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BONE(mm6); diff --git a/libavcodec/i386/dsputil_mmx_rnd.h b/libavcodec/i386/dsputil_mmx_rnd.h index a56374b63e..6d93f9d55f 100644 --- a/libavcodec/i386/dsputil_mmx_rnd.h +++ b/libavcodec/i386/dsputil_mmx_rnd.h @@ -197,7 +197,7 @@ static void attribute_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif :"S"((long)src1Stride), "D"((long)dstStride) - :"memory"); + :"memory"); } static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) diff --git a/libavcodec/i386/fdct_mmx.c b/libavcodec/i386/fdct_mmx.c index 6a13090a13..f3023549a2 100644 --- a/libavcodec/i386/fdct_mmx.c +++ b/libavcodec/i386/fdct_mmx.c @@ -5,7 +5,7 @@ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. * * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT - * + * * Intel Application Note AP-922 - fast, precise implementation of DCT * http://developer.intel.com/vtune/cbts/appnotes.htm * @@ -51,7 +51,7 @@ static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; -struct +struct { const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16); } fdct_r_row_sse2 ATTR_ALIGN(16)= @@ -61,90 +61,90 @@ struct //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, + 16384, 16384, 22725, 19266, + 16384, 16384, 12873, 4520, + 21407, 8867, 19266, -4520, + -8867, -21407, -22725, -12873, + 16384, -16384, 12873, -22725, + -16384, 16384, 4520, 19266, + 8867, -21407, 4520, -12873, + 21407, -8867, 19266, -22725, + + 22725, 22725, 31521, 26722, + 22725, 22725, 17855, 6270, + 29692, 12299, 26722, -6270, + -12299, -29692, -31521, -17855, + 22725, -22725, 17855, -31521, + -22725, 22725, 6270, 26722, + 12299, -29692, 6270, -17855, + 29692, -12299, 26722, -31521, + + 21407, 21407, 29692, 25172, + 21407, 21407, 16819, 5906, + 27969, 11585, 25172, -5906, + -11585, -27969, -29692, -16819, + 21407, -21407, 16819, -29692, + -21407, 21407, 5906, 25172, + 11585, -27969, 5906, -16819, + 27969, -11585, 25172, -29692, + + 19266, 19266, 26722, 22654, + 19266, 19266, 15137, 5315, + 25172, 10426, 22654, -5315, + -10426, -25172, -26722, -15137, + 19266, -19266, 15137, -26722, + -19266, 19266, 5315, 22654, + 10426, -25172, 5315, -15137, + 25172, -10426, 22654, -26722, + + 16384, 16384, 22725, 19266, + 16384, 16384, 12873, 4520, + 21407, 8867, 19266, -4520, + -8867, -21407, -22725, -12873, + 16384, -16384, 12873, -22725, + -16384, 16384, 4520, 19266, + 8867, -21407, 4520, -12873, + 21407, -8867, 19266, -22725, + + 19266, 19266, 26722, 22654, + 19266, 19266, 15137, 5315, + 25172, 10426, 22654, -5315, + -10426, -25172, -26722, -15137, + 19266, -19266, 15137, -26722, + -19266, 19266, 5315, 22654, + 10426, -25172, 5315, -15137, + 25172, -10426, 22654, -26722, + + 21407, 21407, 29692, 25172, + 21407, 21407, 16819, 5906, + 27969, 11585, 25172, -5906, + -11585, -27969, -29692, -16819, + 21407, -21407, 16819, -29692, + -21407, 21407, 5906, 25172, + 11585, -27969, 5906, -16819, + 27969, -11585, 25172, -29692, + + 22725, 22725, 31521, 26722, + 22725, 22725, 17855, 6270, + 29692, 12299, 26722, -6270, + -12299, -29692, -31521, -17855, + 22725, -22725, 17855, -31521, + -22725, 22725, 6270, 26722, + 12299, -29692, 6270, -17855, + 29692, -12299, 26722, -31521, }; -struct +struct { const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16); } tab_frw_01234567_sse2 ATTR_ALIGN(16) = {{ -//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table +//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ C4, C4, C5, C7, C2, C6, C3, -C7, \ -C4, C4, C7, C3, C6, -C2, C7, -C5, \ - C4, -C4, C5, -C1, C2, -C6, C3, -C1, -// c1..c7 * cos(pi/4) * 2^15 + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 #define C1 22725 #define C2 21407 #define C3 19266 @@ -355,17 +355,17 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) "movq \\i(%0), %%xmm2 \n\t" "movq \\i+8(%0), %%xmm0 \n\t" "movdqa \\t+32(%1), %%xmm3 \n\t" - "movdqa \\t+48(%1), %%xmm7 \n\t" + "movdqa \\t+48(%1), %%xmm7 \n\t" "movdqa \\t(%1), %%xmm4 \n\t" - "movdqa \\t+16(%1), %%xmm5 \n\t" + "movdqa \\t+16(%1), %%xmm5 \n\t" ".endm \n\t" ".macro FDCT_ROW_SSE2_H2 i t \n\t" "movq \\i(%0), %%xmm2 \n\t" "movq \\i+8(%0), %%xmm0 \n\t" "movdqa \\t+32(%1), %%xmm3 \n\t" - "movdqa \\t+48(%1), %%xmm7 \n\t" + "movdqa \\t+48(%1), %%xmm7 \n\t" ".endm \n\t" - ".macro FDCT_ROW_SSE2 i \n\t" + ".macro FDCT_ROW_SSE2 i \n\t" "movq %%xmm2, %%xmm1 \n\t" "pshuflw $27, %%xmm0, %%xmm0 \n\t" "paddsw %%xmm0, %%xmm1 \n\t" @@ -376,7 +376,7 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) "pmaddwd %%xmm1, %%xmm7 \n\t" "pmaddwd %%xmm5, %%xmm2 \n\t" "pmaddwd %%xmm4, %%xmm1 \n\t" - "paddd %%xmm7, %%xmm3 \n\t" + "paddd %%xmm7, %%xmm3 \n\t" "paddd %%xmm2, %%xmm1 \n\t" "paddd %%xmm6, %%xmm3 \n\t" "paddd %%xmm6, %%xmm1 \n\t" @@ -384,8 +384,8 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) "psrad %3, %%xmm1 \n\t" "packssdw %%xmm3, %%xmm1 \n\t" "movdqa %%xmm1, \\i(%4) \n\t" - ".endm \n\t" - "movdqa (%2), %%xmm6 \n\t" + ".endm \n\t" + "movdqa (%2), %%xmm6 \n\t" "FDCT_ROW_SSE2_H1 0 0 \n\t" "FDCT_ROW_SSE2 0 \n\t" "FDCT_ROW_SSE2_H2 64 0 \n\t" @@ -411,7 +411,7 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) } static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) -{ +{ pshufw_m2r(*(in + 4), mm5, 0x1B); movq_m2r(*(in + 0), mm0); movq_r2r(mm0, mm1); @@ -454,7 +454,7 @@ static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const i } static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) -{ +{ //FIXME reorder (i dont have a old mmx only cpu here to benchmark ...) movd_m2r(*(in + 6), mm1); punpcklwd_m2r(*(in + 4), mm1); @@ -547,7 +547,7 @@ void ff_fdct_mmx2(int16_t *block) } } -void ff_fdct_sse2(int16_t *block) +void ff_fdct_sse2(int16_t *block) { int64_t align_tmp[16] ATTR_ALIGN(8); int16_t * const block_tmp= (int16_t*)align_tmp; diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c index d07c943e91..f8be644a3b 100644 --- a/libavcodec/i386/fft_sse.c +++ b/libavcodec/i386/fft_sse.c @@ -23,13 +23,13 @@ #include <xmmintrin.h> -static const float p1p1p1m1[4] __attribute__((aligned(16))) = +static const float p1p1p1m1[4] __attribute__((aligned(16))) = { 1.0, 1.0, 1.0, -1.0 }; -static const float p1p1m1p1[4] __attribute__((aligned(16))) = +static const float p1p1m1p1[4] __attribute__((aligned(16))) = { 1.0, 1.0, -1.0, 1.0 }; -static const float p1p1m1m1[4] __attribute__((aligned(16))) = +static const float p1p1m1m1[4] __attribute__((aligned(16))) = { 1.0, 1.0, -1.0, -1.0 }; #if 0 @@ -107,27 +107,27 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) a = *(__m128 *)p; b = *(__m128 *)q; - + /* complex mul */ c = *(__m128 *)cptr; /* cre*re cim*re */ - t1 = _mm_mul_ps(c, - _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); + t1 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); c = *(__m128 *)(cptr + 2); /* -cim*im cre*im */ t2 = _mm_mul_ps(c, - _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); + _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); b = _mm_add_ps(t1, t2); - + /* butterfly */ *(__m128 *)p = _mm_add_ps(a, b); *(__m128 *)q = _mm_sub_ps(a, b); - + p += 2; q += 2; cptr += 4; } while (--k); - + p += nloops; q += nloops; } while (--j); diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c index b5e9baa3a6..45a3c02f35 100644 --- a/libavcodec/i386/h264dsp_mmx.c +++ b/libavcodec/i386/h264dsp_mmx.c @@ -384,7 +384,7 @@ static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a "psraw $5, %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d)\ - "add %3, %1 \n\t" + "add %3, %1 \n\t" #define QPEL_H264HV(A,B,C,D,E,F,OF)\ "movd (%0), "#F" \n\t"\ @@ -399,7 +399,7 @@ static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a "paddw "#F", "#A" \n\t"\ "paddw "#A", %%mm6 \n\t"\ "movq %%mm6, "#OF"(%1) \n\t" - + #define QPEL_H264(OPNAME, OP, MMX)\ static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ int h=4;\ diff --git a/libavcodec/i386/idct_mmx_xvid.c b/libavcodec/i386/idct_mmx_xvid.c index 943c50f92b..219260ed88 100644 --- a/libavcodec/i386/idct_mmx_xvid.c +++ b/libavcodec/i386/idct_mmx_xvid.c @@ -72,13 +72,13 @@ //----------------------------------------------------------------------------- -static const int16_t tg_1_16[4*4] attribute_used __attribute__ ((aligned(8))) = { +static const int16_t tg_1_16[4*4] attribute_used __attribute__ ((aligned(8))) = { 13036,13036,13036,13036, // tg * (2<<16) + 0.5 27146,27146,27146,27146, // tg * (2<<16) + 0.5 -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 23170,23170,23170,23170}; // cos * (2<<15) + 0.5 -static const int32_t rounder_0[2*8] attribute_used __attribute__ ((aligned(8))) = { +static const int32_t rounder_0[2*8] attribute_used __attribute__ ((aligned(8))) = { 65536,65536, 3597,3597, 2260,2260, @@ -148,7 +148,7 @@ static const int32_t rounder_0[2*8] attribute_used __attribute__ ((aligned(8))) //----------------------------------------------------------------------------- // Table for rows 0,4 - constants are multiplied by cos_4_16 -static const int16_t tab_i_04_mmx[32*4] attribute_used __attribute__ ((aligned(8))) = { +static const int16_t tab_i_04_mmx[32*4] attribute_used __attribute__ ((aligned(8))) = { 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00 21407,8867,8867,-21407, // w07 w05 w03 w01 16384,-16384,16384,16384, // w14 w12 w10 w08 @@ -190,7 +190,7 @@ static const int16_t tab_i_04_mmx[32*4] attribute_used __attribute__ ((aligned(8 //----------------------------------------------------------------------------- // %3 for rows 0,4 - constants are multiplied by cos_4_16 -static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8))) = { +static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8))) = { 16384,21407,16384,8867, // movq-> w05 w04 w01 w00 16384,8867,-16384,-21407, // w07 w06 w03 w02 16384,-8867,16384,-21407, // w13 w12 w09 w08 @@ -501,7 +501,7 @@ asm volatile( DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) - + //# Process the columns (4 at a time) DCT_8_INV_COL(0(%0), 0(%0)) DCT_8_INV_COL(8(%0), 8(%0)) @@ -524,7 +524,7 @@ asm volatile( DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) - + //# Process the columns (4 at a time) DCT_8_INV_COL(0(%0), 0(%0)) DCT_8_INV_COL(8(%0), 8(%0)) diff --git a/libavcodec/i386/motion_est_mmx.c b/libavcodec/i386/motion_est_mmx.c index 1b90f8e40f..69e10f628b 100644 --- a/libavcodec/i386/motion_est_mmx.c +++ b/libavcodec/i386/motion_est_mmx.c @@ -393,7 +393,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) c->sad[0]= sad16_mmx2; c->sad[1]= sad8_mmx2; - + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->pix_abs[0][1] = sad16_x2_mmx2; c->pix_abs[0][2] = sad16_y2_mmx2; diff --git a/libavcodec/i386/mpegvideo_mmx.c b/libavcodec/i386/mpegvideo_mmx.c index 70c81f6754..af799b6b00 100644 --- a/libavcodec/i386/mpegvideo_mmx.c +++ b/libavcodec/i386/mpegvideo_mmx.c @@ -40,7 +40,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, qmul = qscale << 1; assert(s->block_last_index[n]>=0 || s->h263_aic); - + if (!s->h263_aic) { if (n < 4) level = block[0] * s->y_dc_scale; @@ -116,7 +116,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, qadd = (qscale - 1) | 1; assert(s->block_last_index[n]>=0 || s->h263_aic); - + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; //printf("%d %d ", qmul, qadd); asm volatile( @@ -209,7 +209,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - if (n < 4) + if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; @@ -263,7 +263,7 @@ asm volatile( "js 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" - ); + ); block[0]= block0; } @@ -339,13 +339,13 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, long nCoeffs; const uint16_t *quant_matrix; int block0; - + assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - if (n < 4) + if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; @@ -394,7 +394,7 @@ asm volatile( "jng 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" - ); + ); block[0]= block0; //Note, we dont do mismatch control for intra as errors cannot accumulate } @@ -404,7 +404,7 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, { long nCoeffs; const uint16_t *quant_matrix; - + assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME @@ -470,13 +470,13 @@ asm volatile( "psrlq $15, %%mm7 \n\t" "pxor %%mm7, %%mm0 \n\t" "movd %%mm0, 124(%0, %3) \n\t" - + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) : "%"REG_a, "memory" ); } -/* draw the edges of width 'w' of an image of size width, height +/* draw the edges of width 'w' of an image of size width, height this mmx version can only handle w==8 || w==16 */ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) { @@ -491,7 +491,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) asm volatile( "1: \n\t" "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" "punpcklwd %%mm0, %%mm0 \n\t" "punpckldq %%mm0, %%mm0 \n\t" "movq %%mm0, -8(%0) \n\t" @@ -512,7 +512,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) asm volatile( "1: \n\t" "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" "punpcklwd %%mm0, %%mm0 \n\t" "punpckldq %%mm0, %%mm0 \n\t" "movq %%mm0, -8(%0) \n\t" @@ -525,12 +525,12 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) "movq %%mm1, 8(%0, %2) \n\t" "add %1, %0 \n\t" "cmp %3, %0 \n\t" - " jb 1b \n\t" + " jb 1b \n\t" : "+r" (ptr) : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) ); } - + for(i=0;i<w;i+=4) { /* top and bottom (and hopefully also the corners) */ ptr= buf - (i + 1) * wrap - w; @@ -694,7 +694,7 @@ void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { const int dct_algo = s->avctx->dct_algo; - + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; @@ -703,7 +703,7 @@ void MPV_common_init_mmx(MpegEncContext *s) s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; draw_edges = draw_edges_mmx; - + if (mm_flags & MM_SSE2) { s->denoise_dct= denoise_dct_sse2; } else { diff --git a/libavcodec/i386/mpegvideo_mmx_template.c b/libavcodec/i386/mpegvideo_mmx_template.c index 93f156ee55..28afdeef0c 100644 --- a/libavcodec/i386/mpegvideo_mmx_template.c +++ b/libavcodec/i386/mpegvideo_mmx_template.c @@ -52,7 +52,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, int level=0, q; //=0 is cuz gcc says uninitalized ... const uint16_t *qmat, *bias; __align8 int16_t temp_block[64]; - + assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? //s->fdct (block); @@ -88,7 +88,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, } else /* For AIC we skip quant/dequant of INTRADC */ level = (block[0] + 4)>>3; - + block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; last_non_zero_p1 = 1; @@ -101,7 +101,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, } if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ - + asm volatile( "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) @@ -116,16 +116,16 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "pxor %%mm1, %%mm1 \n\t" // 0 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 - "pxor %%mm1, %%mm0 \n\t" + "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 - "por %%mm0, %%mm4 \n\t" - "pxor %%mm1, %%mm0 \n\t" + "por %%mm0, %%mm4 \n\t" + "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movq %%mm0, (%5, %%"REG_a") \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 - "movq (%4, %%"REG_a"), %%mm1 \n\t" + "movq (%4, %%"REG_a"), %%mm1 \n\t" "movq %%mm7, (%1, %%"REG_a") \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) @@ -142,7 +142,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, asm volatile( "movd %1, %%mm1 \n\t" // max_qcoeff SPREADW(%%mm1) - "psubusw %%mm1, %%mm4 \n\t" + "psubusw %%mm1, %%mm4 \n\t" "packuswb %%mm4, %%mm4 \n\t" "movd %%mm4, %0 \n\t" // *overflow : "=g" (*overflow) @@ -160,18 +160,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "pxor %%mm1, %%mm1 \n\t" // 0 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 - "pxor %%mm1, %%mm0 \n\t" + "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 - "por %%mm0, %%mm4 \n\t" - "pxor %%mm1, %%mm0 \n\t" + "por %%mm0, %%mm4 \n\t" + "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movq %%mm0, (%5, %%"REG_a") \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 - "movq (%4, %%"REG_a"), %%mm1 \n\t" + "movq (%4, %%"REG_a"), %%mm1 \n\t" "movq %%mm7, (%1, %%"REG_a") \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) @@ -188,7 +188,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, asm volatile( "movd %1, %%mm1 \n\t" // max_qcoeff SPREADW(%%mm1) - "psubusw %%mm1, %%mm4 \n\t" + "psubusw %%mm1, %%mm4 \n\t" "packuswb %%mm4, %%mm4 \n\t" "movd %%mm4, %0 \n\t" // *overflow : "=g" (*overflow) @@ -201,135 +201,135 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ if(last_non_zero_p1 <= 1) goto end; - block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; - block[0x20] = temp_block[0x10]; + block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; + block[0x20] = temp_block[0x10]; if(last_non_zero_p1 <= 4) goto end; - block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; - block[0x09] = temp_block[0x03]; + block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; + block[0x09] = temp_block[0x03]; if(last_non_zero_p1 <= 7) goto end; - block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; - block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; + block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; + block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; if(last_non_zero_p1 <= 11) goto end; - block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; - block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; - block[0x0C] = temp_block[0x05]; + block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; + block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; + block[0x0C] = temp_block[0x05]; if(last_non_zero_p1 <= 16) goto end; - block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; - block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; - block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; - block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; + block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; + block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; + block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; + block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; if(last_non_zero_p1 <= 24) goto end; - block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; - block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; - block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; - block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; + block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; + block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; + block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; + block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; if(last_non_zero_p1 <= 32) goto end; - block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; - block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; - block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; - block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; + block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; + block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; + block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; + block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; if(last_non_zero_p1 <= 40) goto end; - block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; - block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; - block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; - block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; + block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; + block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; + block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; + block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; if(last_non_zero_p1 <= 48) goto end; - block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; - block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; - block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; + block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; + block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; + block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; if(last_non_zero_p1 <= 56) goto end; - block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; - block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; - block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; + block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; + block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){ if(last_non_zero_p1 <= 1) goto end; - block[0x04] = temp_block[0x01]; - block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + block[0x04] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; if(last_non_zero_p1 <= 4) goto end; - block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; - block[0x05] = temp_block[0x03]; + block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; + block[0x05] = temp_block[0x03]; if(last_non_zero_p1 <= 7) goto end; - block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; - block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; if(last_non_zero_p1 <= 11) goto end; - block[0x1C] = temp_block[0x19]; - block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; - block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; + block[0x1C] = temp_block[0x19]; + block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; + block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; if(last_non_zero_p1 <= 16) goto end; - block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; - block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; - block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; - block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; + block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; + block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; if(last_non_zero_p1 <= 24) goto end; - block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; - block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; - block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; - block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; + block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; + block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; + block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; if(last_non_zero_p1 <= 32) goto end; - block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; - block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; - block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; - block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; + block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; + block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; + block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; if(last_non_zero_p1 <= 40) goto end; - block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; - block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; - block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; - block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; + block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; + block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; if(last_non_zero_p1 <= 48) goto end; - block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; - block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; - block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; + block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; + block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; + block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; if(last_non_zero_p1 <= 56) goto end; - block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; - block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; - block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; }else{ if(last_non_zero_p1 <= 1) goto end; - block[0x01] = temp_block[0x01]; - block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + block[0x01] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; if(last_non_zero_p1 <= 4) goto end; - block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; - block[0x03] = temp_block[0x03]; + block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; + block[0x03] = temp_block[0x03]; if(last_non_zero_p1 <= 7) goto end; - block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; - block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; if(last_non_zero_p1 <= 11) goto end; - block[0x19] = temp_block[0x19]; - block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; - block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; + block[0x19] = temp_block[0x19]; + block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; + block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; if(last_non_zero_p1 <= 16) goto end; - block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; - block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; - block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; - block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; + block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; + block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; if(last_non_zero_p1 <= 24) goto end; - block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; - block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; - block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; - block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; + block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; + block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; + block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; if(last_non_zero_p1 <= 32) goto end; - block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; - block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; - block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; - block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; + block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; + block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; + block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; if(last_non_zero_p1 <= 40) goto end; - block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; - block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; - block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; - block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; + block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; + block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; if(last_non_zero_p1 <= 48) goto end; - block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; - block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; - block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; + block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; + block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; + block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; if(last_non_zero_p1 <= 56) goto end; - block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; - block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; - block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; + block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; } end: diff --git a/libavcodec/i386/simple_idct_mmx.c b/libavcodec/i386/simple_idct_mmx.c index 92a366f217..7b5084c7eb 100644 --- a/libavcodec/i386/simple_idct_mmx.c +++ b/libavcodec/i386/simple_idct_mmx.c @@ -60,19 +60,19 @@ static const int16_t __attribute__((aligned(8))) coeffs[]= { C4, C4, C4, C4, C4, -C4, C4, -C4, - + C2, C6, C2, C6, C6, -C2, C6, -C2, - + C1, C3, C1, C3, C5, C7, C5, C7, - + C3, -C7, C3, -C7, -C1, -C5, -C1, -C5, - + C5, -C1, C5, -C1, C7, C3, C7, C3, - + C7, -C5, C7, -C5, C3, -C1, C3, -C1 }; @@ -357,7 +357,7 @@ static inline void idct(int16_t *block) "movd %%mm4, 64+" #dst " \n\t"\ "movd %%mm5, 80+" #dst " \n\t"\ - + #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ @@ -857,7 +857,7 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ "movd %%mm6, 48+" #dst " \n\t"\ "movd %%mm1, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" + "movd %%mm5, 80+" #dst " \n\t" //IDCT( src0, src4, src1, src5, dst, rounder, shift) IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) @@ -924,7 +924,7 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ "movd %%mm6, 48+" #dst " \n\t"\ "movd %%mm1, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" + "movd %%mm5, 80+" #dst " \n\t" //IDCT( src0, src4, src1, src5, dst, rounder, shift) @@ -1137,8 +1137,8 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ "movq %%mm6, 48+" #dst " \n\t"\ "movq %%mm6, 64+" #dst " \n\t"\ - "movq %%mm5, 80+" #dst " \n\t" - + "movq %%mm5, 80+" #dst " \n\t" + //IDCT( src0, src4, src1, src5, dst, rounder, shift) IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) @@ -1214,7 +1214,7 @@ IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ "movd %%mm4, 64+" #dst " \n\t"\ "movd %%mm5, 80+" #dst " \n\t" - + //IDCT( src0, src4, src1, src5, dst, rounder, shift) IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) @@ -1256,7 +1256,7 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movq %%mm0, 32+" #dst " \n\t"\ "movq %%mm4, 48+" #dst " \n\t"\ "movq %%mm4, 64+" #dst " \n\t"\ - "movq %%mm0, 80+" #dst " \n\t" + "movq %%mm0, 80+" #dst " \n\t" //IDCT( src0, src4, src1, src5, dst, rounder, shift) IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) @@ -1277,7 +1277,7 @@ Input 12 32 16 36 52 72 56 76 05 45 07 47 25 65 27 67 15 35 17 37 55 75 57 77 - + Temp 00 04 10 14 20 24 30 34 40 44 50 54 60 64 70 74 diff --git a/libavcodec/i386/vp3dsp_mmx.c b/libavcodec/i386/vp3dsp_mmx.c index 3d220c1d45..4aa1a5f403 100644 --- a/libavcodec/i386/vp3dsp_mmx.c +++ b/libavcodec/i386/vp3dsp_mmx.c @@ -208,7 +208,7 @@ static const uint16_t idct_cosine_table[7] = { I(1) = d1 c1 b1 a1 I(2) = d2 c2 b2 a2 I(3) = d3 c3 b3 a3 - + J(4) = h0 g0 f0 e0 J(5) = h1 g1 f1 e1 J(6) = h2 g2 f2 e2 diff --git a/libavcodec/i386/vp3dsp_sse2.c b/libavcodec/i386/vp3dsp_sse2.c index ed17891bfa..fcc511b651 100644 --- a/libavcodec/i386/vp3dsp_sse2.c +++ b/libavcodec/i386/vp3dsp_sse2.c @@ -36,21 +36,21 @@ static const unsigned short __align16 SSE2_dequant_const[] = }; static const unsigned int __align16 eight_data[] = -{ - 0x00080008, +{ + 0x00080008, + 0x00080008, 0x00080008, - 0x00080008, - 0x00080008 -}; + 0x00080008 +}; static const unsigned short __align16 SSE2_idct_data[7 * 8] = { - 64277,64277,64277,64277,64277,64277,64277,64277, - 60547,60547,60547,60547,60547,60547,60547,60547, - 54491,54491,54491,54491,54491,54491,54491,54491, - 46341,46341,46341,46341,46341,46341,46341,46341, - 36410,36410,36410,36410,36410,36410,36410,36410, - 25080,25080,25080,25080,25080,25080,25080,25080, + 64277,64277,64277,64277,64277,64277,64277,64277, + 60547,60547,60547,60547,60547,60547,60547,60547, + 54491,54491,54491,54491,54491,54491,54491,54491, + 46341,46341,46341,46341,46341,46341,46341,46341, + 36410,36410,36410,36410,36410,36410,36410,36410, + 25080,25080,25080,25080,25080,25080,25080,25080, 12785,12785,12785,12785,12785,12785,12785,12785 }; @@ -820,6 +820,6 @@ void ff_vp3_idct_sse2(int16_t *input_data) SSE2_Row_IDCT(); SSE2_Transpose(); - + SSE2_Column_IDCT(); } |