diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2003-02-18 19:22:34 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2003-02-18 19:22:34 +0000 |
commit | aeae5d537b35356a783e156fb218eb161d7eb93e (patch) | |
tree | 32f3c8a2567d78eecdc16a2b267cc23535b835fc /postproc/rgb2rgb_template.c | |
parent | 64094f3799921698a5d16f8941aa91f7f9ee3574 (diff) | |
download | ffmpeg-aeae5d537b35356a783e156fb218eb161d7eb93e.tar.gz |
optimize
Originally committed as revision 9455 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
Diffstat (limited to 'postproc/rgb2rgb_template.c')
-rw-r--r-- | postproc/rgb2rgb_template.c | 74 |
1 files changed, 72 insertions, 2 deletions
diff --git a/postproc/rgb2rgb_template.c b/postproc/rgb2rgb_template.c index e299b0c12e..01ba6ed6f2 100644 --- a/postproc/rgb2rgb_template.c +++ b/postproc/rgb2rgb_template.c @@ -318,12 +318,46 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned uint16_t *d = (uint16_t *)dst; end = s + src_size; #ifdef HAVE_MMX + mm_end = end - 15; +#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) + asm volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + ".balign 16 \n\t" + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $5, %%mm0 \n\t" + "pslld $11, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "addl $16, %1 \n\t" + "addl $8, %0 \n\t" + "cmpl %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) + ); +#else __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 15; while(s < mm_end) { __asm __volatile( @@ -359,6 +393,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 16; } +#endif __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); #endif @@ -441,12 +476,46 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned uint16_t *d = (uint16_t *)dst; end = s + src_size; #ifdef HAVE_MMX + mm_end = end - 15; +#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) + asm volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + ".balign 16 \n\t" + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $6, %%mm0 \n\t" + "pslld $10, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "addl $16, %1 \n\t" + "addl $8, %0 \n\t" + "cmpl %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) + ); +#else __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 15; while(s < mm_end) { __asm __volatile( @@ -482,6 +551,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 16; } +#endif __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); #endif |