diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2001-11-04 23:04:26 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2001-11-04 23:04:26 +0000 |
commit | 9b2c28e6edbb43e00e0b2d99b95567189cd46e91 (patch) | |
tree | 25bb077af644845b0666c20a0e5b533065ffa8af | |
parent | 9ce6584e496b5a4a33edaefe32921d6eb0e56f37 (diff) | |
download | ffmpeg-9b2c28e6edbb43e00e0b2d99b95567189cd46e91.tar.gz |
faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
Originally committed as revision 2699 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
-rw-r--r-- | postproc/rgb2rgb.c | 31 | ||||
-rw-r--r-- | postproc/rgb2rgb_template.c | 31 |
2 files changed, 42 insertions, 20 deletions
diff --git a/postproc/rgb2rgb.c b/postproc/rgb2rgb.c index 923837a134..7e978fd54f 100644 --- a/postproc/rgb2rgb.c +++ b/postproc/rgb2rgb.c @@ -14,6 +14,7 @@ static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFU static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ +static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; #endif void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size) @@ -119,6 +120,7 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,uint32_t src_size) Original by Strepto/Astral ported to gcc & bugfixed : A'rpi MMX2, 3DNOW optimization by Nick Kurshev + 32bit c version, and and&add trick by Michael Niedermayer */ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) { @@ -126,11 +128,10 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) register const char* s=src+src_size; register char* d=dst+src_size; register int offs=-src_size; - __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)):"memory"); + __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); __asm __volatile( "movq %0, %%mm4\n\t" - "movq %1, %%mm5" - ::"m"(mask15b), "m"(mask15rg):"memory"); + ::"m"(mask15s)); while(offs<0) { __asm __volatile( @@ -140,23 +141,20 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) "movq %%mm0, %%mm1\n\t" "movq %%mm2, %%mm3\n\t" "pand %%mm4, %%mm0\n\t" - "pand %%mm5, %%mm1\n\t" "pand %%mm4, %%mm2\n\t" - "pand %%mm5, %%mm3\n\t" - "psllq $1, %%mm1\n\t" - "psllq $1, %%mm3\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm3, %%mm2\n\t" + "paddw %%mm1, %%mm0\n\t" + "paddw %%mm3, %%mm2\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm2, 8%0" :"=m"(*(d+offs)) :"m"(*(s+offs)) - :"memory"); + ); offs+=16; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); #else +#if 0 const uint16_t *s1=( uint16_t * )src; uint16_t *d1=( uint16_t * )dst; uint16_t *e=((uint8_t *)s1)+src_size; @@ -168,6 +166,19 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) 00000000000001 1111=0x001F */ *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); } +#else + const uint32_t *s1=( uint32_t * )src; + uint32_t *d1=( uint32_t * )dst; + int i; + int size= src_size>>2; + for(i=0; i<size; i++) + { + register int x= s1[i]; +// d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true + d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); + + } +#endif #endif } diff --git a/postproc/rgb2rgb_template.c b/postproc/rgb2rgb_template.c index 923837a134..7e978fd54f 100644 --- a/postproc/rgb2rgb_template.c +++ b/postproc/rgb2rgb_template.c @@ -14,6 +14,7 @@ static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFU static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ +static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; #endif void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size) @@ -119,6 +120,7 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,uint32_t src_size) Original by Strepto/Astral ported to gcc & bugfixed : A'rpi MMX2, 3DNOW optimization by Nick Kurshev + 32bit c version, and and&add trick by Michael Niedermayer */ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) { @@ -126,11 +128,10 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) register const char* s=src+src_size; register char* d=dst+src_size; register int offs=-src_size; - __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)):"memory"); + __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); __asm __volatile( "movq %0, %%mm4\n\t" - "movq %1, %%mm5" - ::"m"(mask15b), "m"(mask15rg):"memory"); + ::"m"(mask15s)); while(offs<0) { __asm __volatile( @@ -140,23 +141,20 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) "movq %%mm0, %%mm1\n\t" "movq %%mm2, %%mm3\n\t" "pand %%mm4, %%mm0\n\t" - "pand %%mm5, %%mm1\n\t" "pand %%mm4, %%mm2\n\t" - "pand %%mm5, %%mm3\n\t" - "psllq $1, %%mm1\n\t" - "psllq $1, %%mm3\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm3, %%mm2\n\t" + "paddw %%mm1, %%mm0\n\t" + "paddw %%mm3, %%mm2\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm2, 8%0" :"=m"(*(d+offs)) :"m"(*(s+offs)) - :"memory"); + ); offs+=16; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); #else +#if 0 const uint16_t *s1=( uint16_t * )src; uint16_t *d1=( uint16_t * )dst; uint16_t *e=((uint8_t *)s1)+src_size; @@ -168,6 +166,19 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size) 00000000000001 1111=0x001F */ *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); } +#else + const uint32_t *s1=( uint32_t * )src; + uint32_t *d1=( uint32_t * )dst; + int i; + int size= src_size>>2; + for(i=0; i<size; i++) + { + register int x= s1[i]; +// d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true + d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); + + } +#endif #endif } |