diff options
author | Diego Biurrun <diego@biurrun.de> | 2007-04-28 11:44:49 +0000 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2007-04-28 11:44:49 +0000 |
commit | 6e42e6c4b410dbef8b593c2d796a5dad95f89ee4 (patch) | |
tree | a85ea55f0fcc2a85fb2001b954df839cd5b715d1 | |
parent | 1da87823db5dbeed72a50fd0194f9397052a5130 (diff) | |
download | ffmpeg-6e42e6c4b410dbef8b593c2d796a5dad95f89ee4.tar.gz |
cosmetics attack, part I: Remove all tabs and prettyprint/reindent the code.
Originally committed as revision 23158 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
-rw-r--r-- | libswscale/cs_test.c | 6 | ||||
-rw-r--r-- | libswscale/rgb2rgb.c | 609 | ||||
-rw-r--r-- | libswscale/rgb2rgb.h | 69 | ||||
-rw-r--r-- | libswscale/rgb2rgb_template.c | 4619 | ||||
-rw-r--r-- | libswscale/swscale_altivec_template.c | 852 |
5 files changed, 3074 insertions, 3081 deletions
diff --git a/libswscale/cs_test.c b/libswscale/cs_test.c index 552fafaf6a..876f270411 100644 --- a/libswscale/cs_test.c +++ b/libswscale/cs_test.c @@ -142,7 +142,7 @@ int main(int argc, char **argv) for(i=0; i<SIZE; i++){ if(srcBuffer[i]!=srcByte){ av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n", - i, width, srcOffset, dstOffset, name); + i, width, srcOffset, dstOffset, name); failed=1; break; } @@ -150,7 +150,7 @@ int main(int argc, char **argv) for(i=0; i<dstOffset; i++){ if(dstBuffer[i]!=dstByte){ av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n", - i, width, srcOffset, dstOffset, name); + i, width, srcOffset, dstOffset, name); failed=1; break; } @@ -158,7 +158,7 @@ int main(int argc, char **argv) for(i=dstOffset + width*dstBpp; i<SIZE; i++){ if(dstBuffer[i]!=dstByte){ av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n", - i, width, srcOffset, dstOffset, name); + i, width, srcOffset, dstOffset, name); failed=1; break; } diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c index 9eadbe095c..d363a2857b 100644 --- a/libswscale/rgb2rgb.c +++ b/libswscale/rgb2rgb.c @@ -1,5 +1,4 @@ /* - * * rgb2rgb.c, Software RGB to RGB convertor * pluralize by Software PAL8 to RGB convertor * Software YUV to YUV convertor @@ -61,83 +60,83 @@ void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size); void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size); void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride); + long width, long height, + long lumStride, long chromStride, long dstStride); void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride); + long width, long height, + long lumStride, long chromStride, long dstStride); void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride); + long width, long height, + long lumStride, long chromStride, long dstStride); void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride); + long width, long height, + long lumStride, long chromStride, long srcStride); void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride); + long width, long height, + long lumStride, long chromStride, long srcStride); void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height, - long srcStride, long dstStride); + long srcStride, long dstStride); void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst, - long width, long height, long src1Stride, - long src2Stride, long dstStride); + long width, long height, long src1Stride, + long src2Stride, long dstStride); void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, - uint8_t *dst1, uint8_t *dst2, - long width, long height, - long srcStride1, long srcStride2, - long dstStride1, long dstStride2); + uint8_t *dst1, uint8_t *dst2, + long width, long height, + long srcStride1, long srcStride2, + long dstStride1, long dstStride2); void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, - uint8_t *dst, - long width, long height, - long srcStride1, long srcStride2, - long srcStride3, long dstStride); + uint8_t *dst, + long width, long height, + long srcStride1, long srcStride2, + long srcStride3, long dstStride); #if defined(ARCH_X86) && defined(CONFIG_GPL) -static const uint64_t mmx_null __attribute__((aligned(8))) = 0x0000000000000000ULL; -static const uint64_t mmx_one __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL; -static const uint64_t mask32b attribute_used __attribute__((aligned(8))) = 0x000000FF000000FFULL; -static const uint64_t mask32g attribute_used __attribute__((aligned(8))) = 0x0000FF000000FF00ULL; -static const uint64_t mask32r attribute_used __attribute__((aligned(8))) = 0x00FF000000FF0000ULL; -static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; -static const uint64_t mask3216br __attribute__((aligned(8)))=0x00F800F800F800F8ULL; -static const uint64_t mask3216g __attribute__((aligned(8)))=0x0000FC000000FC00ULL; -static const uint64_t mask3215g __attribute__((aligned(8)))=0x0000F8000000F800ULL; -static const uint64_t mul3216 __attribute__((aligned(8))) = 0x2000000420000004ULL; -static const uint64_t mul3215 __attribute__((aligned(8))) = 0x2000000820000008ULL; -static const uint64_t mask24b attribute_used __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL; -static const uint64_t mask24g attribute_used __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL; -static const uint64_t mask24r attribute_used __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL; -static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; -static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; -static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL; -static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL; -static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL; -static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ -static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ -static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; -static const uint64_t mask15g __attribute__((aligned(8))) = 0x03E003E003E003E0ULL; -static const uint64_t mask15r __attribute__((aligned(8))) = 0x7C007C007C007C00ULL; +static const uint64_t mmx_null __attribute__((aligned(8))) = 0x0000000000000000ULL; +static const uint64_t mmx_one __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL; +static const uint64_t mask32b attribute_used __attribute__((aligned(8))) = 0x000000FF000000FFULL; +static const uint64_t mask32g attribute_used __attribute__((aligned(8))) = 0x0000FF000000FF00ULL; +static const uint64_t mask32r attribute_used __attribute__((aligned(8))) = 0x00FF000000FF0000ULL; +static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; +static const uint64_t mask3216br __attribute__((aligned(8))) = 0x00F800F800F800F8ULL; +static const uint64_t mask3216g __attribute__((aligned(8))) = 0x0000FC000000FC00ULL; +static const uint64_t mask3215g __attribute__((aligned(8))) = 0x0000F8000000F800ULL; +static const uint64_t mul3216 __attribute__((aligned(8))) = 0x2000000420000004ULL; +static const uint64_t mul3215 __attribute__((aligned(8))) = 0x2000000820000008ULL; +static const uint64_t mask24b attribute_used __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL; +static const uint64_t mask24g attribute_used __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL; +static const uint64_t mask24r attribute_used __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL; +static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; +static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; +static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL; +static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL; +static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL; +static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ +static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ +static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; +static const uint64_t mask15g __attribute__((aligned(8))) = 0x03E003E003E003E0ULL; +static const uint64_t mask15r __attribute__((aligned(8))) = 0x7C007C007C007C00ULL; #define mask16b mask15b -static const uint64_t mask16g __attribute__((aligned(8))) = 0x07E007E007E007E0ULL; -static const uint64_t mask16r __attribute__((aligned(8))) = 0xF800F800F800F800ULL; -static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; -static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL; -static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; -static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c0000007c00ULL; -static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000003e0ULL; -static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; +static const uint64_t mask16g __attribute__((aligned(8))) = 0x07E007E007E007E0ULL; +static const uint64_t mask16r __attribute__((aligned(8))) = 0xF800F800F800F800ULL; +static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; +static const uint64_t green_16mask __attribute__((aligned(8))) = 0x000007e0000007e0ULL; +static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; +static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c0000007c00ULL; +static const uint64_t green_15mask __attribute__((aligned(8))) = 0x000003e0000003e0ULL; +static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; #ifdef FAST_BGR2YV12 -static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000000210041000DULL; -static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL; -static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL; +static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000000210041000DULL; +static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL; +static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL; #else -static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000020E540830C8BULL; -static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL; -static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL; +static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000020E540830C8BULL; +static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL; +static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL; #endif -static const uint64_t bgr2YOffset attribute_used __attribute__((aligned(8))) = 0x1010101010101010ULL; -static const uint64_t bgr2UVOffset attribute_used __attribute__((aligned(8)))= 0x8080808080808080ULL; -static const uint64_t w1111 attribute_used __attribute__((aligned(8))) = 0x0001000100010001ULL; +static const uint64_t bgr2YOffset attribute_used __attribute__((aligned(8))) = 0x1010101010101010ULL; +static const uint64_t bgr2UVOffset attribute_used __attribute__((aligned(8))) = 0x8080808080808080ULL; +static const uint64_t w1111 attribute_used __attribute__((aligned(8))) = 0x0001000100010001ULL; #if 0 static volatile uint64_t __attribute__((aligned(8))) b5Dither; @@ -146,12 +145,12 @@ static volatile uint64_t __attribute__((aligned(8))) g6Dither; static volatile uint64_t __attribute__((aligned(8))) r5Dither; static uint64_t __attribute__((aligned(8))) dither4[2]={ - 0x0103010301030103LL, - 0x0200020002000200LL,}; + 0x0103010301030103LL, + 0x0200020002000200LL,}; static uint64_t __attribute__((aligned(8))) dither8[2]={ - 0x0602060206020602LL, - 0x0004000400040004LL,}; + 0x0602060206020602LL, + 0x0004000400040004LL,}; #endif #endif /* defined(ARCH_X86) */ @@ -215,15 +214,15 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={ void sws_rgb2rgb_init(int flags){ #if (defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX)) && defined(CONFIG_GPL) - if(flags & SWS_CPU_CAPS_MMX2) - rgb2rgb_init_MMX2(); - else if(flags & SWS_CPU_CAPS_3DNOW) - rgb2rgb_init_3DNOW(); - else if(flags & SWS_CPU_CAPS_MMX) - rgb2rgb_init_MMX(); - else + if (flags & SWS_CPU_CAPS_MMX2) + rgb2rgb_init_MMX2(); + else if (flags & SWS_CPU_CAPS_3DNOW) + rgb2rgb_init_3DNOW(); + else if (flags & SWS_CPU_CAPS_MMX) + rgb2rgb_init_MMX(); + else #endif /* defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX) */ - rgb2rgb_init_C(); + rgb2rgb_init_C(); } /** @@ -231,49 +230,49 @@ void sws_rgb2rgb_init(int flags){ */ void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; + long i; /* - for(i=0; i<num_pixels; i++) - ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ]; + for (i=0; i<num_pixels; i++) + ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ]; */ - for(i=0; i<num_pixels; i++) - { - #ifdef WORDS_BIGENDIAN - dst[3]= palette[ src[i]*4+2 ]; - dst[2]= palette[ src[i]*4+1 ]; - dst[1]= palette[ src[i]*4+0 ]; - #else - //FIXME slow? - dst[0]= palette[ src[i]*4+2 ]; - dst[1]= palette[ src[i]*4+1 ]; - dst[2]= palette[ src[i]*4+0 ]; - //dst[3]= 0; /* do we need this cleansing? */ - #endif - dst+= 4; - } + for (i=0; i<num_pixels; i++) + { + #ifdef WORDS_BIGENDIAN + dst[3]= palette[ src[i]*4+2 ]; + dst[2]= palette[ src[i]*4+1 ]; + dst[1]= palette[ src[i]*4+0 ]; + #else + //FIXME slow? + dst[0]= palette[ src[i]*4+2 ]; + dst[1]= palette[ src[i]*4+1 ]; + dst[2]= palette[ src[i]*4+0 ]; + //dst[3]= 0; /* do we need this cleansing? */ + #endif + dst+= 4; + } } void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; - for(i=0; i<num_pixels; i++) - { - #ifdef WORDS_BIGENDIAN - dst[3]= palette[ src[i]*4+0 ]; - dst[2]= palette[ src[i]*4+1 ]; - dst[1]= palette[ src[i]*4+2 ]; - #else - //FIXME slow? - dst[0]= palette[ src[i]*4+0 ]; - dst[1]= palette[ src[i]*4+1 ]; - dst[2]= palette[ src[i]*4+2 ]; - //dst[3]= 0; /* do we need this cleansing? */ - #endif - - dst+= 4; - } + long i; + for (i=0; i<num_pixels; i++) + { + #ifdef WORDS_BIGENDIAN + dst[3]= palette[ src[i]*4+0 ]; + dst[2]= palette[ src[i]*4+1 ]; + dst[1]= palette[ src[i]*4+2 ]; + #else + //FIXME slow? + dst[0]= palette[ src[i]*4+0 ]; + dst[1]= palette[ src[i]*4+1 ]; + dst[2]= palette[ src[i]*4+2 ]; + //dst[3]= 0; /* do we need this cleansing? */ + #endif + + dst+= 4; + } } /** @@ -281,38 +280,38 @@ void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const ui */ void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; + long i; /* - writes 1 byte o much and might cause alignment issues on some architectures? - for(i=0; i<num_pixels; i++) - ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; + writes 1 byte o much and might cause alignment issues on some architectures? + for (i=0; i<num_pixels; i++) + ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; */ - for(i=0; i<num_pixels; i++) - { - //FIXME slow? - dst[0]= palette[ src[i]*4+2 ]; - dst[1]= palette[ src[i]*4+1 ]; - dst[2]= palette[ src[i]*4+0 ]; - dst+= 3; - } + for (i=0; i<num_pixels; i++) + { + //FIXME slow? + dst[0]= palette[ src[i]*4+2 ]; + dst[1]= palette[ src[i]*4+1 ]; + dst[2]= palette[ src[i]*4+0 ]; + dst+= 3; + } } void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; + long i; /* - writes 1 byte o much and might cause alignment issues on some architectures? - for(i=0; i<num_pixels; i++) - ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; + writes 1 byte o much and might cause alignment issues on some architectures? + for (i=0; i<num_pixels; i++) + ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ]; */ - for(i=0; i<num_pixels; i++) - { - //FIXME slow? - dst[0]= palette[ src[i]*4+0 ]; - dst[1]= palette[ src[i]*4+1 ]; - dst[2]= palette[ src[i]*4+2 ]; - dst+= 3; - } + for (i=0; i<num_pixels; i++) + { + //FIXME slow? + dst[0]= palette[ src[i]*4+0 ]; + dst[1]= palette[ src[i]*4+1 ]; + dst[2]= palette[ src[i]*4+2 ]; + dst+= 3; + } } /** @@ -320,15 +319,15 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui */ void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; - for(i=0; i<num_pixels; i++) - ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; + long i; + for (i=0; i<num_pixels; i++) + ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; } void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; - for(i=0; i<num_pixels; i++) - ((uint16_t *)dst)[i] = bswap_16(((uint16_t *)palette)[ src[i] ]); + long i; + for (i=0; i<num_pixels; i++) + ((uint16_t *)dst)[i] = bswap_16(((uint16_t *)palette)[ src[i] ]); } /** @@ -336,216 +335,216 @@ void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const ui */ void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; - for(i=0; i<num_pixels; i++) - ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; + long i; + for (i=0; i<num_pixels; i++) + ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ]; } void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) { - long i; - for(i=0; i<num_pixels; i++) - ((uint16_t *)dst)[i] = bswap_16(((uint16_t *)palette)[ src[i] ]); + long i; + for (i=0; i<num_pixels; i++) + ((uint16_t *)dst)[i] = bswap_16(((uint16_t *)palette)[ src[i] ]); } void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - long num_pixels = src_size >> 2; - for(i=0; i<num_pixels; i++) - { - #ifdef WORDS_BIGENDIAN - /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */ - dst[3*i + 0] = src[4*i + 1]; - dst[3*i + 1] = src[4*i + 2]; - dst[3*i + 2] = src[4*i + 3]; - #else - dst[3*i + 0] = src[4*i + 2]; - dst[3*i + 1] = src[4*i + 1]; - dst[3*i + 2] = src[4*i + 0]; - #endif - } + long i; + long num_pixels = src_size >> 2; + for (i=0; i<num_pixels; i++) + { + #ifdef WORDS_BIGENDIAN + /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */ + dst[3*i + 0] = src[4*i + 1]; + dst[3*i + 1] = src[4*i + 2]; + dst[3*i + 2] = src[4*i + 3]; + #else + dst[3*i + 0] = src[4*i + 2]; + dst[3*i + 1] = src[4*i + 1]; + dst[3*i + 2] = src[4*i + 0]; + #endif + } } void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - for(i=0; 3*i<src_size; i++) - { - #ifdef WORDS_BIGENDIAN - /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */ - dst[4*i + 0] = 0; - dst[4*i + 1] = src[3*i + 0]; - dst[4*i + 2] = src[3*i + 1]; - dst[4*i + 3] = src[3*i + 2]; - #else - dst[4*i + 0] = src[3*i + 2]; - dst[4*i + 1] = src[3*i + 1]; - dst[4*i + 2] = src[3*i + 0]; - dst[4*i + 3] = 0; - #endif - } + long i; + for (i=0; 3*i<src_size; i++) + { + #ifdef WORDS_BIGENDIAN + /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */ + dst[4*i + 0] = 0; + dst[4*i + 1] = src[3*i + 0]; + dst[4*i + 2] = src[3*i + 1]; + dst[4*i + 3] = src[3*i + 2]; + #else + dst[4*i + 0] = src[3*i + 2]; + dst[4*i + 1] = src[3*i + 1]; + dst[4*i + 2] = src[3*i + 0]; + dst[4*i + 3] = 0; + #endif + } } void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (uint16_t *)src; - end = s + src_size/2; - while(s < end) - { - register uint16_t bgr; - bgr = *s++; - #ifdef WORDS_BIGENDIAN - *d++ = 0; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; - #else - *d++ = (bgr&0xF800)>>8; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0x1F)<<3; - *d++ = 0; - #endif - } + const uint16_t *end; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (uint16_t *)src; + end = s + src_size/2; + while (s < end) + { + register uint16_t bgr; + bgr = *s++; + #ifdef WORDS_BIGENDIAN + *d++ = 0; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0xF800)>>8; + #else + *d++ = (bgr&0xF800)>>8; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0x1F)<<3; + *d++ = 0; + #endif + } } void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (const uint16_t *)src; - end = s + src_size/2; - while(s < end) - { - register uint16_t bgr; - bgr = *s++; - *d++ = (bgr&0xF800)>>8; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0x1F)<<3; - } + const uint16_t *end; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; + while (s < end) + { + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0xF800)>>8; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0x1F)<<3; + } } void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - long num_pixels = src_size >> 1; - - for(i=0; i<num_pixels; i++) - { - unsigned b,g,r; - register uint16_t rgb; - rgb = src[2*i]; - r = rgb&0x1F; - g = (rgb&0x7E0)>>5; - b = (rgb&0xF800)>>11; - dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11); - } + long i; + long num_pixels = src_size >> 1; + + for (i=0; i<num_pixels; i++) + { + unsigned b,g,r; + register uint16_t rgb; + rgb = src[2*i]; + r = rgb&0x1F; + g = (rgb&0x7E0)>>5; + b = (rgb&0xF800)>>11; + dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11); + } } void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - long num_pixels = src_size >> 1; - - for(i=0; i<num_pixels; i++) - { - unsigned b,g,r; - register uint16_t rgb; - rgb = src[2*i]; - r = rgb&0x1F; - g = (rgb&0x7E0)>>5; - b = (rgb&0xF800)>>11; - dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10); - } + long i; + long num_pixels = src_size >> 1; + + for (i=0; i<num_pixels; i++) + { + unsigned b,g,r; + register uint16_t rgb; + rgb = src[2*i]; + r = rgb&0x1F; + g = (rgb&0x7E0)>>5; + b = (rgb&0xF800)>>11; + dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10); + } } void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (const uint16_t *)src; - end = s + src_size/2; - while(s < end) - { - register uint16_t bgr; - bgr = *s++; - #ifdef WORDS_BIGENDIAN - *d++ = 0; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; - #else - *d++ = (bgr&0x7C00)>>7; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x1F)<<3; - *d++ = 0; - #endif - } + const uint16_t *end; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; + while (s < end) + { + register uint16_t bgr; + bgr = *s++; + #ifdef WORDS_BIGENDIAN + *d++ = 0; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x7C00)>>7; + #else + *d++ = (bgr&0x7C00)>>7; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x1F)<<3; + *d++ = 0; + #endif + } } void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (uint16_t *)src; - end = s + src_size/2; - while(s < end) - { - register uint16_t bgr; - bgr = *s++; - *d++ = (bgr&0x7C00)>>7; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x1F)<<3; - } + const uint16_t *end; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (uint16_t *)src; + end = s + src_size/2; + while (s < end) + { + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0x7C00)>>7; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x1F)<<3; + } } void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - long num_pixels = src_size >> 1; - - for(i=0; i<num_pixels; i++) - { - unsigned b,g,r; - register uint16_t rgb; - rgb = src[2*i]; - r = rgb&0x1F; - g = (rgb&0x3E0)>>5; - b = (rgb&0x7C00)>>10; - dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11); - } + long i; + long num_pixels = src_size >> 1; + + for (i=0; i<num_pixels; i++) + { + unsigned b,g,r; + register uint16_t rgb; + rgb = src[2*i]; + r = rgb&0x1F; + g = (rgb&0x3E0)>>5; + b = (rgb&0x7C00)>>10; + dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11); + } } void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - long num_pixels = src_size >> 1; - - for(i=0; i<num_pixels; i++) - { - unsigned b,g,r; - register uint16_t rgb; - rgb = src[2*i]; - r = rgb&0x1F; - g = (rgb&0x3E0)>>5; - b = (rgb&0x7C00)>>10; - dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10); - } + long i; + long num_pixels = src_size >> 1; + + for (i=0; i<num_pixels; i++) + { + unsigned b,g,r; + register uint16_t rgb; + rgb = src[2*i]; + r = rgb&0x1F; + g = (rgb&0x3E0)>>5; + b = (rgb&0x7C00)>>10; + dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10); + } } void rgb8tobgr8(const uint8_t *src, uint8_t *dst, long src_size) { - long i; - long num_pixels = src_size; - for(i=0; i<num_pixels; i++) - { - unsigned b,g,r; - register uint8_t rgb; - rgb = src[i]; - r = (rgb&0x07); - g = (rgb&0x38)>>3; - b = (rgb&0xC0)>>6; - dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6); - } + long i; + long num_pixels = src_size; + for (i=0; i<num_pixels; i++) + { + unsigned b,g,r; + register uint8_t rgb; + rgb = src[i]; + r = (rgb&0x07); + g = (rgb&0x38)>>3; + b = (rgb&0xC0)>>6; + dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6); + } } diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h index 769811fb8f..5ad3ec71b4 100644 --- a/libswscale/rgb2rgb.h +++ b/libswscale/rgb2rgb.h @@ -1,5 +1,4 @@ /* - * * rgb2rgb.h, Software RGB to RGB convertor * pluralize by Software PAL8 to RGB convertor * Software YUV to YUV convertor @@ -30,18 +29,18 @@ // Note: do not fix the dependence on stdio.h /* A full collection of rgb to rgb(bgr) convertors */ -extern void (*rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb24to16)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb24to15)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb32to16)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb32to15)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb15to24)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb15to32)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb16to24)(const uint8_t *src,uint8_t *dst,long src_size); -extern void (*rgb16to32)(const uint8_t *src,uint8_t *dst,long src_size); +extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb32to24) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb32to16) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb32to15) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb15to16) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb15to24) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb15to32) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb16to15) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb16to24) (const uint8_t *src, uint8_t *dst, long src_size); +extern void (*rgb16to32) (const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size); @@ -59,7 +58,7 @@ extern void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size); extern void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size); extern void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size); extern void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size); -extern void rgb8tobgr8(const uint8_t *src, uint8_t *dst, long src_size); +extern void rgb8tobgr8 (const uint8_t *src, uint8_t *dst, long src_size); extern void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); @@ -85,16 +84,16 @@ extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, c * problem for anyone then tell me, and ill fix it) */ extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride); + long width, long height, + long lumStride, long chromStride, long dstStride); /** * * width should be a multiple of 16 */ extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride); + long width, long height, + long lumStride, long chromStride, long dstStride); /** * @@ -102,8 +101,8 @@ extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uin * problem for anyone then tell me, and ill fix it) */ extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride); + long width, long height, + long lumStride, long chromStride, long srcStride); /** * @@ -111,8 +110,8 @@ extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint * problem for anyone then tell me, and ill fix it) */ extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride); + long width, long height, + long lumStride, long chromStride, long dstStride); /** * @@ -121,26 +120,26 @@ extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_ * chrominance data is only taken from every secound line others are ignored FIXME write HQ version */ extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride); + long width, long height, + long lumStride, long chromStride, long srcStride); extern void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height, - long srcStride, long dstStride); + long srcStride, long dstStride); extern void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst, - long width, long height, long src1Stride, - long src2Stride, long dstStride); + long width, long height, long src1Stride, + long src2Stride, long dstStride); extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, - uint8_t *dst1, uint8_t *dst2, - long width, long height, - long srcStride1, long srcStride2, - long dstStride1, long dstStride2); + uint8_t *dst1, uint8_t *dst2, + long width, long height, + long srcStride1, long srcStride2, + long dstStride1, long dstStride2); extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, - uint8_t *dst, - long width, long height, - long srcStride1, long srcStride2, - long srcStride3, long dstStride); + uint8_t *dst, + long width, long height, + long srcStride1, long srcStride2, + long srcStride3, long dstStride); void sws_rgb2rgb_init(int flags); diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c index c089e39fa7..6dab0aa539 100644 --- a/libswscale/rgb2rgb_template.c +++ b/libswscale/rgb2rgb_template.c @@ -1,5 +1,4 @@ /* - * * rgb2rgb.c, Software RGB to RGB convertor * pluralize by Software PAL8 to RGB convertor * Software YUV to YUV convertor @@ -53,11 +52,11 @@ #ifdef HAVE_3DNOW #define PREFETCH "prefetch" #define PREFETCHW "prefetchw" -#define PAVGB "pavgusb" +#define PAVGB "pavgusb" #elif defined ( HAVE_MMX2 ) #define PREFETCH "prefetchnta" #define PREFETCHW "prefetcht0" -#define PAVGB "pavgb" +#define PAVGB "pavgb" #else #ifdef __APPLE__ #define PREFETCH "#" @@ -85,148 +84,148 @@ static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size) { - uint8_t *dest = dst; - const uint8_t *s = src; - const uint8_t *end; -#ifdef HAVE_MMX - const uint8_t *mm_end; -#endif - end = s + src_size; -#ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 23; - __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "punpckldq 3%1, %%mm0\n\t" - "movd 6%1, %%mm1\n\t" - "punpckldq 9%1, %%mm1\n\t" - "movd 12%1, %%mm2\n\t" - "punpckldq 15%1, %%mm2\n\t" - "movd 18%1, %%mm3\n\t" - "punpckldq 21%1, %%mm3\n\t" - "pand %%mm7, %%mm0\n\t" - "pand %%mm7, %%mm1\n\t" - "pand %%mm7, %%mm2\n\t" - "pand %%mm7, %%mm3\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm1, 8%0\n\t" - MOVNTQ" %%mm2, 16%0\n\t" - MOVNTQ" %%mm3, 24%0" - :"=m"(*dest) - :"m"(*s) - :"memory"); - dest += 32; - s += 24; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { -#ifdef WORDS_BIGENDIAN - /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ - *dest++ = 0; - *dest++ = s[2]; - *dest++ = s[1]; - *dest++ = s[0]; - s+=3; -#else - *dest++ = *s++; - *dest++ = *s++; - *dest++ = *s++; - *dest++ = 0; -#endif - } + uint8_t *dest = dst; + const uint8_t *s = src; + const uint8_t *end; + #ifdef HAVE_MMX + const uint8_t *mm_end; + #endif + end = s + src_size; + #ifdef HAVE_MMX + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 23; + __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "punpckldq 3%1, %%mm0 \n\t" + "movd 6%1, %%mm1 \n\t" + "punpckldq 9%1, %%mm1 \n\t" + "movd 12%1, %%mm2 \n\t" + "punpckldq 15%1, %%mm2 \n\t" + "movd 18%1, %%mm3 \n\t" + "punpckldq 21%1, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm3 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm1, 8%0 \n\t" + MOVNTQ" %%mm2, 16%0 \n\t" + MOVNTQ" %%mm3, 24%0" + :"=m"(*dest) + :"m"(*s) + :"memory"); + dest += 32; + s += 24; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); + #endif + while (s < end) + { + #ifdef WORDS_BIGENDIAN + /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ + *dest++ = 0; + *dest++ = s[2]; + *dest++ = s[1]; + *dest++ = s[0]; + s+=3; + #else + *dest++ = *s++; + *dest++ = *s++; + *dest++ = *s++; + *dest++ = 0; + #endif + } } static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size) { - uint8_t *dest = dst; - const uint8_t *s = src; - const uint8_t *end; + uint8_t *dest = dst; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - end = s + src_size; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 31; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq 8%1, %%mm1\n\t" - "movq 16%1, %%mm4\n\t" - "movq 24%1, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm6\n\t" - "movq %%mm5, %%mm7\n\t" - "psrlq $8, %%mm2\n\t" - "psrlq $8, %%mm3\n\t" - "psrlq $8, %%mm6\n\t" - "psrlq $8, %%mm7\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm1\n\t" - "pand %2, %%mm4\n\t" - "pand %2, %%mm5\n\t" - "pand %3, %%mm2\n\t" - "pand %3, %%mm3\n\t" - "pand %3, %%mm6\n\t" - "pand %3, %%mm7\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm3, %%mm1\n\t" - "por %%mm6, %%mm4\n\t" - "por %%mm7, %%mm5\n\t" - - "movq %%mm1, %%mm2\n\t" - "movq %%mm4, %%mm3\n\t" - "psllq $48, %%mm2\n\t" - "psllq $32, %%mm3\n\t" - "pand %4, %%mm2\n\t" - "pand %5, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "psrlq $16, %%mm1\n\t" - "psrlq $32, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm3, %%mm1\n\t" - "pand %6, %%mm5\n\t" - "por %%mm5, %%mm4\n\t" - - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm1, 8%0\n\t" - MOVNTQ" %%mm4, 16%0" - :"=m"(*dest) - :"m"(*s),"m"(mask24l), - "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) - :"memory"); - dest += 24; - s += 32; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 31; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq 8%1, %%mm1 \n\t" + "movq 16%1, %%mm4 \n\t" + "movq 24%1, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "movq %%mm4, %%mm6 \n\t" + "movq %%mm5, %%mm7 \n\t" + "psrlq $8, %%mm2 \n\t" + "psrlq $8, %%mm3 \n\t" + "psrlq $8, %%mm6 \n\t" + "psrlq $8, %%mm7 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm1 \n\t" + "pand %2, %%mm4 \n\t" + "pand %2, %%mm5 \n\t" + "pand %3, %%mm2 \n\t" + "pand %3, %%mm3 \n\t" + "pand %3, %%mm6 \n\t" + "pand %3, %%mm7 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm3, %%mm1 \n\t" + "por %%mm6, %%mm4 \n\t" + "por %%mm7, %%mm5 \n\t" + + "movq %%mm1, %%mm2 \n\t" + "movq %%mm4, %%mm3 \n\t" + "psllq $48, %%mm2 \n\t" + "psllq $32, %%mm3 \n\t" + "pand %4, %%mm2 \n\t" + "pand %5, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "psrlq $16, %%mm1 \n\t" + "psrlq $32, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm3, %%mm1 \n\t" + "pand %6, %%mm5 \n\t" + "por %%mm5, %%mm4 \n\t" + + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm1, 8%0 \n\t" + MOVNTQ" %%mm4, 16%0" + :"=m"(*dest) + :"m"(*s),"m"(mask24l), + "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) + :"memory"); + dest += 24; + s += 32; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { #ifdef WORDS_BIGENDIAN - /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ - s++; - dest[2] = *s++; - dest[1] = *s++; - dest[0] = *s++; - dest += 3; + /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ + s++; + dest[2] = *s++; + dest[1] = *s++; + dest[0] = *s++; + dest += 3; #else - *dest++ = *s++; - *dest++ = *s++; - *dest++ = *s++; - s++; + *dest++ = *s++; + *dest++ = *s++; + *dest++ = *s++; + s++; #endif - } + } } /* @@ -237,677 +236,677 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_si */ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size) { - register const uint8_t* s=src; - register uint8_t* d=dst; - register const uint8_t *end; - const uint8_t *mm_end; - end = s + src_size; + register const uint8_t* s=src; + register uint8_t* d=dst; + register const uint8_t *end; + const uint8_t *mm_end; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s)); - __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); - mm_end = end - 15; - while(s<mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq 8%1, %%mm2\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "pand %%mm4, %%mm0\n\t" - "pand %%mm4, %%mm2\n\t" - "paddw %%mm1, %%mm0\n\t" - "paddw %%mm3, %%mm2\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm2, 8%0" - :"=m"(*d) - :"m"(*s) - ); - d+=16; - s+=16; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); + __asm __volatile(PREFETCH" %0"::"m"(*s)); + __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); + mm_end = end - 15; + while (s<mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq 8%1, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "pand %%mm4, %%mm0 \n\t" + "pand %%mm4, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm2, 8%0" + :"=m"(*d) + :"m"(*s) + ); + d+=16; + s+=16; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); #endif mm_end = end - 3; - while(s < mm_end) + while (s < mm_end) { - register unsigned x= *((uint32_t *)s); - *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); - d+=4; - s+=4; + register unsigned x= *((uint32_t *)s); + *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); + d+=4; + s+=4; } - if(s < end) + if (s < end) { - register unsigned short x= *((uint16_t *)s); - *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); + register unsigned short x= *((uint16_t *)s); + *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); } } static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size) { - register const uint8_t* s=src; - register uint8_t* d=dst; - register const uint8_t *end; - const uint8_t *mm_end; - end = s + src_size; + register const uint8_t* s=src; + register uint8_t* d=dst; + register const uint8_t *end; + const uint8_t *mm_end; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s)); - __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); - __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); - mm_end = end - 15; - while(s<mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq 8%1, %%mm2\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "psrlq $1, %%mm0\n\t" - "psrlq $1, %%mm2\n\t" - "pand %%mm7, %%mm0\n\t" - "pand %%mm7, %%mm2\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm3\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm3, %%mm2\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm2, 8%0" - :"=m"(*d) - :"m"(*s) - ); - d+=16; - s+=16; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); + __asm __volatile(PREFETCH" %0"::"m"(*s)); + __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); + __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); + mm_end = end - 15; + while (s<mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq 8%1, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlq $1, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm3 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm2, 8%0" + :"=m"(*d) + :"m"(*s) + ); + d+=16; + s+=16; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); #endif mm_end = end - 3; - while(s < mm_end) + while (s < mm_end) { - register uint32_t x= *((uint32_t *)s); - *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); - s+=4; - d+=4; + register uint32_t x= *((uint32_t *)s); + *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); + s+=4; + d+=4; } - if(s < end) + if (s < end) { - register uint16_t x= *((uint16_t *)s); - *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); - s+=2; - d+=2; + register uint16_t x= *((uint16_t *)s); + *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); + s+=2; + d+=2; } } static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - mm_end = end - 15; + mm_end = end - 15; #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) - asm volatile( - "movq %3, %%mm5 \n\t" - "movq %4, %%mm6 \n\t" - "movq %5, %%mm7 \n\t" - "jmp 2f \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 4(%1), %%mm3 \n\t" - "punpckldq 8(%1), %%mm0 \n\t" - "punpckldq 12(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pand %%mm6, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pmaddwd %%mm7, %%mm0 \n\t" - "pmaddwd %%mm7, %%mm3 \n\t" - "pand %%mm5, %%mm1 \n\t" - "pand %%mm5, %%mm4 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "psrld $5, %%mm0 \n\t" - "pslld $11, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - "add $16, %1 \n\t" - "add $8, %0 \n\t" - "2: \n\t" - "cmp %2, %1 \n\t" - " jb 1b \n\t" - : "+r" (d), "+r"(s) - : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) - ); + asm volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + "jmp 2f \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $5, %%mm0 \n\t" + "pslld $11, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "add $16, %1 \n\t" + "add $8, %0 \n\t" + "2: \n\t" + "cmp %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) + ); #else - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_16mask),"m"(green_16mask)); - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 4%1, %%mm3\n\t" - "punpckldq 8%1, %%mm0\n\t" - "punpckldq 12%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psrlq $3, %%mm0\n\t" - "psrlq $3, %%mm3\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm3\n\t" - "psrlq $5, %%mm1\n\t" - "psrlq $5, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $8, %%mm2\n\t" - "psrlq $8, %%mm5\n\t" - "pand %%mm7, %%mm2\n\t" - "pand %%mm7, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); - d += 4; - s += 16; - } -#endif - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register int rgb = *(uint32_t*)s; s += 4; - *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 4%1, %%mm3 \n\t" + "punpckldq 8%1, %%mm0 \n\t" + "punpckldq 12%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psrlq $3, %%mm0 \n\t" + "psrlq $3, %%mm3 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $8, %%mm2 \n\t" + "psrlq $8, %%mm5 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); + d += 4; + s += 16; + } +#endif + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register int rgb = *(uint32_t*)s; s += 4; + *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); + } } static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 15; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 4%1, %%mm3\n\t" - "punpckldq 8%1, %%mm0\n\t" - "punpckldq 12%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psllq $8, %%mm0\n\t" - "psllq $8, %%mm3\n\t" - "pand %%mm7, %%mm0\n\t" - "pand %%mm7, %%mm3\n\t" - "psrlq $5, %%mm1\n\t" - "psrlq $5, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $19, %%mm2\n\t" - "psrlq $19, %%mm5\n\t" - "pand %2, %%mm2\n\t" - "pand %2, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); - d += 4; - s += 16; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register int rgb = *(uint32_t*)s; s += 4; - *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + mm_end = end - 15; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 4%1, %%mm3 \n\t" + "punpckldq 8%1, %%mm0 \n\t" + "punpckldq 12%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $8, %%mm0 \n\t" + "psllq $8, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); + d += 4; + s += 16; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register int rgb = *(uint32_t*)s; s += 4; + *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); + } } static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - mm_end = end - 15; + mm_end = end - 15; #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) - asm volatile( - "movq %3, %%mm5 \n\t" - "movq %4, %%mm6 \n\t" - "movq %5, %%mm7 \n\t" - "jmp 2f \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 4(%1), %%mm3 \n\t" - "punpckldq 8(%1), %%mm0 \n\t" - "punpckldq 12(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pand %%mm6, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pmaddwd %%mm7, %%mm0 \n\t" - "pmaddwd %%mm7, %%mm3 \n\t" - "pand %%mm5, %%mm1 \n\t" - "pand %%mm5, %%mm4 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "psrld $6, %%mm0 \n\t" - "pslld $10, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - "add $16, %1 \n\t" - "add $8, %0 \n\t" - "2: \n\t" - "cmp %2, %1 \n\t" - " jb 1b \n\t" - : "+r" (d), "+r"(s) - : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) - ); + asm volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + "jmp 2f \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $6, %%mm0 \n\t" + "pslld $10, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "add $16, %1 \n\t" + "add $8, %0 \n\t" + "2: \n\t" + "cmp %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) + ); #else - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_15mask),"m"(green_15mask)); - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 4%1, %%mm3\n\t" - "punpckldq 8%1, %%mm0\n\t" - "punpckldq 12%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psrlq $3, %%mm0\n\t" - "psrlq $3, %%mm3\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm3\n\t" - "psrlq $6, %%mm1\n\t" - "psrlq $6, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $9, %%mm2\n\t" - "psrlq $9, %%mm5\n\t" - "pand %%mm7, %%mm2\n\t" - "pand %%mm7, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); - d += 4; - s += 16; - } -#endif - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register int rgb = *(uint32_t*)s; s += 4; - *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 4%1, %%mm3 \n\t" + "punpckldq 8%1, %%mm0 \n\t" + "punpckldq 12%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psrlq $3, %%mm0 \n\t" + "psrlq $3, %%mm3 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $9, %%mm2 \n\t" + "psrlq $9, %%mm5 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); + d += 4; + s += 16; + } +#endif + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register int rgb = *(uint32_t*)s; s += 4; + *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); + } } static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 15; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 4%1, %%mm3\n\t" - "punpckldq 8%1, %%mm0\n\t" - "punpckldq 12%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psllq $7, %%mm0\n\t" - "psllq $7, %%mm3\n\t" - "pand %%mm7, %%mm0\n\t" - "pand %%mm7, %%mm3\n\t" - "psrlq $6, %%mm1\n\t" - "psrlq $6, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $19, %%mm2\n\t" - "psrlq $19, %%mm5\n\t" - "pand %2, %%mm2\n\t" - "pand %2, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); - d += 4; - s += 16; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register int rgb = *(uint32_t*)s; s += 4; - *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + mm_end = end - 15; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 4%1, %%mm3 \n\t" + "punpckldq 8%1, %%mm0 \n\t" + "punpckldq 12%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $7, %%mm0 \n\t" + "psllq $7, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); + d += 4; + s += 16; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register int rgb = *(uint32_t*)s; s += 4; + *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); + } } static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 11; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 3%1, %%mm3\n\t" - "punpckldq 6%1, %%mm0\n\t" - "punpckldq 9%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psrlq $3, %%mm0\n\t" - "psrlq $3, %%mm3\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm3\n\t" - "psrlq $5, %%mm1\n\t" - "psrlq $5, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $8, %%mm2\n\t" - "psrlq $8, %%mm5\n\t" - "pand %%mm7, %%mm2\n\t" - "pand %%mm7, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); - d += 4; - s += 12; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - const int b= *s++; - const int g= *s++; - const int r= *s++; - *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + mm_end = end - 11; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 3%1, %%mm3 \n\t" + "punpckldq 6%1, %%mm0 \n\t" + "punpckldq 9%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psrlq $3, %%mm0 \n\t" + "psrlq $3, %%mm3 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $8, %%mm2 \n\t" + "psrlq $8, %%mm5 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); + d += 4; + s += 12; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + const int b = *s++; + const int g = *s++; + const int r = *s++; + *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); + } } static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 15; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 3%1, %%mm3\n\t" - "punpckldq 6%1, %%mm0\n\t" - "punpckldq 9%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psllq $8, %%mm0\n\t" - "psllq $8, %%mm3\n\t" - "pand %%mm7, %%mm0\n\t" - "pand %%mm7, %%mm3\n\t" - "psrlq $5, %%mm1\n\t" - "psrlq $5, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $19, %%mm2\n\t" - "psrlq $19, %%mm5\n\t" - "pand %2, %%mm2\n\t" - "pand %2, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); - d += 4; - s += 12; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - const int r= *s++; - const int g= *s++; - const int b= *s++; - *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + mm_end = end - 15; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 3%1, %%mm3 \n\t" + "punpckldq 6%1, %%mm0 \n\t" + "punpckldq 9%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $8, %%mm0 \n\t" + "psllq $8, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); + d += 4; + s += 12; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + const int r = *s++; + const int g = *s++; + const int b = *s++; + *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); + } } static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 11; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 3%1, %%mm3\n\t" - "punpckldq 6%1, %%mm0\n\t" - "punpckldq 9%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psrlq $3, %%mm0\n\t" - "psrlq $3, %%mm3\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm3\n\t" - "psrlq $6, %%mm1\n\t" - "psrlq $6, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $9, %%mm2\n\t" - "psrlq $9, %%mm5\n\t" - "pand %%mm7, %%mm2\n\t" - "pand %%mm7, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); - d += 4; - s += 12; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - const int b= *s++; - const int g= *s++; - const int r= *s++; - *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + mm_end = end - 11; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 3%1, %%mm3 \n\t" + "punpckldq 6%1, %%mm0 \n\t" + "punpckldq 9%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psrlq $3, %%mm0 \n\t" + "psrlq $3, %%mm3 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $9, %%mm2 \n\t" + "psrlq $9, %%mm5 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); + d += 4; + s += 12; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + const int b = *s++; + const int g = *s++; + const int r = *s++; + *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); + } } static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint8_t *s = src; - const uint8_t *end; + const uint8_t *s = src; + const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + const uint8_t *mm_end; #endif - uint16_t *d = (uint16_t *)dst; - end = s + src_size; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm __volatile( - "movq %0, %%mm7\n\t" - "movq %1, %%mm6\n\t" - ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 15; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movd %1, %%mm0\n\t" - "movd 3%1, %%mm3\n\t" - "punpckldq 6%1, %%mm0\n\t" - "punpckldq 9%1, %%mm3\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm3, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "psllq $7, %%mm0\n\t" - "psllq $7, %%mm3\n\t" - "pand %%mm7, %%mm0\n\t" - "pand %%mm7, %%mm3\n\t" - "psrlq $6, %%mm1\n\t" - "psrlq $6, %%mm4\n\t" - "pand %%mm6, %%mm1\n\t" - "pand %%mm6, %%mm4\n\t" - "psrlq $19, %%mm2\n\t" - "psrlq $19, %%mm5\n\t" - "pand %2, %%mm2\n\t" - "pand %2, %%mm5\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm5, %%mm3\n\t" - "psllq $16, %%mm3\n\t" - "por %%mm3, %%mm0\n\t" - MOVNTQ" %%mm0, %0\n\t" - :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); - d += 4; - s += 12; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - const int r= *s++; - const int g= *s++; - const int b= *s++; - *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); - } + __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm __volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + mm_end = end - 15; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movd %1, %%mm0 \n\t" + "movd 3%1, %%mm3 \n\t" + "punpckldq 6%1, %%mm0 \n\t" + "punpckldq 9%1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $7, %%mm0 \n\t" + "psllq $7, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); + d += 4; + s += 12; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + const int r = *s++; + const int g = *s++; + const int b = *s++; + *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); + } } /* @@ -935,706 +934,706 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s */ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; + const uint16_t *end; #ifdef HAVE_MMX - const uint16_t *mm_end; + const uint16_t *mm_end; #endif - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (uint16_t *)src; - end = s + src_size/2; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (uint16_t *)src; + end = s + src_size/2; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 7; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq %1, %%mm1\n\t" - "movq %1, %%mm2\n\t" - "pand %2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %4, %%mm2\n\t" - "psllq $3, %%mm0\n\t" - "psrlq $2, %%mm1\n\t" - "psrlq $7, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "movq %%mm2, %%mm5\n\t" - "punpcklwd %5, %%mm0\n\t" - "punpcklwd %5, %%mm1\n\t" - "punpcklwd %5, %%mm2\n\t" - "punpckhwd %5, %%mm3\n\t" - "punpckhwd %5, %%mm4\n\t" - "punpckhwd %5, %%mm5\n\t" - "psllq $8, %%mm1\n\t" - "psllq $16, %%mm2\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" - "psllq $8, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm5, %%mm3\n\t" - - "movq %%mm0, %%mm6\n\t" - "movq %%mm3, %%mm7\n\t" - - "movq 8%1, %%mm0\n\t" - "movq 8%1, %%mm1\n\t" - "movq 8%1, %%mm2\n\t" - "pand %2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %4, %%mm2\n\t" - "psllq $3, %%mm0\n\t" - "psrlq $2, %%mm1\n\t" - "psrlq $7, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "movq %%mm2, %%mm5\n\t" - "punpcklwd %5, %%mm0\n\t" - "punpcklwd %5, %%mm1\n\t" - "punpcklwd %5, %%mm2\n\t" - "punpckhwd %5, %%mm3\n\t" - "punpckhwd %5, %%mm4\n\t" - "punpckhwd %5, %%mm5\n\t" - "psllq $8, %%mm1\n\t" - "psllq $16, %%mm2\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" - "psllq $8, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm5, %%mm3\n\t" - - :"=m"(*d) - :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) - :"memory"); - /* Borrowed 32 to 24 */ - __asm __volatile( - "movq %%mm0, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "movq %%mm6, %%mm0\n\t" - "movq %%mm7, %%mm1\n\t" - - "movq %%mm4, %%mm6\n\t" - "movq %%mm5, %%mm7\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - - "psrlq $8, %%mm2\n\t" - "psrlq $8, %%mm3\n\t" - "psrlq $8, %%mm6\n\t" - "psrlq $8, %%mm7\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm1\n\t" - "pand %2, %%mm4\n\t" - "pand %2, %%mm5\n\t" - "pand %3, %%mm2\n\t" - "pand %3, %%mm3\n\t" - "pand %3, %%mm6\n\t" - "pand %3, %%mm7\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm3, %%mm1\n\t" - "por %%mm6, %%mm4\n\t" - "por %%mm7, %%mm5\n\t" - - "movq %%mm1, %%mm2\n\t" - "movq %%mm4, %%mm3\n\t" - "psllq $48, %%mm2\n\t" - "psllq $32, %%mm3\n\t" - "pand %4, %%mm2\n\t" - "pand %5, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "psrlq $16, %%mm1\n\t" - "psrlq $32, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm3, %%mm1\n\t" - "pand %6, %%mm5\n\t" - "por %%mm5, %%mm4\n\t" - - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm1, 8%0\n\t" - MOVNTQ" %%mm4, 16%0" - - :"=m"(*d) - :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) - :"memory"); - d += 24; - s += 8; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register uint16_t bgr; - bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; - } + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 7; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %1, %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $3, %%mm0 \n\t" + "psrlq $2, %%mm1 \n\t" + "psrlq $7, %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "movq %%mm3, %%mm7 \n\t" + + "movq 8%1, %%mm0 \n\t" + "movq 8%1, %%mm1 \n\t" + "movq 8%1, %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $3, %%mm0 \n\t" + "psrlq $2, %%mm1 \n\t" + "psrlq $7, %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + + :"=m"(*d) + :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) + :"memory"); + /* Borrowed 32 to 24 */ + __asm __volatile( + "movq %%mm0, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "movq %%mm6, %%mm0 \n\t" + "movq %%mm7, %%mm1 \n\t" + + "movq %%mm4, %%mm6 \n\t" + "movq %%mm5, %%mm7 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + + "psrlq $8, %%mm2 \n\t" + "psrlq $8, %%mm3 \n\t" + "psrlq $8, %%mm6 \n\t" + "psrlq $8, %%mm7 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm1 \n\t" + "pand %2, %%mm4 \n\t" + "pand %2, %%mm5 \n\t" + "pand %3, %%mm2 \n\t" + "pand %3, %%mm3 \n\t" + "pand %3, %%mm6 \n\t" + "pand %3, %%mm7 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm3, %%mm1 \n\t" + "por %%mm6, %%mm4 \n\t" + "por %%mm7, %%mm5 \n\t" + + "movq %%mm1, %%mm2 \n\t" + "movq %%mm4, %%mm3 \n\t" + "psllq $48, %%mm2 \n\t" + "psllq $32, %%mm3 \n\t" + "pand %4, %%mm2 \n\t" + "pand %5, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "psrlq $16, %%mm1 \n\t" + "psrlq $32, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm3, %%mm1 \n\t" + "pand %6, %%mm5 \n\t" + "por %%mm5, %%mm4 \n\t" + + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm1, 8%0 \n\t" + MOVNTQ" %%mm4, 16%0" + + :"=m"(*d) + :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) + :"memory"); + d += 24; + s += 8; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x7C00)>>7; + } } static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; + const uint16_t *end; #ifdef HAVE_MMX - const uint16_t *mm_end; + const uint16_t *mm_end; #endif - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (const uint16_t *)src; - end = s + src_size/2; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 7; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq %1, %%mm1\n\t" - "movq %1, %%mm2\n\t" - "pand %2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %4, %%mm2\n\t" - "psllq $3, %%mm0\n\t" - "psrlq $3, %%mm1\n\t" - "psrlq $8, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "movq %%mm2, %%mm5\n\t" - "punpcklwd %5, %%mm0\n\t" - "punpcklwd %5, %%mm1\n\t" - "punpcklwd %5, %%mm2\n\t" - "punpckhwd %5, %%mm3\n\t" - "punpckhwd %5, %%mm4\n\t" - "punpckhwd %5, %%mm5\n\t" - "psllq $8, %%mm1\n\t" - "psllq $16, %%mm2\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" - "psllq $8, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm5, %%mm3\n\t" - - "movq %%mm0, %%mm6\n\t" - "movq %%mm3, %%mm7\n\t" - - "movq 8%1, %%mm0\n\t" - "movq 8%1, %%mm1\n\t" - "movq 8%1, %%mm2\n\t" - "pand %2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %4, %%mm2\n\t" - "psllq $3, %%mm0\n\t" - "psrlq $3, %%mm1\n\t" - "psrlq $8, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "movq %%mm2, %%mm5\n\t" - "punpcklwd %5, %%mm0\n\t" - "punpcklwd %5, %%mm1\n\t" - "punpcklwd %5, %%mm2\n\t" - "punpckhwd %5, %%mm3\n\t" - "punpckhwd %5, %%mm4\n\t" - "punpckhwd %5, %%mm5\n\t" - "psllq $8, %%mm1\n\t" - "psllq $16, %%mm2\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" - "psllq $8, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm5, %%mm3\n\t" - :"=m"(*d) - :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) - :"memory"); - /* Borrowed 32 to 24 */ - __asm __volatile( - "movq %%mm0, %%mm4\n\t" - "movq %%mm3, %%mm5\n\t" - "movq %%mm6, %%mm0\n\t" - "movq %%mm7, %%mm1\n\t" - - "movq %%mm4, %%mm6\n\t" - "movq %%mm5, %%mm7\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - - "psrlq $8, %%mm2\n\t" - "psrlq $8, %%mm3\n\t" - "psrlq $8, %%mm6\n\t" - "psrlq $8, %%mm7\n\t" - "pand %2, %%mm0\n\t" - "pand %2, %%mm1\n\t" - "pand %2, %%mm4\n\t" - "pand %2, %%mm5\n\t" - "pand %3, %%mm2\n\t" - "pand %3, %%mm3\n\t" - "pand %3, %%mm6\n\t" - "pand %3, %%mm7\n\t" - "por %%mm2, %%mm0\n\t" - "por %%mm3, %%mm1\n\t" - "por %%mm6, %%mm4\n\t" - "por %%mm7, %%mm5\n\t" - - "movq %%mm1, %%mm2\n\t" - "movq %%mm4, %%mm3\n\t" - "psllq $48, %%mm2\n\t" - "psllq $32, %%mm3\n\t" - "pand %4, %%mm2\n\t" - "pand %5, %%mm3\n\t" - "por %%mm2, %%mm0\n\t" - "psrlq $16, %%mm1\n\t" - "psrlq $32, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm3, %%mm1\n\t" - "pand %6, %%mm5\n\t" - "por %%mm5, %%mm4\n\t" - - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm1, 8%0\n\t" - MOVNTQ" %%mm4, 16%0" - - :"=m"(*d) - :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) - :"memory"); - d += 24; - s += 8; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register uint16_t bgr; - bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; - } + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 7; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %1, %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $3, %%mm0 \n\t" + "psrlq $3, %%mm1 \n\t" + "psrlq $8, %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "movq %%mm3, %%mm7 \n\t" + + "movq 8%1, %%mm0 \n\t" + "movq 8%1, %%mm1 \n\t" + "movq 8%1, %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $3, %%mm0 \n\t" + "psrlq $3, %%mm1 \n\t" + "psrlq $8, %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + :"=m"(*d) + :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) + :"memory"); + /* Borrowed 32 to 24 */ + __asm __volatile( + "movq %%mm0, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "movq %%mm6, %%mm0 \n\t" + "movq %%mm7, %%mm1 \n\t" + + "movq %%mm4, %%mm6 \n\t" + "movq %%mm5, %%mm7 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + + "psrlq $8, %%mm2 \n\t" + "psrlq $8, %%mm3 \n\t" + "psrlq $8, %%mm6 \n\t" + "psrlq $8, %%mm7 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm1 \n\t" + "pand %2, %%mm4 \n\t" + "pand %2, %%mm5 \n\t" + "pand %3, %%mm2 \n\t" + "pand %3, %%mm3 \n\t" + "pand %3, %%mm6 \n\t" + "pand %3, %%mm7 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm3, %%mm1 \n\t" + "por %%mm6, %%mm4 \n\t" + "por %%mm7, %%mm5 \n\t" + + "movq %%mm1, %%mm2 \n\t" + "movq %%mm4, %%mm3 \n\t" + "psllq $48, %%mm2 \n\t" + "psllq $32, %%mm3 \n\t" + "pand %4, %%mm2 \n\t" + "pand %5, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "psrlq $16, %%mm1 \n\t" + "psrlq $32, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm3, %%mm1 \n\t" + "pand %6, %%mm5 \n\t" + "por %%mm5, %%mm4 \n\t" + + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm1, 8%0 \n\t" + MOVNTQ" %%mm4, 16%0" + + :"=m"(*d) + :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) + :"memory"); + d += 24; + s += 8; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0xF800)>>8; + } } static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; + const uint16_t *end; #ifdef HAVE_MMX - const uint16_t *mm_end; + const uint16_t *mm_end; #endif - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (const uint16_t *)src; - end = s + src_size/2; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); - mm_end = end - 3; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq %1, %%mm1\n\t" - "movq %1, %%mm2\n\t" - "pand %2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %4, %%mm2\n\t" - "psllq $3, %%mm0\n\t" - "psrlq $2, %%mm1\n\t" - "psrlq $7, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "movq %%mm2, %%mm5\n\t" - "punpcklwd %%mm7, %%mm0\n\t" - "punpcklwd %%mm7, %%mm1\n\t" - "punpcklwd %%mm7, %%mm2\n\t" - "punpckhwd %%mm7, %%mm3\n\t" - "punpckhwd %%mm7, %%mm4\n\t" - "punpckhwd %%mm7, %%mm5\n\t" - "psllq $8, %%mm1\n\t" - "psllq $16, %%mm2\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" - "psllq $8, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm5, %%mm3\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm3, 8%0\n\t" - :"=m"(*d) - :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) - :"memory"); - d += 16; - s += 4; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); + mm_end = end - 3; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %1, %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $3, %%mm0 \n\t" + "psrlq $2, %%mm1 \n\t" + "psrlq $7, %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %%mm7, %%mm0 \n\t" + "punpcklwd %%mm7, %%mm1 \n\t" + "punpcklwd %%mm7, %%mm2 \n\t" + "punpckhwd %%mm7, %%mm3 \n\t" + "punpckhwd %%mm7, %%mm4 \n\t" + "punpckhwd %%mm7, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm3, 8%0 \n\t" + :"=m"(*d) + :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) + :"memory"); + d += 16; + s += 4; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { #if 0 //slightly slower on athlon - int bgr= *s++; - *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); + int bgr= *s++; + *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); #else - register uint16_t bgr; - bgr = *s++; + register uint16_t bgr; + bgr = *s++; #ifdef WORDS_BIGENDIAN - *d++ = 0; - *d++ = (bgr&0x7C00)>>7; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x1F)<<3; + *d++ = 0; + *d++ = (bgr&0x7C00)>>7; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x1F)<<3; #else - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; - *d++ = 0; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x7C00)>>7; + *d++ = 0; #endif #endif - } + } } static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) { - const uint16_t *end; + const uint16_t *end; #ifdef HAVE_MMX - const uint16_t *mm_end; + const uint16_t *mm_end; #endif - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (uint16_t *)src; - end = s + src_size/2; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (uint16_t *)src; + end = s + src_size/2; #ifdef HAVE_MMX - __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); - mm_end = end - 3; - while(s < mm_end) - { - __asm __volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq %1, %%mm1\n\t" - "movq %1, %%mm2\n\t" - "pand %2, %%mm0\n\t" - "pand %3, %%mm1\n\t" - "pand %4, %%mm2\n\t" - "psllq $3, %%mm0\n\t" - "psrlq $3, %%mm1\n\t" - "psrlq $8, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "movq %%mm2, %%mm5\n\t" - "punpcklwd %%mm7, %%mm0\n\t" - "punpcklwd %%mm7, %%mm1\n\t" - "punpcklwd %%mm7, %%mm2\n\t" - "punpckhwd %%mm7, %%mm3\n\t" - "punpckhwd %%mm7, %%mm4\n\t" - "punpckhwd %%mm7, %%mm5\n\t" - "psllq $8, %%mm1\n\t" - "psllq $16, %%mm2\n\t" - "por %%mm1, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" - "psllq $8, %%mm4\n\t" - "psllq $16, %%mm5\n\t" - "por %%mm4, %%mm3\n\t" - "por %%mm5, %%mm3\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm3, 8%0\n\t" - :"=m"(*d) - :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) - :"memory"); - d += 16; - s += 4; - } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#endif - while(s < end) - { - register uint16_t bgr; - bgr = *s++; + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); + mm_end = end - 3; + while (s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %1, %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $3, %%mm0 \n\t" + "psrlq $3, %%mm1 \n\t" + "psrlq $8, %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %%mm7, %%mm0 \n\t" + "punpcklwd %%mm7, %%mm1 \n\t" + "punpcklwd %%mm7, %%mm2 \n\t" + "punpckhwd %%mm7, %%mm3 \n\t" + "punpckhwd %%mm7, %%mm4 \n\t" + "punpckhwd %%mm7, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm3, 8%0 \n\t" + :"=m"(*d) + :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) + :"memory"); + d += 16; + s += 4; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while (s < end) + { + register uint16_t bgr; + bgr = *s++; #ifdef WORDS_BIGENDIAN - *d++ = 0; - *d++ = (bgr&0xF800)>>8; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0x1F)<<3; + *d++ = 0; + *d++ = (bgr&0xF800)>>8; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0x1F)<<3; #else - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; - *d++ = 0; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0xF800)>>8; + *d++ = 0; #endif - } + } } static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) { - long idx = 15 - src_size; - uint8_t *s = (uint8_t *) src-idx, *d = dst-idx; + long idx = 15 - src_size; + uint8_t *s = (uint8_t *) src-idx, *d = dst-idx; #ifdef HAVE_MMX - __asm __volatile( - "test %0, %0 \n\t" - "jns 2f \n\t" - PREFETCH" (%1, %0) \n\t" - "movq %3, %%mm7 \n\t" - "pxor %4, %%mm7 \n\t" - "movq %%mm7, %%mm6 \n\t" - "pxor %5, %%mm7 \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%1, %0) \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" + __asm __volatile( + "test %0, %0 \n\t" + "jns 2f \n\t" + PREFETCH" (%1, %0) \n\t" + "movq %3, %%mm7 \n\t" + "pxor %4, %%mm7 \n\t" + "movq %%mm7, %%mm6 \n\t" + "pxor %5, %%mm7 \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 32(%1, %0) \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq 8(%1, %0), %%mm1 \n\t" # ifdef HAVE_MMX2 - "pshufw $177, %%mm0, %%mm3 \n\t" - "pshufw $177, %%mm1, %%mm5 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm6, %%mm5 \n\t" - "por %%mm3, %%mm0 \n\t" - "por %%mm5, %%mm1 \n\t" + "pshufw $177, %%mm0, %%mm3 \n\t" + "pshufw $177, %%mm1, %%mm5 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm6, %%mm5 \n\t" + "por %%mm3, %%mm0 \n\t" + "por %%mm5, %%mm1 \n\t" # else - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm4 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm6, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "pslld $16, %%mm2 \n\t" - "psrld $16, %%mm3 \n\t" - "pslld $16, %%mm4 \n\t" - "psrld $16, %%mm5 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm4, %%mm1 \n\t" - "por %%mm3, %%mm0 \n\t" - "por %%mm5, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm4 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm6, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "pslld $16, %%mm2 \n\t" + "psrld $16, %%mm3 \n\t" + "pslld $16, %%mm4 \n\t" + "psrld $16, %%mm5 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm4, %%mm1 \n\t" + "por %%mm3, %%mm0 \n\t" + "por %%mm5, %%mm1 \n\t" # endif - MOVNTQ" %%mm0, (%2, %0) \n\t" - MOVNTQ" %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "js 1b \n\t" - SFENCE" \n\t" - EMMS" \n\t" - "2: \n\t" - : "+&r"(idx) - : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) - : "memory"); -#endif - for (; idx<15; idx+=4) { - register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00; - v &= 0xff00ff; - *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); - } + MOVNTQ" %%mm0, (%2, %0) \n\t" + MOVNTQ" %%mm1, 8(%2, %0) \n\t" + "add $16, %0 \n\t" + "js 1b \n\t" + SFENCE" \n\t" + EMMS" \n\t" + "2: \n\t" + : "+&r"(idx) + : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) + : "memory"); +#endif + for (; idx<15; idx+=4) { + register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00; + v &= 0xff00ff; + *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); + } } static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) { - unsigned i; + unsigned i; #ifdef HAVE_MMX - long mmx_size= 23 - src_size; - asm volatile ( - "test %%"REG_a", %%"REG_a" \n\t" - "jns 2f \n\t" - "movq "MANGLE(mask24r)", %%mm5 \n\t" - "movq "MANGLE(mask24g)", %%mm6 \n\t" - "movq "MANGLE(mask24b)", %%mm7 \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%1, %%"REG_a") \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG - "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG - "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B - "psllq $16, %%mm0 \n\t" // 00 BGR BGR - "pand %%mm5, %%mm0 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "por %%mm0, %%mm1 \n\t" - "por %%mm2, %%mm1 \n\t" - "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG - MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG - "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B - "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR - "pand %%mm7, %%mm0 \n\t" - "pand %%mm5, %%mm1 \n\t" - "pand %%mm6, %%mm2 \n\t" - "por %%mm0, %%mm1 \n\t" - "por %%mm2, %%mm1 \n\t" - "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B - MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R - "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR - "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG - "pand %%mm6, %%mm0 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm5, %%mm2 \n\t" - "por %%mm0, %%mm1 \n\t" - "por %%mm2, %%mm1 \n\t" - MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" - "add $24, %%"REG_a" \n\t" - " js 1b \n\t" - "2: \n\t" - : "+a" (mmx_size) - : "r" (src-mmx_size), "r"(dst-mmx_size) - ); - - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); - - if(mmx_size==23) return; //finihsed, was multiple of 8 - - src+= src_size; - dst+= src_size; - src_size= 23-mmx_size; - src-= src_size; - dst-= src_size; -#endif - for(i=0; i<src_size; i+=3) - { - register uint8_t x; - x = src[i + 2]; - dst[i + 1] = src[i + 1]; - dst[i + 2] = src[i + 0]; - dst[i + 0] = x; - } + long mmx_size= 23 - src_size; + asm volatile ( + "test %%"REG_a", %%"REG_a" \n\t" + "jns 2f \n\t" + "movq "MANGLE(mask24r)", %%mm5 \n\t" + "movq "MANGLE(mask24g)", %%mm6 \n\t" + "movq "MANGLE(mask24b)", %%mm7 \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 32(%1, %%"REG_a") \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG + "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG + "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B + "psllq $16, %%mm0 \n\t" // 00 BGR BGR + "pand %%mm5, %%mm0 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "por %%mm0, %%mm1 \n\t" + "por %%mm2, %%mm1 \n\t" + "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG + MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG + "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B + "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR + "pand %%mm7, %%mm0 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm6, %%mm2 \n\t" + "por %%mm0, %%mm1 \n\t" + "por %%mm2, %%mm1 \n\t" + "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B + MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R + "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR + "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG + "pand %%mm6, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm5, %%mm2 \n\t" + "por %%mm0, %%mm1 \n\t" + "por %%mm2, %%mm1 \n\t" + MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" + "add $24, %%"REG_a" \n\t" + " js 1b \n\t" + "2: \n\t" + : "+a" (mmx_size) + : "r" (src-mmx_size), "r"(dst-mmx_size) + ); + + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); + + if (mmx_size==23) return; //finihsed, was multiple of 8 + + src+= src_size; + dst+= src_size; + src_size= 23-mmx_size; + src-= src_size; + dst-= src_size; +#endif + for (i=0; i<src_size; i+=3) + { + register uint8_t x; + x = src[i + 2]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 0]; + dst[i + 0] = x; + } } static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride, long vertLumPerChroma) + long width, long height, + long lumStride, long chromStride, long dstStride, long vertLumPerChroma) { - long y; - const long chromWidth= width>>1; - for(y=0; y<height; y++) - { + long y; + const long chromWidth= width>>1; + for (y=0; y<height; y++) + { #ifdef HAVE_MMX //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) - asm volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%1, %%"REG_a", 2) \n\t" - PREFETCH" 32(%2, %%"REG_a") \n\t" - PREFETCH" 32(%3, %%"REG_a") \n\t" - "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) - "movq %%mm0, %%mm2 \n\t" // U(0) - "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) - "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) - - "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) - "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) - "movq %%mm3, %%mm4 \n\t" // Y(0) - "movq %%mm5, %%mm6 \n\t" // Y(8) - "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) - "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) - "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) - "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) - - MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" - MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" - MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" - MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) - : "%"REG_a - ); + asm volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 32(%1, %%"REG_a", 2) \n\t" + PREFETCH" 32(%2, %%"REG_a") \n\t" + PREFETCH" 32(%3, %%"REG_a") \n\t" + "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) + "movq %%mm0, %%mm2 \n\t" // U(0) + "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) + "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) + + "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) + "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) + "movq %%mm3, %%mm4 \n\t" // Y(0) + "movq %%mm5, %%mm6 \n\t" // Y(8) + "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) + "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) + "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) + "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) + + MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) + : "%"REG_a + ); #else #if defined ARCH_ALPHA && defined HAVE_MVI -#define pl2yuy2(n) \ - y1 = yc[n]; \ - y2 = yc2[n]; \ - u = uc[n]; \ - v = vc[n]; \ - asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ - asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ - asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ - asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ - yuv1 = (u << 8) + (v << 24); \ - yuv2 = yuv1 + y2; \ - yuv1 += y1; \ - qdst[n] = yuv1; \ - qdst2[n] = yuv2; - - int i; - uint64_t *qdst = (uint64_t *) dst; - uint64_t *qdst2 = (uint64_t *) (dst + dstStride); - const uint32_t *yc = (uint32_t *) ysrc; - const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); - const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; - for(i = 0; i < chromWidth; i += 8){ - uint64_t y1, y2, yuv1, yuv2; - uint64_t u, v; - /* Prefetch */ - asm("ldq $31,64(%0)" :: "r"(yc)); - asm("ldq $31,64(%0)" :: "r"(yc2)); - asm("ldq $31,64(%0)" :: "r"(uc)); - asm("ldq $31,64(%0)" :: "r"(vc)); - - pl2yuy2(0); - pl2yuy2(1); - pl2yuy2(2); - pl2yuy2(3); - - yc += 4; - yc2 += 4; - uc += 4; - vc += 4; - qdst += 4; - qdst2 += 4; - } - y++; - ysrc += lumStride; - dst += dstStride; +#define pl2yuy2(n) \ + y1 = yc[n]; \ + y2 = yc2[n]; \ + u = uc[n]; \ + v = vc[n]; \ + asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ + asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ + asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ + asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ + yuv1 = (u << 8) + (v << 24); \ + yuv2 = yuv1 + y2; \ + yuv1 += y1; \ + qdst[n] = yuv1; \ + qdst2[n] = yuv2; + + int i; + uint64_t *qdst = (uint64_t *) dst; + uint64_t *qdst2 = (uint64_t *) (dst + dstStride); + const uint32_t *yc = (uint32_t *) ysrc; + const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); + const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; + for (i = 0; i < chromWidth; i += 8){ + uint64_t y1, y2, yuv1, yuv2; + uint64_t u, v; + /* Prefetch */ + asm("ldq $31,64(%0)" :: "r"(yc)); + asm("ldq $31,64(%0)" :: "r"(yc2)); + asm("ldq $31,64(%0)" :: "r"(uc)); + asm("ldq $31,64(%0)" :: "r"(vc)); + + pl2yuy2(0); + pl2yuy2(1); + pl2yuy2(2); + pl2yuy2(3); + + yc += 4; + yc2 += 4; + uc += 4; + vc += 4; + qdst += 4; + qdst2 += 4; + } + y++; + ysrc += lumStride; + dst += dstStride; #elif __WORDSIZE >= 64 - int i; - uint64_t *ldst = (uint64_t *) dst; - const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; - for(i = 0; i < chromWidth; i += 2){ - uint64_t k, l; - k = yc[0] + (uc[0] << 8) + - (yc[1] << 16) + (vc[0] << 24); - l = yc[2] + (uc[1] << 8) + - (yc[3] << 16) + (vc[1] << 24); - *ldst++ = k + (l << 32); - yc += 4; - uc += 2; - vc += 2; - } + int i; + uint64_t *ldst = (uint64_t *) dst; + const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; + for (i = 0; i < chromWidth; i += 2){ + uint64_t k, l; + k = yc[0] + (uc[0] << 8) + + (yc[1] << 16) + (vc[0] << 24); + l = yc[2] + (uc[1] << 8) + + (yc[3] << 16) + (vc[1] << 24); + *ldst++ = k + (l << 32); + yc += 4; + uc += 2; + vc += 2; + } #else - int i, *idst = (int32_t *) dst; - const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; - for(i = 0; i < chromWidth; i++){ + int i, *idst = (int32_t *) dst; + const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; + for (i = 0; i < chromWidth; i++){ #ifdef WORDS_BIGENDIAN - *idst++ = (yc[0] << 24)+ (uc[0] << 16) + - (yc[1] << 8) + (vc[0] << 0); + *idst++ = (yc[0] << 24)+ (uc[0] << 16) + + (yc[1] << 8) + (vc[0] << 0); #else - *idst++ = yc[0] + (uc[0] << 8) + - (yc[1] << 16) + (vc[0] << 24); -#endif - yc += 2; - uc++; - vc++; - } -#endif -#endif - if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) - { - usrc += chromStride; - vsrc += chromStride; - } - ysrc += lumStride; - dst += dstStride; - } + *idst++ = yc[0] + (uc[0] << 8) + + (yc[1] << 16) + (vc[0] << 24); +#endif + yc += 2; + uc++; + vc++; + } +#endif +#endif + if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) + { + usrc += chromStride; + vsrc += chromStride; + } + ysrc += lumStride; + dst += dstStride; + } #ifdef HAVE_MMX -asm( EMMS" \n\t" - SFENCE" \n\t" +asm( EMMS" \n\t" + SFENCE" \n\t" :::"memory"); #endif } @@ -1645,103 +1644,103 @@ asm( EMMS" \n\t" * problem for anyone then tell me, and ill fix it) */ static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride) + long width, long height, + long lumStride, long chromStride, long dstStride) { - //FIXME interpolate chroma - RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); + //FIXME interpolate chroma + RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); } static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride, long vertLumPerChroma) + long width, long height, + long lumStride, long chromStride, long dstStride, long vertLumPerChroma) { - long y; - const long chromWidth= width>>1; - for(y=0; y<height; y++) - { + long y; + const long chromWidth= width>>1; + for (y=0; y<height; y++) + { #ifdef HAVE_MMX //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) - asm volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%1, %%"REG_a", 2) \n\t" - PREFETCH" 32(%2, %%"REG_a") \n\t" - PREFETCH" 32(%3, %%"REG_a") \n\t" - "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) - "movq %%mm0, %%mm2 \n\t" // U(0) - "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) - "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) - - "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) - "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) - "movq %%mm0, %%mm4 \n\t" // Y(0) - "movq %%mm2, %%mm6 \n\t" // Y(8) - "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) - "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) - "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) - "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) - - MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" - MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" - MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" - MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) - : "%"REG_a - ); + asm volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 32(%1, %%"REG_a", 2) \n\t" + PREFETCH" 32(%2, %%"REG_a") \n\t" + PREFETCH" 32(%3, %%"REG_a") \n\t" + "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) + "movq %%mm0, %%mm2 \n\t" // U(0) + "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) + "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) + + "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) + "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) + "movq %%mm0, %%mm4 \n\t" // Y(0) + "movq %%mm2, %%mm6 \n\t" // Y(8) + "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) + "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) + "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) + "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) + + MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) + : "%"REG_a + ); #else //FIXME adapt the alpha asm code from yv12->yuy2 #if __WORDSIZE >= 64 - int i; - uint64_t *ldst = (uint64_t *) dst; - const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; - for(i = 0; i < chromWidth; i += 2){ - uint64_t k, l; - k = uc[0] + (yc[0] << 8) + - (vc[0] << 16) + (yc[1] << 24); - l = uc[1] + (yc[2] << 8) + - (vc[1] << 16) + (yc[3] << 24); - *ldst++ = k + (l << 32); - yc += 4; - uc += 2; - vc += 2; - } + int i; + uint64_t *ldst = (uint64_t *) dst; + const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; + for (i = 0; i < chromWidth; i += 2){ + uint64_t k, l; + k = uc[0] + (yc[0] << 8) + + (vc[0] << 16) + (yc[1] << 24); + l = uc[1] + (yc[2] << 8) + + (vc[1] << 16) + (yc[3] << 24); + *ldst++ = k + (l << 32); + yc += 4; + uc += 2; + vc += 2; + } #else - int i, *idst = (int32_t *) dst; - const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; - for(i = 0; i < chromWidth; i++){ + int i, *idst = (int32_t *) dst; + const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; + for (i = 0; i < chromWidth; i++){ #ifdef WORDS_BIGENDIAN - *idst++ = (uc[0] << 24)+ (yc[0] << 16) + - (vc[0] << 8) + (yc[1] << 0); + *idst++ = (uc[0] << 24)+ (yc[0] << 16) + + (vc[0] << 8) + (yc[1] << 0); #else - *idst++ = uc[0] + (yc[0] << 8) + - (vc[0] << 16) + (yc[1] << 24); -#endif - yc += 2; - uc++; - vc++; - } -#endif -#endif - if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) - { - usrc += chromStride; - vsrc += chromStride; - } - ysrc += lumStride; - dst += dstStride; - } + *idst++ = uc[0] + (yc[0] << 8) + + (vc[0] << 16) + (yc[1] << 24); +#endif + yc += 2; + uc++; + vc++; + } +#endif +#endif + if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) + { + usrc += chromStride; + vsrc += chromStride; + } + ysrc += lumStride; + dst += dstStride; + } #ifdef HAVE_MMX -asm( EMMS" \n\t" - SFENCE" \n\t" +asm( EMMS" \n\t" + SFENCE" \n\t" :::"memory"); #endif } @@ -1752,11 +1751,11 @@ asm( EMMS" \n\t" * problem for anyone then tell me, and ill fix it) */ static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride) + long width, long height, + long lumStride, long chromStride, long dstStride) { - //FIXME interpolate chroma - RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); + //FIXME interpolate chroma + RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); } /** @@ -1764,10 +1763,10 @@ static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, * width should be a multiple of 16 */ static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - long width, long height, - long lumStride, long chromStride, long dstStride) + long width, long height, + long lumStride, long chromStride, long dstStride) { - RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); + RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); } /** @@ -1776,234 +1775,234 @@ static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usr * problem for anyone then tell me, and ill fix it) */ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride) + long width, long height, + long lumStride, long chromStride, long srcStride) { - long y; - const long chromWidth= width>>1; - for(y=0; y<height; y+=2) - { + long y; + const long chromWidth= width>>1; + for (y=0; y<height; y+=2) + { #ifdef HAVE_MMX - asm volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_a", 4) \n\t" - "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) - "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) - "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) - "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) - "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) - - MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" - - "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) - "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) - "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) - "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) - "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) - "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" - - "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) - "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) - "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) - "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) - "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) - "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) - "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) - - MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" - MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%"REG_a - ); - - ydst += lumStride; - src += srcStride; - - asm volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_a", 4) \n\t" - "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) - "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) - "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) - "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) - "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) - "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%"REG_a - ); + asm volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... + ASMALIGN(4) + "1: \n\t" + PREFETCH" 64(%0, %%"REG_a", 4) \n\t" + "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) + "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) + "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + + MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" + + "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) + "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) + "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) + "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" + + "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) + "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) + "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) + "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) + "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) + "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) + "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) + + MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" + MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%"REG_a + ); + + ydst += lumStride; + src += srcStride; + + asm volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 64(%0, %%"REG_a", 4) \n\t" + "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) + "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%"REG_a + ); #else - long i; - for(i=0; i<chromWidth; i++) - { - ydst[2*i+0] = src[4*i+0]; - udst[i] = src[4*i+1]; - ydst[2*i+1] = src[4*i+2]; - vdst[i] = src[4*i+3]; - } - ydst += lumStride; - src += srcStride; - - for(i=0; i<chromWidth; i++) - { - ydst[2*i+0] = src[4*i+0]; - ydst[2*i+1] = src[4*i+2]; - } -#endif - udst += chromStride; - vdst += chromStride; - ydst += lumStride; - src += srcStride; - } + long i; + for (i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+0]; + udst[i] = src[4*i+1]; + ydst[2*i+1] = src[4*i+2]; + vdst[i] = src[4*i+3]; + } + ydst += lumStride; + src += srcStride; + + for (i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+0]; + ydst[2*i+1] = src[4*i+2]; + } +#endif + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; + } #ifdef HAVE_MMX -asm volatile( EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); +asm volatile( EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); #endif } static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, - uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, long lumStride, long chromStride) + uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + long width, long height, long lumStride, long chromStride) { - /* Y Plane */ - memcpy(ydst, ysrc, width*height); + /* Y Plane */ + memcpy(ydst, ysrc, width*height); - /* XXX: implement upscaling for U,V */ + /* XXX: implement upscaling for U,V */ } static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) { - long x,y; + long x,y; - dst[0]= src[0]; + dst[0]= src[0]; - // first line - for(x=0; x<srcWidth-1; x++){ - dst[2*x+1]= (3*src[x] + src[x+1])>>2; - dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; - } - dst[2*srcWidth-1]= src[srcWidth-1]; + // first line + for (x=0; x<srcWidth-1; x++){ + dst[2*x+1]= (3*src[x] + src[x+1])>>2; + dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; + } + dst[2*srcWidth-1]= src[srcWidth-1]; dst+= dstStride; - for(y=1; y<srcHeight; y++){ + for (y=1; y<srcHeight; y++){ #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - const long mmxSize= srcWidth&~15; - asm volatile( - "mov %4, %%"REG_a" \n\t" - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq (%1, %%"REG_a"), %%mm1 \n\t" - "movq 1(%0, %%"REG_a"), %%mm2 \n\t" - "movq 1(%1, %%"REG_a"), %%mm3 \n\t" - "movq -1(%0, %%"REG_a"), %%mm4 \n\t" - "movq -1(%1, %%"REG_a"), %%mm5 \n\t" - PAVGB" %%mm0, %%mm5 \n\t" - PAVGB" %%mm0, %%mm3 \n\t" - PAVGB" %%mm0, %%mm5 \n\t" - PAVGB" %%mm0, %%mm3 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - "movq %%mm5, %%mm7 \n\t" - "movq %%mm4, %%mm6 \n\t" - "punpcklbw %%mm3, %%mm5 \n\t" - "punpckhbw %%mm3, %%mm7 \n\t" - "punpcklbw %%mm2, %%mm4 \n\t" - "punpckhbw %%mm2, %%mm6 \n\t" + const long mmxSize= srcWidth&~15; + asm volatile( + "mov %4, %%"REG_a" \n\t" + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq 1(%0, %%"REG_a"), %%mm2 \n\t" + "movq 1(%1, %%"REG_a"), %%mm3 \n\t" + "movq -1(%0, %%"REG_a"), %%mm4 \n\t" + "movq -1(%1, %%"REG_a"), %%mm5 \n\t" + PAVGB" %%mm0, %%mm5 \n\t" + PAVGB" %%mm0, %%mm3 \n\t" + PAVGB" %%mm0, %%mm5 \n\t" + PAVGB" %%mm0, %%mm3 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + "movq %%mm5, %%mm7 \n\t" + "movq %%mm4, %%mm6 \n\t" + "punpcklbw %%mm3, %%mm5 \n\t" + "punpckhbw %%mm3, %%mm7 \n\t" + "punpcklbw %%mm2, %%mm4 \n\t" + "punpckhbw %%mm2, %%mm6 \n\t" #if 1 - MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" + MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" #else - "movq %%mm5, (%2, %%"REG_a", 2) \n\t" - "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" - "movq %%mm4, (%3, %%"REG_a", 2) \n\t" - "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" -#endif - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), - "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), - "g" (-mmxSize) - : "%"REG_a - - ); + "movq %%mm5, (%2, %%"REG_a", 2) \n\t" + "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t" + "movq %%mm4, (%3, %%"REG_a", 2) \n\t" + "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" +#endif + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), + "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), + "g" (-mmxSize) + : "%"REG_a + + ); #else - const long mmxSize=1; -#endif - dst[0 ]= (3*src[0] + src[srcStride])>>2; - dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; - - for(x=mmxSize-1; x<srcWidth-1; x++){ - dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; - dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; - dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; - dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; - } - dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; - dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; - - dst+=dstStride*2; - src+=srcStride; - } + const long mmxSize=1; +#endif + dst[0 ]= (3*src[0] + src[srcStride])>>2; + dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; + + for (x=mmxSize-1; x<srcWidth-1; x++){ + dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; + dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; + dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; + dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; + } + dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; + dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; + + dst+=dstStride*2; + src+=srcStride; + } - // last line + // last line #if 1 - dst[0]= src[0]; + dst[0]= src[0]; - for(x=0; x<srcWidth-1; x++){ - dst[2*x+1]= (3*src[x] + src[x+1])>>2; - dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; - } - dst[2*srcWidth-1]= src[srcWidth-1]; + for (x=0; x<srcWidth-1; x++){ + dst[2*x+1]= (3*src[x] + src[x+1])>>2; + dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; + } + dst[2*srcWidth-1]= src[srcWidth-1]; #else - for(x=0; x<srcWidth; x++){ - dst[2*x+0]= - dst[2*x+1]= src[x]; - } + for (x=0; x<srcWidth; x++){ + dst[2*x+0]= + dst[2*x+1]= src[x]; + } #endif #ifdef HAVE_MMX -asm volatile( EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); +asm volatile( EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); #endif } @@ -2014,122 +2013,122 @@ asm volatile( EMMS" \n\t" * chrominance data is only taken from every secound line others are ignored FIXME write HQ version */ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride) + long width, long height, + long lumStride, long chromStride, long srcStride) { - long y; - const long chromWidth= width>>1; - for(y=0; y<height; y+=2) - { + long y; + const long chromWidth= width>>1; + for (y=0; y<height; y+=2) + { #ifdef HAVE_MMX - asm volatile( - "xorl %%eax, %%eax \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%eax, 4) \n\t" - "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) - "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) - "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) - "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) - "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) - "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) - "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) - "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) - "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) - - MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" - - "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) - "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) - "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) - "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) - "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) - "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) - "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) - "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" - - "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) - "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) - "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) - "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) - "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) - "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) - "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) - - MOVNTQ" %%mm0, (%3, %%eax) \n\t" - MOVNTQ" %%mm2, (%2, %%eax) \n\t" - - "addl $8, %%eax \n\t" - "cmpl %4, %%eax \n\t" - " jb 1b \n\t" - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%eax" - ); - - ydst += lumStride; - src += srcStride; - - asm volatile( - "xorl %%eax, %%eax \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%eax, 4) \n\t" - "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) - "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) - "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) - "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) - "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) - "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) - "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" - MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" - - "addl $8, %%eax \n\t" - "cmpl %4, %%eax \n\t" - " jb 1b \n\t" - - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%eax" - ); + asm volatile( + "xorl %%eax, %%eax \n\t" + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... + ASMALIGN(4) + "1: \n\t" + PREFETCH" 64(%0, %%eax, 4) \n\t" + "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) + "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) + "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) + "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) + "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) + "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) + "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) + "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) + "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + + MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" + + "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) + "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) + "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) + "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) + "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) + "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) + "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) + "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" + + "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) + "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) + "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) + "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) + "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) + "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) + "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) + + MOVNTQ" %%mm0, (%3, %%eax) \n\t" + MOVNTQ" %%mm2, (%2, %%eax) \n\t" + + "addl $8, %%eax \n\t" + "cmpl %4, %%eax \n\t" + " jb 1b \n\t" + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%eax" + ); + + ydst += lumStride; + src += srcStride; + + asm volatile( + "xorl %%eax, %%eax \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 64(%0, %%eax, 4) \n\t" + "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) + "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) + "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) + "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) + "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" + MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" + + "addl $8, %%eax \n\t" + "cmpl %4, %%eax \n\t" + " jb 1b \n\t" + + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%eax" + ); #else - long i; - for(i=0; i<chromWidth; i++) - { - udst[i] = src[4*i+0]; - ydst[2*i+0] = src[4*i+1]; - vdst[i] = src[4*i+2]; - ydst[2*i+1] = src[4*i+3]; - } - ydst += lumStride; - src += srcStride; - - for(i=0; i<chromWidth; i++) - { - ydst[2*i+0] = src[4*i+1]; - ydst[2*i+1] = src[4*i+3]; - } -#endif - udst += chromStride; - vdst += chromStride; - ydst += lumStride; - src += srcStride; - } + long i; + for (i=0; i<chromWidth; i++) + { + udst[i] = src[4*i+0]; + ydst[2*i+0] = src[4*i+1]; + vdst[i] = src[4*i+2]; + ydst[2*i+1] = src[4*i+3]; + } + ydst += lumStride; + src += srcStride; + + for (i=0; i<chromWidth; i++) + { + ydst[2*i+0] = src[4*i+1]; + ydst[2*i+1] = src[4*i+3]; + } +#endif + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; + } #ifdef HAVE_MMX -asm volatile( EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); +asm volatile( EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); #endif } @@ -2140,604 +2139,604 @@ asm volatile( EMMS" \n\t" * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version */ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - long width, long height, - long lumStride, long chromStride, long srcStride) + long width, long height, + long lumStride, long chromStride, long srcStride) { - long y; - const long chromWidth= width>>1; + long y; + const long chromWidth= width>>1; #ifdef HAVE_MMX - for(y=0; y<height-2; y+=2) - { - long i; - for(i=0; i<2; i++) - { - asm volatile( - "mov %2, %%"REG_a" \n\t" - "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" - "movq "MANGLE(w1111)", %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd 3(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 6(%0, %%"REG_d"), %%mm2 \n\t" - "movd 9(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" + for (y=0; y<height-2; y+=2) + { + long i; + for (i=0; i<2; i++) + { + asm volatile( + "mov %2, %%"REG_a" \n\t" + "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" + "movq "MANGLE(w1111)", %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 64(%0, %%"REG_d") \n\t" + "movd (%0, %%"REG_d"), %%mm0 \n\t" + "movd 3(%0, %%"REG_d"), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "movd 6(%0, %%"REG_d"), %%mm2 \n\t" + "movd 9(%0, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm1 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" #ifndef FAST_BGR2YV12 - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "psraw $7, %%mm0 \n\t" - - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 15(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 18(%0, %%"REG_d"), %%mm2 \n\t" - "movd 21(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" + "psrad $8, %%mm0 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "pmaddwd %%mm5, %%mm0 \n\t" + "pmaddwd %%mm5, %%mm2 \n\t" + "packssdw %%mm2, %%mm0 \n\t" + "psraw $7, %%mm0 \n\t" + + "movd 12(%0, %%"REG_d"), %%mm4 \n\t" + "movd 15(%0, %%"REG_d"), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "movd 18(%0, %%"REG_d"), %%mm2 \n\t" + "movd 21(%0, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm1 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" #ifndef FAST_BGR2YV12 - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm1, %%mm4 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm2, %%mm4 \n\t" - "psraw $7, %%mm4 \n\t" - - "packuswb %%mm4, %%mm0 \n\t" - "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" - - MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+width*3), "r" (ydst+width), "g" (-width) - : "%"REG_a, "%"REG_d - ); - ydst += lumStride; - src += srcStride; - } - src -= srcStride*2; - asm volatile( - "mov %4, %%"REG_a" \n\t" - "movq "MANGLE(w1111)", %%mm5 \n\t" - "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" - "add %%"REG_d", %%"REG_d" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" - PREFETCH" 64(%1, %%"REG_d") \n\t" + "psrad $8, %%mm4 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm1, %%mm4 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "pmaddwd %%mm5, %%mm4 \n\t" + "pmaddwd %%mm5, %%mm2 \n\t" + "add $24, %%"REG_d" \n\t" + "packssdw %%mm2, %%mm4 \n\t" + "psraw $7, %%mm4 \n\t" + + "packuswb %%mm4, %%mm0 \n\t" + "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" + + MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + : : "r" (src+width*3), "r" (ydst+width), "g" (-width) + : "%"REG_a, "%"REG_d + ); + ydst += lumStride; + src += srcStride; + } + src -= srcStride*2; + asm volatile( + "mov %4, %%"REG_a" \n\t" + "movq "MANGLE(w1111)", %%mm5 \n\t" + "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" + "add %%"REG_d", %%"REG_d" \n\t" + ASMALIGN(4) + "1: \n\t" + PREFETCH" 64(%0, %%"REG_d") \n\t" + PREFETCH" 64(%1, %%"REG_d") \n\t" #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - "movq (%0, %%"REG_d"), %%mm0 \n\t" - "movq (%1, %%"REG_d"), %%mm1 \n\t" - "movq 6(%0, %%"REG_d"), %%mm2 \n\t" - "movq 6(%1, %%"REG_d"), %%mm3 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm0 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" + "movq (%0, %%"REG_d"), %%mm0 \n\t" + "movq (%1, %%"REG_d"), %%mm1 \n\t" + "movq 6(%0, %%"REG_d"), %%mm2 \n\t" + "movq 6(%1, %%"REG_d"), %%mm3 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlq $24, %%mm0 \n\t" + "psrlq $24, %%mm2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" #else - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd (%1, %%"REG_d"), %%mm1 \n\t" - "movd 3(%0, %%"REG_d"), %%mm2 \n\t" - "movd 3(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "movd 6(%0, %%"REG_d"), %%mm4 \n\t" - "movd 6(%1, %%"REG_d"), %%mm1 \n\t" - "movd 9(%0, %%"REG_d"), %%mm2 \n\t" - "movd 9(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm4 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm2 \n\t" -#endif - "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" - - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" + "movd (%0, %%"REG_d"), %%mm0 \n\t" + "movd (%1, %%"REG_d"), %%mm1 \n\t" + "movd 3(%0, %%"REG_d"), %%mm2 \n\t" + "movd 3(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "movd 6(%0, %%"REG_d"), %%mm4 \n\t" + "movd 6(%1, %%"REG_d"), %%mm1 \n\t" + "movd 9(%0, %%"REG_d"), %%mm2 \n\t" + "movd 9(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm4 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm4, %%mm2 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm2 \n\t" +#endif + "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" + "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" + + "pmaddwd %%mm0, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" #ifndef FAST_BGR2YV12 - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm2, %%mm0 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 - "psraw $7, %%mm0 \n\t" + "psrad $8, %%mm0 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm2, %%mm0 \n\t" + "packssdw %%mm3, %%mm1 \n\t" + "pmaddwd %%mm5, %%mm0 \n\t" + "pmaddwd %%mm5, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 + "psraw $7, %%mm0 \n\t" #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - "movq 12(%0, %%"REG_d"), %%mm4 \n\t" - "movq 12(%1, %%"REG_d"), %%mm1 \n\t" - "movq 18(%0, %%"REG_d"), %%mm2 \n\t" - "movq 18(%1, %%"REG_d"), %%mm3 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm4, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm4 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" + "movq 12(%0, %%"REG_d"), %%mm4 \n\t" + "movq 12(%1, %%"REG_d"), %%mm1 \n\t" + "movq 18(%0, %%"REG_d"), %%mm2 \n\t" + "movq 18(%1, %%"REG_d"), %%mm3 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm4, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlq $24, %%mm4 \n\t" + "psrlq $24, %%mm2 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" #else - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 12(%1, %%"REG_d"), %%mm1 \n\t" - "movd 15(%0, %%"REG_d"), %%mm2 \n\t" - "movd 15(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm4 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm4 \n\t" - "movd 18(%0, %%"REG_d"), %%mm5 \n\t" - "movd 18(%1, %%"REG_d"), %%mm1 \n\t" - "movd 21(%0, %%"REG_d"), %%mm2 \n\t" - "movd 21(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm5 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "movq "MANGLE(w1111)", %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm2 \n\t" -#endif - "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" - - "pmaddwd %%mm4, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" + "movd 12(%0, %%"REG_d"), %%mm4 \n\t" + "movd 12(%1, %%"REG_d"), %%mm1 \n\t" + "movd 15(%0, %%"REG_d"), %%mm2 \n\t" + "movd 15(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm4 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm4 \n\t" + "movd 18(%0, %%"REG_d"), %%mm5 \n\t" + "movd 18(%1, %%"REG_d"), %%mm1 \n\t" + "movd 21(%0, %%"REG_d"), %%mm2 \n\t" + "movd 21(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm5 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm5, %%mm2 \n\t" + "movq "MANGLE(w1111)", %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm2 \n\t" +#endif + "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" + "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" + + "pmaddwd %%mm4, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" #ifndef FAST_BGR2YV12 - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm2, %%mm4 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 - "psraw $7, %%mm4 \n\t" - - "movq %%mm0, %%mm1 \n\t" - "punpckldq %%mm4, %%mm0 \n\t" - "punpckhdq %%mm4, %%mm1 \n\t" - "packsswb %%mm1, %%mm0 \n\t" - "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" - "movd %%mm0, (%2, %%"REG_a") \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, (%3, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) - : "%"REG_a, "%"REG_d - ); - - udst += chromStride; - vdst += chromStride; - src += srcStride*2; - } - - asm volatile( EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); + "psrad $8, %%mm4 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm2, %%mm4 \n\t" + "packssdw %%mm3, %%mm1 \n\t" + "pmaddwd %%mm5, %%mm4 \n\t" + "pmaddwd %%mm5, %%mm1 \n\t" + "add $24, %%"REG_d" \n\t" + "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 + "psraw $7, %%mm4 \n\t" + + "movq %%mm0, %%mm1 \n\t" + "punpckldq %%mm4, %%mm0 \n\t" + "punpckhdq %%mm4, %%mm1 \n\t" + "packsswb %%mm1, %%mm0 \n\t" + "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" + "movd %%mm0, (%2, %%"REG_a") \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, (%3, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) + : "%"REG_a, "%"REG_d + ); + + udst += chromStride; + vdst += chromStride; + src += srcStride*2; + } + + asm volatile( EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); #else - y=0; -#endif - for(; y<height; y+=2) - { - long i; - for(i=0; i<chromWidth; i++) - { - unsigned int b= src[6*i+0]; - unsigned int g= src[6*i+1]; - unsigned int r= src[6*i+2]; - - unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; - unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; - unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; - - udst[i] = U; - vdst[i] = V; - ydst[2*i] = Y; - - b= src[6*i+3]; - g= src[6*i+4]; - r= src[6*i+5]; - - Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; - ydst[2*i+1] = Y; - } - ydst += lumStride; - src += srcStride; - - for(i=0; i<chromWidth; i++) - { - unsigned int b= src[6*i+0]; - unsigned int g= src[6*i+1]; - unsigned int r= src[6*i+2]; - - unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; - - ydst[2*i] = Y; - - b= src[6*i+3]; - g= src[6*i+4]; - r= src[6*i+5]; - - Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; - ydst[2*i+1] = Y; - } - udst += chromStride; - vdst += chromStride; - ydst += lumStride; - src += srcStride; - } + y=0; +#endif + for (; y<height; y+=2) + { + long i; + for (i=0; i<chromWidth; i++) + { + unsigned int b = src[6*i+0]; + unsigned int g = src[6*i+1]; + unsigned int r = src[6*i+2]; + + unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; + unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; + unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; + + udst[i] = U; + vdst[i] = V; + ydst[2*i] = Y; + + b = src[6*i+3]; + g = src[6*i+4]; + r = src[6*i+5]; + + Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; + ydst[2*i+1] = Y; + } + ydst += lumStride; + src += srcStride; + + for (i=0; i<chromWidth; i++) + { + unsigned int b = src[6*i+0]; + unsigned int g = src[6*i+1]; + unsigned int r = src[6*i+2]; + + unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; + + ydst[2*i] = Y; + + b = src[6*i+3]; + g = src[6*i+4]; + r = src[6*i+5]; + + Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; + ydst[2*i+1] = Y; + } + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; + } } void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, - long width, long height, long src1Stride, - long src2Stride, long dstStride){ - long h; + long width, long height, long src1Stride, + long src2Stride, long dstStride){ + long h; - for(h=0; h < height; h++) - { - long w; + for (h=0; h < height; h++) + { + long w; #ifdef HAVE_MMX #ifdef HAVE_SSE2 - asm( - "xor %%"REG_a", %%"REG_a" \n\t" - "1: \n\t" - PREFETCH" 64(%1, %%"REG_a") \n\t" - PREFETCH" 64(%2, %%"REG_a") \n\t" - "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" - "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" - "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" - "punpcklbw %%xmm2, %%xmm0 \n\t" - "punpckhbw %%xmm2, %%xmm1 \n\t" - "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" - "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" - "add $16, %%"REG_a" \n\t" - "cmp %3, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) - : "memory", "%"REG_a"" - ); + asm( + "xor %%"REG_a", %%"REG_a" \n\t" + "1: \n\t" + PREFETCH" 64(%1, %%"REG_a") \n\t" + PREFETCH" 64(%2, %%"REG_a") \n\t" + "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" + "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" + "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" + "punpcklbw %%xmm2, %%xmm0 \n\t" + "punpckhbw %%xmm2, %%xmm1 \n\t" + "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" + "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" + "add $16, %%"REG_a" \n\t" + "cmp %3, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) + : "memory", "%"REG_a"" + ); #else - asm( - "xor %%"REG_a", %%"REG_a" \n\t" - "1: \n\t" - PREFETCH" 64(%1, %%"REG_a") \n\t" - PREFETCH" 64(%2, %%"REG_a") \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 8(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - "movq 8(%2, %%"REG_a"), %%mm5 \n\t" - "punpcklbw %%mm4, %%mm0 \n\t" - "punpckhbw %%mm4, %%mm1 \n\t" - "punpcklbw %%mm5, %%mm2 \n\t" - "punpckhbw %%mm5, %%mm3 \n\t" - MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" - MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" - "add $16, %%"REG_a" \n\t" - "cmp %3, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) - : "memory", "%"REG_a - ); -#endif - for(w= (width&(~15)); w < width; w++) - { - dest[2*w+0] = src1[w]; - dest[2*w+1] = src2[w]; - } + asm( + "xor %%"REG_a", %%"REG_a" \n\t" + "1: \n\t" + PREFETCH" 64(%1, %%"REG_a") \n\t" + PREFETCH" 64(%2, %%"REG_a") \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 8(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + "movq 8(%2, %%"REG_a"), %%mm5 \n\t" + "punpcklbw %%mm4, %%mm0 \n\t" + "punpckhbw %%mm4, %%mm1 \n\t" + "punpcklbw %%mm5, %%mm2 \n\t" + "punpckhbw %%mm5, %%mm3 \n\t" + MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" + "add $16, %%"REG_a" \n\t" + "cmp %3, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) + : "memory", "%"REG_a + ); +#endif + for (w= (width&(~15)); w < width; w++) + { + dest[2*w+0] = src1[w]; + dest[2*w+1] = src2[w]; + } #else - for(w=0; w < width; w++) - { - dest[2*w+0] = src1[w]; - dest[2*w+1] = src2[w]; - } + for (w=0; w < width; w++) + { + dest[2*w+0] = src1[w]; + dest[2*w+1] = src2[w]; + } #endif - dest += dstStride; + dest += dstStride; src1 += src1Stride; src2 += src2Stride; - } + } #ifdef HAVE_MMX - asm( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); + asm( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); #endif } static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, - uint8_t *dst1, uint8_t *dst2, - long width, long height, - long srcStride1, long srcStride2, - long dstStride1, long dstStride2) + uint8_t *dst1, uint8_t *dst2, + long width, long height, + long srcStride1, long srcStride2, + long dstStride1, long dstStride2) { long y,x,w,h; w=width/2; h=height/2; #ifdef HAVE_MMX asm volatile( - PREFETCH" %0\n\t" - PREFETCH" %1\n\t" - ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); -#endif - for(y=0;y<h;y++){ - const uint8_t* s1=src1+srcStride1*(y>>1); - uint8_t* d=dst1+dstStride1*y; - x=0; + PREFETCH" %0 \n\t" + PREFETCH" %1 \n\t" + ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); +#endif + for (y=0;y<h;y++){ + const uint8_t* s1=src1+srcStride1*(y>>1); + uint8_t* d=dst1+dstStride1*y; + x=0; #ifdef HAVE_MMX - for(;x<w-31;x+=32) - { - asm volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq 8%1, %%mm2\n\t" - "movq 16%1, %%mm4\n\t" - "movq 24%1, %%mm6\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "movq %%mm6, %%mm7\n\t" - "punpcklbw %%mm0, %%mm0\n\t" - "punpckhbw %%mm1, %%mm1\n\t" - "punpcklbw %%mm2, %%mm2\n\t" - "punpckhbw %%mm3, %%mm3\n\t" - "punpcklbw %%mm4, %%mm4\n\t" - "punpckhbw %%mm5, %%mm5\n\t" - "punpcklbw %%mm6, %%mm6\n\t" - "punpckhbw %%mm7, %%mm7\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm1, 8%0\n\t" - MOVNTQ" %%mm2, 16%0\n\t" - MOVNTQ" %%mm3, 24%0\n\t" - MOVNTQ" %%mm4, 32%0\n\t" - MOVNTQ" %%mm5, 40%0\n\t" - MOVNTQ" %%mm6, 48%0\n\t" - MOVNTQ" %%mm7, 56%0" - :"=m"(d[2*x]) - :"m"(s1[x]) - :"memory"); - } -#endif - for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; + for (;x<w-31;x+=32) + { + asm volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq 8%1, %%mm2 \n\t" + "movq 16%1, %%mm4 \n\t" + "movq 24%1, %%mm6 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "movq %%mm6, %%mm7 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpcklbw %%mm2, %%mm2 \n\t" + "punpckhbw %%mm3, %%mm3 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpckhbw %%mm5, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm7 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm1, 8%0 \n\t" + MOVNTQ" %%mm2, 16%0 \n\t" + MOVNTQ" %%mm3, 24%0 \n\t" + MOVNTQ" %%mm4, 32%0 \n\t" + MOVNTQ" %%mm5, 40%0 \n\t" + MOVNTQ" %%mm6, 48%0 \n\t" + MOVNTQ" %%mm7, 56%0" + :"=m"(d[2*x]) + :"m"(s1[x]) + :"memory"); + } +#endif + for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; } - for(y=0;y<h;y++){ - const uint8_t* s2=src2+srcStride2*(y>>1); - uint8_t* d=dst2+dstStride2*y; - x=0; + for (y=0;y<h;y++){ + const uint8_t* s2=src2+srcStride2*(y>>1); + uint8_t* d=dst2+dstStride2*y; + x=0; #ifdef HAVE_MMX - for(;x<w-31;x+=32) - { - asm volatile( - PREFETCH" 32%1\n\t" - "movq %1, %%mm0\n\t" - "movq 8%1, %%mm2\n\t" - "movq 16%1, %%mm4\n\t" - "movq 24%1, %%mm6\n\t" - "movq %%mm0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "movq %%mm6, %%mm7\n\t" - "punpcklbw %%mm0, %%mm0\n\t" - "punpckhbw %%mm1, %%mm1\n\t" - "punpcklbw %%mm2, %%mm2\n\t" - "punpckhbw %%mm3, %%mm3\n\t" - "punpcklbw %%mm4, %%mm4\n\t" - "punpckhbw %%mm5, %%mm5\n\t" - "punpcklbw %%mm6, %%mm6\n\t" - "punpckhbw %%mm7, %%mm7\n\t" - MOVNTQ" %%mm0, %0\n\t" - MOVNTQ" %%mm1, 8%0\n\t" - MOVNTQ" %%mm2, 16%0\n\t" - MOVNTQ" %%mm3, 24%0\n\t" - MOVNTQ" %%mm4, 32%0\n\t" - MOVNTQ" %%mm5, 40%0\n\t" - MOVNTQ" %%mm6, 48%0\n\t" - MOVNTQ" %%mm7, 56%0" - :"=m"(d[2*x]) - :"m"(s2[x]) - :"memory"); - } -#endif - for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; + for (;x<w-31;x+=32) + { + asm volatile( + PREFETCH" 32%1 \n\t" + "movq %1, %%mm0 \n\t" + "movq 8%1, %%mm2 \n\t" + "movq 16%1, %%mm4 \n\t" + "movq 24%1, %%mm6 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "movq %%mm6, %%mm7 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpcklbw %%mm2, %%mm2 \n\t" + "punpckhbw %%mm3, %%mm3 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpckhbw %%mm5, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm7 \n\t" + MOVNTQ" %%mm0, %0 \n\t" + MOVNTQ" %%mm1, 8%0 \n\t" + MOVNTQ" %%mm2, 16%0 \n\t" + MOVNTQ" %%mm3, 24%0 \n\t" + MOVNTQ" %%mm4, 32%0 \n\t" + MOVNTQ" %%mm5, 40%0 \n\t" + MOVNTQ" %%mm6, 48%0 \n\t" + MOVNTQ" %%mm7, 56%0" + :"=m"(d[2*x]) + :"m"(s2[x]) + :"memory"); + } +#endif + for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; } #ifdef HAVE_MMX - asm( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); + asm( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); #endif } static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, - uint8_t *dst, - long width, long height, - long srcStride1, long srcStride2, - long srcStride3, long dstStride) + uint8_t *dst, + long width, long height, + long srcStride1, long srcStride2, + long srcStride3, long dstStride) { long y,x,w,h; w=width/2; h=height; - for(y=0;y<h;y++){ - const uint8_t* yp=src1+srcStride1*y; - const uint8_t* up=src2+srcStride2*(y>>2); - const uint8_t* vp=src3+srcStride3*(y>>2); - uint8_t* d=dst+dstStride*y; - x=0; + for (y=0;y<h;y++){ + const uint8_t* yp=src1+srcStride1*y; + const uint8_t* up=src2+srcStride2*(y>>2); + const uint8_t* vp=src3+srcStride3*(y>>2); + uint8_t* d=dst+dstStride*y; + x=0; #ifdef HAVE_MMX - for(;x<w-7;x+=8) - { - asm volatile( - PREFETCH" 32(%1, %0)\n\t" - PREFETCH" 32(%2, %0)\n\t" - PREFETCH" 32(%3, %0)\n\t" - "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ - "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */ - "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */ - "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ - "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */ - "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */ - "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */ - "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */ - "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */ - "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */ - - "movq %%mm1, %%mm6\n\t" - "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/ - "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ - "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ - MOVNTQ" %%mm0, (%4, %0, 8)\n\t" - MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t" - - "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/ - "movq 8(%1, %0, 4), %%mm0\n\t" - "movq %%mm0, %%mm3\n\t" - "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/ - "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/ - MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t" - MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t" - - "movq %%mm4, %%mm6\n\t" - "movq 16(%1, %0, 4), %%mm0\n\t" - "movq %%mm0, %%mm3\n\t" - "punpcklbw %%mm5, %%mm4\n\t" - "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/ - "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/ - MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t" - MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t" - - "punpckhbw %%mm5, %%mm6\n\t" - "movq 24(%1, %0, 4), %%mm0\n\t" - "movq %%mm0, %%mm3\n\t" - "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/ - "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/ - MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t" - MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t" - - : "+r" (x) - : "r"(yp), "r" (up), "r"(vp), "r"(d) - :"memory"); - } -#endif - for(; x<w; x++) - { - const long x2= x<<2; - d[8*x+0]=yp[x2]; - d[8*x+1]=up[x]; - d[8*x+2]=yp[x2+1]; - d[8*x+3]=vp[x]; - d[8*x+4]=yp[x2+2]; - d[8*x+5]=up[x]; - d[8*x+6]=yp[x2+3]; - d[8*x+7]=vp[x]; - } + for (;x<w-7;x+=8) + { + asm volatile( + PREFETCH" 32(%1, %0) \n\t" + PREFETCH" 32(%2, %0) \n\t" + PREFETCH" 32(%3, %0) \n\t" + "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ + "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ + "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ + "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ + "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ + "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ + "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ + "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ + "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ + "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ + + "movq %%mm1, %%mm6 \n\t" + "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ + "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ + "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ + MOVNTQ" %%mm0, (%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" + + "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ + "movq 8(%1, %0, 4), %%mm0 \n\t" + "movq %%mm0, %%mm3 \n\t" + "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ + "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ + MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" + + "movq %%mm4, %%mm6 \n\t" + "movq 16(%1, %0, 4), %%mm0 \n\t" + "movq %%mm0, %%mm3 \n\t" + "punpcklbw %%mm5, %%mm4 \n\t" + "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ + "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ + MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" + + "punpckhbw %%mm5, %%mm6 \n\t" + "movq 24(%1, %0, 4), %%mm0 \n\t" + "movq %%mm0, %%mm3 \n\t" + "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ + "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ + MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" + + : "+r" (x) + : "r"(yp), "r" (up), "r"(vp), "r"(d) + :"memory"); + } +#endif + for (; x<w; x++) + { + const long x2 = x<<2; + d[8*x+0] = yp[x2]; + d[8*x+1] = up[x]; + d[8*x+2] = yp[x2+1]; + d[8*x+3] = vp[x]; + d[8*x+4] = yp[x2+2]; + d[8*x+5] = up[x]; + d[8*x+6] = yp[x2+3]; + d[8*x+7] = vp[x]; + } } #ifdef HAVE_MMX - asm( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); + asm( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); #endif } static inline void RENAME(rgb2rgb_init)(void){ - rgb15to16= RENAME(rgb15to16); - rgb15to24= RENAME(rgb15to24); - rgb15to32= RENAME(rgb15to32); - rgb16to24= RENAME(rgb16to24); - rgb16to32= RENAME(rgb16to32); - rgb16to15= RENAME(rgb16to15); - rgb24to16= RENAME(rgb24to16); - rgb24to15= RENAME(rgb24to15); - rgb24to32= RENAME(rgb24to32); - rgb32to16= RENAME(rgb32to16); - rgb32to15= RENAME(rgb32to15); - rgb32to24= RENAME(rgb32to24); - rgb24tobgr15= RENAME(rgb24tobgr15); - rgb24tobgr16= RENAME(rgb24tobgr16); - rgb24tobgr24= RENAME(rgb24tobgr24); - rgb32tobgr32= RENAME(rgb32tobgr32); - rgb32tobgr16= RENAME(rgb32tobgr16); - rgb32tobgr15= RENAME(rgb32tobgr15); - yv12toyuy2= RENAME(yv12toyuy2); - yv12touyvy= RENAME(yv12touyvy); - yuv422ptoyuy2= RENAME(yuv422ptoyuy2); - yuy2toyv12= RENAME(yuy2toyv12); -// uyvytoyv12= RENAME(uyvytoyv12); -// yvu9toyv12= RENAME(yvu9toyv12); - planar2x= RENAME(planar2x); - rgb24toyv12= RENAME(rgb24toyv12); - interleaveBytes= RENAME(interleaveBytes); - vu9_to_vu12= RENAME(vu9_to_vu12); - yvu9_to_yuy2= RENAME(yvu9_to_yuy2); + rgb15to16 = RENAME(rgb15to16); + rgb15to24 = RENAME(rgb15to24); + rgb15to32 = RENAME(rgb15to32); + rgb16to24 = RENAME(rgb16to24); + rgb16to32 = RENAME(rgb16to32); + rgb16to15 = RENAME(rgb16to15); + rgb24to16 = RENAME(rgb24to16); + rgb24to15 = RENAME(rgb24to15); + rgb24to32 = RENAME(rgb24to32); + rgb32to16 = RENAME(rgb32to16); + rgb32to15 = RENAME(rgb32to15); + rgb32to24 = RENAME(rgb32to24); + rgb24tobgr15 = RENAME(rgb24tobgr15); + rgb24tobgr16 = RENAME(rgb24tobgr16); + rgb24tobgr24 = RENAME(rgb24tobgr24); + rgb32tobgr32 = RENAME(rgb32tobgr32); + rgb32tobgr16 = RENAME(rgb32tobgr16); + rgb32tobgr15 = RENAME(rgb32tobgr15); + yv12toyuy2 = RENAME(yv12toyuy2); + yv12touyvy = RENAME(yv12touyvy); + yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); + yuy2toyv12 = RENAME(yuy2toyv12); +// uyvytoyv12 = RENAME(uyvytoyv12); +// yvu9toyv12 = RENAME(yvu9toyv12); + planar2x = RENAME(planar2x); + rgb24toyv12 = RENAME(rgb24toyv12); + interleaveBytes = RENAME(interleaveBytes); + vu9_to_vu12 = RENAME(vu9_to_vu12); + yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); } diff --git a/libswscale/swscale_altivec_template.c b/libswscale/swscale_altivec_template.c index d5c0e1f34e..e9b51a808e 100644 --- a/libswscale/swscale_altivec_template.c +++ b/libswscale/swscale_altivec_template.c @@ -31,518 +31,514 @@ static inline void altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { - register int i; - vector unsigned int altivec_vectorShiftInt19 = - vec_add(vec_splat_u32(10),vec_splat_u32(9)); - if ((unsigned long)dest % 16) { - /* badly aligned store, we force store alignement */ - /* and will handle load misalignement on val w/ vec_perm */ - vector unsigned char perm1; - vector signed int v1; - for (i = 0 ; (i < dstW) && - (((unsigned long)dest + i) % 16) ; i++) { - int t = val[i] >> 19; - dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); - } - perm1 = vec_lvsl(i << 2, val); - v1 = vec_ld(i << 2, val); - for ( ; i < (dstW - 15); i+=16) { - int offset = i << 2; - vector signed int v2 = vec_ld(offset + 16, val); - vector signed int v3 = vec_ld(offset + 32, val); - vector signed int v4 = vec_ld(offset + 48, val); - vector signed int v5 = vec_ld(offset + 64, val); - vector signed int v12 = vec_perm(v1,v2,perm1); - vector signed int v23 = vec_perm(v2,v3,perm1); - vector signed int v34 = vec_perm(v3,v4,perm1); - vector signed int v45 = vec_perm(v4,v5,perm1); - - vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); - vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); - vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); - vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); - vector unsigned short vs1 = vec_packsu(vA, vB); - vector unsigned short vs2 = vec_packsu(vC, vD); - vector unsigned char vf = vec_packsu(vs1, vs2); - vec_st(vf, i, dest); - v1 = v5; + register int i; + vector unsigned int altivec_vectorShiftInt19 = + vec_add(vec_splat_u32(10),vec_splat_u32(9)); + if ((unsigned long)dest % 16) { + /* badly aligned store, we force store alignement */ + /* and will handle load misalignement on val w/ vec_perm */ + vector unsigned char perm1; + vector signed int v1; + for (i = 0 ; (i < dstW) && + (((unsigned long)dest + i) % 16) ; i++) { + int t = val[i] >> 19; + dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); + } + perm1 = vec_lvsl(i << 2, val); + v1 = vec_ld(i << 2, val); + for ( ; i < (dstW - 15); i+=16) { + int offset = i << 2; + vector signed int v2 = vec_ld(offset + 16, val); + vector signed int v3 = vec_ld(offset + 32, val); + vector signed int v4 = vec_ld(offset + 48, val); + vector signed int v5 = vec_ld(offset + 64, val); + vector signed int v12 = vec_perm(v1,v2,perm1); + vector signed int v23 = vec_perm(v2,v3,perm1); + vector signed int v34 = vec_perm(v3,v4,perm1); + vector signed int v45 = vec_perm(v4,v5,perm1); + + vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); + vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); + vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); + vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); + vector unsigned short vs1 = vec_packsu(vA, vB); + vector unsigned short vs2 = vec_packsu(vC, vD); + vector unsigned char vf = vec_packsu(vs1, vs2); + vec_st(vf, i, dest); + v1 = v5; + } + } else { // dest is properly aligned, great + for (i = 0; i < (dstW - 15); i+=16) { + int offset = i << 2; + vector signed int v1 = vec_ld(offset, val); + vector signed int v2 = vec_ld(offset + 16, val); + vector signed int v3 = vec_ld(offset + 32, val); + vector signed int v4 = vec_ld(offset + 48, val); + vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19); + vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19); + vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19); + vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19); + vector unsigned short vs1 = vec_packsu(v5, v6); + vector unsigned short vs2 = vec_packsu(v7, v8); + vector unsigned char vf = vec_packsu(vs1, vs2); + vec_st(vf, i, dest); + } } - } else { // dest is properly aligned, great - for (i = 0; i < (dstW - 15); i+=16) { - int offset = i << 2; - vector signed int v1 = vec_ld(offset, val); - vector signed int v2 = vec_ld(offset + 16, val); - vector signed int v3 = vec_ld(offset + 32, val); - vector signed int v4 = vec_ld(offset + 48, val); - vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19); - vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19); - vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19); - vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19); - vector unsigned short vs1 = vec_packsu(v5, v6); - vector unsigned short vs2 = vec_packsu(v7, v8); - vector unsigned char vf = vec_packsu(vs1, vs2); - vec_st(vf, i, dest); + for ( ; i < dstW ; i++) { + int t = val[i] >> 19; + dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); } - } - for ( ; i < dstW ; i++) { - int t = val[i] >> 19; - dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); - } } static inline void yuv2yuvX_altivec_real(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, - int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, - uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) + int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, + uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) { - const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; - register int i, j; - { - int __attribute__ ((aligned (16))) val[dstW]; + const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; + register int i, j; + { + int __attribute__ ((aligned (16))) val[dstW]; - for (i = 0; i < (dstW -7); i+=4) { - vec_st(vini, i << 2, val); - } - for (; i < dstW; i++) { - val[i] = (1 << 18); - } + for (i = 0; i < (dstW -7); i+=4) { + vec_st(vini, i << 2, val); + } + for (; i < dstW; i++) { + val[i] = (1 << 18); + } - for (j = 0; j < lumFilterSize; j++) { - vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter); - vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter); - vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); - vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter + for (j = 0; j < lumFilterSize; j++) { + vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter); + vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter); + vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); + vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter - perm = vec_lvsl(0, lumSrc[j]); - l1 = vec_ld(0, lumSrc[j]); + perm = vec_lvsl(0, lumSrc[j]); + l1 = vec_ld(0, lumSrc[j]); - for (i = 0; i < (dstW - 7); i+=8) { - int offset = i << 2; - vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]); + for (i = 0; i < (dstW - 7); i+=8) { + int offset = i << 2; + vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]); - vector signed int v1 = vec_ld(offset, val); - vector signed int v2 = vec_ld(offset + 16, val); + vector signed int v1 = vec_ld(offset, val); + vector signed int v2 = vec_ld(offset + 16, val); - vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7] + vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7] - vector signed int i1 = vec_mule(vLumFilter, ls); - vector signed int i2 = vec_mulo(vLumFilter, ls); + vector signed int i1 = vec_mule(vLumFilter, ls); + vector signed int i2 = vec_mulo(vLumFilter, ls); - vector signed int vf1 = vec_mergeh(i1, i2); - vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j] + vector signed int vf1 = vec_mergeh(i1, i2); + vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j] - vector signed int vo1 = vec_add(v1, vf1); - vector signed int vo2 = vec_add(v2, vf2); + vector signed int vo1 = vec_add(v1, vf1); + vector signed int vo2 = vec_add(v2, vf2); - vec_st(vo1, offset, val); - vec_st(vo2, offset + 16, val); + vec_st(vo1, offset, val); + vec_st(vo2, offset + 16, val); - l1 = l2; - } - for ( ; i < dstW; i++) { - val[i] += lumSrc[j][i] * lumFilter[j]; - } - } - altivec_packIntArrayToCharArray(val,dest,dstW); - } - if (uDest != 0) { - int __attribute__ ((aligned (16))) u[chrDstW]; - int __attribute__ ((aligned (16))) v[chrDstW]; - - for (i = 0; i < (chrDstW -7); i+=4) { - vec_st(vini, i << 2, u); - vec_st(vini, i << 2, v); - } - for (; i < chrDstW; i++) { - u[i] = (1 << 18); - v[i] = (1 << 18); + l1 = l2; + } + for ( ; i < dstW; i++) { + val[i] += lumSrc[j][i] * lumFilter[j]; + } + } + altivec_packIntArrayToCharArray(val,dest,dstW); } + if (uDest != 0) { + int __attribute__ ((aligned (16))) u[chrDstW]; + int __attribute__ ((aligned (16))) v[chrDstW]; + + for (i = 0; i < (chrDstW -7); i+=4) { + vec_st(vini, i << 2, u); + vec_st(vini, i << 2, v); + } + for (; i < chrDstW; i++) { + u[i] = (1 << 18); + v[i] = (1 << 18); + } - for (j = 0; j < chrFilterSize; j++) { - vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter); - vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter); - vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0); - vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter - - perm = vec_lvsl(0, chrSrc[j]); - l1 = vec_ld(0, chrSrc[j]); - l1_V = vec_ld(2048 << 1, chrSrc[j]); - - for (i = 0; i < (chrDstW - 7); i+=8) { - int offset = i << 2; - vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]); - vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]); - - vector signed int v1 = vec_ld(offset, u); - vector signed int v2 = vec_ld(offset + 16, u); - vector signed int v1_V = vec_ld(offset, v); - vector signed int v2_V = vec_ld(offset + 16, v); - - vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7] - vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055] - - vector signed int i1 = vec_mule(vChrFilter, ls); - vector signed int i2 = vec_mulo(vChrFilter, ls); - vector signed int i1_V = vec_mule(vChrFilter, ls_V); - vector signed int i2_V = vec_mulo(vChrFilter, ls_V); - - vector signed int vf1 = vec_mergeh(i1, i2); - vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] - vector signed int vf1_V = vec_mergeh(i1_V, i2_V); - vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] - - vector signed int vo1 = vec_add(v1, vf1); - vector signed int vo2 = vec_add(v2, vf2); - vector signed int vo1_V = vec_add(v1_V, vf1_V); - vector signed int vo2_V = vec_add(v2_V, vf2_V); - - vec_st(vo1, offset, u); - vec_st(vo2, offset + 16, u); - vec_st(vo1_V, offset, v); - vec_st(vo2_V, offset + 16, v); - - l1 = l2; - l1_V = l2_V; - } - for ( ; i < chrDstW; i++) { - u[i] += chrSrc[j][i] * chrFilter[j]; - v[i] += chrSrc[j][i + 2048] * chrFilter[j]; - } + for (j = 0; j < chrFilterSize; j++) { + vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter); + vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter); + vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0); + vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter + + perm = vec_lvsl(0, chrSrc[j]); + l1 = vec_ld(0, chrSrc[j]); + l1_V = vec_ld(2048 << 1, chrSrc[j]); + + for (i = 0; i < (chrDstW - 7); i+=8) { + int offset = i << 2; + vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]); + vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]); + + vector signed int v1 = vec_ld(offset, u); + vector signed int v2 = vec_ld(offset + 16, u); + vector signed int v1_V = vec_ld(offset, v); + vector signed int v2_V = vec_ld(offset + 16, v); + + vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7] + vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055] + + vector signed int i1 = vec_mule(vChrFilter, ls); + vector signed int i2 = vec_mulo(vChrFilter, ls); + vector signed int i1_V = vec_mule(vChrFilter, ls_V); + vector signed int i2_V = vec_mulo(vChrFilter, ls_V); + + vector signed int vf1 = vec_mergeh(i1, i2); + vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] + vector signed int vf1_V = vec_mergeh(i1_V, i2_V); + vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] + + vector signed int vo1 = vec_add(v1, vf1); + vector signed int vo2 = vec_add(v2, vf2); + vector signed int vo1_V = vec_add(v1_V, vf1_V); + vector signed int vo2_V = vec_add(v2_V, vf2_V); + + vec_st(vo1, offset, u); + vec_st(vo2, offset + 16, u); + vec_st(vo1_V, offset, v); + vec_st(vo2_V, offset + 16, v); + + l1 = l2; + l1_V = l2_V; + } + for ( ; i < chrDstW; i++) { + u[i] += chrSrc[j][i] * chrFilter[j]; + v[i] += chrSrc[j][i + 2048] * chrFilter[j]; + } + } + altivec_packIntArrayToCharArray(u,uDest,chrDstW); + altivec_packIntArrayToCharArray(v,vDest,chrDstW); } - altivec_packIntArrayToCharArray(u,uDest,chrDstW); - altivec_packIntArrayToCharArray(v,vDest,chrDstW); - } } static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) { - register int i; - int __attribute__ ((aligned (16))) tempo[4]; - - if (filterSize % 4) { - for(i=0; i<dstW; i++) { - register int j; - register int srcPos = filterPos[i]; - register int val = 0; - for(j=0; j<filterSize; j++) { - val += ((int)src[srcPos + j])*filter[filterSize*i + j]; - } - dst[i] = av_clip(val>>7, 0, (1<<15)-1); + register int i; + int __attribute__ ((aligned (16))) tempo[4]; + + if (filterSize % 4) { + for (i=0; i<dstW; i++) { + register int j; + register int srcPos = filterPos[i]; + register int val = 0; + for (j=0; j<filterSize; j++) { + val += ((int)src[srcPos + j])*filter[filterSize*i + j]; + } + dst[i] = av_clip(val>>7, 0, (1<<15)-1); + } } - } - else - switch (filterSize) { - case 4: + else + switch (filterSize) { + case 4: { - for(i=0; i<dstW; i++) { - register int srcPos = filterPos[i]; - - vector unsigned char src_v0 = vec_ld(srcPos, src); - vector unsigned char src_v1, src_vF; - vector signed short src_v, filter_v; - vector signed int val_vEven, val_s; - if ((((int)src + srcPos)% 16) > 12) { - src_v1 = vec_ld(srcPos + 16, src); - } - src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); - - src_v = // vec_unpackh sign-extends... - (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); - // now put our elements in the even slots - src_v = vec_mergeh(src_v, (vector signed short)vzero); - - filter_v = vec_ld(i << 3, filter); + for (i=0; i<dstW; i++) { + register int srcPos = filterPos[i]; + + vector unsigned char src_v0 = vec_ld(srcPos, src); + vector unsigned char src_v1, src_vF; + vector signed short src_v, filter_v; + vector signed int val_vEven, val_s; + if ((((int)src + srcPos)% 16) > 12) { + src_v1 = vec_ld(srcPos + 16, src); + } + src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); + + src_v = // vec_unpackh sign-extends... + (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); + // now put our elements in the even slots + src_v = vec_mergeh(src_v, (vector signed short)vzero); + + filter_v = vec_ld(i << 3, filter); // the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2) // the neat trick : we only care for half the elements, // high or low depending on (i<<3)%16 (it's 0 or 8 here), // and we're going to use vec_mule, so we chose // carefully how to "unpack" the elements into the even slots - if ((i << 3) % 16) - filter_v = vec_mergel(filter_v,(vector signed short)vzero); - else - filter_v = vec_mergeh(filter_v,(vector signed short)vzero); - - val_vEven = vec_mule(src_v, filter_v); - val_s = vec_sums(val_vEven, vzero); - vec_st(val_s, 0, tempo); - dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); - } + if ((i << 3) % 16) + filter_v = vec_mergel(filter_v,(vector signed short)vzero); + else + filter_v = vec_mergeh(filter_v,(vector signed short)vzero); + + val_vEven = vec_mule(src_v, filter_v); + val_s = vec_sums(val_vEven, vzero); + vec_st(val_s, 0, tempo); + dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); + } } break; - case 8: + case 8: { - for(i=0; i<dstW; i++) { - register int srcPos = filterPos[i]; - - vector unsigned char src_v0 = vec_ld(srcPos, src); - vector unsigned char src_v1, src_vF; - vector signed short src_v, filter_v; - vector signed int val_v, val_s; - if ((((int)src + srcPos)% 16) > 8) { - src_v1 = vec_ld(srcPos + 16, src); - } - src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); - - src_v = // vec_unpackh sign-extends... - (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); - filter_v = vec_ld(i << 4, filter); + for (i=0; i<dstW; i++) { + register int srcPos = filterPos[i]; + + vector unsigned char src_v0 = vec_ld(srcPos, src); + vector unsigned char src_v1, src_vF; + vector signed short src_v, filter_v; + vector signed int val_v, val_s; + if ((((int)src + srcPos)% 16) > 8) { + src_v1 = vec_ld(srcPos + 16, src); + } + src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); + + src_v = // vec_unpackh sign-extends... + (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); + filter_v = vec_ld(i << 4, filter); // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2) - val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); - val_s = vec_sums(val_v, vzero); - vec_st(val_s, 0, tempo); - dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); - } + val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); + val_s = vec_sums(val_v, vzero); + vec_st(val_s, 0, tempo); + dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); + } } break; - case 16: + case 16: { - for(i=0; i<dstW; i++) { - register int srcPos = filterPos[i]; + for (i=0; i<dstW; i++) { + register int srcPos = filterPos[i]; - vector unsigned char src_v0 = vec_ld(srcPos, src); - vector unsigned char src_v1 = vec_ld(srcPos + 16, src); - vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); + vector unsigned char src_v0 = vec_ld(srcPos, src); + vector unsigned char src_v1 = vec_ld(srcPos + 16, src); + vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); - vector signed short src_vA = // vec_unpackh sign-extends... - (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); - vector signed short src_vB = // vec_unpackh sign-extends... - (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); + vector signed short src_vA = // vec_unpackh sign-extends... + (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); + vector signed short src_vB = // vec_unpackh sign-extends... + (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); - vector signed short filter_v0 = vec_ld(i << 5, filter); - vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); - // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2) + vector signed short filter_v0 = vec_ld(i << 5, filter); + vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); + // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2) - vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); - vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); + vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); + vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); - vector signed int val_s = vec_sums(val_v, vzero); + vector signed int val_s = vec_sums(val_v, vzero); - vec_st(val_s, 0, tempo); - dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); - } + vec_st(val_s, 0, tempo); + dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); + } } break; - default: + default: { - for(i=0; i<dstW; i++) { - register int j; - register int srcPos = filterPos[i]; + for (i=0; i<dstW; i++) { + register int j; + register int srcPos = filterPos[i]; vector signed int val_s, val_v = (vector signed int)vzero; - vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); + vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter); vector unsigned char src_v0 = vec_ld(srcPos, src); vector unsigned char permS = vec_lvsl(srcPos, src); for (j = 0 ; j < filterSize - 15; j += 16) { - vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); - vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); + vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); + vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); - vector signed short src_vA = // vec_unpackh sign-extends... - (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); - vector signed short src_vB = // vec_unpackh sign-extends... - (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); + vector signed short src_vA = // vec_unpackh sign-extends... + (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); + vector signed short src_vB = // vec_unpackh sign-extends... + (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); - vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); - vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); - vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); - vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); + vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); + vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); + vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); + vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); - vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); - val_v = vec_msums(src_vB, filter_v1, val_acc); + vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); + val_v = vec_msums(src_vB, filter_v1, val_acc); - filter_v0R = filter_v2R; - src_v0 = src_v1; + filter_v0R = filter_v2R; + src_v0 = src_v1; } if (j < (filterSize-7)) { - // loading src_v0 is useless, it's already done above - //vector unsigned char src_v0 = vec_ld(srcPos + j, src); - vector unsigned char src_v1, src_vF; - vector signed short src_v, filter_v1R, filter_v; - if ((((int)src + srcPos)% 16) > 8) { - src_v1 = vec_ld(srcPos + j + 16, src); - } - src_vF = vec_perm(src_v0, src_v1, permS); - - src_v = // vec_unpackh sign-extends... - (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); - // loading filter_v0R is useless, it's already done above - //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter); - filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); - filter_v = vec_perm(filter_v0R, filter_v1R, permF); - - val_v = vec_msums(src_v, filter_v, val_v); + // loading src_v0 is useless, it's already done above + //vector unsigned char src_v0 = vec_ld(srcPos + j, src); + vector unsigned char src_v1, src_vF; + vector signed short src_v, filter_v1R, filter_v; + if ((((int)src + srcPos)% 16) > 8) { + src_v1 = vec_ld(srcPos + j + 16, src); + } + src_vF = vec_perm(src_v0, src_v1, permS); + + src_v = // vec_unpackh sign-extends... + (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); + // loading filter_v0R is useless, it's already done above + //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter); + filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); + filter_v = vec_perm(filter_v0R, filter_v1R, permF); + + val_v = vec_msums(src_v, filter_v, val_v); } val_s = vec_sums(val_v, vzero); vec_st(val_s, 0, tempo); dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); - } + } } - } + } } static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) { - uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; - // yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] ); - uint8_t *ysrc = src[0]; - uint8_t *usrc = src[1]; - uint8_t *vsrc = src[2]; - const int width = c->srcW; - const int height = srcSliceH; - const int lumStride = srcStride[0]; - const int chromStride = srcStride[1]; - const int dstStride = dstStride_a[0]; - const vector unsigned char yperm = vec_lvsl(0, ysrc); - const int vertLumPerChroma = 2; - register unsigned int y; - - if(width&15){ - yv12toyuy2( ysrc, usrc, vsrc, dst,c->srcW,srcSliceH, lumStride, chromStride, dstStride); - return srcSliceH; - } - - /* this code assume: - - 1) dst is 16 bytes-aligned - 2) dstStride is a multiple of 16 - 3) width is a multiple of 16 - 4) lum&chrom stride are multiple of 8 - */ + int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) { + uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; + // yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] ); + uint8_t *ysrc = src[0]; + uint8_t *usrc = src[1]; + uint8_t *vsrc = src[2]; + const int width = c->srcW; + const int height = srcSliceH; + const int lumStride = srcStride[0]; + const int chromStride = srcStride[1]; + const int dstStride = dstStride_a[0]; + const vector unsigned char yperm = vec_lvsl(0, ysrc); + const int vertLumPerChroma = 2; + register unsigned int y; + + if (width&15) { + yv12toyuy2( ysrc, usrc, vsrc, dst,c->srcW,srcSliceH, lumStride, chromStride, dstStride); + return srcSliceH; + } - for(y=0; y<height; y++) - { - int i; - for (i = 0; i < width - 31; i+= 32) { - const unsigned int j = i >> 1; - vector unsigned char v_yA = vec_ld(i, ysrc); - vector unsigned char v_yB = vec_ld(i + 16, ysrc); - vector unsigned char v_yC = vec_ld(i + 32, ysrc); - vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); - vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); - vector unsigned char v_uA = vec_ld(j, usrc); - vector unsigned char v_uB = vec_ld(j + 16, usrc); - vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); - vector unsigned char v_vA = vec_ld(j, vsrc); - vector unsigned char v_vB = vec_ld(j + 16, vsrc); - vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); - vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); - vector unsigned char v_uv_b = vec_mergel(v_u, v_v); - vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); - vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); - vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b); - vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b); - vec_st(v_yuy2_0, (i << 1), dst); - vec_st(v_yuy2_1, (i << 1) + 16, dst); - vec_st(v_yuy2_2, (i << 1) + 32, dst); - vec_st(v_yuy2_3, (i << 1) + 48, dst); - } - if (i < width) { - const unsigned int j = i >> 1; - vector unsigned char v_y1 = vec_ld(i, ysrc); - vector unsigned char v_u = vec_ld(j, usrc); - vector unsigned char v_v = vec_ld(j, vsrc); - vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); - vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); - vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); - vec_st(v_yuy2_0, (i << 1), dst); - vec_st(v_yuy2_1, (i << 1) + 16, dst); - } - if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) - { - usrc += chromStride; - vsrc += chromStride; - } - ysrc += lumStride; - dst += dstStride; + /* this code assume: + + 1) dst is 16 bytes-aligned + 2) dstStride is a multiple of 16 + 3) width is a multiple of 16 + 4) lum&chrom stride are multiple of 8 + */ + + for (y=0; y<height; y++) { + int i; + for (i = 0; i < width - 31; i+= 32) { + const unsigned int j = i >> 1; + vector unsigned char v_yA = vec_ld(i, ysrc); + vector unsigned char v_yB = vec_ld(i + 16, ysrc); + vector unsigned char v_yC = vec_ld(i + 32, ysrc); + vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); + vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); + vector unsigned char v_uA = vec_ld(j, usrc); + vector unsigned char v_uB = vec_ld(j + 16, usrc); + vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); + vector unsigned char v_vA = vec_ld(j, vsrc); + vector unsigned char v_vB = vec_ld(j + 16, vsrc); + vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); + vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); + vector unsigned char v_uv_b = vec_mergel(v_u, v_v); + vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); + vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); + vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b); + vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b); + vec_st(v_yuy2_0, (i << 1), dst); + vec_st(v_yuy2_1, (i << 1) + 16, dst); + vec_st(v_yuy2_2, (i << 1) + 32, dst); + vec_st(v_yuy2_3, (i << 1) + 48, dst); + } + if (i < width) { + const unsigned int j = i >> 1; + vector unsigned char v_y1 = vec_ld(i, ysrc); + vector unsigned char v_u = vec_ld(j, usrc); + vector unsigned char v_v = vec_ld(j, vsrc); + vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); + vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); + vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); + vec_st(v_yuy2_0, (i << 1), dst); + vec_st(v_yuy2_1, (i << 1) + 16, dst); + } + if ( (y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) { + usrc += chromStride; + vsrc += chromStride; + } + ysrc += lumStride; + dst += dstStride; } - return srcSliceH; + return srcSliceH; } static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) { - uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; - // yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] ); - uint8_t *ysrc = src[0]; - uint8_t *usrc = src[1]; - uint8_t *vsrc = src[2]; - const int width = c->srcW; - const int height = srcSliceH; - const int lumStride = srcStride[0]; - const int chromStride = srcStride[1]; - const int dstStride = dstStride_a[0]; - const int vertLumPerChroma = 2; - const vector unsigned char yperm = vec_lvsl(0, ysrc); - register unsigned int y; - - if(width&15){ - yv12touyvy( ysrc, usrc, vsrc, dst,c->srcW,srcSliceH, lumStride, chromStride, dstStride); - return srcSliceH; - } - - /* this code assume: - - 1) dst is 16 bytes-aligned - 2) dstStride is a multiple of 16 - 3) width is a multiple of 16 - 4) lum&chrom stride are multiple of 8 - */ + int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) { + uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; + // yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] ); + uint8_t *ysrc = src[0]; + uint8_t *usrc = src[1]; + uint8_t *vsrc = src[2]; + const int width = c->srcW; + const int height = srcSliceH; + const int lumStride = srcStride[0]; + const int chromStride = srcStride[1]; + const int dstStride = dstStride_a[0]; + const int vertLumPerChroma = 2; + const vector unsigned char yperm = vec_lvsl(0, ysrc); + register unsigned int y; + + if (width&15) { + yv12touyvy( ysrc, usrc, vsrc, dst,c->srcW,srcSliceH, lumStride, chromStride, dstStride); + return srcSliceH; + } - for(y=0; y<height; y++) - { - int i; - for (i = 0; i < width - 31; i+= 32) { - const unsigned int j = i >> 1; - vector unsigned char v_yA = vec_ld(i, ysrc); - vector unsigned char v_yB = vec_ld(i + 16, ysrc); - vector unsigned char v_yC = vec_ld(i + 32, ysrc); - vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); - vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); - vector unsigned char v_uA = vec_ld(j, usrc); - vector unsigned char v_uB = vec_ld(j + 16, usrc); - vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); - vector unsigned char v_vA = vec_ld(j, vsrc); - vector unsigned char v_vB = vec_ld(j + 16, vsrc); - vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); - vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); - vector unsigned char v_uv_b = vec_mergel(v_u, v_v); - vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); - vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); - vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2); - vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2); - vec_st(v_uyvy_0, (i << 1), dst); - vec_st(v_uyvy_1, (i << 1) + 16, dst); - vec_st(v_uyvy_2, (i << 1) + 32, dst); - vec_st(v_uyvy_3, (i << 1) + 48, dst); - } - if (i < width) { - const unsigned int j = i >> 1; - vector unsigned char v_y1 = vec_ld(i, ysrc); - vector unsigned char v_u = vec_ld(j, usrc); - vector unsigned char v_v = vec_ld(j, vsrc); - vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); - vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); - vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); - vec_st(v_uyvy_0, (i << 1), dst); - vec_st(v_uyvy_1, (i << 1) + 16, dst); - } - if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) - { - usrc += chromStride; - vsrc += chromStride; - } - ysrc += lumStride; - dst += dstStride; + /* this code assume: + + 1) dst is 16 bytes-aligned + 2) dstStride is a multiple of 16 + 3) width is a multiple of 16 + 4) lum&chrom stride are multiple of 8 + */ + + for (y=0; y<height; y++) { + int i; + for (i = 0; i < width - 31; i+= 32) { + const unsigned int j = i >> 1; + vector unsigned char v_yA = vec_ld(i, ysrc); + vector unsigned char v_yB = vec_ld(i + 16, ysrc); + vector unsigned char v_yC = vec_ld(i + 32, ysrc); + vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); + vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); + vector unsigned char v_uA = vec_ld(j, usrc); + vector unsigned char v_uB = vec_ld(j + 16, usrc); + vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); + vector unsigned char v_vA = vec_ld(j, vsrc); + vector unsigned char v_vB = vec_ld(j + 16, vsrc); + vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); + vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); + vector unsigned char v_uv_b = vec_mergel(v_u, v_v); + vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); + vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); + vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2); + vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2); + vec_st(v_uyvy_0, (i << 1), dst); + vec_st(v_uyvy_1, (i << 1) + 16, dst); + vec_st(v_uyvy_2, (i << 1) + 32, dst); + vec_st(v_uyvy_3, (i << 1) + 48, dst); + } + if (i < width) { + const unsigned int j = i >> 1; + vector unsigned char v_y1 = vec_ld(i, ysrc); + vector unsigned char v_u = vec_ld(j, usrc); + vector unsigned char v_v = vec_ld(j, vsrc); + vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); + vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); + vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); + vec_st(v_uyvy_0, (i << 1), dst); + vec_st(v_uyvy_1, (i << 1) + 16, dst); + } + if ( (y&(vertLumPerChroma-1))==(vertLumPerChroma-1) ) { + usrc += chromStride; + vsrc += chromStride; + } + ysrc += lumStride; + dst += dstStride; } - return srcSliceH; + return srcSliceH; } |