diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2001-10-18 22:27:13 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2001-10-18 22:27:13 +0000 |
commit | 783e9cc9af091dca768a9b6103af4e528ea2406e (patch) | |
tree | 44f238736e5b306fbc82aa269db79979d5dfa4fd | |
parent | acced5532e85f718be9d06494928c454aaab2621 (diff) | |
download | ffmpeg-783e9cc9af091dca768a9b6103af4e528ea2406e.tar.gz |
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
moved mmx variables to top to avoid alignment issues
mmx2 code should work fine now if and only if the input width is %16=0 and the output width is %32=0
reordered some code (5% faster with a simply -benchmark)
first line bug fixed (i hope i didnt introduce any new bugs with that ...)
changed a lot of the vertical scale setup code, i hope i fixed something and didnt mess it up :)
a few known bugs left (rightmost line is wrong)
MMX2 code will only be used for upscaling & acceptable width´s
16bit dithering can be disabled
Originally committed as revision 2265 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
-rw-r--r-- | postproc/swscale.c | 477 | ||||
-rw-r--r-- | postproc/swscale_template.c | 477 |
2 files changed, 542 insertions, 412 deletions
diff --git a/postproc/swscale.c b/postproc/swscale.c index b02fe41152..049099abcb 100644 --- a/postproc/swscale.c +++ b/postproc/swscale.c @@ -1,28 +1,29 @@ // Software scaling and colorspace conversion routines for MPlayer +// Orginal C implementation by ? +// current version mostly by Michael Niedermayer (michaelni@gmx.at) + #include <inttypes.h> #include "../config.h" -#undef HAVE_MMX2 //code is buggy +//#undef HAVE_MMX2 //#undef HAVE_MMX +//#undef ARCH_X86 +#define DITHER16BPP +#define ALT_ERROR #define RET 0xC3 //near return opcode +/* +NOTES -// temporary storage for 4 yuv lines: -// 16bit for now (mmx likes it more compact) -static uint16_t pix_buf_y[4][2048]; -static uint16_t pix_buf_uv[2][2048*2]; +known BUGS with known cause (no bugreports please!) +line at the right (c,asm and mmx2) +code reads 1 sample too much (might cause a sig11) -// clipping helper table for C implementations: -static unsigned char clip_table[768]; - -// yuv->rgb conversion tables: -static int yuvtab_2568[256]; -static int yuvtab_3343[256]; -static int yuvtab_0c92[256]; -static int yuvtab_1a1e[256]; -static int yuvtab_40cf[256]; +TODO +check alignment off everything +*/ static uint64_t yCoeff= 0x2568256825682568LL; static uint64_t ubCoeff= 0x3343334333433343LL; @@ -46,11 +47,27 @@ static uint64_t g16Mask= 0x07E007E007E007E0LL; static uint64_t r16Mask= 0xF800F800F800F800LL; static uint64_t temp0; + +// temporary storage for 4 yuv lines: +// 16bit for now (mmx likes it more compact) +static uint16_t pix_buf_y[4][2048]; +static uint16_t pix_buf_uv[2][2048*2]; + +// clipping helper table for C implementations: +static unsigned char clip_table[768]; + +// yuv->rgb conversion tables: +static int yuvtab_2568[256]; +static int yuvtab_3343[256]; +static int yuvtab_0c92[256]; +static int yuvtab_1a1e[256]; +static int yuvtab_40cf[256]; + + static uint8_t funnyYCode[10000]; static uint8_t funnyUVCode[10000]; - // *** bilinear scaling and yuv->rgb conversion of yv12 slices: // *** Note: it's called multiple times while decoding a frame, first time y==0 // *** Designed to upscale, but may work for downscale too. @@ -64,27 +81,43 @@ void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h, //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; -unsigned int s_xinc2=s_xinc>>1; +unsigned int s_xinc2; -static int s_srcypos; +static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) static int s_ypos; + +// last horzontally interpolated lines, used to avoid unnecessary calculations static int s_last_ypos; +static int s_last_y1pos; + static int static_dstw; #ifdef HAVE_MMX2 +// used to detect a horizontal size change static int old_dstw= -1; static int old_s_xinc= -1; + +// difference between the requested xinc and the required one for the mmx2 routine +static int s_xinc_diff=0; +static int s_xinc2_diff=0; #endif +int canMMX2BeUsed; -s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other +// we need that precission at least for the mmx2 code +s_xinc*= 256; +s_xinc2=s_xinc>>1; +canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; if(y==0){ - s_srcypos=-2*s_yinc; - s_ypos=-2; - s_last_ypos=-2; + s_srcypos= s_yinc/2 - 0x8000; + s_ypos=0; + + // force calculation of the horizontal interpolation of the first line + s_last_ypos=-99; + s_last_y1pos=-99; #ifdef HAVE_MMX2 // cant downscale !!! - if(old_s_xinc != s_xinc || old_dstw!=dstw) + if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) { uint8_t *fragment; int imm8OfPShufW1; @@ -102,32 +135,30 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe //code fragment -// fragmentLength=0; -// printf("%d, %d\n", fragmentLength,imm8OfPShufW1); - asm volatile( "jmp 9f \n\t" // Begin "0: \n\t" - "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment + "movq (%%esi), %%mm0 \n\t" //FIXME Alignment "movq %%mm0, %%mm1 \n\t" "psrlq $8, %%mm0 \n\t" "punpcklbw %%mm7, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" + "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF "pshufw $0xFF, %%mm1, %%mm1 \n\t" "1: \n\t" + "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry "pshufw $0xFF, %%mm0, %%mm0 \n\t" "2: \n\t" + "psrlw $9, %%mm3 \n\t" "psubw %%mm1, %%mm0 \n\t" - "psraw $1, %%mm0 \n\t" - "pmullw %%mm2, %%mm0 \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF "psllw $7, %%mm1 \n\t" "paddw %%mm1, %%mm0 \n\t" - "movq %%mm0, (%%edi, %%eax) \n\t" - "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF - "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF - "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry + "movq %%mm0, (%%edi, %%eax) \n\t" "addl $8, %%eax \n\t" // End @@ -147,17 +178,28 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe ); xpos= xx=xalpha= 0; - //FIXME choose size and or xinc so that they fit exactly + + /* choose xinc so that all 8 parts fit exactly + Note: we cannot use just 1 part because it would not fit in the code cache */ + s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; +// s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); +#ifdef ALT_ERROR + s_xinc2_diff+= ((0x10000/(dstw/8))); +#endif + s_xinc_diff= s_xinc2_diff*2; + + s_xinc2+= s_xinc2_diff; + s_xinc+= s_xinc_diff; for(i=0; i<dstw/8; i++) { - int xx=xpos>>8; + int xx=xpos>>16; if((i&3) == 0) { int a=0; - int b=((xpos+s_xinc)>>8) - xx; - int c=((xpos+s_xinc*2)>>8) - xx; - int d=((xpos+s_xinc*3)>>8) - xx; + int b=((xpos+s_xinc)>>16) - xx; + int c=((xpos+s_xinc*2)>>16) - xx; + int d=((xpos+s_xinc*3)>>16) - xx; memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); @@ -174,14 +216,14 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe //FIXME choose size and or xinc so that they fit exactly for(i=0; i<dstw/8; i++) { - int xx=xpos>>8; + int xx=xpos>>16; if((i&3) == 0) { int a=0; - int b=((xpos+s_xinc2)>>8) - xx; - int c=((xpos+s_xinc2*2)>>8) - xx; - int d=((xpos+s_xinc2*3)>>8) - xx; + int b=((xpos+s_xinc2)>>16) - xx; + int c=((xpos+s_xinc2*2)>>16) - xx; + int d=((xpos+s_xinc2*3)>>16) - xx; memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); @@ -197,86 +239,117 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe } -#endif + + if(canMMX2BeUsed) + { + s_xinc+= s_xinc_diff; + s_xinc2+= s_xinc2_diff; + } +#endif // HAVE_MMX2 } // reset counters while(1){ unsigned char *dest=dstptr+dststride*s_ypos; - int y0=2+(s_srcypos>>16); - int y1=1+(s_srcypos>>17); + int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line + // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) + int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; + int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line int yalpha=(s_srcypos&0xFFFF)>>7; int yalpha1=yalpha^511; - int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; + int uvalpha=(srcuvpos&0x1FFFF)>>8; int uvalpha1=uvalpha^511; - uint16_t *buf0=pix_buf_y[y0&3]; - uint16_t *buf1=pix_buf_y[((y0+1)&3)]; - uint16_t *uvbuf0=pix_buf_uv[y1&1]; - uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; + uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice + uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice + uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice + uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice int i; - if(y0>=y+h) break; + // if this is before the first line than use only the first src line + if(y0==0) buf0= buf1; + if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 + + if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway + + // if this is after the last line than use only the last src line + if(y0>=y+h) + { + buf1= buf0; + s_last_ypos=y0; + } + if(y1>=(y+h)/2) + { + uvbuf1= uvbuf0; + s_last_y1pos=y1; + } + s_ypos++; s_srcypos+=s_yinc; + //only interpolate the src line horizontally if we didnt do it allready if(s_last_ypos!=y0){ unsigned char *src=srcptr[0]+(y0-y)*stride[0]; unsigned int xpos=0; s_last_ypos=y0; // *** horizontal scale Y line to temp buffer - // this loop should be rewritten in MMX assembly!!!! -#ifdef HAVE_MMX2 - asm volatile( - "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm2, %%mm2 \n\t" // 2*xalpha - "movd %5, %%mm6 \n\t" // s_xinc&0xFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "movq %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF - "movq %%mm2, temp0 \n\t" - "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "xorl %%eax, %%eax \n\t" // i - "xorl %%ebx, %%ebx \n\t" // xx - "movl %0, %%esi \n\t" // src - "movl %1, %%edi \n\t" // buf1 - "movl %3, %%edx \n\t" // (s_xinc*4)>>8 - "xorl %%ecx, %%ecx \n\t" - "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF -// "int $3\n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), - "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) - : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" - ); +#ifdef ARCH_X86 -#elif defined (ARCH_X86) +#ifdef HAVE_MMX2 + if(canMMX2BeUsed) + { + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "pxor %%mm2, %%mm2 \n\t" // 2*xalpha + "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "movq %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" + "paddw %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" + "paddw %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF + "movq %%mm2, temp0 \n\t" + "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "xorl %%eax, %%eax \n\t" // i + "movl %0, %%esi \n\t" // src + "movl %1, %%edi \n\t" // buf1 + "movl %3, %%edx \n\t" // (s_xinc*4)>>16 + "xorl %%ecx, %%ecx \n\t" + "xorl %%ebx, %%ebx \n\t" + "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF + // "int $3\n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), + "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) + : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" + ); + } + else + { +#endif //NO MMX just normal asm ... FIXME try/write funny MMX2 variant //FIXME add prefetch asm volatile( @@ -288,24 +361,24 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, (%%edi, %%eax, 2) \n\t" - "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF + "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, 2(%%edi, %%eax, 2) \n\t" - "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF + "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry @@ -314,106 +387,96 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe " jb 1b \n\t" - :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) + :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) : "%eax", "%ebx", "%ecx", "%edi", "%esi" ); +#ifdef HAVE_MMX2 + } //if MMX2 cant be used +#endif #else for(i=0;i<dstw;i++){ - register unsigned int xx=xpos>>8; - register unsigned int xalpha=(xpos&0xFF)>>1; + register unsigned int xx=xpos>>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); xpos+=s_xinc; } #endif + } // *** horizontal scale U and V lines to temp buffer - if(!(y0&1)){ + if(s_last_y1pos!=y1){ unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; - xpos=0; - // this loop should be rewritten in MMX assembly!!!! + int xpos=0; + s_last_y1pos= y1; +#ifdef ARCH_X86 #ifdef HAVE_MMX2 - asm volatile( + if(canMMX2BeUsed) + { + asm volatile( "pxor %%mm7, %%mm7 \n\t" "pxor %%mm2, %%mm2 \n\t" // 2*xalpha - "movd %5, %%mm6 \n\t" // s_xinc&0xFF + "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF "punpcklwd %%mm6, %%mm6 \n\t" "punpcklwd %%mm6, %%mm6 \n\t" "movq %%mm6, %%mm2 \n\t" "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" + "paddw %%mm6, %%mm2 \n\t" "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF + "paddw %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF "movq %%mm2, temp0 \n\t" - "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF + "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF "punpcklwd %%mm6, %%mm6 \n\t" "punpcklwd %%mm6, %%mm6 \n\t" "xorl %%eax, %%eax \n\t" // i - "xorl %%ebx, %%ebx \n\t" // xx "movl %0, %%esi \n\t" // src "movl %1, %%edi \n\t" // buf1 - "movl %3, %%edx \n\t" // (s_xinc*4)>>8 + "movl %3, %%edx \n\t" // (s_xinc*4)>>16 "xorl %%ecx, %%ecx \n\t" - "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF + "xorl %%ebx, %%ebx \n\t" + "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF + // "int $3\n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" +#define FUNNYUVCODE \ + "call funnyUVCode \n\t"\ + "movq temp0, %%mm2 \n\t"\ + "xorl %%ecx, %%ecx \n\t" + +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + + "xorl %%eax, %%eax \n\t" // i - "xorl %%ebx, %%ebx \n\t" // xx "movl %6, %%esi \n\t" // src "movl %1, %%edi \n\t" // buf1 "addl $4096, %%edi \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - - :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), - "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + + :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), + "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" ); - -#elif defined (ARCH_X86) + } + else + { +#endif asm volatile( "xorl %%eax, %%eax \n\t" // i "xorl %%ebx, %%ebx \n\t" // xx @@ -424,46 +487,48 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, (%%edi, %%eax, 2) \n\t" "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, 4096(%%edi, %%eax, 2)\n\t" - "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF + "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry "addl $1, %%eax \n\t" "cmpl %2, %%eax \n\t" " jb 1b \n\t" - :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), + :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), "r" (src2) : "%eax", "%ebx", "%ecx", "%edi", "%esi" ); +#ifdef HAVE_MMX2 + } //if MMX2 cant be used +#endif #else - for(i=0;i<dstw;i++){ - register unsigned int xx=xpos>>8; - register unsigned int xalpha=(xpos&0xFF)>>1; + for(i=0;i<dstw;i++){ + register unsigned int xx=xpos>>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); xpos+=s_xinc2; - } -#endif } - if(!y0) continue; +#endif } + // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... // Re: Note1: ok n*4 for now // Note2: instead of using lookup tabs, mmx version could do the multiply... @@ -489,47 +554,47 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe "1: \n\t"\ "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ - "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ - "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ - "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ - "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ - "psubw w10, %%mm1 \n\t" /* Y-16*/\ - "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ - "pmulhw yCoeff, %%mm1 \n\t"\ -\ "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ + "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ + "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ + "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ + "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ + "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ + "psubw w10, %%mm1 \n\t" /* Y-16*/\ "psubw w80, %%mm3 \n\t" /* (U-128)*/\ + "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ + "pmulhw yCoeff, %%mm1 \n\t"\ +\ \ - "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ - "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ + "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ + "pmulhw ubCoeff, %%mm3 \n\t"\ "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ + "pmulhw ugCoeff, %%mm2 \n\t"\ "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ "psubw w80, %%mm0 \n\t" /* (V-128)*/\ "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ \ - "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ - "pmulhw ubCoeff, %%mm3 \n\t"\ - "paddw %%mm1, %%mm3 \n\t" /* B*/\ \ "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ "pmulhw vrCoeff, %%mm0 \n\t"\ + "pmulhw vgCoeff, %%mm4 \n\t"\ + "paddw %%mm1, %%mm3 \n\t" /* B*/\ "paddw %%mm1, %%mm0 \n\t" /* R*/\ + "packuswb %%mm3, %%mm3 \n\t"\ \ - "pmulhw ugCoeff, %%mm2 \n\t"\ - "pmulhw vgCoeff, %%mm4 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ "paddw %%mm4, %%mm2 \n\t"\ "paddw %%mm2, %%mm1 \n\t" /* G*/\ \ - "packuswb %%mm3, %%mm3 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ "packuswb %%mm1, %%mm1 \n\t" YSCALEYUV2RGB @@ -610,9 +675,11 @@ YSCALEYUV2RGB asm volatile( YSCALEYUV2RGB +#ifdef DITHER16BPP "paddusb g16Dither, %%mm1 \n\t" "paddusb b16Dither, %%mm0 \n\t" "paddusb b16Dither, %%mm3 \n\t" +#endif "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R @@ -699,8 +766,6 @@ YSCALEYUV2RGB #elif defined (HAVE_MMX) asm volatile("emms"); #endif - - } diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c index b02fe41152..049099abcb 100644 --- a/postproc/swscale_template.c +++ b/postproc/swscale_template.c @@ -1,28 +1,29 @@ // Software scaling and colorspace conversion routines for MPlayer +// Orginal C implementation by ? +// current version mostly by Michael Niedermayer (michaelni@gmx.at) + #include <inttypes.h> #include "../config.h" -#undef HAVE_MMX2 //code is buggy +//#undef HAVE_MMX2 //#undef HAVE_MMX +//#undef ARCH_X86 +#define DITHER16BPP +#define ALT_ERROR #define RET 0xC3 //near return opcode +/* +NOTES -// temporary storage for 4 yuv lines: -// 16bit for now (mmx likes it more compact) -static uint16_t pix_buf_y[4][2048]; -static uint16_t pix_buf_uv[2][2048*2]; +known BUGS with known cause (no bugreports please!) +line at the right (c,asm and mmx2) +code reads 1 sample too much (might cause a sig11) -// clipping helper table for C implementations: -static unsigned char clip_table[768]; - -// yuv->rgb conversion tables: -static int yuvtab_2568[256]; -static int yuvtab_3343[256]; -static int yuvtab_0c92[256]; -static int yuvtab_1a1e[256]; -static int yuvtab_40cf[256]; +TODO +check alignment off everything +*/ static uint64_t yCoeff= 0x2568256825682568LL; static uint64_t ubCoeff= 0x3343334333433343LL; @@ -46,11 +47,27 @@ static uint64_t g16Mask= 0x07E007E007E007E0LL; static uint64_t r16Mask= 0xF800F800F800F800LL; static uint64_t temp0; + +// temporary storage for 4 yuv lines: +// 16bit for now (mmx likes it more compact) +static uint16_t pix_buf_y[4][2048]; +static uint16_t pix_buf_uv[2][2048*2]; + +// clipping helper table for C implementations: +static unsigned char clip_table[768]; + +// yuv->rgb conversion tables: +static int yuvtab_2568[256]; +static int yuvtab_3343[256]; +static int yuvtab_0c92[256]; +static int yuvtab_1a1e[256]; +static int yuvtab_40cf[256]; + + static uint8_t funnyYCode[10000]; static uint8_t funnyUVCode[10000]; - // *** bilinear scaling and yuv->rgb conversion of yv12 slices: // *** Note: it's called multiple times while decoding a frame, first time y==0 // *** Designed to upscale, but may work for downscale too. @@ -64,27 +81,43 @@ void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h, //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; -unsigned int s_xinc2=s_xinc>>1; +unsigned int s_xinc2; -static int s_srcypos; +static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) static int s_ypos; + +// last horzontally interpolated lines, used to avoid unnecessary calculations static int s_last_ypos; +static int s_last_y1pos; + static int static_dstw; #ifdef HAVE_MMX2 +// used to detect a horizontal size change static int old_dstw= -1; static int old_s_xinc= -1; + +// difference between the requested xinc and the required one for the mmx2 routine +static int s_xinc_diff=0; +static int s_xinc2_diff=0; #endif +int canMMX2BeUsed; -s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other +// we need that precission at least for the mmx2 code +s_xinc*= 256; +s_xinc2=s_xinc>>1; +canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; if(y==0){ - s_srcypos=-2*s_yinc; - s_ypos=-2; - s_last_ypos=-2; + s_srcypos= s_yinc/2 - 0x8000; + s_ypos=0; + + // force calculation of the horizontal interpolation of the first line + s_last_ypos=-99; + s_last_y1pos=-99; #ifdef HAVE_MMX2 // cant downscale !!! - if(old_s_xinc != s_xinc || old_dstw!=dstw) + if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) { uint8_t *fragment; int imm8OfPShufW1; @@ -102,32 +135,30 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe //code fragment -// fragmentLength=0; -// printf("%d, %d\n", fragmentLength,imm8OfPShufW1); - asm volatile( "jmp 9f \n\t" // Begin "0: \n\t" - "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment + "movq (%%esi), %%mm0 \n\t" //FIXME Alignment "movq %%mm0, %%mm1 \n\t" "psrlq $8, %%mm0 \n\t" "punpcklbw %%mm7, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" + "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF "pshufw $0xFF, %%mm1, %%mm1 \n\t" "1: \n\t" + "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry "pshufw $0xFF, %%mm0, %%mm0 \n\t" "2: \n\t" + "psrlw $9, %%mm3 \n\t" "psubw %%mm1, %%mm0 \n\t" - "psraw $1, %%mm0 \n\t" - "pmullw %%mm2, %%mm0 \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF "psllw $7, %%mm1 \n\t" "paddw %%mm1, %%mm0 \n\t" - "movq %%mm0, (%%edi, %%eax) \n\t" - "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF - "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF - "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry + "movq %%mm0, (%%edi, %%eax) \n\t" "addl $8, %%eax \n\t" // End @@ -147,17 +178,28 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe ); xpos= xx=xalpha= 0; - //FIXME choose size and or xinc so that they fit exactly + + /* choose xinc so that all 8 parts fit exactly + Note: we cannot use just 1 part because it would not fit in the code cache */ + s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; +// s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); +#ifdef ALT_ERROR + s_xinc2_diff+= ((0x10000/(dstw/8))); +#endif + s_xinc_diff= s_xinc2_diff*2; + + s_xinc2+= s_xinc2_diff; + s_xinc+= s_xinc_diff; for(i=0; i<dstw/8; i++) { - int xx=xpos>>8; + int xx=xpos>>16; if((i&3) == 0) { int a=0; - int b=((xpos+s_xinc)>>8) - xx; - int c=((xpos+s_xinc*2)>>8) - xx; - int d=((xpos+s_xinc*3)>>8) - xx; + int b=((xpos+s_xinc)>>16) - xx; + int c=((xpos+s_xinc*2)>>16) - xx; + int d=((xpos+s_xinc*3)>>16) - xx; memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); @@ -174,14 +216,14 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe //FIXME choose size and or xinc so that they fit exactly for(i=0; i<dstw/8; i++) { - int xx=xpos>>8; + int xx=xpos>>16; if((i&3) == 0) { int a=0; - int b=((xpos+s_xinc2)>>8) - xx; - int c=((xpos+s_xinc2*2)>>8) - xx; - int d=((xpos+s_xinc2*3)>>8) - xx; + int b=((xpos+s_xinc2)>>16) - xx; + int c=((xpos+s_xinc2*2)>>16) - xx; + int d=((xpos+s_xinc2*3)>>16) - xx; memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); @@ -197,86 +239,117 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe } -#endif + + if(canMMX2BeUsed) + { + s_xinc+= s_xinc_diff; + s_xinc2+= s_xinc2_diff; + } +#endif // HAVE_MMX2 } // reset counters while(1){ unsigned char *dest=dstptr+dststride*s_ypos; - int y0=2+(s_srcypos>>16); - int y1=1+(s_srcypos>>17); + int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line + // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) + int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; + int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line int yalpha=(s_srcypos&0xFFFF)>>7; int yalpha1=yalpha^511; - int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; + int uvalpha=(srcuvpos&0x1FFFF)>>8; int uvalpha1=uvalpha^511; - uint16_t *buf0=pix_buf_y[y0&3]; - uint16_t *buf1=pix_buf_y[((y0+1)&3)]; - uint16_t *uvbuf0=pix_buf_uv[y1&1]; - uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; + uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice + uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice + uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice + uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice int i; - if(y0>=y+h) break; + // if this is before the first line than use only the first src line + if(y0==0) buf0= buf1; + if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 + + if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway + + // if this is after the last line than use only the last src line + if(y0>=y+h) + { + buf1= buf0; + s_last_ypos=y0; + } + if(y1>=(y+h)/2) + { + uvbuf1= uvbuf0; + s_last_y1pos=y1; + } + s_ypos++; s_srcypos+=s_yinc; + //only interpolate the src line horizontally if we didnt do it allready if(s_last_ypos!=y0){ unsigned char *src=srcptr[0]+(y0-y)*stride[0]; unsigned int xpos=0; s_last_ypos=y0; // *** horizontal scale Y line to temp buffer - // this loop should be rewritten in MMX assembly!!!! -#ifdef HAVE_MMX2 - asm volatile( - "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm2, %%mm2 \n\t" // 2*xalpha - "movd %5, %%mm6 \n\t" // s_xinc&0xFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "movq %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF - "movq %%mm2, temp0 \n\t" - "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "xorl %%eax, %%eax \n\t" // i - "xorl %%ebx, %%ebx \n\t" // xx - "movl %0, %%esi \n\t" // src - "movl %1, %%edi \n\t" // buf1 - "movl %3, %%edx \n\t" // (s_xinc*4)>>8 - "xorl %%ecx, %%ecx \n\t" - "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF -// "int $3\n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyYCode \n\t" - :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), - "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) - : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" - ); +#ifdef ARCH_X86 -#elif defined (ARCH_X86) +#ifdef HAVE_MMX2 + if(canMMX2BeUsed) + { + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "pxor %%mm2, %%mm2 \n\t" // 2*xalpha + "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "movq %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" + "paddw %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" + "paddw %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF + "movq %%mm2, temp0 \n\t" + "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "xorl %%eax, %%eax \n\t" // i + "movl %0, %%esi \n\t" // src + "movl %1, %%edi \n\t" // buf1 + "movl %3, %%edx \n\t" // (s_xinc*4)>>16 + "xorl %%ecx, %%ecx \n\t" + "xorl %%ebx, %%ebx \n\t" + "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF + // "int $3\n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + "movq temp0, %%mm2 \n\t" + "xorl %%ecx, %%ecx \n\t" + "call funnyYCode \n\t" + :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), + "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) + : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" + ); + } + else + { +#endif //NO MMX just normal asm ... FIXME try/write funny MMX2 variant //FIXME add prefetch asm volatile( @@ -288,24 +361,24 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, (%%edi, %%eax, 2) \n\t" - "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF + "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, 2(%%edi, %%eax, 2) \n\t" - "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF + "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry @@ -314,106 +387,96 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe " jb 1b \n\t" - :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) + :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) : "%eax", "%ebx", "%ecx", "%edi", "%esi" ); +#ifdef HAVE_MMX2 + } //if MMX2 cant be used +#endif #else for(i=0;i<dstw;i++){ - register unsigned int xx=xpos>>8; - register unsigned int xalpha=(xpos&0xFF)>>1; + register unsigned int xx=xpos>>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); xpos+=s_xinc; } #endif + } // *** horizontal scale U and V lines to temp buffer - if(!(y0&1)){ + if(s_last_y1pos!=y1){ unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; - xpos=0; - // this loop should be rewritten in MMX assembly!!!! + int xpos=0; + s_last_y1pos= y1; +#ifdef ARCH_X86 #ifdef HAVE_MMX2 - asm volatile( + if(canMMX2BeUsed) + { + asm volatile( "pxor %%mm7, %%mm7 \n\t" "pxor %%mm2, %%mm2 \n\t" // 2*xalpha - "movd %5, %%mm6 \n\t" // s_xinc&0xFF + "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF "punpcklwd %%mm6, %%mm6 \n\t" "punpcklwd %%mm6, %%mm6 \n\t" "movq %%mm6, %%mm2 \n\t" "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" + "paddw %%mm6, %%mm2 \n\t" "psllq $16, %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF + "paddw %%mm6, %%mm2 \n\t" + "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF "movq %%mm2, temp0 \n\t" - "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF + "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF "punpcklwd %%mm6, %%mm6 \n\t" "punpcklwd %%mm6, %%mm6 \n\t" "xorl %%eax, %%eax \n\t" // i - "xorl %%ebx, %%ebx \n\t" // xx "movl %0, %%esi \n\t" // src "movl %1, %%edi \n\t" // buf1 - "movl %3, %%edx \n\t" // (s_xinc*4)>>8 + "movl %3, %%edx \n\t" // (s_xinc*4)>>16 "xorl %%ecx, %%ecx \n\t" - "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF + "xorl %%ebx, %%ebx \n\t" + "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF + // "int $3\n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" +#define FUNNYUVCODE \ + "call funnyUVCode \n\t"\ + "movq temp0, %%mm2 \n\t"\ + "xorl %%ecx, %%ecx \n\t" + +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + + "xorl %%eax, %%eax \n\t" // i - "xorl %%ebx, %%ebx \n\t" // xx "movl %6, %%esi \n\t" // src "movl %1, %%edi \n\t" // buf1 "addl $4096, %%edi \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - "movq temp0, %%mm2 \n\t" - "xorb %%cl, %%cl \n\t" - "call funnyUVCode \n\t" - - :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), - "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE +FUNNYUVCODE + + :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), + "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" ); - -#elif defined (ARCH_X86) + } + else + { +#endif asm volatile( "xorl %%eax, %%eax \n\t" // i "xorl %%ebx, %%ebx \n\t" // xx @@ -424,46 +487,48 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, (%%edi, %%eax, 2) \n\t" "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $8, %%edi \n\t" + "shll $16, %%edi \n\t" "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) "movl %1, %%edi \n\t" - "shrl $1, %%esi \n\t" + "shrl $9, %%esi \n\t" "movw %%si, 4096(%%edi, %%eax, 2)\n\t" - "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF + "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry "addl $1, %%eax \n\t" "cmpl %2, %%eax \n\t" " jb 1b \n\t" - :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), + :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), "r" (src2) : "%eax", "%ebx", "%ecx", "%edi", "%esi" ); +#ifdef HAVE_MMX2 + } //if MMX2 cant be used +#endif #else - for(i=0;i<dstw;i++){ - register unsigned int xx=xpos>>8; - register unsigned int xalpha=(xpos&0xFF)>>1; + for(i=0;i<dstw;i++){ + register unsigned int xx=xpos>>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); xpos+=s_xinc2; - } -#endif } - if(!y0) continue; +#endif } + // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... // Re: Note1: ok n*4 for now // Note2: instead of using lookup tabs, mmx version could do the multiply... @@ -489,47 +554,47 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe "1: \n\t"\ "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ - "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ - "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ - "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ - "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ - "psubw w10, %%mm1 \n\t" /* Y-16*/\ - "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ - "pmulhw yCoeff, %%mm1 \n\t"\ -\ "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ + "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ + "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ + "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ + "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ + "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ + "psubw w10, %%mm1 \n\t" /* Y-16*/\ "psubw w80, %%mm3 \n\t" /* (U-128)*/\ + "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ + "pmulhw yCoeff, %%mm1 \n\t"\ +\ \ - "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ - "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ + "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ + "pmulhw ubCoeff, %%mm3 \n\t"\ "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ + "pmulhw ugCoeff, %%mm2 \n\t"\ "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ "psubw w80, %%mm0 \n\t" /* (V-128)*/\ "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ \ - "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ - "pmulhw ubCoeff, %%mm3 \n\t"\ - "paddw %%mm1, %%mm3 \n\t" /* B*/\ \ "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ "pmulhw vrCoeff, %%mm0 \n\t"\ + "pmulhw vgCoeff, %%mm4 \n\t"\ + "paddw %%mm1, %%mm3 \n\t" /* B*/\ "paddw %%mm1, %%mm0 \n\t" /* R*/\ + "packuswb %%mm3, %%mm3 \n\t"\ \ - "pmulhw ugCoeff, %%mm2 \n\t"\ - "pmulhw vgCoeff, %%mm4 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ "paddw %%mm4, %%mm2 \n\t"\ "paddw %%mm2, %%mm1 \n\t" /* G*/\ \ - "packuswb %%mm3, %%mm3 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ "packuswb %%mm1, %%mm1 \n\t" YSCALEYUV2RGB @@ -610,9 +675,11 @@ YSCALEYUV2RGB asm volatile( YSCALEYUV2RGB +#ifdef DITHER16BPP "paddusb g16Dither, %%mm1 \n\t" "paddusb b16Dither, %%mm0 \n\t" "paddusb b16Dither, %%mm3 \n\t" +#endif "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R @@ -699,8 +766,6 @@ YSCALEYUV2RGB #elif defined (HAVE_MMX) asm volatile("emms"); #endif - - } |