diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2001-12-02 20:03:26 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2001-12-02 20:03:26 +0000 |
commit | 077ea8a7f462b132765adfdcf53b898a6034d99b (patch) | |
tree | 4288f0da243d494c60e3f6c29bf3d7d38acbd7f6 | |
parent | af91b8b3c854fdfc34bc81dfd20faa439fe667e9 (diff) | |
download | ffmpeg-077ea8a7f462b132765adfdcf53b898a6034d99b.tar.gz |
horizontal up/downscale linear & cubic
-sws command line option (0 -> fast_bilinear, 1->bilinear, 2->bicubic)
Originally committed as revision 3273 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
-rw-r--r-- | postproc/swscale.c | 19 | ||||
-rw-r--r-- | postproc/swscale.h | 4 | ||||
-rw-r--r-- | postproc/swscale_template.c | 377 |
3 files changed, 372 insertions, 28 deletions
diff --git a/postproc/swscale.c b/postproc/swscale.c index 580fe53c3d..0383c37fc5 100644 --- a/postproc/swscale.c +++ b/postproc/swscale.c @@ -7,6 +7,7 @@ #include <inttypes.h> #include <string.h> +#include <math.h> //#include <stdio.h> //FOR DEBUG ONLY #include "../config.h" #include "swscale.h" @@ -60,6 +61,7 @@ static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL; static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL; static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL; static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; +static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL; static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL; static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL; @@ -96,9 +98,17 @@ static uint64_t __attribute__((aligned(8))) asm_uvalpha1; // 16bit for now (mmx likes it more compact) static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048]; static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2]; +static int16_t __attribute__((aligned(8))) hLumFilter[8000]; +static int16_t __attribute__((aligned(8))) hLumFilterPos[2000]; +static int16_t __attribute__((aligned(8))) hChrFilter[8000]; +static int16_t __attribute__((aligned(8))) hChrFilterPos[2000]; #else static uint16_t pix_buf_y[4][2048]; static uint16_t pix_buf_uv[2][2048*2]; +static int16_t hLumFilter[8000]; +static int16_t hLumFilterPos[2000]; +static int16_t hChrFilter[8000]; +static int16_t hChrFilterPos[2000]; #endif // clipping helper table for C implementations: @@ -118,6 +128,11 @@ static int yuvtab_0c92[256]; static int yuvtab_1a1e[256]; static int yuvtab_40cf[256]; +static int hLumFilterSize; +static int hChrFilterSize; + +int sws_flags=0; + #ifdef CAN_COMPILE_X86_ASM static uint8_t funnyYCode[10000]; static uint8_t funnyUVCode[10000]; @@ -128,9 +143,9 @@ static int canMMX2BeUsed=0; #ifdef CAN_COMPILE_X86_ASM void in_asm_used_var_warning_killer() { - int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+ + volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+ bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+ - M24A+M24B+M24C; + M24A+M24B+M24C+w02 + funnyYCode[0]+ funnyUVCode[0]+b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]; if(i) i=0; } #endif diff --git a/postproc/swscale.h b/postproc/swscale.h index 7e104f7dc0..2c75a9007c 100644 --- a/postproc/swscale.h +++ b/postproc/swscale.h @@ -1,4 +1,8 @@ +#define SWS_FAST_BILINEAR 0 +#define SWS_BILINEAR 1 +#define SWS_BICUBIC 2 + // *** bilinear scaling and yuv->rgb & yuv->yuv conversion of yv12 slices: // *** Note: it's called multiple times while decoding a frame, first time y==0 // *** Designed to upscale, but may work for downscale too. diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c index 0094c8bdf1..20bd8f9fb0 100644 --- a/postproc/swscale_template.c +++ b/postproc/swscale_template.c @@ -5,6 +5,11 @@ // current version mostly by Michael Niedermayer (michaelni@gmx.at) // the parts written by michael are under GNU GPL +/* TODO +Move static / global vars into a struct so multiple scalers can be used +write vertical cubic upscale / linear downscale stuff +*/ + #undef MOVNTQ #undef PAVGB #undef PREFETCH @@ -1154,10 +1159,176 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *buf1, uint16_t *uv #endif } - -static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) +// Bilinear / Bicubic scaling +static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, + int16_t *filter, int16_t *filterPos, int filterSize) { +#ifdef HAVE_MMX + if(filterSize==4) // allways true for upscaling, sometimes for down too + { + int counter= -2*dstW; + filter-= counter*2; + filterPos-= counter/2; + dst-= counter/2; + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq w02, %%mm6 \n\t" + "pushl %%ebp \n\t" // we use 7 regs here ... + "movl %%eax, %%ebp \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movzwl (%2, %%ebp), %%eax \n\t" + "movzwl 2(%2, %%ebp), %%ebx \n\t" + "movq (%1, %%ebp, 4), %%mm1 \n\t" + "movq 8(%1, %%ebp, 4), %%mm3 \n\t" + "movd (%3, %%eax), %%mm0 \n\t" + "movd (%3, %%ebx), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "psrad $8, %%mm0 \n\t" + "psrad $8, %%mm3 \n\t" + "packssdw %%mm3, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm0 \n\t" + "packssdw %%mm0, %%mm0 \n\t" + "movd %%mm0, (%4, %%ebp) \n\t" + "addl $4, %%ebp \n\t" + " jnc 1b \n\t" + + "popl %%ebp \n\t" + : "+a" (counter) + : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) + : "%ebx" + ); + } + else if(filterSize==8) + { + int counter= -2*dstW; + filter-= counter*4; + filterPos-= counter/2; + dst-= counter/2; + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq w02, %%mm6 \n\t" + "pushl %%ebp \n\t" // we use 7 regs here ... + "movl %%eax, %%ebp \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movzwl (%2, %%ebp), %%eax \n\t" + "movzwl 2(%2, %%ebp), %%ebx \n\t" + "movq (%1, %%ebp, 8), %%mm1 \n\t" + "movq 16(%1, %%ebp, 8), %%mm3 \n\t" + "movd (%3, %%eax), %%mm0 \n\t" + "movd (%3, %%ebx), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + + "movq 8(%1, %%ebp, 8), %%mm1 \n\t" + "movq 24(%1, %%ebp, 8), %%mm5 \n\t" + "movd 4(%3, %%eax), %%mm4 \n\t" + "movd 4(%3, %%ebx), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "paddd %%mm5, %%mm3 \n\t" + + "psrad $8, %%mm0 \n\t" + "psrad $8, %%mm3 \n\t" + "packssdw %%mm3, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm0 \n\t" + "packssdw %%mm0, %%mm0 \n\t" + "movd %%mm0, (%4, %%ebp) \n\t" + "addl $4, %%ebp \n\t" + " jnc 1b \n\t" + + "popl %%ebp \n\t" + : "+a" (counter) + : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) + : "%ebx" + ); + } + else + { + int counter= -2*dstW; +// filter-= counter*filterSize/2; + filterPos-= counter/2; + dst-= counter/2; + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq w02, %%mm6 \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movl %2, %%ecx \n\t" + "movzwl (%%ecx, %0), %%eax \n\t" + "movzwl 2(%%ecx, %0), %%ebx \n\t" + "movl %5, %%ecx \n\t" + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" + "2: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1, %6), %%mm3 \n\t" + "movd (%%ecx, %%eax), %%mm0 \n\t" + "movd (%%ecx, %%ebx), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "paddd %%mm3, %%mm5 \n\t" + "paddd %%mm0, %%mm4 \n\t" + "addl $8, %1 \n\t" + "addl $4, %%ecx \n\t" + "cmpl %4, %%ecx \n\t" + " jb 2b \n\t" + "addl %6, %1 \n\t" + "psrad $8, %%mm4 \n\t" + "psrad $8, %%mm5 \n\t" + "packssdw %%mm5, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm4 \n\t" + "packssdw %%mm4, %%mm4 \n\t" + "movl %3, %%eax \n\t" + "movd %%mm4, (%%eax, %0) \n\t" + "addl $4, %0 \n\t" + " jnc 1b \n\t" + + : "+r" (counter) + : "r" (filter), "m" (filterPos), "m" (dst), "m"(src+filterSize), + "m" (src), "r" (filterSize*2) + : "%ebx", "%eax", "%ecx", "%edx" + ); + } +#else + int i; + for(i=0; i<dstW; i++) + { + int j; + int srcPos= filterPos[i]; + int val=0; +// printf("filterPos: %d\n", hFilterPos[i]); + for(j=0; j<filterSize; j++) + { +// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); + val += ((int)src[srcPos + j])*filter[filterSize*i + j]; + } +// filter += hFilterSize; + dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... +// dst[i] = val>>7; + } +#endif +} // *** horizontal scale Y line to temp buffer +static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) +{ + if(sws_flags != SWS_FAST_BILINEAR) + { + RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); + } + else // Fast Bilinear upscale / crap downscale + { #ifdef ARCH_X86 #ifdef HAVE_MMX2 int i; @@ -1267,11 +1438,19 @@ FUNNY_Y_CODE xpos+=xInc; } #endif + } } inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, int srcW, int xInc) { + if(sws_flags != SWS_FAST_BILINEAR) + { + RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); + RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); + } + else // Fast Bilinear upscale / crap downscale + { #ifdef ARCH_X86 #ifdef HAVE_MMX2 int i; @@ -1402,6 +1581,162 @@ FUNNYUVCODE xpos+=xInc; } #endif + } +} + +static void inline RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc, + int srcW, int dstW) +{ + int i; +#ifdef HAVE_MMX + asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS +#endif + + if(xInc <= (1<<16)) // upscale / cubic interpolate + { + int i; + int xDstInSrc; + if(sws_flags==SWS_BICUBIC) *filterSize= 4; + else *filterSize= 2; +// printf("%d %d %d\n", filterSize, srcW, dstW); +#ifdef HAVE_MMX + *filterSize= (*filterSize +3) & (~3); // -> *filterSize %4 == 0 +#endif + xDstInSrc= xInc - 0x8000; + for(i=0; i<dstW; i++) + { + int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1; + int j; + + filterPos[i]= xx; + if(sws_flags == SWS_BICUBIC) + { + double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); +// int coeff; + int y1,y2,y3,y4; + double A= -0.75; + // Equation is from VirtualDub + y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0); + y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0); + y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0); + y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0); + +// printf("%d %d %d \n", coeff, (int)d, xDstInSrc); + filter[i*(*filterSize) + 0]= y1; + filter[i*(*filterSize) + 1]= y2; + filter[i*(*filterSize) + 2]= y3; + filter[i*(*filterSize) + 3]= y4; +// printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4); + } + else + { + for(j=0; j<*filterSize; j++) + { + double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); + int coeff; + coeff= (int)(0.5 + (1.0 - d)*(1<<14)); + if(coeff<0) coeff=0; + // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); + filter[i*(*filterSize) + j]= coeff; + xx++; + } + } + xDstInSrc+= xInc; + } + } + else // downscale + { + int xDstInSrc; + if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW); + else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW); +// printf("%d %d %d\n", *filterSize, srcW, dstW); +#ifdef HAVE_MMX + *filterSize= (*filterSize +3) & (~3); // -> *filterSize %4 == 0 +#endif + xDstInSrc= xInc - 0x8000; + for(i=0; i<dstW; i++) + { + int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5); + int j; + + filterPos[i]= xx; + for(j=0; j<*filterSize; j++) + { + double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; + int coeff; + if(sws_flags == SWS_BICUBIC) + { + double A= -0.75; +// d*=2; + // Equation is from VirtualDub + if(d<1.0) + coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d + + (A+2.0)*d*d*d) * (1<<14)); + else if(d<2.0) + coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d + - 5.0*A*d*d + A*d*d*d) * (1<<14)); + else + coeff=0; + } + else + { + coeff= (int)(0.5 + (1.0 - d)*(1<<14)); + if(coeff<0) coeff=0; + } +// printf("%d %d %d \n", coeff, (int)d, xDstInSrc); + filter[i*(*filterSize) + j]= coeff; + xx++; + } + xDstInSrc+= xInc; + } + } + + //fix borders + for(i=0; i<dstW; i++) + { + int j; + if(filterPos[i] < 0) + { + // Move filter coeffs left to compensate for filterPos + for(j=1; j<*filterSize; j++) + { + int left= MAX(j + filterPos[i], 0); + filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j]; + filter[i*(*filterSize) + j]=0; + } + filterPos[i]= 0; + } + + if(filterPos[i] + *filterSize > srcW) + { + int shift= filterPos[i] + *filterSize - srcW; + // Move filter coeffs right to compensate for filterPos + for(j=*filterSize-2; j>=0; j--) + { + int right= MIN(j + shift, *filterSize-1); + filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; + filter[i*(*filterSize) +j]=0; + } + filterPos[i]= srcW - *filterSize; + } + } + + //Normalize + for(i=0; i<dstW; i++) + { + int j; + double sum=0; + double scale=1<<14; + for(j=0; j<*filterSize; j++) + { + sum+= filter[i*(*filterSize) + j]; + } + scale/= sum; + for(j=0; j<*filterSize; j++) + { + filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); + } + } } static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY , @@ -1421,11 +1756,8 @@ static int dstY; static int lastLumSrcY; static int lastChrSrcY; -#ifdef HAVE_MMX2 -// used to detect a horizontal size change -static int old_dstW= -1; -static int old_s_xinc= -1; -#endif +static int oldDstW= -1; +static int oldSrcW= -1; int dstUVw; int i; @@ -1469,10 +1801,19 @@ else s_xinc2= s_xinc; = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128; pix_buf_y[0][i]= pix_buf_y[1][i]= 0; } - + + //precalculate horizontal scaler filter coefficients + if(oldDstW!=dstW || oldSrcW!=srcW) + { +// int i; + oldDstW= dstW; oldSrcW= srcW; + + RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, s_xinc, srcW, dstW); + RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, s_xinc2, srcW, dstW); + #ifdef HAVE_MMX2 // cant downscale !!! - if((old_s_xinc != s_xinc || old_dstW!=dstW) && canMMX2BeUsed) + if(canMMX2BeUsed) { uint8_t *fragment; int imm8OfPShufW1; @@ -1481,9 +1822,6 @@ else s_xinc2= s_xinc; int xpos, i; - old_s_xinc= s_xinc; - old_dstW= dstW; - // create an optimized horizontal scaling routine //code fragment @@ -1532,20 +1870,6 @@ else s_xinc2= s_xinc; xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers - /* choose xinc so that all 8 parts fit exactly - Note: we cannot use just 1 part because it would not fit in the code cache */ -// s_xinc2_diff= -((((s_xinc2*(dstW/8))&0xFFFF))/(dstW/8))-10; -// s_xinc_diff= -((((s_xinc*(dstW/8))&0xFFFF))/(dstW/8)); -#ifdef ALT_ERROR -// s_xinc2_diff+= ((0x10000/(dstW/8))); -#endif -// s_xinc_diff= s_xinc2_diff*2; - -// s_xinc2+= s_xinc2_diff; -// s_xinc+= s_xinc_diff; - -// old_s_xinc= s_xinc; - for(i=0; i<dstW/8; i++) { int xx=xpos>>16; @@ -1602,6 +1926,7 @@ else s_xinc2= s_xinc; } #endif // HAVE_MMX2 + } // Init stuff } // reset counters while(1){ |