aboutsummaryrefslogtreecommitdiffstats
path: root/postproc
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2001-10-24 00:05:30 +0000
committerMichael Niedermayer <michaelni@gmx.at>2001-10-24 00:05:30 +0000
commit73d33554000dd37188c10123a9f7963f8c78b45a (patch)
tree506275a245ae996b1133d3b06a2bd4dd9f95d6dd /postproc
parent2d83f323d63332e5ecaa481d8f9301c0ea92b6ba (diff)
downloadffmpeg-73d33554000dd37188c10123a9f7963f8c78b45a.tar.gz
more speed
Originally committed as revision 2438 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
Diffstat (limited to 'postproc')
-rw-r--r--postproc/postprocess.c66
-rw-r--r--postproc/postprocess_template.c66
2 files changed, 112 insertions, 20 deletions
diff --git a/postproc/postprocess.c b/postproc/postprocess.c
index ac5aa24cf7..b9ccde0f37 100644
--- a/postproc/postprocess.c
+++ b/postproc/postprocess.c
@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture
+ int QPCorrecture= 256;
/* Temporary buffers for handling the last row(s) */
static uint8_t *tempDst= NULL;
@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
packedYOffset= 0;
}
+ if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
+ else QPCorrecture= 256;
+
/* copy first row of 8x8 blocks */
for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
-
+#ifdef ARCH_X86
+ int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
+ int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
+ int QPFrac= QPDelta;
+#endif
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
than use a temporary buffer */
if(y+15 >= height)
@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE)
{
const int stride= dstStride;
- int QP;
- if(isColor)
- {
- QP=QPs[(y>>3)*QPStride + (x>>3)];
- }
- else
+#ifdef ARCH_X86
+ int QP= *QPptr;
+ asm volatile(
+ "addl %2, %1 \n\t"
+ "sbbl %%eax, %%eax \n\t"
+ "shll $2, %%eax \n\t"
+ "subl %%eax, %0 \n\t"
+ : "+r" (QPptr), "+m" (QPFrac)
+ : "r" (QPDelta)
+ : "%eax"
+ );
+#else
+ int QP= isColor ?
+ QPs[(y>>3)*QPStride + (x>>3)]:
+ QPs[(y>>4)*QPStride + (x>>4)];
+#endif
+ if(!isColor)
{
- QP= QPs[(y>>4)*QPStride + (x>>4)];
- if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
- yHistogram[ srcBlock[srcStride*5] ]++;
+ QP= (QP* QPCorrecture)>>8;
+ yHistogram[ srcBlock[srcStride*4 + 4] ]++;
}
#ifdef HAVE_MMX
asm volatile(
@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
#endif
#ifdef HAVE_MMX2
+/*
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
+*/
+/*
+ prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
+ prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
+ prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
+ prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
+*/
+
+ asm(
+ "movl %4, %%eax \n\t"
+ "shrl $2, %%eax \n\t"
+ "andl $6, %%eax \n\t"
+ "addl $5, %%eax \n\t"
+ "movl %%eax, %%ebx \n\t"
+ "imul %1, %%eax \n\t"
+ "imul %3, %%ebx \n\t"
+ "prefetchnta 32(%%eax, %0) \n\t"
+ "prefetcht0 32(%%ebx, %2) \n\t"
+ "addl %1, %%eax \n\t"
+ "addl %3, %%ebx \n\t"
+ "prefetchnta 32(%%eax, %0) \n\t"
+ "prefetcht0 32(%%ebx, %2) \n\t"
+ :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
+ "m" (x)
+ : "%eax", "%ebx"
+ );
+
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
diff --git a/postproc/postprocess_template.c b/postproc/postprocess_template.c
index ac5aa24cf7..b9ccde0f37 100644
--- a/postproc/postprocess_template.c
+++ b/postproc/postprocess_template.c
@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture
+ int QPCorrecture= 256;
/* Temporary buffers for handling the last row(s) */
static uint8_t *tempDst= NULL;
@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
packedYOffset= 0;
}
+ if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
+ else QPCorrecture= 256;
+
/* copy first row of 8x8 blocks */
for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
-
+#ifdef ARCH_X86
+ int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
+ int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
+ int QPFrac= QPDelta;
+#endif
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
than use a temporary buffer */
if(y+15 >= height)
@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE)
{
const int stride= dstStride;
- int QP;
- if(isColor)
- {
- QP=QPs[(y>>3)*QPStride + (x>>3)];
- }
- else
+#ifdef ARCH_X86
+ int QP= *QPptr;
+ asm volatile(
+ "addl %2, %1 \n\t"
+ "sbbl %%eax, %%eax \n\t"
+ "shll $2, %%eax \n\t"
+ "subl %%eax, %0 \n\t"
+ : "+r" (QPptr), "+m" (QPFrac)
+ : "r" (QPDelta)
+ : "%eax"
+ );
+#else
+ int QP= isColor ?
+ QPs[(y>>3)*QPStride + (x>>3)]:
+ QPs[(y>>4)*QPStride + (x>>4)];
+#endif
+ if(!isColor)
{
- QP= QPs[(y>>4)*QPStride + (x>>4)];
- if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
- yHistogram[ srcBlock[srcStride*5] ]++;
+ QP= (QP* QPCorrecture)>>8;
+ yHistogram[ srcBlock[srcStride*4 + 4] ]++;
}
#ifdef HAVE_MMX
asm volatile(
@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
#endif
#ifdef HAVE_MMX2
+/*
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
+*/
+/*
+ prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
+ prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
+ prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
+ prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
+*/
+
+ asm(
+ "movl %4, %%eax \n\t"
+ "shrl $2, %%eax \n\t"
+ "andl $6, %%eax \n\t"
+ "addl $5, %%eax \n\t"
+ "movl %%eax, %%ebx \n\t"
+ "imul %1, %%eax \n\t"
+ "imul %3, %%ebx \n\t"
+ "prefetchnta 32(%%eax, %0) \n\t"
+ "prefetcht0 32(%%ebx, %2) \n\t"
+ "addl %1, %%eax \n\t"
+ "addl %3, %%ebx \n\t"
+ "prefetchnta 32(%%eax, %0) \n\t"
+ "prefetcht0 32(%%ebx, %2) \n\t"
+ :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
+ "m" (x)
+ : "%eax", "%ebx"
+ );
+
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);