aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2006-10-19 01:19:03 +0000
committerMichael Niedermayer <michaelni@gmx.at>2006-10-19 01:19:03 +0000
commitebd624b66256c93f66fa8d86a22fa50a3566ffd2 (patch)
treea448a634dd66bfeb23a9166ca40cc9441d2b49a5 /libavcodec
parent99fd05cbdd0460b3f0afc88532f1cf45815d18e2 (diff)
downloadffmpeg-ebd624b66256c93f66fa8d86a22fa50a3566ffd2.tar.gz
optimize sign decoding code in decode_residual()
x86 is 4% faster on P3 C sign stuff + x86 code for everything else is also faster then before (sorry forgot to test pure C) ... and if i replace the second occurance of the sign decoding in decode_residual by the asm too then everything gets slower iam starting to think that it might be best to write the whole function in asm, playing this avoid random deoptimizations game with gcc is not fun at all Originally committed as revision 6732 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/cabac.h81
-rw-r--r--libavcodec/h264.c6
2 files changed, 83 insertions, 4 deletions
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 8059ba65d6..a2828770c5 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -575,6 +575,36 @@ static int get_cabac(CABACContext *c, uint8_t * const state){
}
static int get_cabac_bypass(CABACContext *c){
+#if 0 //not faster
+ int bit;
+ asm volatile(
+ "movl "RANGE "(%1), %%ebx \n\t"
+ "movl "LOW "(%1), %%eax \n\t"
+ "shl $17, %%ebx \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %%ebx, %%eax \n\t"
+ "cdq \n\t"
+ "and %%edx, %%ebx \n\t"
+ "add %%ebx, %%eax \n\t"
+ "test %%ax, %%ax \n\t"
+ " jnz 1f \n\t"
+ "movl "BYTE "(%1), %%ebx \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%%ebx), %%ecx \n\t"
+ "bswap %%ecx \n\t"
+ "shrl $15, %%ecx \n\t"
+ "addl $2, %%ebx \n\t"
+ "addl %%ecx, %%eax \n\t"
+ "movl %%ebx, "BYTE "(%1) \n\t"
+ "1: \n\t"
+ "movl %%eax, "LOW "(%1) \n\t"
+
+ :"=&d"(bit)
+ :"r"(c)
+ : "%eax", "%ebx", "%ecx", "memory"
+ );
+ return bit+1;
+#else
int range;
c->low += c->low;
@@ -588,7 +618,58 @@ static int get_cabac_bypass(CABACContext *c){
c->low -= range;
return 1;
}
+#endif
}
+
+
+static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
+#ifdef ARCH_X86
+ int bit;
+ asm volatile(
+ "movl "RANGE "(%1), %%ebx \n\t"
+ "movl "LOW "(%1), %%eax \n\t"
+ "shl $17, %%ebx \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %%ebx, %%eax \n\t"
+ "cdq \n\t"
+ "and %%edx, %%ebx \n\t"
+ "add %%ebx, %%eax \n\t"
+ "xor %%edx, %%ecx \n\t"
+ "sub %%edx, %%ecx \n\t"
+ "test %%ax, %%ax \n\t"
+ " jnz 1f \n\t"
+ "movl "BYTE "(%1), %%ebx \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%%ebx), %%edx \n\t"
+ "bswap %%edx \n\t"
+ "shrl $15, %%edx \n\t"
+ "addl $2, %%ebx \n\t"
+ "addl %%edx, %%eax \n\t"
+ "movl %%ebx, "BYTE "(%1) \n\t"
+ "1: \n\t"
+ "movl %%eax, "LOW "(%1) \n\t"
+
+ :"+c"(val)
+ :"r"(c)
+ : "%eax", "%ebx", "%edx", "memory"
+ );
+ return val;
+#else
+ int range, mask;
+ c->low += c->low;
+
+ if(!(c->low & CABAC_MASK))
+ refill(c);
+
+ range= c->range<<17;
+ c->low -= range;
+ mask= c->low >> 31;
+ range &= mask;
+ c->low += range;
+ return (val^mask)-mask;
+#endif
+}
+
//FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!)
//FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard)
#ifdef ARCH_X86
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index d99ceb2de3..26382cb7f6 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -6168,11 +6168,9 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
if( get_cabac( CC, ctx ) == 0 ) {
if( !qmul ) {
- if( get_cabac_bypass( CC ) ) block[j] = -1;
- else block[j] = 1;
+ block[j] = get_cabac_bypass_sign( CC, -1);
}else{
- if( get_cabac_bypass( CC ) ) block[j] = (-qmul[j] + 32) >> 6;
- else block[j] = ( qmul[j] + 32) >> 6;
+ block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
}
abslevel1++;