Roll back 4:4:4 H.264 for now

Needs some ARM/PPC asm modifications.
author: Jason Garrett-Glaser <jason@x264.com> 2011-06-13 13:38:46 -0700
committer: Jason Garrett-Glaser <jason@x264.com> 2011-06-13 13:38:46 -0700
commit: 504811baeacf8bac400962e84fca678b79068ceb (patch)
tree: b32cdbea17132514b3f7beace314d039be6a8117 /libavcodec/x86
parent: 295f0a2503550088a5ffddc5754b9fba2fa6ee60 (diff)
download: ffmpeg-504811baeacf8bac400962e84fca678b79068ceb.tar.gz
4 files changed, 40 insertions, 58 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 214c6a3945..1cc6991666 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -784,7 +784,7 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
 
 /* draw the edges of width 'w' of an image of size width, height
    this mmx version can only handle w==8 || w==16 */
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
 {
     uint8_t *ptr, *last_line;
     int i;
@@ -839,7 +839,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w,
 
     /* top and bottom (and hopefully also the corners) */
     if (sides&EDGE_TOP) {
-        for(i = 0; i < h; i += 4) {
+        for(i = 0; i < w; i += 4) {
             ptr= buf - (i + 1) * wrap - w;
             __asm__ volatile(
                     "1:                             \n\t"
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index b5f77c90d5..c850dc2ef3 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -36,7 +36,7 @@
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
-                                   int *index, int last_off){
+                                   int *index){
     void *end= significant_coeff_ctx_base + max_coeff - 1;
     int minusstart= -(int)significant_coeff_ctx_base;
     int minusindex= 4-(int)index;
@@ -52,12 +52,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
 
         "test $1, %%edx                         \n\t"
         " jz 3f                                 \n\t"
-        "add  %7, %1                            \n\t"
 
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
                              "%%bx", "%%esi", "%%eax", "%%al")
 
-        "sub  %7, %1                            \n\t"
         "mov  %2, %%"REG_a"                     \n\t"
         "movl %4, %%ecx                         \n\t"
         "add  %1, %%"REG_c"                     \n\t"
@@ -84,7 +82,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
-        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
+        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
         : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
     );
     return coeff_count;
@@ -92,7 +90,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
 
 static int decode_significance_8x8_x86(CABACContext *c,
                                        uint8_t *significant_coeff_ctx_base,
-                                       int *index, int last_off, const uint8_t *sig_off){
+                                       int *index, const uint8_t *sig_off){
     int minusindex= 4-(int)index;
     int coeff_count;
     x86_reg last=0;
@@ -116,9 +114,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
 
         "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
         "add %5, %%"REG_D"                      \n\t"
-        "add %7, %%"REG_D"                      \n\t"
 
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
                              "%%bx", "%%esi", "%%eax", "%%al")
 
         "mov %2, %%"REG_a"                      \n\t"
@@ -145,7 +142,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count),"+m"(last), "+m"(index)
-        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_off)
+        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
         : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
     );
     return coeff_count;
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 4788da98e0..f90f41c4bc 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -32,18 +32,14 @@
 SECTION_RODATA
 
 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
-           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
-           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
-           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
-           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
-           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
-           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
-           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
-           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
-           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
-           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
-           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
+scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
+           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
+           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
+           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
+           db 1+1*8, 2+1*8
+           db 1+2*8, 2+2*8
+           db 1+4*8, 2+4*8
+           db 1+5*8, 2+5*8
 %ifdef PIC
 %define scan8 r11
 %else
@@ -621,8 +617,6 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0
     mov         r10, r0
 %endif
     call         h264_idct_add8_mmx_plane
-    mov          r5, 32
-    add          r2, 384
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
@@ -684,8 +678,6 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0
     lea         r11, [scan8_mem]
 %endif
     call h264_idct_add8_mmx2_plane
-    mov          r5, 32
-    add          r2, 384
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
@@ -818,12 +810,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
     test        r0, r0
     jz .try%1dc
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    mov        r0d, dword [r1+%1*8+64]
     add         r0, [r10]
 %else
     mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    add         r0, dword [r1+%1*8+64]
 %endif
     call        x264_add8x4_idct_sse2
     jmp .cycle%1end
@@ -832,18 +824,16 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
     or         r0w, word [r2+32]
     jz .cycle%1end
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    mov        r0d, dword [r1+%1*8+64]
     add         r0, [r10]
 %else
     mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    add         r0, dword [r1+%1*8+64]
 %endif
     call        h264_idct_dc_add8_mmx2
 .cycle%1end
-%if %1 == 1
-    add         r2, 384+64
-%elif %1 < 3
+%if %1 < 3
     add         r2, 64
 %endif
 %endmacro
@@ -855,15 +845,15 @@ cglobal h264_idct_add8_8_sse2, 5, 7, 8
 %ifdef ARCH_X86_64
     mov         r10, r0
 %endif
-    add8_sse2_cycle 0, 0x34
-    add8_sse2_cycle 1, 0x3c
+    add8_sse2_cycle 0, 0x09
+    add8_sse2_cycle 1, 0x11
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
     add        r0mp, gprsize
 %endif
-    add8_sse2_cycle 2, 0x5c
-    add8_sse2_cycle 3, 0x64
+    add8_sse2_cycle 2, 0x21
+    add8_sse2_cycle 3, 0x29
     RET
 
 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 54636a95d0..3f7cf4cefc 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -29,18 +29,14 @@ SECTION_RODATA
 
 pw_pixel_max: times 8 dw ((1 << 10)-1)
 pd_32:        times 4 dd 32
-scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
-           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
-           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
-           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
-           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
-           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
-           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
-           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
-           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
-           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
-           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
-           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
+scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
+           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
+           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
+           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
+           db 1+1*8, 2+1*8
+           db 1+2*8, 2+2*8
+           db 1+4*8, 2+4*8
+           db 1+5*8, 2+5*8
 
 %ifdef PIC
 %define scan8 r11
@@ -310,7 +306,7 @@ INIT_AVX
 IDCT_ADD16INTRA_10 avx
 %endif
 
-%assign last_block 36
+%assign last_block 24
 ;-----------------------------------------------------------------------------
 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
 ;-----------------------------------------------------------------------------
@@ -321,22 +317,21 @@ cglobal h264_idct_add8_10_%1,5,7
 %endif
     add      r2, 1024
     mov      r0, [r0]
-    ADD16_OP_INTRA %1, 16, 4+ 6*8
-    ADD16_OP_INTRA %1, 18, 4+ 7*8
-    add      r2, 1024-128*2
+    ADD16_OP_INTRA %1, 16, 1+1*8
+    ADD16_OP_INTRA %1, 18, 1+2*8
 %ifdef ARCH_X86_64
     mov      r0, [r10+gprsize]
 %else
     mov      r0, r0m
     mov      r0, [r0+gprsize]
 %endif
-    ADD16_OP_INTRA %1, 32, 4+11*8
-    ADD16_OP_INTRA %1, 34, 4+12*8
+    ADD16_OP_INTRA %1, 20, 1+4*8
+    ADD16_OP_INTRA %1, 22, 1+5*8
     REP_RET
     AC %1, 16
     AC %1, 18
-    AC %1, 32
-    AC %1, 34
+    AC %1, 20
+    AC %1, 22
 
 %endmacro ; IDCT_ADD8
author	Jason Garrett-Glaser <jason@x264.com>	2011-06-13 13:38:46 -0700
committer	Jason Garrett-Glaser <jason@x264.com>	2011-06-13 13:38:46 -0700
commit	504811baeacf8bac400962e84fca678b79068ceb (patch)
tree	b32cdbea17132514b3f7beace314d039be6a8117 /libavcodec/x86
parent	295f0a2503550088a5ffddc5754b9fba2fa6ee60 (diff)
download	ffmpeg-504811baeacf8bac400962e84fca678b79068ceb.tar.gz