diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-04-13 00:39:19 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-04-13 00:39:19 +0200 |
commit | ca19862d3879dfadfd6c1a1a443eac40fc433385 (patch) | |
tree | dc114d719849d6c3645063e02378dfddecbc991a /libavcodec/x86/h264_idct.asm | |
parent | 5eeecde859ebfa384fbe3c8a1aa62a6b8f6c58e4 (diff) | |
parent | 18e8fef7c9292041e3266487d9bc6b59e5eaa798 (diff) | |
download | ffmpeg-ca19862d3879dfadfd6c1a1a443eac40fc433385.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
libxvid: remove disabled code
qdm2: make a table static const
qdm2: simplify bitstream reader setup for some subpacket types
qdm2: use get_bits_left()
build: Consistently handle conditional compilation for all optimization OBJS.
avpacket, bfi, bgmc, rawenc: K&R prettyprinting cosmetics
msrle: convert MS RLE decoding function to bytestream2.
x86inc improvements for 64-bit
Conflicts:
common.mak
libavcodec/avpacket.c
libavcodec/bfi.c
libavcodec/msrledec.c
libavcodec/qdm2.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/h264_idct.asm')
-rw-r--r-- | libavcodec/x86/h264_idct.asm | 156 |
1 files changed, 73 insertions, 83 deletions
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 25f4755fa1..da045f71e2 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -45,8 +45,10 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 db 4+13*8, 5+13*8, 4+14*8, 5+14*8 db 6+13*8, 7+13*8, 6+14*8, 7+14*8 %ifdef PIC -%define scan8 r11 +%define npicregs 1 +%define scan8 picregq %else +%define npicregs 0 %define scan8 scan8_mem %endif @@ -301,10 +303,10 @@ cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8_mmx, 5, 7, 0 +cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -323,13 +325,13 @@ cglobal h264_idct_add16_8_mmx, 5, 7, 0 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8_mmx, 5, 7, 0 +cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg %assign pad 128+4-(stack_offset&7) SUB rsp, pad xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -355,10 +357,10 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8_mmx2, 5, 7, 0 +cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -371,16 +373,13 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 test r6, r6 jz .no_dc DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d %endif - mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] - DC_ADD_MMX2_OP movh, dst_reg, r3, r6 + mov dst2d, dword [r1+r5*4] + lea dst2q, [r0+dst2q] + DC_ADD_MMX2_OP movh, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -402,10 +401,10 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 +cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -425,10 +424,10 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 +cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -448,16 +447,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 test r6, r6 jz .skipblock DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d %endif - mov dst_regd, dword [r1+r5*4] - add dst_reg, r0 - DC_ADD_MMX2_OP movh, dst_reg, r3, r6 + mov dst2d, dword [r1+r5*4] + add dst2q, r0 + DC_ADD_MMX2_OP movh, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -470,13 +466,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 +cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg %assign pad 128+4-(stack_offset&7) SUB rsp, pad xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -489,18 +485,15 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 test r6, r6 jz .no_dc DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d -%endif - mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 - lea dst_reg, [dst_reg+r3*4] - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d +%endif + mov dst2d, dword [r1+r5*4] + lea dst2q, [r0+dst2q] + DC_ADD_MMX2_OP mova, dst2q, r3, r6 + lea dst2q, [dst2q+r3*4] + DC_ADD_MMX2_OP mova, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -533,10 +526,10 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 INIT_XMM ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8_sse2, 5, 7, 10 +cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -550,18 +543,15 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10 jz .no_dc INIT_MMX DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d -%endif - mov dst_regd, dword [r1+r5*4] - add dst_reg, r0 - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 - lea dst_reg, [dst_reg+r3*4] - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d +%endif + mov dst2d, dword [r1+r5*4] + add dst2q, r0 + DC_ADD_MMX2_OP mova, dst2q, r3, r6 + lea dst2q, [dst2q+r3*4] + DC_ADD_MMX2_OP mova, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -572,9 +562,9 @@ INIT_MMX REP_RET .no_dc INIT_XMM - mov dst_regd, dword [r1+r5*4] - add dst_reg, r0 - IDCT8_ADD_SSE dst_reg, r2, r3, r6 + mov dst2d, dword [r1+r5*4] + add dst2q, r0 + IDCT8_ADD_SSE dst2q, r2, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -595,7 +585,7 @@ h264_idct_add8_mmx_plane: jz .skipblock %if ARCH_X86_64 mov r0d, dword [r1+r5*4] - add r0, [r10] + add r0, [dst2q] %else mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, [r0] @@ -611,20 +601,20 @@ h264_idct_add8_mmx_plane: ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8_mmx, 5, 7, 0 +cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg mov r5, 16 add r2, 512 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif %if ARCH_X86_64 - mov r10, r0 + mov dst2q, r0 %endif call h264_idct_add8_mmx_plane mov r5, 32 add r2, 384 %if ARCH_X86_64 - add r10, gprsize + add dst2q, gprsize %else add r0mp, gprsize %endif @@ -639,7 +629,7 @@ h264_idct_add8_mmx2_plane jz .try_dc %if ARCH_X86_64 mov r0d, dword [r1+r5*4] - add r0, [r10] + add r0, [dst2q] %else mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, [r0] @@ -658,7 +648,7 @@ h264_idct_add8_mmx2_plane DC_ADD_MMX2_INIT r2, r3, r6 %if ARCH_X86_64 mov r0d, dword [r1+r5*4] - add r0, [r10] + add r0, [dst2q] %else mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, [r0] @@ -674,20 +664,20 @@ h264_idct_add8_mmx2_plane ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8_mmx2, 5, 7, 0 +cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg mov r5, 16 add r2, 512 %if ARCH_X86_64 - mov r10, r0 + mov dst2q, r0 %endif %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif call h264_idct_add8_mmx2_plane mov r5, 32 add r2, 384 %if ARCH_X86_64 - add r10, gprsize + add dst2q, gprsize %else add r0mp, gprsize %endif @@ -739,7 +729,7 @@ x264_add8x4_idct_sse2: jz .cycle%1end mov r0d, dword [r1+%1*8] %if ARCH_X86_64 - add r0, r10 + add r0, r5 %else add r0, r0m %endif @@ -752,9 +742,9 @@ x264_add8x4_idct_sse2: ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8_sse2, 5, 5, 8 +cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 %if ARCH_X86_64 - mov r10, r0 + mov r5, r0 %endif ; unrolling of the loop leads to an average performance gain of ; 20-25% @@ -774,7 +764,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 jz .try%1dc mov r0d, dword [r1+%1*8] %if ARCH_X86_64 - add r0, r10 + add r0, r7 %else add r0, r0m %endif @@ -786,7 +776,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 jz .cycle%1end mov r0d, dword [r1+%1*8] %if ARCH_X86_64 - add r0, r10 + add r0, r7 %else add r0, r0m %endif @@ -799,9 +789,9 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 +cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 %if ARCH_X86_64 - mov r10, r0 + mov r7, r0 %endif add16intra_sse2_cycle 0, 0xc add16intra_sse2_cycle 1, 0x14 @@ -819,7 +809,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 jz .try%1dc %if ARCH_X86_64 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] - add r0, [r10] + add r0, [r7] %else mov r0, r0m mov r0, [r0] @@ -833,7 +823,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 jz .cycle%1end %if ARCH_X86_64 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] - add r0, [r10] + add r0, [r7] %else mov r0, r0m mov r0, [r0] @@ -850,15 +840,15 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8_sse2, 5, 7, 8 +cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8 add r2, 512 %if ARCH_X86_64 - mov r10, r0 + mov r7, r0 %endif add8_sse2_cycle 0, 0x34 add8_sse2_cycle 1, 0x3c %if ARCH_X86_64 - add r10, gprsize + add r7, gprsize %else add r0mp, gprsize %endif |