diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2011-06-01 21:53:15 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2011-06-02 07:07:02 -0700 |
commit | 994c3550ffe032385833c21876fb121f59516079 (patch) | |
tree | ae8a05c1b8b0252711631a6b7075ae51c47fca95 /libavcodec/x86 | |
parent | cc9947ffbe8a847a49d092c9253910cfc53279ad (diff) | |
download | ffmpeg-994c3550ffe032385833c21876fb121f59516079.tar.gz |
2x faster h264_idct_add8_10.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/h264_idct_10bit.asm | 55 |
1 files changed, 21 insertions, 34 deletions
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 3d0004e09e..64089001e5 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -249,16 +249,17 @@ IDCT8_DC_ADD avx jmp .skipadd%2 %endmacro +%assign last_block 16 %macro ADD16_OP_INTRA 3 cmp word [r4+%3], 0 jnz .ac%2 - mov r6d, dword [r2+ 0] - or r6d, dword [r2+64] + mov r5d, dword [r2+ 0] + or r5d, dword [r2+64] jz .skipblock%2 - mov r5d, dword [r1+(%2+0)*4] + mov r5d, dword [r1+(%2+0)*4] call idct_dc_add_%1 .skipblock%2: -%if %2<15 +%if %2<last_block-2 add r2, 128 %endif .skipadd%2: @@ -302,47 +303,33 @@ INIT_AVX IDCT_ADD16INTRA_10 avx %endif +%assign last_block 24 ;----------------------------------------------------------------------------- ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- %macro IDCT_ADD8 1 cglobal h264_idct_add8_10_%1,5,7 - mov r5, 16 - add r2, 1024 -%ifdef PIC - lea r11, [scan8_mem] -%endif %ifdef ARCH_X86_64 - mov r10, r0 + mov r10, r0 %endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - or r6d, dword [r2] - test r6, r6 - jz .skipblock + add r2, 1024 + mov r0, [r0] + ADD16_OP_INTRA %1, 16, 1+1*8 + ADD16_OP_INTRA %1, 18, 1+2*8 %ifdef ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [r10] + mov r0, [r10+gprsize] %else - mov r0, r0m - mov r0, [r0] - add r0, dword [r1+r5*4] + mov r0, r0m + mov r0, [r0+gprsize] %endif - IDCT4_ADD_10 r0, r2, r3 -.skipblock: - inc r5 - add r2, 64 - test r5, 3 - jnz .nextblock -%ifdef ARCH_X86_64 - add r10, gprsize -%else - add r0mp, gprsize -%endif - test r5, 4 - jnz .nextblock + ADD16_OP_INTRA %1, 20, 1+4*8 + ADD16_OP_INTRA %1, 22, 1+5*8 REP_RET + AC %1, 16 + AC %1, 18 + AC %1, 20 + AC %1, 22 + %endmacro ; IDCT_ADD8 INIT_XMM |