diff options
author | Henrik Gramner <hengar-6@student.ltu.se> | 2012-04-04 20:03:15 +0000 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2012-04-11 15:47:00 -0400 |
commit | 729f90e26802057f06905ab15a34612168eeac80 (patch) | |
tree | 41f8c4cedf10851b5b437aeeb558ce3d0f8db1dc /libavcodec/x86/h264_deblock.asm | |
parent | e1ce756844e684876318570dcebc74bc66c084f0 (diff) | |
download | ffmpeg-729f90e26802057f06905ab15a34612168eeac80.tar.gz |
x86inc improvements for 64-bit
Add support for all x86-64 registers
Prefer caller-saved register over callee-saved on WIN64
Support up to 15 function arguments
Also (by Ronald S. Bultje)
Fix up our asm to work with new x86inc.asm.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
Diffstat (limited to 'libavcodec/x86/h264_deblock.asm')
-rw-r--r-- | libavcodec/x86/h264_deblock.asm | 60 |
1 files changed, 30 insertions, 30 deletions
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 0f61922276..1982dc4bd3 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -328,11 +328,11 @@ cglobal deblock_v_luma_8_%1, 5,5,10 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal deblock_h_luma_8_%1, 5,7 - movsxd r10, r1d - lea r11, [r10+r10*2] +cglobal deblock_h_luma_8_%1, 5,9 + movsxd r7, r1d + lea r8, [r7+r7*2] lea r6, [r0-4] - lea r5, [r0-4+r11] + lea r5, [r0-4+r8] %if WIN64 sub rsp, 0x98 %define pix_tmp rsp+0x30 @@ -342,14 +342,14 @@ cglobal deblock_h_luma_8_%1, 5,7 %endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp + lea r6, [r6+r7*8] + lea r5, [r5+r7*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %if WIN64 @@ -364,17 +364,17 @@ cglobal deblock_h_luma_8_%1, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) %if WIN64 add rsp, 0x98 @@ -705,32 +705,32 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8_%1, 4,7 - movsxd r10, r1d - lea r11, [r10*3] +cglobal deblock_h_luma_intra_8_%1, 4,9 + movsxd r7, r1d + lea r8, [r7*3] lea r6, [r0-4] - lea r5, [r0-4+r11] + lea r5, [r0-4+r8] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r7*8] + lea r5, [r5+r7*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call deblock_v_luma_intra_8_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r5, [r6+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) + lea r5, [r6+r8] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) add rsp, 0x88 RET %else |