diff options
author | Henrik Gramner <hengar-6@student.ltu.se> | 2012-04-04 20:03:15 +0000 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2012-04-11 15:47:00 -0400 |
commit | 729f90e26802057f06905ab15a34612168eeac80 (patch) | |
tree | 41f8c4cedf10851b5b437aeeb558ce3d0f8db1dc /libavcodec/x86/dsputil_yasm.asm | |
parent | e1ce756844e684876318570dcebc74bc66c084f0 (diff) | |
download | ffmpeg-729f90e26802057f06905ab15a34612168eeac80.tar.gz |
x86inc improvements for 64-bit
Add support for all x86-64 registers
Prefer caller-saved register over callee-saved on WIN64
Support up to 15 function arguments
Also (by Ronald S. Bultje)
Fix up our asm to work with new x86inc.asm.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
Diffstat (limited to 'libavcodec/x86/dsputil_yasm.asm')
-rw-r--r-- | libavcodec/x86/dsputil_yasm.asm | 36 |
1 files changed, 18 insertions, 18 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 00dc18b469..bec4063260 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -497,9 +497,9 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset %macro EMU_EDGE_FUNC 0 %if ARCH_X86_64 -%define w_reg r10 -cglobal emu_edge_core, 6, 7, 1 - mov r11, r5 ; save block_h +%define w_reg r7 +cglobal emu_edge_core, 6, 9, 1 + mov r8, r5 ; save block_h %else %define w_reg r6 cglobal emu_edge_core, 2, 7, 0 @@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0 sub r0, w_reg %if ARCH_X86_64 mov r3, r0 ; backup of buf+block_h*linesize - mov r5, r11 + mov r5, r8 %else mov r0m, r0 ; backup of buf+block_h*linesize mov r5, r5m @@ -550,7 +550,7 @@ cglobal emu_edge_core, 2, 7, 0 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me sar w_reg, 1 sal w_reg, 6 - ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs + ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h %ifdef PIC lea rax, [.emuedge_extend_left_2] @@ -560,7 +560,7 @@ cglobal emu_edge_core, 2, 7, 0 %endif call w_reg - ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w + ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w .right_extend: %if ARCH_X86_32 mov r0, r0m @@ -591,7 +591,7 @@ cglobal emu_edge_core, 2, 7, 0 %define vall al %define valh ah %define valw ax -%define valw2 r10w +%define valw2 r7w %define valw3 r3w %if WIN64 %define valw4 r4w @@ -618,7 +618,7 @@ cglobal emu_edge_core, 2, 7, 0 ; - else if (%2 & 8) fills 8 bytes into mm0 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax ; - else if (%2 & 4) fills 4 bytes into mm0-1 -; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax +; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax ; (note that we're using r3 for body/bottom because it's a shorter ; opcode, and then the loop fits in 128 bytes) ; - else fills remaining bytes into rax @@ -848,7 +848,7 @@ ALIGN 64 %endrep %endmacro ; LEFT_EXTEND -; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val +; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val %macro RIGHT_EXTEND 0 %assign %%n 2 %rep 11 @@ -858,7 +858,7 @@ ALIGN 64 sub r3, r2 ; dst -= linesize READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels - dec r11 + dec r8 %else ; ARCH_X86_32 sub r0, r2 ; dst -= linesize READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels @@ -937,11 +937,11 @@ ALIGN 64 %macro SLOW_V_EXTEND 0 .slow_v_extend_loop: ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x +; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x %if ARCH_X86_64 - push r11 ; save old value of block_h + push r8 ; save old value of block_h test r3, r3 -%define cnt_reg r11 +%define cnt_reg r8 jz .do_body_copy ; if (!start_y) goto do_body_copy V_COPY_ROW top, r3 %else @@ -955,7 +955,7 @@ ALIGN 64 V_COPY_ROW body, r4 %if ARCH_X86_64 - pop r11 ; restore old value of block_h + pop r8 ; restore old value of block_h %define cnt_reg r3 %endif test r5, r5 @@ -974,7 +974,7 @@ ALIGN 64 %macro SLOW_LEFT_EXTEND 0 .slow_left_extend_loop: -; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x +; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x mov r4, 8 sub r0, linesize READ_V_PIXEL 8, [r0+w_reg] @@ -1002,11 +1002,11 @@ ALIGN 64 %macro SLOW_RIGHT_EXTEND 0 .slow_right_extend_loop: -; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, -; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr +; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, +; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr %if ARCH_X86_64 %define buf_reg r3 -%define bh_reg r11 +%define bh_reg r8 %else %define buf_reg r0 %define bh_reg r5 |