aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dsputil_yasm.asm
diff options
context:
space:
mode:
authorHenrik Gramner <hengar-6@student.ltu.se>2012-04-04 20:03:15 +0000
committerJustin Ruggles <justin.ruggles@gmail.com>2012-04-11 15:47:00 -0400
commit729f90e26802057f06905ab15a34612168eeac80 (patch)
tree41f8c4cedf10851b5b437aeeb558ce3d0f8db1dc /libavcodec/x86/dsputil_yasm.asm
parente1ce756844e684876318570dcebc74bc66c084f0 (diff)
downloadffmpeg-729f90e26802057f06905ab15a34612168eeac80.tar.gz
x86inc improvements for 64-bit
Add support for all x86-64 registers Prefer caller-saved register over callee-saved on WIN64 Support up to 15 function arguments Also (by Ronald S. Bultje) Fix up our asm to work with new x86inc.asm. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
Diffstat (limited to 'libavcodec/x86/dsputil_yasm.asm')
-rw-r--r--libavcodec/x86/dsputil_yasm.asm36
1 files changed, 18 insertions, 18 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 00dc18b469..bec4063260 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -497,9 +497,9 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
%macro EMU_EDGE_FUNC 0
%if ARCH_X86_64
-%define w_reg r10
-cglobal emu_edge_core, 6, 7, 1
- mov r11, r5 ; save block_h
+%define w_reg r7
+cglobal emu_edge_core, 6, 9, 1
+ mov r8, r5 ; save block_h
%else
%define w_reg r6
cglobal emu_edge_core, 2, 7, 0
@@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0
sub r0, w_reg
%if ARCH_X86_64
mov r3, r0 ; backup of buf+block_h*linesize
- mov r5, r11
+ mov r5, r8
%else
mov r0m, r0 ; backup of buf+block_h*linesize
mov r5, r5m
@@ -550,7 +550,7 @@ cglobal emu_edge_core, 2, 7, 0
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar w_reg, 1
sal w_reg, 6
- ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
+ ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
%ifdef PIC
lea rax, [.emuedge_extend_left_2]
@@ -560,7 +560,7 @@ cglobal emu_edge_core, 2, 7, 0
%endif
call w_reg
- ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
+ ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
.right_extend:
%if ARCH_X86_32
mov r0, r0m
@@ -591,7 +591,7 @@ cglobal emu_edge_core, 2, 7, 0
%define vall al
%define valh ah
%define valw ax
-%define valw2 r10w
+%define valw2 r7w
%define valw3 r3w
%if WIN64
%define valw4 r4w
@@ -618,7 +618,7 @@ cglobal emu_edge_core, 2, 7, 0
; - else if (%2 & 8) fills 8 bytes into mm0
; - if (%2 & 7 == 4) fills the last 4 bytes into rax
; - else if (%2 & 4) fills 4 bytes into mm0-1
-; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
+; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
; (note that we're using r3 for body/bottom because it's a shorter
; opcode, and then the loop fits in 128 bytes)
; - else fills remaining bytes into rax
@@ -848,7 +848,7 @@ ALIGN 64
%endrep
%endmacro ; LEFT_EXTEND
-; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
+; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
%macro RIGHT_EXTEND 0
%assign %%n 2
%rep 11
@@ -858,7 +858,7 @@ ALIGN 64
sub r3, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
- dec r11
+ dec r8
%else ; ARCH_X86_32
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
@@ -937,11 +937,11 @@ ALIGN 64
%macro SLOW_V_EXTEND 0
.slow_v_extend_loop:
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
-; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
+; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
%if ARCH_X86_64
- push r11 ; save old value of block_h
+ push r8 ; save old value of block_h
test r3, r3
-%define cnt_reg r11
+%define cnt_reg r8
jz .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, r3
%else
@@ -955,7 +955,7 @@ ALIGN 64
V_COPY_ROW body, r4
%if ARCH_X86_64
- pop r11 ; restore old value of block_h
+ pop r8 ; restore old value of block_h
%define cnt_reg r3
%endif
test r5, r5
@@ -974,7 +974,7 @@ ALIGN 64
%macro SLOW_LEFT_EXTEND 0
.slow_left_extend_loop:
-; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
+; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
mov r4, 8
sub r0, linesize
READ_V_PIXEL 8, [r0+w_reg]
@@ -1002,11 +1002,11 @@ ALIGN 64
%macro SLOW_RIGHT_EXTEND 0
.slow_right_extend_loop:
-; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
-; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
+; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
+; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
%if ARCH_X86_64
%define buf_reg r3
-%define bh_reg r11
+%define bh_reg r8
%else
%define buf_reg r0
%define bh_reg r5