diff options
author | Henrik Gramner <hengar-6@student.ltu.se> | 2012-04-04 20:03:15 +0000 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2012-04-11 15:47:00 -0400 |
commit | 729f90e26802057f06905ab15a34612168eeac80 (patch) | |
tree | 41f8c4cedf10851b5b437aeeb558ce3d0f8db1dc | |
parent | e1ce756844e684876318570dcebc74bc66c084f0 (diff) | |
download | ffmpeg-729f90e26802057f06905ab15a34612168eeac80.tar.gz |
x86inc improvements for 64-bit
Add support for all x86-64 registers
Prefer caller-saved register over callee-saved on WIN64
Support up to 15 function arguments
Also (by Ronald S. Bultje)
Fix up our asm to work with new x86inc.asm.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
-rw-r--r-- | libavcodec/x86/dsputil_yasm.asm | 36 | ||||
-rw-r--r-- | libavcodec/x86/fft_mmx.asm | 25 | ||||
-rw-r--r-- | libavcodec/x86/fmtconvert.asm | 6 | ||||
-rw-r--r-- | libavcodec/x86/h264_chromamc.asm | 48 | ||||
-rw-r--r-- | libavcodec/x86/h264_deblock.asm | 60 | ||||
-rw-r--r-- | libavcodec/x86/h264_idct.asm | 156 | ||||
-rw-r--r-- | libavcodec/x86/h264_idct_10bit.asm | 24 | ||||
-rw-r--r-- | libavcodec/x86/h264_intrapred.asm | 30 | ||||
-rw-r--r-- | libavcodec/x86/h264_qpel_10bit.asm | 20 | ||||
-rw-r--r-- | libavcodec/x86/h264_weight.asm | 14 | ||||
-rw-r--r-- | libavutil/x86/x86inc.asm | 218 | ||||
-rw-r--r-- | libswscale/x86/output.asm | 4 | ||||
-rw-r--r-- | libswscale/x86/scale.asm | 18 |
13 files changed, 318 insertions, 341 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 00dc18b469..bec4063260 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -497,9 +497,9 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset %macro EMU_EDGE_FUNC 0 %if ARCH_X86_64 -%define w_reg r10 -cglobal emu_edge_core, 6, 7, 1 - mov r11, r5 ; save block_h +%define w_reg r7 +cglobal emu_edge_core, 6, 9, 1 + mov r8, r5 ; save block_h %else %define w_reg r6 cglobal emu_edge_core, 2, 7, 0 @@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0 sub r0, w_reg %if ARCH_X86_64 mov r3, r0 ; backup of buf+block_h*linesize - mov r5, r11 + mov r5, r8 %else mov r0m, r0 ; backup of buf+block_h*linesize mov r5, r5m @@ -550,7 +550,7 @@ cglobal emu_edge_core, 2, 7, 0 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me sar w_reg, 1 sal w_reg, 6 - ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs + ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h %ifdef PIC lea rax, [.emuedge_extend_left_2] @@ -560,7 +560,7 @@ cglobal emu_edge_core, 2, 7, 0 %endif call w_reg - ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w + ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w .right_extend: %if ARCH_X86_32 mov r0, r0m @@ -591,7 +591,7 @@ cglobal emu_edge_core, 2, 7, 0 %define vall al %define valh ah %define valw ax -%define valw2 r10w +%define valw2 r7w %define valw3 r3w %if WIN64 %define valw4 r4w @@ -618,7 +618,7 @@ cglobal emu_edge_core, 2, 7, 0 ; - else if (%2 & 8) fills 8 bytes into mm0 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax ; - else if (%2 & 4) fills 4 bytes into mm0-1 -; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax +; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax ; (note that we're using r3 for body/bottom because it's a shorter ; opcode, and then the loop fits in 128 bytes) ; - else fills remaining bytes into rax @@ -848,7 +848,7 @@ ALIGN 64 %endrep %endmacro ; LEFT_EXTEND -; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val +; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val %macro RIGHT_EXTEND 0 %assign %%n 2 %rep 11 @@ -858,7 +858,7 @@ ALIGN 64 sub r3, r2 ; dst -= linesize READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels - dec r11 + dec r8 %else ; ARCH_X86_32 sub r0, r2 ; dst -= linesize READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels @@ -937,11 +937,11 @@ ALIGN 64 %macro SLOW_V_EXTEND 0 .slow_v_extend_loop: ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x +; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x %if ARCH_X86_64 - push r11 ; save old value of block_h + push r8 ; save old value of block_h test r3, r3 -%define cnt_reg r11 +%define cnt_reg r8 jz .do_body_copy ; if (!start_y) goto do_body_copy V_COPY_ROW top, r3 %else @@ -955,7 +955,7 @@ ALIGN 64 V_COPY_ROW body, r4 %if ARCH_X86_64 - pop r11 ; restore old value of block_h + pop r8 ; restore old value of block_h %define cnt_reg r3 %endif test r5, r5 @@ -974,7 +974,7 @@ ALIGN 64 %macro SLOW_LEFT_EXTEND 0 .slow_left_extend_loop: -; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x +; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x mov r4, 8 sub r0, linesize READ_V_PIXEL 8, [r0+w_reg] @@ -1002,11 +1002,11 @@ ALIGN 64 %macro SLOW_RIGHT_EXTEND 0 .slow_right_extend_loop: -; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, -; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr +; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, +; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr %if ARCH_X86_64 %define buf_reg r3 -%define bh_reg r11 +%define bh_reg r8 %else %define buf_reg r0 %define bh_reg r5 diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index a2f26cca33..225c66635d 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -749,14 +749,11 @@ INIT_XMM %endmacro %macro DECL_IMDCT 2 -cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input +cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input %if ARCH_X86_64 -%define rrevtab r10 -%define rtcos r11 -%define rtsin r12 - push r12 - push r13 - push r14 +%define rrevtab r7 +%define rtcos r8 +%define rtsin r9 %else %define rrevtab r6 %define rtsin r6 @@ -798,12 +795,12 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * %if ARCH_X86_64 movzx r5, word [rrevtab+r4-4] movzx r6, word [rrevtab+r4-2] - movzx r13, word [rrevtab+r3] - movzx r14, word [rrevtab+r3+2] + movzx r10, word [rrevtab+r3] + movzx r11, word [rrevtab+r3+2] movlps [r1+r5 *8], xmm0 movhps [r1+r6 *8], xmm0 - movlps [r1+r13*8], xmm1 - movhps [r1+r14*8], xmm1 + movlps [r1+r10*8], xmm1 + movhps [r1+r11*8], xmm1 add r4, 4 %else mov r6, [esp] @@ -839,11 +836,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * mov r1, -mmsize sub r1, r0 %2 r0, r1, r6, rtcos, rtsin -%if ARCH_X86_64 - pop r14 - pop r13 - pop r12 -%else +%if ARCH_X86_64 == 0 add esp, 12 %endif %ifidn avx_enabled, 1 diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 3f39c7e564..63befc94f6 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -179,9 +179,8 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2 %macro FLOAT_TO_INT16_INTERLEAVE6 1 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) -cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 +cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len %if ARCH_X86_64 - %define lend r10d mov lend, r2d %else %define lend dword r2m @@ -240,9 +239,8 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 ;----------------------------------------------------------------------------- %macro FLOAT_INTERLEAVE6 2 -cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 +cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len %if ARCH_X86_64 - %define lend r10d mov lend, r2d %else %define lend dword r2m diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index 8b621fa8bb..64a4efe057 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -91,9 +91,22 @@ SECTION .text %endmacro %macro chroma_mc8_mmx_func 3 +%ifidn %2, rv40 +%ifdef PIC +%define rnd_1d_rv40 r8 +%define rnd_2d_rv40 r8 +%define extra_regs 2 +%else ; no-PIC +%define rnd_1d_rv40 rnd_rv40_1d_tbl +%define rnd_2d_rv40 rnd_rv40_2d_tbl +%define extra_regs 1 +%endif ; PIC +%else +%define extra_regs 0 +%endif ; rv40 ; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ; int stride, int h, int mx, int my) -cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 +cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 %if ARCH_X86_64 movsxd r2, r2d %endif @@ -106,19 +119,12 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 .at_least_one_non_zero %ifidn %2, rv40 -%ifdef PIC -%define rnd_1d_rv40 r11 -%define rnd_2d_rv40 r11 -%else ; no-PIC -%define rnd_1d_rv40 rnd_rv40_1d_tbl -%define rnd_2d_rv40 rnd_rv40_2d_tbl -%endif %if ARCH_X86_64 - mov r10, r5 - and r10, 6 ; &~1 for mx/my=[0,7] - lea r10, [r10*4+r4] - sar r10d, 1 -%define rnd_bias r10 + mov r7, r5 + and r7, 6 ; &~1 for mx/my=[0,7] + lea r7, [r7*4+r4] + sar r7d, 1 +%define rnd_bias r7 %define dest_reg r0 %else ; x86-32 mov r0, r5 @@ -145,7 +151,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 %ifidn %2, rv40 %ifdef PIC - lea r11, [rnd_rv40_1d_tbl] + lea r8, [rnd_rv40_1d_tbl] %endif %if ARCH_X86_64 == 0 mov r5, r0m @@ -196,7 +202,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 movd m6, r5d ; y %ifidn %2, rv40 %ifdef PIC - lea r11, [rnd_rv40_2d_tbl] + lea r8, [rnd_rv40_2d_tbl] %endif %if ARCH_X86_64 == 0 mov r5, r0m @@ -278,7 +284,13 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 %endmacro %macro chroma_mc4_mmx_func 3 -cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 +%define extra_regs 0 +%ifidn %2, rv40 +%ifdef PIC +%define extra_regs 1 +%endif ; PIC +%endif ; rv40 +cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0 %if ARCH_X86_64 movsxd r2, r2d %endif @@ -296,8 +308,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 %ifidn %2, rv40 %ifdef PIC - lea r11, [rnd_rv40_2d_tbl] -%define rnd_2d_rv40 r11 + lea r6, [rnd_rv40_2d_tbl] +%define rnd_2d_rv40 r6 %else %define rnd_2d_rv40 rnd_rv40_2d_tbl %endif diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 0f61922276..1982dc4bd3 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -328,11 +328,11 @@ cglobal deblock_v_luma_8_%1, 5,5,10 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal deblock_h_luma_8_%1, 5,7 - movsxd r10, r1d - lea r11, [r10+r10*2] +cglobal deblock_h_luma_8_%1, 5,9 + movsxd r7, r1d + lea r8, [r7+r7*2] lea r6, [r0-4] - lea r5, [r0-4+r11] + lea r5, [r0-4+r8] %if WIN64 sub rsp, 0x98 %define pix_tmp rsp+0x30 @@ -342,14 +342,14 @@ cglobal deblock_h_luma_8_%1, 5,7 %endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp + lea r6, [r6+r7*8] + lea r5, [r5+r7*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %if WIN64 @@ -364,17 +364,17 @@ cglobal deblock_h_luma_8_%1, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) %if WIN64 add rsp, 0x98 @@ -705,32 +705,32 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8_%1, 4,7 - movsxd r10, r1d - lea r11, [r10*3] +cglobal deblock_h_luma_intra_8_%1, 4,9 + movsxd r7, r1d + lea r8, [r7*3] lea r6, [r0-4] - lea r5, [r0-4+r11] + lea r5, [r0-4+r8] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r7*8] + lea r5, [r5+r7*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call deblock_v_luma_intra_8_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r5, [r6+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) + lea r5, [r6+r8] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) add rsp, 0x88 RET %else diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 5e8c0edfa6..cc83806884 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -45,8 +45,10 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 db 4+13*8, 5+13*8, 4+14*8, 5+14*8 db 6+13*8, 7+13*8, 6+14*8, 7+14*8 %ifdef PIC -%define scan8 r11 +%define npicregs 1 +%define scan8 picregq %else +%define npicregs 0 %define scan8 scan8_mem %endif @@ -301,10 +303,10 @@ cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8_mmx, 5, 7, 0 +cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -323,13 +325,13 @@ cglobal h264_idct_add16_8_mmx, 5, 7, 0 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8_mmx, 5, 7, 0 +cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg %assign pad 128+4-(stack_offset&7) SUB rsp, pad xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -355,10 +357,10 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8_mmx2, 5, 7, 0 +cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -371,16 +373,13 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 test r6, r6 jz .no_dc DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d %endif - mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] - DC_ADD_MMX2_OP movh, dst_reg, r3, r6 + mov dst2d, dword [r1+r5*4] + lea dst2q, [r0+dst2q] + DC_ADD_MMX2_OP movh, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -402,10 +401,10 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 +cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -425,10 +424,10 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 +cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -448,16 +447,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 test r6, r6 jz .skipblock DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d %endif - mov dst_regd, dword [r1+r5*4] - add dst_reg, r0 - DC_ADD_MMX2_OP movh, dst_reg, r3, r6 + mov dst2d, dword [r1+r5*4] + add dst2q, r0 + DC_ADD_MMX2_OP movh, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -470,13 +466,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 +cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg %assign pad 128+4-(stack_offset&7) SUB rsp, pad xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -489,18 +485,15 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 test r6, r6 jz .no_dc DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d -%endif - mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 - lea dst_reg, [dst_reg+r3*4] - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d +%endif + mov dst2d, dword [r1+r5*4] + lea dst2q, [r0+dst2q] + DC_ADD_MMX2_OP mova, dst2q, r3, r6 + lea dst2q, [dst2q+r3*4] + DC_ADD_MMX2_OP mova, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -533,10 +526,10 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 INIT_XMM ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8_sse2, 5, 7, 10 +cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif .nextblock movzx r6, byte [scan8+r5] @@ -550,18 +543,15 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10 jz .no_dc INIT_MMX DC_ADD_MMX2_INIT r2, r3, r6 -%if ARCH_X86_64 -%define dst_reg r10 -%define dst_regd r10d -%else -%define dst_reg r1 -%define dst_regd r1d -%endif - mov dst_regd, dword [r1+r5*4] - add dst_reg, r0 - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 - lea dst_reg, [dst_reg+r3*4] - DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%if ARCH_X86_64 == 0 +%define dst2q r1 +%define dst2d r1d +%endif + mov dst2d, dword [r1+r5*4] + add dst2q, r0 + DC_ADD_MMX2_OP mova, dst2q, r3, r6 + lea dst2q, [dst2q+r3*4] + DC_ADD_MMX2_OP mova, dst2q, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -572,9 +562,9 @@ INIT_MMX REP_RET .no_dc INIT_XMM - mov dst_regd, dword [r1+r5*4] - add dst_reg, r0 - IDCT8_ADD_SSE dst_reg, r2, r3, r6 + mov dst2d, dword [r1+r5*4] + add dst2q, r0 + IDCT8_ADD_SSE dst2q, r2, r3, r6 %if ARCH_X86_64 == 0 mov r1, r1m %endif @@ -595,7 +585,7 @@ h264_idct_add8_mmx_plane: jz .skipblock %if ARCH_X86_64 mov r0d, dword [r1+r5*4] - add r0, [r10] + add r0, [dst2q] %else mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, [r0] @@ -611,20 +601,20 @@ h264_idct_add8_mmx_plane: ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8_mmx, 5, 7, 0 +cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg mov r5, 16 add r2, 512 %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif %if ARCH_X86_64 - mov r10, r0 + mov dst2q, r0 %endif call h264_idct_add8_mmx_plane mov r5, 32 add r2, 384 %if ARCH_X86_64 - add r10, gprsize + add dst2q, gprsize %else add r0mp, gprsize %endif @@ -639,7 +629,7 @@ h264_idct_add8_mmx2_plane jz .try_dc %if ARCH_X86_64 mov r0d, dword [r1+r5*4] - add r0, [r10] + add r0, [dst2q] %else mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, [r0] @@ -658,7 +648,7 @@ h264_idct_add8_mmx2_plane DC_ADD_MMX2_INIT r2, r3, r6 %if ARCH_X86_64 mov r0d, dword [r1+r5*4] - add r0, [r10] + add r0, [dst2q] %else mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, [r0] @@ -674,20 +664,20 @@ h264_idct_add8_mmx2_plane ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8_mmx2, 5, 7, 0 +cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg mov r5, 16 add r2, 512 %if ARCH_X86_64 - mov r10, r0 + mov dst2q, r0 %endif %ifdef PIC - lea r11, [scan8_mem] + lea picregq, [scan8_mem] %endif call h264_idct_add8_mmx2_plane mov r5, 32 add r2, 384 %if ARCH_X86_64 - add r10, gprsize + add dst2q, gprsize %else add r0mp, gprsize %endif @@ -739,7 +729,7 @@ x264_add8x4_idct_sse2: jz .cycle%1end mov r0d, dword [r1+%1*8] %if ARCH_X86_64 - add r0, r10 + add r0, r5 %else add r0, r0m %endif @@ -752,9 +742,9 @@ x264_add8x4_idct_sse2: ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8_sse2, 5, 5, 8 +cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 %if ARCH_X86_64 - mov r10, r0 + mov r5, r0 %endif ; unrolling of the loop leads to an average performance gain of ; 20-25% @@ -774,7 +764,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 jz .try%1dc mov r0d, dword [r1+%1*8] %if ARCH_X86_64 - add r0, r10 + add r0, r7 %else add r0, r0m %endif @@ -786,7 +776,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 jz .cycle%1end mov r0d, dword [r1+%1*8] %if ARCH_X86_64 - add r0, r10 + add r0, r7 %else add r0, r0m %endif @@ -799,9 +789,9 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 +cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 %if ARCH_X86_64 - mov r10, r0 + mov r7, r0 %endif add16intra_sse2_cycle 0, 0xc add16intra_sse2_cycle 1, 0x14 @@ -819,7 +809,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 jz .try%1dc %if ARCH_X86_64 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] - add r0, [r10] + add r0, [r7] %else mov r0, r0m mov r0, [r0] @@ -833,7 +823,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 jz .cycle%1end %if ARCH_X86_64 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] - add r0, [r10] + add r0, [r7] %else mov r0, r0m mov r0, [r0] @@ -850,15 +840,15 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8_sse2, 5, 7, 8 +cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8 add r2, 512 %if ARCH_X86_64 - mov r10, r0 + mov r7, r0 %endif add8_sse2_cycle 0, 0x34 add8_sse2_cycle 1, 0x3c %if ARCH_X86_64 - add r10, gprsize + add r7, gprsize %else add r0mp, gprsize %endif diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 501c2a4da1..934a7ff633 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -29,24 +29,6 @@ SECTION_RODATA pw_pixel_max: times 8 dw ((1 << 10)-1) pd_32: times 4 dd 32 -scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 - db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 - db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 - db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 - db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 - db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 - db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 - db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 - db 4+11*8, 5+11*8, 4+12*8, 5+12*8 - db 6+11*8, 7+11*8, 6+12*8, 7+12*8 - db 4+13*8, 5+13*8, 4+14*8, 5+14*8 - db 6+13*8, 7+13*8, 6+14*8, 7+14*8 - -%ifdef PIC -%define scan8 r11 -%else -%define scan8 scan8_mem -%endif SECTION .text @@ -315,9 +297,9 @@ IDCT_ADD16INTRA_10 avx ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- %macro IDCT_ADD8 1 -cglobal h264_idct_add8_10_%1,5,7,7 +cglobal h264_idct_add8_10_%1,5,8,7 %if ARCH_X86_64 - mov r10, r0 + mov r7, r0 %endif add r2, 1024 mov r0, [r0] @@ -325,7 +307,7 @@ cglobal h264_idct_add8_10_%1,5,7,7 ADD16_OP_INTRA %1, 18, 4+ 7*8 add r2, 1024-128*2 %if ARCH_X86_64 - mov r0, [r10+gprsize] + mov r0, [r7+gprsize] %else mov r0, r0m mov r0, [r0+gprsize] diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index c6b4386627..3beb3b9d6d 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -289,7 +289,7 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6 ;----------------------------------------------------------------------------- %macro H264_PRED16x16_PLANE 3 -cglobal pred16x16_plane_%3_%1, 2, 7, %2 +cglobal pred16x16_plane_%3_%1, 2, 9, %2 mov r2, r1 ; +stride neg r1 ; -stride @@ -349,7 +349,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 add r4, r2 %if ARCH_X86_64 -%define e_reg r11 +%define e_reg r8 %else %define e_reg r0 %endif @@ -370,8 +370,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movzx e_reg, byte [r3 ] %if ARCH_X86_64 - movzx r10, byte [r4+r2 ] - sub r10, e_reg + movzx r7, byte [r4+r2 ] + sub r7, e_reg %else movzx r6, byte [r4+r2 ] sub r6, e_reg @@ -386,7 +386,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movzx r6, byte [r3 ] sub r6, r4 %if ARCH_X86_64 - lea r6, [r10+r6*2] + lea r6, [r7+r6*2] lea r5, [r5+r6*2] add r5, r6 %else @@ -396,9 +396,9 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movzx r4, byte [e_reg ] %if ARCH_X86_64 - movzx r10, byte [r3 +r2 ] - sub r10, r4 - sub r5, r10 + movzx r7, byte [r3 +r2 ] + sub r7, r4 + sub r5, r7 %else movzx r6, byte [r3 +r2 ] sub r6, r4 @@ -410,7 +410,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movzx r6, byte [r3 +r2*2] sub r6, r4 %if ARCH_X86_64 - add r6, r10 + add r6, r7 %endif lea r5, [r5+r6*8] @@ -588,7 +588,7 @@ H264_PRED16x16_PLANE ssse3, 8, svq3 ;----------------------------------------------------------------------------- %macro H264_PRED8x8_PLANE 2 -cglobal pred8x8_plane_%1, 2, 7, %2 +cglobal pred8x8_plane_%1, 2, 9, %2 mov r2, r1 ; +stride neg r1 ; -stride @@ -642,7 +642,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 add r4, r2 %if ARCH_X86_64 -%define e_reg r11 +%define e_reg r8 %else %define e_reg r0 %endif @@ -653,9 +653,9 @@ cglobal pred8x8_plane_%1, 2, 7, %2 movzx e_reg, byte [r3 ] %if ARCH_X86_64 - movzx r10, byte [r4+r2 ] - sub r10, e_reg - sub r5, r10 + movzx r7, byte [r4+r2 ] + sub r7, e_reg + sub r5, r7 %else movzx r6, byte [r4+r2 ] sub r6, e_reg @@ -667,7 +667,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 movzx r6, byte [r4+r2*2 ] sub r6, e_reg %if ARCH_X86_64 - add r6, r10 + add r6, r7 %endif lea r5, [r5+r6*4] diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index bdacf9f472..788d715d61 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -121,8 +121,8 @@ MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 %endmacro %macro MCAxA_OP 8 -cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 %if ARCH_X86_32 +cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 call stub_%2_h264_qpel%4_%3_10_%1 mov r0, r0m mov r1, r1m @@ -141,17 +141,19 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 call stub_%2_h264_qpel%4_%3_10_%1 RET %else ; ARCH_X86_64 - mov r10, r0 - mov r11, r1 +cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8 + mov r%7, r0 +%assign p1 %7+1 + mov r %+ p1, r1 call stub_%2_h264_qpel%4_%3_10_%1 - lea r0, [r10+%4*2] - lea r1, [r11+%4*2] + lea r0, [r%7+%4*2] + lea r1, [r %+ p1+%4*2] call stub_%2_h264_qpel%4_%3_10_%1 - lea r0, [r10+r2*%4] - lea r1, [r11+r2*%4] + lea r0, [r%7+r2*%4] + lea r1, [r %+ p1+r2*%4] call stub_%2_h264_qpel%4_%3_10_%1 - lea r0, [r10+r2*%4+%4*2] - lea r1, [r11+r2*%4+%4*2] + lea r0, [r%7+r2*%4+%4*2] + lea r1, [r %+ p1+r2*%4+%4*2] %if UNIX64 == 0 ; fall through to function call stub_%2_h264_qpel%4_%3_10_%1 RET diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 1c40e49eaa..22ce72d19f 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -127,7 +127,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SETUP 0 %if ARCH_X86_64 -%define off_regd r11d +%define off_regd r7d %else %define off_regd r3d %endif @@ -175,7 +175,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 %endmacro INIT_MMX -cglobal h264_biweight_16_mmx2, 7, 7, 0 +cglobal h264_biweight_16_mmx2, 7, 8, 0 BIWEIGHT_SETUP movifnidn r3d, r3m .nextrow @@ -194,7 +194,7 @@ cglobal h264_biweight_16_mmx2, 7, 7, 0 REP_RET %macro BIWEIGHT_FUNC_MM 3 -cglobal h264_biweight_%1_%3, 7, 7, %2 +cglobal h264_biweight_%1_%3, 7, 8, %2 BIWEIGHT_SETUP movifnidn r3d, r3m .nextrow @@ -215,7 +215,7 @@ INIT_XMM BIWEIGHT_FUNC_MM 16, 8, sse2 %macro BIWEIGHT_FUNC_HALF_MM 3 -cglobal h264_biweight_%1_%3, 7, 7, %2 +cglobal h264_biweight_%1_%3, 7, 8, %2 BIWEIGHT_SETUP movifnidn r3d, r3m sar r3, 1 @@ -245,7 +245,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SSSE3_SETUP 0 %if ARCH_X86_64 -%define off_regd r11d +%define off_regd r7d %else %define off_regd r3d %endif @@ -277,7 +277,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 %endmacro INIT_XMM -cglobal h264_biweight_16_ssse3, 7, 7, 8 +cglobal h264_biweight_16_ssse3, 7, 8, 8 BIWEIGHT_SSSE3_SETUP movifnidn r3d, r3m @@ -296,7 +296,7 @@ cglobal h264_biweight_16_ssse3, 7, 7, 8 REP_RET INIT_XMM -cglobal h264_biweight_8_ssse3, 7, 7, 8 +cglobal h264_biweight_8_ssse3, 7, 8, 8 BIWEIGHT_SSSE3_SETUP movifnidn r3d, r3m sar r3, 1 diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 7db1e9c311..ea9f9a1550 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1,11 +1,12 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project +;* Copyright (C) 2005-2012 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> ;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Henrik Gramner <hengar-6@student.ltu.se> ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above @@ -95,6 +96,9 @@ default rel %endif +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) +CPU amdnop + ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that @@ -128,18 +132,20 @@ ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size -%macro DECLARE_REG 6 +%macro DECLARE_REG 5-6 %define r%1q %2 %define r%1d %3 %define r%1w %4 %define r%1b %5 - %define r%1m %6 - %ifid %6 ; i.e. it's a register + %if %0 == 5 + %define r%1m %3 %define r%1mp %2 %elif ARCH_X86_64 ; memory - %define r%1mp qword %6 + %define r%1m [rsp + stack_offset + %6] + %define r%1mp qword r %+ %1m %else - %define r%1mp dword %6 + %define r%1m [esp + stack_offset + %6] + %define r%1mp dword r %+ %1m %endif %define r%1 %2 %endmacro @@ -187,7 +193,7 @@ DECLARE_REG_SIZE bp, bpl %endrep %endmacro -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 @@ -205,6 +211,33 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 %assign stack_offset stack_offset-gprsize %endmacro +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + %macro SUB 2 sub %1, %2 %ifidn %1, rsp @@ -272,39 +305,34 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 %if WIN64 ; Windows x64 ;================================================= -DECLARE_REG 0, rcx, ecx, cx, cl, ecx -DECLARE_REG 1, rdx, edx, dx, dl, edx -DECLARE_REG 2, r8, r8d, r8w, r8b, r8d -DECLARE_REG 3, r9, r9d, r9w, r9b, r9d -DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] -DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] -%define r7m [rsp + stack_offset + 64] -%define r8m [rsp + stack_offset + 72] - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [rsp + stack_offset + 8 + %1*8] - %endif -%endmacro +DECLARE_REG 0, rcx, ecx, cx, cl +DECLARE_REG 1, rdx, edx, dx, dl +DECLARE_REG 2, R8, R8D, R8W, R8B +DECLARE_REG 3, R9, R9D, R9W, R9B +DECLARE_REG 4, R10, R10D, R10W, R10B, 40 +DECLARE_REG 5, R11, R11D, R11W, R11B, 48 +DECLARE_REG 6, rax, eax, ax, al, 56 +DECLARE_REG 7, rdi, edi, di, dil, 64 +DECLARE_REG 8, rsi, esi, si, sil, 72 +DECLARE_REG 9, rbx, ebx, bx, bl, 80 +DECLARE_REG 10, rbp, ebp, bp, bpl, 88 +DECLARE_REG 11, R12, R12D, R12W, R12B, 96 +DECLARE_REG 12, R13, R13D, R13W, R13B, 104 +DECLARE_REG 13, R14, R14D, R14W, R14B, 112 +DECLARE_REG 14, R15, R15D, R15W, R15B, 120 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... - ASSERT %2 >= %1 + %assign num_args %1 %assign regs_used %2 - ASSERT regs_used <= 7 - %if regs_used > 4 - push r4 - push r5 - %assign stack_offset stack_offset+16 - %endif + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 %if mmsize == 8 %assign xmm_regs_used 0 %else WIN64_SPILL_XMM %3 %endif - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS %4 %endmacro @@ -312,12 +340,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 - sub rsp, (xmm_regs_used-6)*16+16 - %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 + SUB rsp, (xmm_regs_used-6)*16+16 %assign %%i xmm_regs_used %rep (xmm_regs_used-6) %assign %%i %%i-1 - movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i + movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i %endrep %endif %endmacro @@ -327,7 +354,7 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %assign %%i xmm_regs_used %rep (xmm_regs_used-6) %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] %endrep add %1, (xmm_regs_used-6)*16+16 %endif @@ -341,15 +368,12 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp - %if regs_used > 4 - pop r5 - pop r4 - %endif + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 ret %endmacro %macro REP_RET 0 - %if regs_used > 4 || xmm_regs_used > 6 + %if regs_used > 7 || xmm_regs_used > 6 RET %else rep ret @@ -358,92 +382,80 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %elif ARCH_X86_64 ; *nix x64 ;============================================= -DECLARE_REG 0, rdi, edi, di, dil, edi -DECLARE_REG 1, rsi, esi, si, sil, esi -DECLARE_REG 2, rdx, edx, dx, dl, edx -DECLARE_REG 3, rcx, ecx, cx, cl, ecx -DECLARE_REG 4, r8, r8d, r8w, r8b, r8d -DECLARE_REG 5, r9, r9d, r9w, r9b, r9d -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] -%define r7m [rsp + stack_offset + 16] -%define r8m [rsp + stack_offset + 24] - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [rsp - 40 + %1*8] - %endif -%endmacro +DECLARE_REG 0, rdi, edi, di, dil +DECLARE_REG 1, rsi, esi, si, sil +DECLARE_REG 2, rdx, edx, dx, dl +DECLARE_REG 3, rcx, ecx, cx, cl +DECLARE_REG 4, R8, R8D, R8W, R8B +DECLARE_REG 5, R9, R9D, R9W, R9B +DECLARE_REG 6, rax, eax, ax, al, 8 +DECLARE_REG 7, R10, R10D, R10W, R10B, 16 +DECLARE_REG 8, R11, R11D, R11W, R11B, 24 +DECLARE_REG 9, rbx, ebx, bx, bl, 32 +DECLARE_REG 10, rbp, ebp, bp, bpl, 40 +DECLARE_REG 11, R12, R12D, R12W, R12B, 48 +DECLARE_REG 12, R13, R13D, R13W, R13B, 56 +DECLARE_REG 13, R14, R14D, R14W, R14B, 64 +DECLARE_REG 14, R15, R15D, R15W, R15B, 72 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... - ASSERT %2 >= %1 - ASSERT %2 <= 7 - LOAD_IF_USED 6, %1 + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS %4 %endmacro %macro RET 0 + POP_IF_USED 14, 13, 12, 11, 10, 9 ret %endmacro %macro REP_RET 0 - rep ret + %if regs_used > 9 + RET + %else + rep ret + %endif %endmacro %else ; X86_32 ;============================================================== -DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] -DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] -DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] -DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] -DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] -DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] -DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] -%define r7m [esp + stack_offset + 32] -%define r8m [esp + stack_offset + 36] +DECLARE_REG 0, eax, eax, ax, al, 4 +DECLARE_REG 1, ecx, ecx, cx, cl, 8 +DECLARE_REG 2, edx, edx, dx, dl, 12 +DECLARE_REG 3, ebx, ebx, bx, bl, 16 +DECLARE_REG 4, esi, esi, si, null, 20 +DECLARE_REG 5, edi, edi, di, null, 24 +DECLARE_REG 6, ebp, ebp, bp, null, 28 %define rsp esp -%macro PUSH_IF_USED 1 ; reg_id - %if %1 < regs_used - push r%1 - %assign stack_offset stack_offset+4 - %endif -%endmacro - -%macro POP_IF_USED 1 ; reg_id - %if %1 < regs_used - pop r%1 - %endif +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [esp + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep %endmacro -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [esp + stack_offset + 4 + %1*4] - %endif -%endmacro +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... - ASSERT %2 >= %1 + %assign num_args %1 %assign regs_used %2 - ASSERT regs_used <= 7 - PUSH_IF_USED 3 - PUSH_IF_USED 4 - PUSH_IF_USED 5 - PUSH_IF_USED 6 - LOAD_IF_USED 0, %1 - LOAD_IF_USED 1, %1 - LOAD_IF_USED 2, %1 - LOAD_IF_USED 3, %1 - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 + %if regs_used > 7 + %assign regs_used 7 + %endif + ASSERT regs_used >= num_args + PUSH_IF_USED 3, 4, 5, 6 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 DEFINE_ARGS %4 %endmacro %macro RET 0 - POP_IF_USED 6 - POP_IF_USED 5 - POP_IF_USED 4 - POP_IF_USED 3 + POP_IF_USED 6, 5, 4, 3 ret %endmacro @@ -464,8 +476,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endmacro %endif - - ;============================================================================= ; arch-independent part ;============================================================================= diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index 68dbf51b02..9b0b01253a 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -62,11 +62,11 @@ SECTION .text %define cntr_reg fltsizeq %define movsx mov %else -%define cntr_reg r11 +%define cntr_reg r7 %define movsx movsxd %endif -cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset +cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset %if %1 == 8 || %1 == 9 || %1 == 10 pxor m6, m6 %endif ; %1 == 8/9/10 diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index f7ed45fcf3..d56e253afa 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -53,7 +53,7 @@ SECTION .text %ifnidn %3, X cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 %else -cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize +cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize %endif %if ARCH_X86_64 movsxd wq, wd @@ -245,10 +245,9 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz %define dlt 0 %endif ; %4 ==/!= X4 %if ARCH_X86_64 - push r12 -%define srcq r11 -%define pos1q r10 -%define srcendq r12 +%define srcq r8 +%define pos1q r7 +%define srcendq r9 movsxd fltsizeq, fltsized ; filterSize lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] %else ; x86-32 @@ -388,16 +387,7 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz add wq, 2 %endif ; %3 ==/!= X jl .loop -%ifnidn %3, X REP_RET -%else ; %3 == X -%if ARCH_X86_64 - pop r12 - RET -%else ; x86-32 - REP_RET -%endif ; x86-32/64 -%endif ; %3 ==/!= X %endmacro ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm |