diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2012-12-08 16:12:38 -0800 |
---|---|---|
committer | Luca Barbato <lu_zero@gentoo.org> | 2012-12-12 05:23:46 +0100 |
commit | 6f40e9f070f7a6ccf745561409ddbcc2be5e47e5 (patch) | |
tree | 488441177422bcdb279f5f95337b647a68f31a43 /libavcodec | |
parent | 14758e3211d34a97c42b07acae117ce5627d7f57 (diff) | |
download | ffmpeg-6f40e9f070f7a6ccf745561409ddbcc2be5e47e5.tar.gz |
x86inc: support stack mem allocation and re-alignment in PROLOGUE
Use this in VP8/H264-8bit loopfilter functions so they can be used if
there is no aligned stack (e.g. MSVC 32bit or ICC 10.x).
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/h264_deblock.asm | 27 | ||||
-rw-r--r-- | libavcodec/x86/h264dsp_init.c | 4 | ||||
-rw-r--r-- | libavcodec/x86/vp8dsp.asm | 68 | ||||
-rw-r--r-- | libavcodec/x86/vp8dsp_init.c | 8 |
4 files changed, 42 insertions, 65 deletions
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index c124c4daa0..1f1dbc6a2f 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -398,14 +398,12 @@ DEBLOCK_LUMA ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_8, 5,5 +cglobal deblock_%1_luma_8, 5,5,8,2*%2 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride - %assign pad 2*%2+12-(stack_offset&15) - SUB esp, pad mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 @@ -443,22 +441,19 @@ cglobal deblock_%1_luma_8, 5,5 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 - ADD esp, pad RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma_8, 0,5 +cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 mov r0, r0mp mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] - %assign pad 0x78-(stack_offset&15) - SUB esp, pad -%define pix_tmp esp+12 +%define pix_tmp esp+12*HAVE_ALIGNED_STACK ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp @@ -500,7 +495,6 @@ cglobal deblock_h_luma_8, 0,5 movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) - ADD esp, pad RET %endmacro ; DEBLOCK_LUMA @@ -631,7 +625,7 @@ DEBLOCK_LUMA v, 16 %define mpb_0 m14 %define mpb_1 m15 %else - %define spill(x) [esp+16*x+((stack_offset+4)&15)] + %define spill(x) [esp+16*x] %define p2 [r4+r1] %define q2 [r0+2*r1] %define t4 spill(0) @@ -646,10 +640,7 @@ DEBLOCK_LUMA v, 16 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_intra_8, 4,6,16 -%if ARCH_X86_64 == 0 - sub esp, 0x60 -%endif +cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride dec r2d ; alpha-1 @@ -698,9 +689,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16 LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] .end: -%if ARCH_X86_64 == 0 - add esp, 0x60 -%endif RET INIT_MMX cpuname @@ -737,12 +725,10 @@ cglobal deblock_h_luma_intra_8, 4,9 add rsp, 0x88 RET %else -cglobal deblock_h_luma_intra_8, 2,4 +cglobal deblock_h_luma_intra_8, 2,4,8,0x80 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] -%assign pad 0x8c-(stack_offset&15) - SUB rsp, pad %define pix_tmp rsp ; transpose 8x16 -> tmp space @@ -773,7 +759,6 @@ cglobal deblock_h_luma_intra_8, 2,4 lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - ADD rsp, pad RET %endif ; ARCH_X86_64 %endmacro ; DEBLOCK_LUMA_INTRA diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index ac231cbb08..73d4990899 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -275,18 +275,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; -#if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; -#endif /* HAVE_ALIGNED_STACK */ } if (EXTERNAL_SSSE3(mm_flags)) { c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; } - if (EXTERNAL_AVX(mm_flags) && HAVE_ALIGNED_STACK) { + if (EXTERNAL_AVX(mm_flags)) { c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index d732bf45eb..af8403e991 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1631,28 +1631,31 @@ SIMPLE_LOOPFILTER h, 5 ;----------------------------------------------------------------------------- %macro INNER_LOOPFILTER 2 +%define stack_size 0 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%ifidn %1, v ; [3]=hev() result +%define stack_size mmsize * -4 +%else ; h ; extra storage space for transposes +%define stack_size mmsize * -5 +%endif +%endif + %if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr +cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr %else ; luma -cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr +cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr %endif %if cpuflag(ssse3) pxor m7, m7 %endif -%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr -%ifidn %1, v ; [3]=hev() result -%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15) -%else ; h ; extra storage space for transposes -%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15) -%endif + +%ifndef m8 ; splat function arguments SPLATB_REG m0, flimEq, m7 ; E SPLATB_REG m1, flimIq, m7 ; I SPLATB_REG m2, hevthrq, m7 ; hev_thresh - SUB rsp, pad - %define m_flimE [rsp] %define m_flimI [rsp+mmsize] %define m_hevthr [rsp+mmsize*2] @@ -2082,12 +2085,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr dec cntrq jg .next8px %endif -%endif - -%ifndef m8 ; sse2 on x86-32 or mmx/mmxext - ADD rsp, pad -%endif + REP_RET +%else ; mmsize == 16 RET +%endif %endmacro %if ARCH_X86_32 @@ -2122,31 +2123,34 @@ INNER_LOOPFILTER h, 8 ;----------------------------------------------------------------------------- %macro MBEDGE_LOOPFILTER 2 -%if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr -%else ; luma -cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr -%endif - -%if cpuflag(ssse3) - pxor m7, m7 -%endif +%define stack_size 0 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr %if mmsize == 16 ; [3]=hev() result ; [4]=filter tmp result ; [5]/[6] = p2/q2 backup ; [7]=lim_res sign result -%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15) +%define stack_size mmsize * -7 %else ; 8 ; extra storage space for transposes -%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15) +%define stack_size mmsize * -8 +%endif %endif + +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr +%else ; luma +cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr +%endif + +%if cpuflag(ssse3) + pxor m7, m7 +%endif + +%ifndef m8 ; splat function arguments SPLATB_REG m0, flimEq, m7 ; E SPLATB_REG m1, flimIq, m7 ; I SPLATB_REG m2, hevthrq, m7 ; hev_thresh - SUB rsp, pad - %define m_flimE [rsp] %define m_flimI [rsp+mmsize] %define m_hevthr [rsp+mmsize*2] @@ -2740,12 +2744,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt dec cntrq jg .next8px %endif -%endif - -%ifndef m8 ; sse2 on x86-32 or mmx/mmxext - ADD rsp, pad -%endif + REP_RET +%else ; mmsize == 16 RET +%endif %endmacro %if ARCH_X86_32 diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index bf5463f080..a9d3541807 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -390,13 +390,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; -#endif } if (mm_flags & AV_CPU_FLAG_SSE2) { @@ -404,13 +402,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; -#endif } if (mm_flags & AV_CPU_FLAG_SSSE3) { @@ -424,7 +420,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; @@ -434,17 +429,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; -#endif } if (mm_flags & AV_CPU_FLAG_SSE4) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; -#endif } #endif /* HAVE_YASM */ } |