diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2015-10-06 08:38:17 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2015-10-07 08:55:19 -0400 |
commit | ce7872903345c8637c2cfc4245cba4b34f6bbe40 (patch) | |
tree | d0fe941d262f15c3605685a661409c23a50791c7 /libavcodec/x86/vp9intrapred_16bpp.asm | |
parent | b3b6665c605328174ef014ae42719d7ab462d0a2 (diff) | |
download | ffmpeg-ce7872903345c8637c2cfc4245cba4b34f6bbe40.tar.gz |
vp9: don't keep a stack pointer if we don't need it.
This saves one register in a few cases on 32bit builds with unaligned
stack (e.g. MSVC), making the code slightly easier to maintain.
(Can someone please test this on 32bit+msvc and confirm make fate-vp9
and tests/checkasm/checkasm still work after this patch?)
Diffstat (limited to 'libavcodec/x86/vp9intrapred_16bpp.asm')
-rw-r--r-- | libavcodec/x86/vp9intrapred_16bpp.asm | 20 |
1 files changed, 5 insertions, 15 deletions
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 3653469d9f..c0ac16d3eb 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -601,7 +601,7 @@ cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body INIT_XMM sse2 -cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a +cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a mova m0, [pw_1023] .body: pxor m1, m1 @@ -655,7 +655,7 @@ cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a jge .loop RET -cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a +cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a mova m0, [pw_4095] jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body @@ -945,7 +945,7 @@ cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a RET cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ - %1 * ARCH_X86_32 * mmsize, dst, stride, l, a + %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a mova m0, [aq+mmsize*3] ; a[24-31] movu m1, [aq+mmsize*3-2] ; a[23-30] psrldq m2, m0, 2 ; a[25-31]. @@ -1634,13 +1634,8 @@ cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a jg .loop RET -%if ARCH_X86_64 || HAVE_ALIGNED_STACK cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ - %1 * mmsize * ARCH_X86_32, dst, stride, l, a -%else -cglobal vp9_ipred_hu_32x32_16, 3, 6, 10 + notcpuflag(ssse3), \ - %1 * mmsize * ARCH_X86_32, dst, stride, l, a -%endif + %1 * -mmsize * ARCH_X86_32, dst, stride, l, a mova m2, [lq+mmsize*0+0] movu m1, [lq+mmsize*0+2] movu m0, [lq+mmsize*0+4] @@ -1671,12 +1666,7 @@ cglobal vp9_ipred_hu_32x32_16, 3, 6, 10 + notcpuflag(ssse3), \ SBUTTERFLY wd, 7, 6, 0 pshufd m1, m1, q3333 UNSCRATCH 0, 9, rsp+1*mmsize -%if ARCH_X86_64 || HAVE_ALIGNED_STACK DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 -%else - DEFINE_ARGS dst, stride, stride3, stride4, stride20, stride28 -%define cntd dword r0m -%endif lea stride3q, [strideq*3] lea stride4q, [strideq*4] lea stride28q, [stride4q*8] @@ -1902,7 +1892,7 @@ cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a RET cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ - 10 * mmsize * ARCH_X86_32, dst, stride, l, a + 10 * -mmsize * ARCH_X86_32, dst, stride, l, a mova m2, [lq+mmsize*0+0] movu m1, [lq+mmsize*0+2] movu m0, [lq+mmsize*0+4] |