aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/x86
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2013-06-18 21:30:43 +0000
committerLuca Barbato <lu_zero@gentoo.org>2013-06-29 13:23:57 +0200
commitb545179fdff1ccfbbb9d422e4e9720cb6c6d9191 (patch)
tree0476bc87fd03fd512c49103a36ff05681a000e00 /libavutil/x86
parent502ab21af0ca68f76d6112722c46d2f35c004053 (diff)
downloadffmpeg-b545179fdff1ccfbbb9d422e4e9720cb6c6d9191.tar.gz
x86: lpc: simd av_evaluate_lls
1.5x-1.8x faster on sandybridge Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
Diffstat (limited to 'libavutil/x86')
-rw-r--r--libavutil/x86/lls.asm38
-rw-r--r--libavutil/x86/lls_init.c3
2 files changed, 41 insertions, 0 deletions
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index 92c00fcda1..92b7f955c2 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -194,3 +194,41 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
jle .loop2x1
.ret:
REP_RET
+
+
+INIT_XMM sse2
+cglobal evaluate_lls, 2,4,2, ctx, var, order, i
+ ; This function is often called on the same buffer as update_lls, but with
+ ; an offset. They can't both be aligned.
+ ; Load halves rather than movu to avoid store-forwarding stalls, since the
+ ; input was initialized immediately prior to this function using scalar math.
+ %define coefsq ctxq
+ mov id, orderd
+ imul orderd, MAX_VARS
+ lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
+ movsd m0, [varq]
+ movhpd m0, [varq + 8]
+ mulpd m0, [coefsq]
+ lea coefsq, [coefsq + iq*8]
+ lea varq, [varq + iq*8]
+ neg iq
+ add iq, 2
+.loop:
+ movsd m1, [varq + iq*8]
+ movhpd m1, [varq + iq*8 + 8]
+ mulpd m1, [coefsq + iq*8]
+ addpd m0, m1
+ add iq, 2
+ jl .loop
+ jg .skip1
+ movsd m1, [varq + iq*8]
+ mulsd m1, [coefsq + iq*8]
+ addpd m0, m1
+.skip1:
+ movhlps m1, m0
+ addsd m0, m1
+%if ARCH_X86_32
+ movsd r0m, m0
+ fld qword r0m
+%endif
+ RET
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 8a80f83002..888bc54a39 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -25,12 +25,15 @@
void ff_update_lls_sse2(LLSModel *m, double *var);
void ff_update_lls_avx(LLSModel *m, double *var);
+double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
m->update_lls = ff_update_lls_sse2;
+ if (m->indep_count >= 4)
+ m->evaluate_lls = ff_evaluate_lls_sse2;
}
if (EXTERNAL_AVX(cpu_flags)) {
m->update_lls = ff_update_lls_avx;