aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-06-30 13:06:01 -0300
committerMichael Niedermayer <michaelni@gmx.at>2014-07-02 01:09:53 +0200
commit1a69224f44853f0a54c17a81449b63b61c3f2503 (patch)
tree92585842f242bf4bdbe5e1319aef80a82b0bbe8f
parenta441a2437bc2663b05513bb223a8dfd08721a9ee (diff)
downloadffmpeg-1a69224f44853f0a54c17a81449b63b61c3f2503.tar.gz
x86/swr: add ff_resample_{common, linear}_float_fma
Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libswresample/x86/resample.asm43
-rw-r--r--libswresample/x86/resample_x86_dsp.c43
2 files changed, 51 insertions, 35 deletions
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index bce1389bec..17f6169d71 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
%else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+ fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
addp%4 m0, m0, m1
+%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
-%if cpuflag(avx)
- vextractf128 xm1, m0, 0x1
- addps xm0, xm1
-%endif
-
%ifidn %1, int16
%if mmsize == 16
pshufd m1, m0, q0032
@@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
movd [dstq], m0
%else ; float/double
; horizontal sum & store
+%if mmsize == 32
+ vextractf128 xm1, m0, 0x1
+ addps xm0, xm1
+%endif
movhlps xm1, xm0
%ifidn %1, float
addps xm0, xm1
@@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
paddd m2, m3
paddd m0, m1
%else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+ fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
+ fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
addp%4 m2, m2, m3
addp%4 m0, m0, m1
+%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
-%if cpuflag(avx)
- vextractf128 xm1, m0, 0x1
- vextractf128 xm3, m2, 0x1
- addps xm0, xm1
- addps xm2, xm3
-%endif
-
%ifidn %1, int16
%if mmsize == 16
pshufd m3, m2, q0032
@@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
; - unix64: eax=r6[filter1], edx=r2[todo]
%else ; float/double
; val += (v2 - val) * (FELEML) frac / c->src_incr;
+%if mmsize == 32
+ vextractf128 xm1, m0, 0x1
+ vextractf128 xm3, m2, 0x1
+ addps xm0, xm1
+ addps xm2, xm3
+%endif
cvtsi2s%4 xm1, fracd
subp%4 xm2, xm0
mulp%4 xm1, xm4
shufp%4 xm1, xm1, q0000
+%if cpuflag(fma4) || cpuflag(fma3)
+ fmaddp%4 xm0, xm2, xm1, xm0
+%else
mulp%4 xm2, xm1
addp%4 xm0, xm2
+%endif ; cpuflag
; horizontal sum & store
movhlps xm1, xm0
@@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
INIT_YMM avx
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
%if ARCH_X86_32
INIT_MMX mmxext
diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c
index 9049da6951..ff9f1ec83e 100644
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,30 +27,19 @@
#include "libswresample/resample.h"
-int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
+#define RESAMPLE_FUNCS(type, opt) \
+int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+ const uint8_t *src, int sz, int upd); \
+int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+ const uint8_t *src, int sz, int upd)
-int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
+RESAMPLE_FUNCS(int16, mmxext);
+RESAMPLE_FUNCS(int16, sse2);
+RESAMPLE_FUNCS(float, sse);
+RESAMPLE_FUNCS(float, avx);
+RESAMPLE_FUNCS(float, fma3);
+RESAMPLE_FUNCS(float, fma4);
+RESAMPLE_FUNCS(double, sse2);
void swresample_dsp_x86_init(ResampleContext *c)
{
@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
}
+ if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
+ c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
+ c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
+ }
+ if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
+ c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
+ c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
+ }
}