aboutsummaryrefslogtreecommitdiffstats
path: root/libswscale/x86/swscale_mmx.c
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2011-09-13 09:53:42 -0700
committerRonald S. Bultje <rsbultje@gmail.com>2011-09-13 09:53:42 -0700
commite0c3e0738757a92c2910bac83f2ef830b428ba11 (patch)
tree4947bcc2e79f0b9393c12be1d9a6d4bba77dc049 /libswscale/x86/swscale_mmx.c
parent3ed78609438af404d6738cc3bb3eefae93a7c2d4 (diff)
downloadffmpeg-e0c3e0738757a92c2910bac83f2ef830b428ba11.tar.gz
sws: implement MMX/SSE2/SSSE3/SSE4 versions for horizontal scaling.
Speed: from 3.9x to 9.6x speed improvement over C, and some small (up to 15%) speed improvements over existing MMX code (particularly for bigger filters).
Diffstat (limited to 'libswscale/x86/swscale_mmx.c')
-rw-r--r--libswscale/x86/swscale_mmx.c86
1 files changed, 86 insertions, 0 deletions
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index f855a75212..dd7aea1492 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -176,6 +176,41 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
}
}
+#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
+extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
+ SwsContext *c, int16_t *data, \
+ int dstW, const uint8_t *src, \
+ const int16_t *filter, \
+ const int16_t *filterPos, int filterSize);
+
+#define SCALE_FUNCS(filter_n, opt) \
+ SCALE_FUNC(filter_n, 8, 15, opt); \
+ SCALE_FUNC(filter_n, 9, 15, opt); \
+ SCALE_FUNC(filter_n, 10, 15, opt); \
+ SCALE_FUNC(filter_n, 16, 15, opt); \
+ SCALE_FUNC(filter_n, 8, 19, opt); \
+ SCALE_FUNC(filter_n, 9, 19, opt); \
+ SCALE_FUNC(filter_n, 10, 19, opt); \
+ SCALE_FUNC(filter_n, 16, 19, opt)
+
+#define SCALE_FUNCS_MMX(opt) \
+ SCALE_FUNCS(4, opt); \
+ SCALE_FUNCS(8, opt); \
+ SCALE_FUNCS(X, opt)
+
+#define SCALE_FUNCS_SSE(opt) \
+ SCALE_FUNCS(4, opt); \
+ SCALE_FUNCS(8, opt); \
+ SCALE_FUNCS(X4, opt); \
+ SCALE_FUNCS(X8, opt)
+
+#if ARCH_X86_32
+SCALE_FUNCS_MMX(mmx);
+#endif
+SCALE_FUNCS_SSE(sse2);
+SCALE_FUNCS_SSE(ssse3);
+SCALE_FUNCS_SSE(sse4);
+
void ff_sws_init_swScale_mmx(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -186,4 +221,55 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
if (cpu_flags & AV_CPU_FLAG_MMX2)
sws_init_swScale_MMX2(c);
#endif
+
+#if HAVE_YASM
+#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
+ if (c->srcBpc == 8) { \
+ hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale8to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 9) { \
+ hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale9to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 10) { \
+ hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale10to19_ ## filtersize ## _ ## opt1; \
+ } else /* c->srcBpc == 16 */ { \
+ hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale16to19_ ## filtersize ## _ ## opt1; \
+ } \
+} while (0)
+#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+ switch (filtersize) { \
+ case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+ case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+ default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
+ }
+#if ARCH_X86_32
+ if (cpu_flags & AV_CPU_FLAG_MMX) {
+ ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
+ ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+ }
+#endif
+#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+ switch (filtersize) { \
+ case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+ case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+ default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
+ else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
+ break; \
+ }
+ if (cpu_flags & AV_CPU_FLAG_SSE2) {
+ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
+ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+ }
+ if (cpu_flags & AV_CPU_FLAG_SSSE3) {
+ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
+ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
+ }
+ if (cpu_flags & AV_CPU_FLAG_SSE4) {
+ /* Xto15 don't need special sse4 functions */
+ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
+ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+ }
+#endif
}