aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dsputil_mmx.c
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2009-12-05 15:09:10 +0000
committerLoren Merritt <lorenm@u.washington.edu>2009-12-05 15:09:10 +0000
commitb1159ad92818cd8f0885d252b0800f5960fe7241 (patch)
treea9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/x86/dsputil_mmx.c
parente470691aa8798004bf5589871865a765cb791014 (diff)
downloadffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/dsputil_mmx.c')
-rw-r--r--libavcodec/x86/dsputil_mmx.c19
1 files changed, 9 insertions, 10 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 93d4af5565..66c3a00aa9 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2384,12 +2384,11 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
-void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
-void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
-void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
-void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
-int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
-int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
+int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift);
+int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift);
+int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2951,9 +2950,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
if(mm_flags & FF_MM_MMX2){
#if HAVE_YASM
- c->add_int16 = ff_add_int16_mmx2;
- c->sub_int16 = ff_sub_int16_mmx2;
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
#endif
}
if(mm_flags & FF_MM_SSE){
@@ -2975,11 +2973,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#if HAVE_YASM
- c->add_int16 = ff_add_int16_sse2;
- c->sub_int16 = ff_sub_int16_sse2;
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
#endif
}
+ if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
}
if (CONFIG_ENCODERS)