refactor and optimize scalarproduct

29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2009-12-05 15:09:10 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2009-12-05 15:09:10 +0000
commit: b1159ad92818cd8f0885d252b0800f5960fe7241 (patch)
tree: a9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/dsputil.c
parent: e470691aa8798004bf5589871865a765cb791014 (diff)
download: ffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz
1 files changed, 11 insertions, 14 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index a04b8a400a..ffa8cecd4a 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -4298,18 +4298,6 @@ void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, i
     }
 }
 
-static void add_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-       *v1++ += *v2++;
-}
-
-static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-        *v1++ -= *v2++;
-}
-
 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
 {
     int res = 0;
@@ -4320,6 +4308,16 @@ static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int
     return res;
 }
 
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+{
+    int res = 0;
+    while (order--) {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    }
+    return res;
+}
+
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -4848,9 +4846,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->vector_clipf = vector_clipf_c;
     c->float_to_int16 = ff_float_to_int16_c;
     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
-    c->add_int16 = add_int16_c;
-    c->sub_int16 = sub_int16_c;
     c->scalarproduct_int16 = scalarproduct_int16_c;
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     c->scalarproduct_float = scalarproduct_float_c;
     c->butterflies_float = butterflies_float_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;
author	Loren Merritt <lorenm@u.washington.edu>	2009-12-05 15:09:10 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2009-12-05 15:09:10 +0000
commit	b1159ad92818cd8f0885d252b0800f5960fe7241 (patch)
tree	a9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/dsputil.c
parent	e470691aa8798004bf5589871865a765cb791014 (diff)
download	ffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz