refactor and optimize scalarproduct

29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2009-12-05 15:09:10 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2009-12-05 15:09:10 +0000
commit: b1159ad92818cd8f0885d252b0800f5960fe7241 (patch)
tree: a9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/dsputil.h
parent: e470691aa8798004bf5589871865a765cb791014 (diff)
download: ffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz
1 files changed, 7 insertions, 11 deletions
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 4079396ff5..e483276070 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -560,23 +560,19 @@ typedef struct DSPContext {
     void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
            int * range, int * sum,  int edges);
 
-    /* ape functions */
-    /**
-     * Add contents of the second vector to the first one.
-     * @param len length of vectors, should be multiple of 16
-     */
-    void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
-    /**
-     * Add contents of the second vector to the first one.
-     * @param len length of vectors, should be multiple of 16
-     */
-    void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
     /**
      * Calculate scalar product of two vectors.
      * @param len length of vectors, should be multiple of 16
      * @param shift number of bits to discard from product
      */
     int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
+    /* ape functions */
+    /**
+     * Calculate scalar product of v1 and v2,
+     * and v1[i] += v3[i] * mul
+     * @param len length of vectors, should be multiple of 16
+     */
+    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
 
     /* rv30 functions */
     qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
author	Loren Merritt <lorenm@u.washington.edu>	2009-12-05 15:09:10 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2009-12-05 15:09:10 +0000
commit	b1159ad92818cd8f0885d252b0800f5960fe7241 (patch)
tree	a9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/dsputil.h
parent	e470691aa8798004bf5589871865a765cb791014 (diff)
download	ffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz