diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2009-12-05 15:09:10 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2009-12-05 15:09:10 +0000 |
commit | b1159ad92818cd8f0885d252b0800f5960fe7241 (patch) | |
tree | a9d4177c61a9a89b4ac78a4a5b8a95f962a858a0 /libavcodec/dsputil.c | |
parent | e470691aa8798004bf5589871865a765cb791014 (diff) | |
download | ffmpeg-b1159ad92818cd8f0885d252b0800f5960fe7241.tar.gz |
refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
9-123% faster ape decoding on G4.
Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/dsputil.c')
-rw-r--r-- | libavcodec/dsputil.c | 25 |
1 files changed, 11 insertions, 14 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index a04b8a400a..ffa8cecd4a 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -4298,18 +4298,6 @@ void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, i } } -static void add_int16_c(int16_t * v1, int16_t * v2, int order) -{ - while (order--) - *v1++ += *v2++; -} - -static void sub_int16_c(int16_t * v1, int16_t * v2, int order) -{ - while (order--) - *v1++ -= *v2++; -} - static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) { int res = 0; @@ -4320,6 +4308,16 @@ static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int return res; } +static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +{ + int res = 0; + while (order--) { + res += *v1 * *v2++; + *v1++ += mul * *v3++; + } + return res; +} + #define W0 2048 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ @@ -4848,9 +4846,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->vector_clipf = vector_clipf_c; c->float_to_int16 = ff_float_to_int16_c; c->float_to_int16_interleave = ff_float_to_int16_interleave_c; - c->add_int16 = add_int16_c; - c->sub_int16 = sub_int16_c; c->scalarproduct_int16 = scalarproduct_int16_c; + c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; c->scalarproduct_float = scalarproduct_float_c; c->butterflies_float = butterflies_float_c; c->vector_fmul_scalar = vector_fmul_scalar_c; |