Merge remote-tracking branch 'origin/master' into combined

author: rogerdpack <rogerpack2005@gmail.com> 2013-01-15 19:09:15 -0700
committer: rogerdpack <rogerpack2005@gmail.com> 2013-01-15 19:09:15 -0700
commit: c540312ac3b58e0bbd751844fc2c47c6e3713cf5 (patch)
tree: fcf92b1c0f1772b379828125c2555a47d1c81c6b /libavcodec
parent: 47e88486b4b3b3de992b07f89dfaedf410a8bd5e (diff)
parent: 2b20397e1fbe52db800ef5deb810f7bc2602f248 (diff)
download: ffmpeg-c540312ac3b58e0bbd751844fc2c47c6e3713cf5.tar.gz
61 files changed, 1583 insertions, 328 deletions
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index 5b98856e67..d17366c8ee 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -257,10 +257,12 @@ typedef struct ChannelElement {
     SpectralBandReplication sbr;
 } ChannelElement;
 
+typedef struct AACContext AACContext;
+
 /**
  * main AAC context
  */
-typedef struct AACContext {
+struct AACContext {
     AVClass        *class;
     AVCodecContext *avctx;
     AVFrame frame;
@@ -317,6 +319,18 @@ typedef struct AACContext {
 
     OutputConfiguration oc[2];
     int warned_num_aac_frames;
-} AACContext;
+
+    /* aacdec functions pointers */
+    void (*imdct_and_windowing)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_tns)(float coef[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode);
+    void (*windowing_and_mdct_ltp)(AACContext *ac, float *out,
+                                   float *in, IndividualChannelStream *ics);
+    void (*update_ltp)(AACContext *ac, SingleChannelElement *sce);
+
+};
+
+void ff_aacdec_init_mips(AACContext *c);
 
 #endif /* AVCODEC_AAC_H */
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index 2538948edd..d4ea115cd5 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -108,6 +108,8 @@
 
 #if ARCH_ARM
 #   include "arm/aac.h"
+#elif ARCH_MIPS
+#   include "mips/aacdec_mips.h"
 #endif
 
 static VLC vlc_scalefactors;
@@ -872,6 +874,8 @@ static void reset_predictor_group(PredictorState *ps, int group_num)
         ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), sizeof(ff_aac_spectral_codes[num][0]), \
         size);
 
+static void aacdec_init(AACContext *ac);
+
 static av_cold int aac_decode_init(AVCodecContext *avctx)
 {
     AACContext *ac = avctx->priv_data;
@@ -879,6 +883,8 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
     ac->avctx = avctx;
     ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
 
+    aacdec_init(ac);
+
     avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
     if (avctx->extradata_size > 0) {
@@ -2165,10 +2171,10 @@ static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
             predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
         memset(&predTime[i], 0, (2048 - i) * sizeof(float));
 
-        windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
 
         if (sce->tns.present)
-            apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
 
         for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
             if (ltp->used[sfb])
@@ -2380,25 +2386,25 @@ static void spectral_to_sample(AACContext *ac)
                 if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
                     if (che->ch[0].ics.predictor_present) {
                         if (che->ch[0].ics.ltp.present)
-                            apply_ltp(ac, &che->ch[0]);
+                            ac->apply_ltp(ac, &che->ch[0]);
                         if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
-                            apply_ltp(ac, &che->ch[1]);
+                            ac->apply_ltp(ac, &che->ch[1]);
                     }
                 }
                 if (che->ch[0].tns.present)
-                    apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
+                    ac->apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
                 if (che->ch[1].tns.present)
-                    apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
+                    ac->apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
                 if (type <= TYPE_CPE)
                     apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, apply_dependent_coupling);
                 if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
-                    imdct_and_windowing(ac, &che->ch[0]);
+                    ac->imdct_and_windowing(ac, &che->ch[0]);
                     if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                        update_ltp(ac, &che->ch[0]);
+                        ac->update_ltp(ac, &che->ch[0]);
                     if (type == TYPE_CPE) {
-                        imdct_and_windowing(ac, &che->ch[1]);
+                        ac->imdct_and_windowing(ac, &che->ch[1]);
                         if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                            update_ltp(ac, &che->ch[1]);
+                            ac->update_ltp(ac, &che->ch[1]);
                     }
                     if (ac->oc[1].m4ac.sbr > 0) {
                         ff_sbr_apply(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
@@ -2979,6 +2985,17 @@ static av_cold int latm_decode_init(AVCodecContext *avctx)
     return ret;
 }
 
+static void aacdec_init(AACContext *c)
+{
+    c->imdct_and_windowing                      = imdct_and_windowing;
+    c->apply_ltp                                = apply_ltp;
+    c->apply_tns                                = apply_tns;
+    c->windowing_and_mdct_ltp                   = windowing_and_mdct_ltp;
+    c->update_ltp                               = update_ltp;
+
+    if(ARCH_MIPS)
+        ff_aacdec_init_mips(c);
+}
 /**
  * AVOptions for Japanese DTV specific extensions (ADTS only)
  */
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index 904e0bb9ef..b1174dc219 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -336,7 +336,7 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
 {
     int nb_coefs;
     int blk, bnd;
-    AC3Block *block, *block0;
+    AC3Block *block, *block0 = NULL;
 
     if (s->channel_mode != AC3_CHMODE_STEREO)
         return;
diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c
index c6ca1146ea..d11a07061b 100644
--- a/libavcodec/arm/videodsp_init_armv5te.c
+++ b/libavcodec/arm/videodsp_init_armv5te.c
@@ -19,7 +19,7 @@
  */
 
 #include "libavutil/arm/cpu.h"
-#include <libavcodec/videodsp.h>
+#include "libavcodec/videodsp.h"
 #include "videodsp_arm.h"
 
 void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
diff --git a/libavcodec/bfin/Makefile b/libavcodec/bfin/Makefile
index be81e6c39a..d1b41bc7f4 100644
--- a/libavcodec/bfin/Makefile
+++ b/libavcodec/bfin/Makefile
@@ -2,7 +2,7 @@ OBJS += bfin/dsputil_bfin.o                                             \
         bfin/fdct_bfin.o                                                \
         bfin/idct_bfin.o                                                \
         bfin/pixels_bfin.o                                              \
-        bfin/vp3_bfin.o                                                 \
-        bfin/vp3_idct_bfin.o                                            \
 
 OBJS-$(CONFIG_MPEGVIDEOENC)             += bfin/mpegvideo_bfin.o
+OBJS-$(CONFIG_VP3DSP)                   += bfin/vp3_bfin.o              \
+                                           bfin/vp3_idct_bfin.o
diff --git a/libavcodec/bfin/dsputil_bfin.c b/libavcodec/bfin/dsputil_bfin.c
index 955aea01f6..da6edb5e5e 100644
--- a/libavcodec/bfin/dsputil_bfin.c
+++ b/libavcodec/bfin/dsputil_bfin.c
@@ -257,13 +257,7 @@ void ff_dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
         if (avctx->dct_algo == FF_DCT_AUTO)
             c->fdct                  = ff_bfin_fdct;
 
-        // FIXME convert to VP3DSPContext
-        if (0) { // avctx->idct_algo == FF_IDCT_VP3) {
-            c->idct_permutation_type = FF_NO_IDCT_PERM;
-            c->idct                  = ff_bfin_vp3_idct;
-            c->idct_add              = ff_bfin_vp3_idct_add;
-            c->idct_put              = ff_bfin_vp3_idct_put;
-        } else if (avctx->idct_algo == FF_IDCT_AUTO) {
+    if (avctx->idct_algo == FF_IDCT_AUTO) {
             c->idct_permutation_type = FF_NO_IDCT_PERM;
             c->idct                  = ff_bfin_idct;
             c->idct_add              = bfin_idct_add;
diff --git a/libavcodec/bfin/dsputil_bfin.h b/libavcodec/bfin/dsputil_bfin.h
index 04f87cc2b8..293092363b 100644
--- a/libavcodec/bfin/dsputil_bfin.h
+++ b/libavcodec/bfin/dsputil_bfin.h
@@ -38,9 +38,6 @@
 
 void ff_bfin_idct (int16_t *block) attribute_l1_text;
 void ff_bfin_fdct (int16_t *block) attribute_l1_text;
-void ff_bfin_vp3_idct (int16_t *block);
-void ff_bfin_vp3_idct_put (uint8_t *dest, int line_size, int16_t *block);
-void ff_bfin_vp3_idct_add (uint8_t *dest, int line_size, int16_t *block);
 void ff_bfin_add_pixels_clamped (const int16_t *block, uint8_t *dest, int line_size) attribute_l1_text;
 void ff_bfin_put_pixels_clamped (const int16_t *block, uint8_t *dest, int line_size) attribute_l1_text;
 void ff_bfin_diff_pixels (int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride)  attribute_l1_text;
diff --git a/libavcodec/bfin/vp3_bfin.c b/libavcodec/bfin/vp3_bfin.c
index 06d3443a2f..d9364790f1 100644
--- a/libavcodec/bfin/vp3_bfin.c
+++ b/libavcodec/bfin/vp3_bfin.c
@@ -18,9 +18,13 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <string.h>
+
 #include "libavcodec/avcodec.h"
+#include "libavcodec/vp3dsp.h"
 #include "libavcodec/dsputil.h"
 #include "dsputil_bfin.h"
+#include "vp3_bfin.h"
 
 /* Intra iDCT offset 128 */
 void ff_bfin_vp3_idct_put (uint8_t *dest, int line_size, int16_t *block)
@@ -33,6 +37,8 @@ void ff_bfin_vp3_idct_put (uint8_t *dest, int line_size, int16_t *block)
     for (i=0;i<8;i++)
         for (j=0;j<8;j++)
             dest[line_size*i+j]=cm[block[i*8+j]];
+
+    memset(block, 0, 128);
 }
 
 /* Inter iDCT */
@@ -40,4 +46,12 @@ void ff_bfin_vp3_idct_add (uint8_t *dest, int line_size, int16_t *block)
 {
     ff_bfin_vp3_idct (block);
     ff_bfin_add_pixels_clamped (block, dest, line_size);
+
+    memset(block, 0, 128);
+}
+
+void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags)
+{
+    c->idct_add = ff_bfin_vp3_idct_add;
+    c->idct_put = ff_bfin_vp3_idct_put;
 }
diff --git a/libavcodec/bfin/vp3_bfin.h b/libavcodec/bfin/vp3_bfin.h
new file mode 100644
index 0000000000..f0bf824951
--- /dev/null
+++ b/libavcodec/bfin/vp3_bfin.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVCODEC_BFIN_VP3_BFIN_H
+#define AVCODEC_BFIN_VP3_BFIN_H
+
+#include <stdint.h>
+
+void ff_bfin_vp3_idct(int16_t *block);
+void ff_bfin_vp3_idct_put(uint8_t *dest, int line_size, int16_t *block);
+void ff_bfin_vp3_idct_add(uint8_t *dest, int line_size, int16_t *block);
+
+#endif /* AVCODEC_BFIN_VP3_BFIN_H */
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index 35fe962e8d..c2c1ea1b0c 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -1209,7 +1209,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
             if ((ret = bink_decode_plane(c, &gb, plane_idx, !!plane)) < 0)
                 return ret;
         } else {
-            if ((ret = binkb_decode_plane(c, &gb, plane_idx, !pkt->pts, !!plane)) < 0)
+            if ((ret = binkb_decode_plane(c, &gb, plane_idx,
+                                          !avctx->frame_number, !!plane)) < 0)
                 return ret;
         }
         if (get_bits_count(&gb) >= bits_count)
diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c
index 6e94d32ecc..0af685e28b 100644
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -180,7 +180,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, const uint8_t *buf, int buf_si
     for (i = 0; i < ctx->mb_height; i++) {
         ctx->mb_scan_index[i] = AV_RB32(buf + 0x170 + (i<<2));
         av_dlog(ctx->avctx, "mb scan index %d\n", ctx->mb_scan_index[i]);
-        if (buf_size < ctx->mb_scan_index[i] + 0x280) {
+        if (buf_size < ctx->mb_scan_index[i] + 0x280LL) {
             av_log(ctx->avctx, AV_LOG_ERROR, "invalid mb scan index\n");
             return -1;
         }
diff --git a/libavcodec/dsputil_template.c b/libavcodec/dsputil_template.c
index 2922883de5..7e20995a1d 100644
--- a/libavcodec/dsputil_template.c
+++ b/libavcodec/dsputil_template.c
@@ -29,54 +29,6 @@
 
 #include "bit_depth_template.c"
 
-static inline void FUNC(copy_block2)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN2P(dst   , AV_RN2P(src   ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void FUNC(copy_block4)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN4P(dst   , AV_RN4P(src   ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void FUNC(copy_block8)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN4P(dst                , AV_RN4P(src                ));
-        AV_WN4P(dst+4*sizeof(pixel), AV_RN4P(src+4*sizeof(pixel)));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN4P(dst                 , AV_RN4P(src                 ));
-        AV_WN4P(dst+ 4*sizeof(pixel), AV_RN4P(src+ 4*sizeof(pixel)));
-        AV_WN4P(dst+ 8*sizeof(pixel), AV_RN4P(src+ 8*sizeof(pixel)));
-        AV_WN4P(dst+12*sizeof(pixel), AV_RN4P(src+12*sizeof(pixel)));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
 /* draw the edges of width 'w' of an image of size width, height */
 //FIXME check that this is ok for mpeg4 interlaced
 static void FUNCC(draw_edges)(uint8_t *p_buf, int p_wrap, int width, int height, int w, int h, int sides)
diff --git a/libavcodec/eatgv.c b/libavcodec/eatgv.c
index c75f1f3b80..7092ba679b 100644
--- a/libavcodec/eatgv.c
+++ b/libavcodec/eatgv.c
@@ -42,10 +42,10 @@ typedef struct TgvContext {
     AVFrame frame;
     AVFrame last_frame;
     int width,height;
-    unsigned int palette[AVPALETTE_COUNT];
+    uint32_t palette[AVPALETTE_COUNT];
 
     int (*mv_codebook)[2];
-    unsigned char (*block_codebook)[16];
+    uint8_t (*block_codebook)[16];
     int num_mvs;           ///< current length of mv_codebook
     int num_blocks_packed; ///< current length of block_codebook
 } TgvContext;
@@ -66,11 +66,11 @@ static av_cold int tgv_decode_init(AVCodecContext *avctx)
  * @return 0 on success, -1 on critical buffer underflow
  */
 static int unpack(const uint8_t *src, const uint8_t *src_end,
-                  unsigned char *dst, int width, int height)
+                  uint8_t *dst, int width, int height)
 {
-    unsigned char *dst_end = dst + width*height;
+    uint8_t *dst_end = dst + width*height;
     int size, size1, size2, offset, run;
-    unsigned char *dst_start = dst;
+    uint8_t *dst_start = dst;
 
     if (src[0] & 0x01)
         src += 5;
@@ -150,7 +150,7 @@ static int tgv_decode_inter(TgvContext *s, const uint8_t *buf,
     int i,j,x,y;
     GetBitContext gb;
     int mvbits;
-    const unsigned char *blocks_raw;
+    const uint8_t *blocks_raw;
 
     if(buf_end - buf < 12)
         return AVERROR_INVALIDDATA;
@@ -174,7 +174,7 @@ static int tgv_decode_inter(TgvContext *s, const uint8_t *buf,
     }
 
     if (num_blocks_packed > s->num_blocks_packed) {
-        s->block_codebook = av_realloc(s->block_codebook, num_blocks_packed*16*sizeof(unsigned char));
+        s->block_codebook = av_realloc(s->block_codebook, num_blocks_packed*16);
         s->num_blocks_packed = num_blocks_packed;
     }
 
@@ -213,7 +213,7 @@ static int tgv_decode_inter(TgvContext *s, const uint8_t *buf,
     for (y = 0; y < s->avctx->height / 4; y++)
         for (x = 0; x < s->avctx->width / 4; x++) {
             unsigned int vector = get_bits(&gb, vector_bits);
-            const unsigned char *src;
+            const uint8_t *src;
             int src_stride;
 
             if (vector < num_mvs) {
diff --git a/libavcodec/ffv1.c b/libavcodec/ffv1.c
index 074be4cfbb..29d76ed62e 100644
--- a/libavcodec/ffv1.c
+++ b/libavcodec/ffv1.c
@@ -123,6 +123,10 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         int sxe         = f->avctx->width  * (sx + 1) / f->num_h_slices;
         int sys         = f->avctx->height *  sy      / f->num_v_slices;
         int sye         = f->avctx->height * (sy + 1) / f->num_v_slices;
+
+        if (!fs)
+            return AVERROR(ENOMEM);
+
         f->slice_context[i] = fs;
         memcpy(fs, f, sizeof(*fs));
         memset(fs->rc_stat2, 0, sizeof(fs->rc_stat2));
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 78beb9a47e..a012df334f 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -896,6 +896,8 @@ slices_ok:
 #define STATS_OUT_SIZE 1024 * 1024 * 6
     if (avctx->flags & CODEC_FLAG_PASS1) {
         avctx->stats_out = av_mallocz(STATS_OUT_SIZE);
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
         for (i = 0; i < s->quant_table_count; i++)
             for (j = 0; j < s->slice_count; j++) {
                 FFV1Context *sf = s->slice_context[j];
diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 9d5ecd04cf..0e887e6f89 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -254,7 +254,7 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
                                  int pred_order, int bps)
 {
     const int blocksize = s->blocksize;
-    int a, b, c, d, i;
+    int av_uninit(a), av_uninit(b), av_uninit(c), av_uninit(d), i;
 
     /* warm up samples */
     for (i = 0; i < pred_order; i++) {
@@ -505,6 +505,16 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
                                        FLAC_MAX_CHANNELS, 32);
     }
 
+    if (buf_size > 5 && !memcmp(buf, "\177FLAC", 5)) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skiping flac header packet 1\n");
+        return buf_size;
+    }
+
+    if (buf_size > 0 && (*buf & 0x7F) == FLAC_METADATA_TYPE_VORBIS_COMMENT) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skiping vorbis comment\n");
+        return buf_size;
+    }
+
     /* check that there is at least the smallest decodable amount of data.
        this amount corresponds to the smallest valid FLAC frame possible.
        FF F8 69 02 00 00 9A 00 00 34 46 */
diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c
index 7855416567..21464ed6b4 100644
--- a/libavcodec/flashsv.c
+++ b/libavcodec/flashsv.c
@@ -245,6 +245,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     FlashSVContext *s  = avctx->priv_data;
     int h_blocks, v_blocks, h_part, v_part, i, j;
     GetBitContext gb;
+    int last_blockwidth = s->block_width;
+    int last_blockheight= s->block_height;
 
     /* no supplementary picture */
     if (buf_size == 0)
@@ -260,6 +262,10 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     s->block_height = 16 * (get_bits(&gb,  4) + 1);
     s->image_height =       get_bits(&gb, 12);
 
+    if (   last_blockwidth != s->block_width
+        || last_blockheight!= s->block_height)
+        av_freep(&s->blocks);
+
     if (s->ver == 2) {
         skip_bits(&gb, 6);
         if (get_bits1(&gb)) {
@@ -323,9 +329,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
         s->keyframedata = av_realloc(s->keyframedata, avpkt->size);
         memcpy(s->keyframedata, avpkt->data, avpkt->size);
     }
-    if(s->ver == 2)
-        s->blocks = av_realloc(s->blocks,
-                                (v_blocks + !!v_part) * (h_blocks + !!h_part)
+    if(s->ver == 2 && !s->blocks)
+        s->blocks = av_mallocz((v_blocks + !!v_part) * (h_blocks + !!h_part)
                                 * sizeof(s->blocks[0]));
 
     av_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n",
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 8436032b18..34cd8c0658 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -1462,7 +1462,6 @@ static void decode_postinit(H264Context *h, int setup_finished)
             cur->f.repeat_pict = 1;
             break;
         case SEI_PIC_STRUCT_FRAME_DOUBLING:
-            // Force progressive here, doubling interlaced frame is a bad idea.
             cur->f.repeat_pict = 2;
             break;
         case SEI_PIC_STRUCT_FRAME_TRIPLING:
@@ -2368,7 +2367,7 @@ static int field_end(H264Context *h, int in_setup)
      * past end by one (callers fault) and resync_mb_y != 0
      * causes problems for the first MB line, too.
      */
-    if (!FIELD_PICTURE && h->current_slice)
+    if (!FIELD_PICTURE && h->current_slice && !h->sps.new)
         ff_er_frame_end(s);
 
     ff_MPV_frame_end(s);
@@ -2776,7 +2775,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
                      || s->avctx->bits_per_raw_sample != h->sps.bit_depth_luma
                      || h->cur_chroma_format_idc != h->sps.chroma_format_idc
                      || av_cmp_q(h->sps.sar, s->avctx->sample_aspect_ratio)));
-
+    if (h0->s.avctx->pix_fmt != get_pixel_format(h0))
+        must_reinit = 1;
 
     s->mb_width  = h->sps.mb_width;
     s->mb_height = h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
@@ -3906,6 +3906,7 @@ static int execute_decode_slices(H264Context *h, int context_count)
     if (context_count == 1) {
         return decode_slice(avctx, &h);
     } else {
+        av_assert0(context_count > 0);
         for (i = 1; i < context_count; i++) {
             hx                    = h->thread_context[i];
             hx->s.err_recognition = avctx->err_recognition;
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index bbca26ab4d..54b3775761 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1712,7 +1712,7 @@ decode_cabac_residual_internal(H264Context *h, int16_t *block,
 \
             if( coeff_abs >= 15 ) { \
                 int j = 0; \
-                while( get_cabac_bypass( CC ) ) { \
+                while(get_cabac_bypass( CC ) && j<30) { \
                     j++; \
                 } \
 \
diff --git a/libavcodec/huffyuvdec.c b/libavcodec/huffyuvdec.c
index 9c92bf3875..2c56bfa493 100644
--- a/libavcodec/huffyuvdec.c
+++ b/libavcodec/huffyuvdec.c
@@ -107,23 +107,26 @@ static int read_len_table(uint8_t *dst, GetBitContext *gb)
     return 0;
 }
 
-static void generate_joint_tables(HYuvContext *s)
+static int generate_joint_tables(HYuvContext *s)
 {
     uint16_t symbols[1 << VLC_BITS];
     uint16_t bits[1 << VLC_BITS];
     uint8_t len[1 << VLC_BITS];
+    int ret;
+
     if (s->bitstream_bpp < 24) {
         int p, i, y, u;
         for (p = 0; p < 3; p++) {
             for (i = y = 0; y < 256; y++) {
                 int len0 = s->len[0][y];
                 int limit = VLC_BITS - len0;
-                if(limit <= 0)
+                if(limit <= 0 || !len0)
                     continue;
                 for (u = 0; u < 256; u++) {
                     int len1 = s->len[p][u];
-                    if (len1 > limit)
+                    if (len1 > limit || !len1)
                         continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i] = len0 + len1;
                     bits[i] = (s->bits[0][y] << len1) + s->bits[p][u];
                     symbols[i] = (y << 8) + u;
@@ -132,8 +135,9 @@ static void generate_joint_tables(HYuvContext *s)
                 }
             }
             ff_free_vlc(&s->vlc[3 + p]);
-            ff_init_vlc_sparse(&s->vlc[3 + p], VLC_BITS, i, len, 1, 1,
-                               bits, 2, 2, symbols, 2, 2, 0);
+            if ((ret = ff_init_vlc_sparse(&s->vlc[3 + p], VLC_BITS, i, len, 1, 1,
+                                          bits, 2, 2, symbols, 2, 2, 0)) < 0)
+                return ret;
         }
     } else {
         uint8_t (*map)[4] = (uint8_t(*)[4])s->pix_bgr_map;
@@ -146,18 +150,19 @@ static void generate_joint_tables(HYuvContext *s)
         for (i = 0, g = -16; g < 16; g++) {
             int len0 = s->len[p0][g & 255];
             int limit0 = VLC_BITS - len0;
-            if (limit0 < 2)
+            if (limit0 < 2 || !len0)
                 continue;
             for (b = -16; b < 16; b++) {
                 int len1 = s->len[p1][b & 255];
                 int limit1 = limit0 - len1;
-                if (limit1 < 1)
+                if (limit1 < 1 || !len1)
                     continue;
                 code = (s->bits[p0][g & 255] << len1) + s->bits[p1][b & 255];
                 for (r = -16; r < 16; r++) {
                     int len2 = s->len[2][r & 255];
-                    if (len2 > limit1)
+                    if (len2 > limit1 || !len2)
                         continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i] = len0 + len1 + len2;
                     bits[i] = (code << len2) + s->bits[2][r & 255];
                     if (s->decorrelate) {
@@ -174,14 +179,17 @@ static void generate_joint_tables(HYuvContext *s)
             }
         }
         ff_free_vlc(&s->vlc[3]);
-        init_vlc(&s->vlc[3], VLC_BITS, i, len, 1, 1, bits, 2, 2, 0);
+        if ((ret = init_vlc(&s->vlc[3], VLC_BITS, i, len, 1, 1, bits, 2, 2, 0)) < 0)
+            return ret;
     }
+    return 0;
 }
 
 static int read_huffman_tables(HYuvContext *s, const uint8_t *src, int length)
 {
     GetBitContext gb;
     int i;
+    int ret;
 
     init_get_bits(&gb, src, length * 8);
 
@@ -192,11 +200,13 @@ static int read_huffman_tables(HYuvContext *s, const uint8_t *src, int length)
             return -1;
         }
         ff_free_vlc(&s->vlc[i]);
-        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
-                 s->bits[i], 4, 4, 0);
+        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
+                           s->bits[i], 4, 4, 0)) < 0)
+            return ret;
     }
 
-    generate_joint_tables(s);
+    if ((ret = generate_joint_tables(s)) < 0)
+        return ret;
 
     return (get_bits_count(&gb) + 7) / 8;
 }
@@ -205,6 +215,7 @@ static int read_old_huffman_tables(HYuvContext *s)
 {
     GetBitContext gb;
     int i;
+    int ret;
 
     init_get_bits(&gb, classic_shift_luma,
                   classic_shift_luma_table_size * 8);
@@ -228,11 +239,13 @@ static int read_old_huffman_tables(HYuvContext *s)
 
     for (i = 0; i < 3; i++) {
         ff_free_vlc(&s->vlc[i]);
-        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
-                 s->bits[i], 4, 4, 0);
+        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
+                            s->bits[i], 4, 4, 0)) < 0)
+            return ret;
     }
 
-    generate_joint_tables(s);
+    if ((ret = generate_joint_tables(s)) < 0)
+        return ret;
 
     return 0;
 }
diff --git a/libavcodec/iff.c b/libavcodec/iff.c
index 6b1fd89d6b..8e7f8cae46 100644
--- a/libavcodec/iff.c
+++ b/libavcodec/iff.c
@@ -576,7 +576,7 @@ static void decode_deep_rle32(uint8_t *dst, const uint8_t *src, int src_size, in
             }
         } else {
             int size = -opcode + 1;
-            uint32_t pixel = AV_RL32(src);
+            uint32_t pixel = AV_RN32(src);
             for (i = 0; i < size; i++) {
                 *(uint32_t *)(dst + y*linesize + x * 4) = pixel;
                 x += 1;
diff --git a/libavcodec/interplayvideo.c b/libavcodec/interplayvideo.c
index 3285578a70..e0550a702b 100644
--- a/libavcodec/interplayvideo.c
+++ b/libavcodec/interplayvideo.c
@@ -969,6 +969,13 @@ static int ipvideo_decode_frame(AVCodecContext *avctx,
     if (buf_size < s->decoding_map_size)
         return buf_size;
 
+    if (s->last_frame.data[0] && av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, NULL)) {
+        if (s->last_frame.data[0])
+            avctx->release_buffer(avctx, &s->last_frame);
+        if (s->second_last_frame.data[0])
+            avctx->release_buffer(avctx, &s->second_last_frame);
+    }
+
     s->decoding_map = buf;
     bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size,
                      buf_size - s->decoding_map_size);
diff --git a/libavcodec/kbdwin.c b/libavcodec/kbdwin.c
index 2722312f11..5a62e9da2b 100644
--- a/libavcodec/kbdwin.c
+++ b/libavcodec/kbdwin.c
@@ -16,9 +16,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-
-#include <libavutil/avassert.h>
-#include <libavutil/mathematics.h>
+#include "libavutil/avassert.h"
+#include "libavutil/mathematics.h"
 #include "libavutil/attributes.h"
 #include "kbdwin.h"
 
diff --git a/libavcodec/libavcodec.v b/libavcodec/libavcodec.v
index f3e6db446a..826a547565 100644
--- a/libavcodec/libavcodec.v
+++ b/libavcodec/libavcodec.v
@@ -1,9 +1,10 @@
 LIBAVCODEC_$MAJOR {
         global: av*;
+                #deprecated, remove after next bump
                 audio_resample;
                 audio_resample_close;
-                #deprecated, remove after next bump
                 dsputil_init;
+                ff_dsputil_init;
                 ff_find_pix_fmt;
                 ff_framenum_to_drop_timecode;
                 ff_framenum_to_smtpe_timecode;
diff --git a/libavcodec/libopenjpegenc.c b/libavcodec/libopenjpegenc.c
index 13e8ef914c..c35508376b 100644
--- a/libavcodec/libopenjpegenc.c
+++ b/libavcodec/libopenjpegenc.c
@@ -100,6 +100,12 @@ static opj_image_t *mj2_create_image(AVCodecContext *avctx, opj_cparameters_t *p
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
         color_space = CLRSPC_SRGB;
         break;
     case AV_PIX_FMT_YUV410P:
@@ -182,6 +188,35 @@ static av_cold int libopenjpeg_encode_init(AVCodecContext *avctx)
     ctx->enc_params.tcp_numlayers = ctx->numlayers;
     ctx->enc_params.tcp_rates[0] = FFMAX(avctx->compression_level, 0) * 2;
 
+    if (ctx->cinema_mode > 0) {
+        ctx->enc_params.irreversible = 1;
+        ctx->enc_params.tcp_mct = 1;
+        ctx->enc_params.tile_size_on = 0;
+        /* no subsampling */
+        ctx->enc_params.cp_tdx=1;
+        ctx->enc_params.cp_tdy=1;
+        ctx->enc_params.subsampling_dx = 1;
+        ctx->enc_params.subsampling_dy = 1;
+        /* Tile and Image shall be at (0,0) */
+        ctx->enc_params.cp_tx0 = 0;
+        ctx->enc_params.cp_ty0 = 0;
+        ctx->enc_params.image_offset_x0 = 0;
+        ctx->enc_params.image_offset_y0 = 0;
+        /* Codeblock size= 32*32 */
+        ctx->enc_params.cblockw_init = 32;
+        ctx->enc_params.cblockh_init = 32;
+        ctx->enc_params.csty |= 0x01;
+        /* No ROI */
+        ctx->enc_params.roi_compno = -1;
+
+        if (ctx->enc_params.prog_order != CPRL) {
+            av_log(avctx, AV_LOG_ERROR, "prog_order forced to CPRL\n");
+            ctx->enc_params.prog_order = CPRL;
+        }
+        ctx->enc_params.tp_flag = 'C';
+        ctx->enc_params.tp_on = 1;
+    }
+
     ctx->compress = opj_create_compress(ctx->format);
     if (!ctx->compress) {
         av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
@@ -351,6 +386,7 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     opj_cio_t *stream;
     int cpyresult = 0;
     int ret, len;
+    AVFrame gbrframe;
 
     // x0, y0 is the top left corner of the image
     // x1, y1 is the width, height of the reference grid
@@ -369,6 +405,25 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_RGBA64:
         cpyresult = libopenjpeg_copy_packed16(avctx, frame, image);
         break;
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+        gbrframe = *frame;
+        gbrframe.data[0] = frame->data[2]; // swap to be rgb
+        gbrframe.data[1] = frame->data[0];
+        gbrframe.data[2] = frame->data[1];
+        gbrframe.linesize[0] = frame->linesize[2];
+        gbrframe.linesize[1] = frame->linesize[0];
+        gbrframe.linesize[2] = frame->linesize[1];
+        if (avctx->pix_fmt == AV_PIX_FMT_GBR24P) {
+            cpyresult = libopenjpeg_copy_unpacked8(avctx, &gbrframe, image);
+        } else {
+            cpyresult = libopenjpeg_copy_unpacked16(avctx, &gbrframe, image);
+        }
+        break;
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_YUV410P:
     case AV_PIX_FMT_YUV411P:
@@ -505,6 +560,8 @@ AVCodec ff_libopenjpeg_encoder = {
     .capabilities   = 0,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_RGB48, AV_PIX_FMT_RGBA64,
+        AV_PIX_FMT_GBR24P,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
         AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A, AV_PIX_FMT_GRAY16,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA420P,
         AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA422P,
diff --git a/libavcodec/loco.c b/libavcodec/loco.c
index 93377e2220..9958c148c7 100644
--- a/libavcodec/loco.c
+++ b/libavcodec/loco.c
@@ -287,7 +287,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case LOCO_CRGBA:
     case LOCO_RGBA:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_BGRA;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "Unknown colorspace, index = %i\n", l->mode);
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index 68e850c60e..cd96fa4d82 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -33,7 +33,7 @@
 #include "mpeg12.h"
 #include "thread.h"
 
-typedef struct MDECContext{
+typedef struct MDECContext {
     AVCodecContext *avctx;
     DSPContext dsp;
     AVFrame picture;
@@ -57,36 +57,36 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
     int level, diff, i, j, run;
     int component;
     RLTable *rl = &ff_rl_mpeg1;
-    uint8_t * const scantable= a->scantable.permutated;
-    const uint16_t *quant_matrix= ff_mpeg1_default_intra_matrix;
-    const int qscale= a->qscale;
+    uint8_t * const scantable = a->scantable.permutated;
+    const uint16_t *quant_matrix = ff_mpeg1_default_intra_matrix;
+    const int qscale = a->qscale;
 
     /* DC coefficient */
-    if(a->version==2){
-        block[0]= 2*get_sbits(&a->gb, 10) + 1024;
-    }else{
+    if (a->version == 2) {
+        block[0] = 2 * get_sbits(&a->gb, 10) + 1024;
+    } else {
         component = (n <= 3 ? 0 : n - 4 + 1);
         diff = decode_dc(&a->gb, component);
         if (diff >= 0xffff)
-            return -1;
-        a->last_dc[component]+= diff;
-        block[0] = a->last_dc[component]<<3;
+            return AVERROR_INVALIDDATA;
+        a->last_dc[component] += diff;
+        block[0] = a->last_dc[component] << 3;
     }
 
     i = 0;
     {
         OPEN_READER(re, &a->gb);
         /* now quantify & encode AC coefficients */
-        for(;;) {
+        for (;;) {
             UPDATE_CACHE(re, &a->gb);
             GET_RL_VLC(level, run, re, &a->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
 
-            if(level == 127){
+            if (level == 127) {
                 break;
-            } else if(level != 0) {
-                i += run;
-                j = scantable[i];
-                level= (level*qscale*quant_matrix[j])>>3;
+            } else if (level != 0) {
+                i    += run;
+                j     = scantable[i];
+                level = (level * qscale * quant_matrix[j]) >> 3;
                 level = (level ^ SHOW_SBITS(re, &a->gb, 1)) - SHOW_SBITS(re, &a->gb, 1);
                 LAST_SKIP_BITS(re, &a->gb, 1);
             } else {
@@ -94,21 +94,21 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
                 run = SHOW_UBITS(re, &a->gb, 6)+1; LAST_SKIP_BITS(re, &a->gb, 6);
                 UPDATE_CACHE(re, &a->gb);
                 level = SHOW_SBITS(re, &a->gb, 10); SKIP_BITS(re, &a->gb, 10);
-                i += run;
-                j = scantable[i];
-                if(level<0){
-                    level= -level;
-                    level= (level*qscale*quant_matrix[j])>>3;
-                    level= (level-1)|1;
-                    level= -level;
-                }else{
-                    level= (level*qscale*quant_matrix[j])>>3;
-                    level= (level-1)|1;
+                i    += run;
+                j     = scantable[i];
+                if (level < 0) {
+                    level = -level;
+                    level = (level * qscale * quant_matrix[j]) >> 3;
+                    level = (level - 1) | 1;
+                    level = -level;
+                } else {
+                    level = (level * qscale * quant_matrix[j]) >> 3;
+                    level = (level - 1) | 1;
                 }
             }
-            if (i > 63){
+            if (i > 63) {
                 av_log(a->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", a->mb_x, a->mb_y);
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
 
             block[j] = level;
@@ -119,34 +119,38 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
     return 0;
 }
 
-static inline int decode_mb(MDECContext *a, int16_t block[6][64]){
-    int i;
-    const int block_index[6]= {5,4,0,1,2,3};
+static inline int decode_mb(MDECContext *a, int16_t block[6][64])
+{
+    int i, ret;
+    const int block_index[6] = { 5, 4, 0, 1, 2, 3 };
 
     a->dsp.clear_blocks(block[0]);
 
-    for(i=0; i<6; i++){
-        if( mdec_decode_block_intra(a, block[ block_index[i] ], block_index[i]) < 0 ||
-            get_bits_left(&a->gb) < 0)
-            return -1;
+    for (i = 0; i < 6; i++) {
+        if ((ret = mdec_decode_block_intra(a, block[block_index[i]],
+                                           block_index[i])) < 0)
+            return ret;
+        if (get_bits_left(&a->gb) < 0)
+            return AVERROR_INVALIDDATA;
     }
     return 0;
 }
 
-static inline void idct_put(MDECContext *a, int mb_x, int mb_y){
-    int16_t (*block)[64]= a->block;
-    int linesize= a->picture.linesize[0];
+static inline void idct_put(MDECContext *a, int mb_x, int mb_y)
+{
+    int16_t (*block)[64] = a->block;
+    int linesize = a->picture.linesize[0];
 
-    uint8_t *dest_y  = a->picture.data[0] + (mb_y * 16* linesize              ) + mb_x * 16;
-    uint8_t *dest_cb = a->picture.data[1] + (mb_y * 8 * a->picture.linesize[1]) + mb_x * 8;
-    uint8_t *dest_cr = a->picture.data[2] + (mb_y * 8 * a->picture.linesize[2]) + mb_x * 8;
+    uint8_t *dest_y  = a->picture.data[0] + (mb_y * 16 * linesize              ) + mb_x * 16;
+    uint8_t *dest_cb = a->picture.data[1] + (mb_y * 8  * a->picture.linesize[1]) + mb_x * 8;
+    uint8_t *dest_cr = a->picture.data[2] + (mb_y * 8  * a->picture.linesize[2]) + mb_x * 8;
 
-    a->dsp.idct_put(dest_y                 , linesize, block[0]);
-    a->dsp.idct_put(dest_y              + 8, linesize, block[1]);
-    a->dsp.idct_put(dest_y + 8*linesize    , linesize, block[2]);
-    a->dsp.idct_put(dest_y + 8*linesize + 8, linesize, block[3]);
+    a->dsp.idct_put(dest_y,                    linesize, block[0]);
+    a->dsp.idct_put(dest_y                + 8, linesize, block[1]);
+    a->dsp.idct_put(dest_y + 8 * linesize,     linesize, block[2]);
+    a->dsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]);
 
-    if(!(a->avctx->flags&CODEC_FLAG_GRAY)){
+    if (!(a->avctx->flags & CODEC_FLAG_GRAY)) {
         a->dsp.idct_put(dest_cb, a->picture.linesize[1], block[4]);
         a->dsp.idct_put(dest_cr, a->picture.linesize[2], block[5]);
     }
@@ -156,112 +160,106 @@ static int decode_frame(AVCodecContext *avctx,
                         void *data, int *got_frame,
                         AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     MDECContext * const a = avctx->priv_data;
-    AVFrame *picture = data;
-    AVFrame * const p= &a->picture;
-    int i;
+    const uint8_t *buf    = avpkt->data;
+    int buf_size          = avpkt->size;
+    AVFrame *picture      = data;
+    AVFrame * const p     = &a->picture;
+    int i, ret;
 
-    if(p->data[0])
+    if (p->data[0])
         ff_thread_release_buffer(avctx, p);
 
-    p->reference= 0;
-    if(ff_thread_get_buffer(avctx, p) < 0){
+    p->reference = 0;
+    if ((ret = ff_thread_get_buffer(avctx, p)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
+        return ret;
     }
-    p->pict_type= AV_PICTURE_TYPE_I;
-    p->key_frame= 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
 
     av_fast_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
     if (!a->bitstream_buffer)
         return AVERROR(ENOMEM);
-    for(i=0; i<buf_size; i+=2){
-        a->bitstream_buffer[i]  = buf[i+1];
-        a->bitstream_buffer[i+1]= buf[i  ];
+    for (i = 0; i < buf_size; i += 2) {
+        a->bitstream_buffer[i]     = buf[i + 1];
+        a->bitstream_buffer[i + 1] = buf[i];
     }
-    init_get_bits(&a->gb, a->bitstream_buffer, buf_size*8);
+    init_get_bits(&a->gb, a->bitstream_buffer, buf_size * 8);
 
     /* skip over 4 preamble bytes in stream (typically 0xXX 0xXX 0x00 0x38) */
     skip_bits(&a->gb, 32);
 
-    a->qscale=  get_bits(&a->gb, 16);
-    a->version= get_bits(&a->gb, 16);
+    a->qscale  = get_bits(&a->gb, 16);
+    a->version = get_bits(&a->gb, 16);
 
-    a->last_dc[0]=
-    a->last_dc[1]=
-    a->last_dc[2]= 128;
+    a->last_dc[0] = a->last_dc[1] = a->last_dc[2] = 128;
 
-    for(a->mb_x=0; a->mb_x<a->mb_width; a->mb_x++){
-        for(a->mb_y=0; a->mb_y<a->mb_height; a->mb_y++){
-            if( decode_mb(a, a->block) <0)
-                return -1;
+    for (a->mb_x = 0; a->mb_x < a->mb_width; a->mb_x++) {
+        for (a->mb_y = 0; a->mb_y < a->mb_height; a->mb_y++) {
+            if ((ret = decode_mb(a, a->block)) < 0)
+                return ret;
 
             idct_put(a, a->mb_x, a->mb_y);
         }
     }
 
-    p->quality= a->qscale * FF_QP2LAMBDA;
+    p->quality = a->qscale * FF_QP2LAMBDA;
     memset(p->qscale_table, a->qscale, a->mb_width);
 
     *picture   = a->picture;
     *got_frame = 1;
 
-    return (get_bits_count(&a->gb)+31)/32*4;
+    return (get_bits_count(&a->gb) + 31) / 32 * 4;
 }
 
-static av_cold void mdec_common_init(AVCodecContext *avctx){
+static av_cold int decode_init(AVCodecContext *avctx)
+{
     MDECContext * const a = avctx->priv_data;
+    AVFrame *p            = &a->picture;
 
-    ff_dsputil_init(&a->dsp, avctx);
-
-    a->mb_width   = (avctx->coded_width  + 15) / 16;
-    a->mb_height  = (avctx->coded_height + 15) / 16;
+    a->mb_width  = (avctx->coded_width  + 15) / 16;
+    a->mb_height = (avctx->coded_height + 15) / 16;
 
     avcodec_get_frame_defaults(&a->picture);
-    avctx->coded_frame= &a->picture;
-    a->avctx= avctx;
-}
+    avctx->coded_frame = &a->picture;
+    a->avctx           = avctx;
 
-static av_cold int decode_init(AVCodecContext *avctx){
-    MDECContext * const a = avctx->priv_data;
-    AVFrame *p= &a->picture;
-
-    mdec_common_init(avctx);
+    ff_dsputil_init(&a->dsp, avctx);
     ff_mpeg12_init_vlcs();
     ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_zigzag_direct);
 
-    if( avctx->idct_algo == FF_IDCT_AUTO )
+    if (avctx->idct_algo == FF_IDCT_AUTO)
         avctx->idct_algo = FF_IDCT_SIMPLE;
-    p->qstride= 0;
-    p->qscale_table= av_mallocz(a->mb_width);
-    avctx->pix_fmt= AV_PIX_FMT_YUVJ420P;
+    p->qstride      = 0;
+    p->qscale_table = av_mallocz(a->mb_width);
+    avctx->pix_fmt  = AV_PIX_FMT_YUVJ420P;
 
     return 0;
 }
 
-static av_cold int decode_init_thread_copy(AVCodecContext *avctx){
+static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
+{
     MDECContext * const a = avctx->priv_data;
-    AVFrame *p = &a->picture;
+    AVFrame *p            = &a->picture;
 
-    avctx->coded_frame= p;
-    a->avctx= avctx;
+    avctx->coded_frame = p;
+    a->avctx           = avctx;
 
     p->qscale_table= av_mallocz(a->mb_width);
 
     return 0;
 }
 
-
-static av_cold int decode_end(AVCodecContext *avctx){
+static av_cold int decode_end(AVCodecContext *avctx)
+{
     MDECContext * const a = avctx->priv_data;
 
-    if(a->picture.data[0])
+    if (a->picture.data[0])
         avctx->release_buffer(avctx, &a->picture);
     av_freep(&a->bitstream_buffer);
     av_freep(&a->picture.qscale_table);
-    a->bitstream_buffer_size=0;
+    a->bitstream_buffer_size = 0;
 
     return 0;
 }
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index d30bac3d76..54d27e8af3 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -297,7 +297,7 @@ static int decode(MimicContext *ctx, int quality, int num_coeffs,
  * Flip the buffer upside-down and put it in the YVU order to match the
  * way Mimic encodes frames.
  */
-static void prepare_avpic(MimicContext *ctx, AVPicture *dst, AVPicture *src)
+static void prepare_avpic(MimicContext *ctx, AVPicture *dst, AVFrame *src)
 {
     int i;
     dst->data[0] = src->data[0] + ( ctx->avctx->height       - 1) * src->linesize[0];
@@ -374,7 +374,7 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     ctx->next_cur_index  = (ctx->cur_index - 1) & 15;
 
     prepare_avpic(ctx, &ctx->flipped_ptrs[ctx->cur_index],
-                  (AVPicture*) &ctx->buf_ptrs[ctx->cur_index]);
+                  &ctx->buf_ptrs[ctx->cur_index]);
 
     ff_thread_finish_setup(avctx);
 
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 01811975c6..609939ded7 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -13,3 +13,4 @@ OBJS-$(CONFIG_FFT)                        += mips/fft_init_table.o
 MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
 MIPSFPU-OBJS                              += mips/fmtconvert_mips.o
 OBJS-$(CONFIG_AC3DSP)                     += mips/ac3dsp_mips.o
+OBJS-$(CONFIG_AAC_DECODER)                += mips/aacdec_mips.o
diff --git a/libavcodec/mips/aacdec_mips.c b/libavcodec/mips/aacdec_mips.c
new file mode 100644
index 0000000000..e4033668da
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#include "libavcodec/aac.h"
+#include "aacdec_mips.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/sinewin.h"
+
+#if HAVE_INLINE_ASM
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void imdct_and_windowing_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *in    = sce->coeffs;
+    float *out   = sce->ret;
+    float *saved = sce->saved;
+    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float *buf  = ac->buf_mdct;
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        {
+            float *buf1 = saved;
+            float *buf2 = out;
+            int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+            int loop_end;
+
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                ".set push                               \n\t"
+                ".set noreorder                          \n\t"
+                "addiu   %[loop_end], %[src],      1792  \n\t"
+            "1:                                          \n\t"
+                "lw      %[temp0],    0(%[src])          \n\t"
+                "lw      %[temp1],    4(%[src])          \n\t"
+                "lw      %[temp2],    8(%[src])          \n\t"
+                "lw      %[temp3],    12(%[src])         \n\t"
+                "lw      %[temp4],    16(%[src])         \n\t"
+                "lw      %[temp5],    20(%[src])         \n\t"
+                "lw      %[temp6],    24(%[src])         \n\t"
+                "lw      %[temp7],    28(%[src])         \n\t"
+                "addiu   %[src],      %[src],      32    \n\t"
+                "sw      %[temp0],    0(%[dst])          \n\t"
+                "sw      %[temp1],    4(%[dst])          \n\t"
+                "sw      %[temp2],    8(%[dst])          \n\t"
+                "sw      %[temp3],    12(%[dst])         \n\t"
+                "sw      %[temp4],    16(%[dst])         \n\t"
+                "sw      %[temp5],    20(%[dst])         \n\t"
+                "sw      %[temp6],    24(%[dst])         \n\t"
+                "sw      %[temp7],    28(%[dst])         \n\t"
+                "bne     %[src],      %[loop_end], 1b    \n\t"
+                " addiu  %[dst],      %[dst],      32    \n\t"
+                ".set pop                                \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
+                  [dst]"+r"(buf2)
+                :
+                : "memory"
+            );
+        }
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            {
+                float wi;
+                float wj;
+                int i;
+                float temp0, temp1, temp2, temp3;
+                float *dst0 = out + 448 + 0*128;
+                float *dst1 = dst0 + 64 + 63;
+                float *dst2 = saved + 63;
+                float *win0 = (float*)swindow;
+                float *win1 = win0 + 64 + 63;
+                float *win0_prev = (float*)swindow_prev;
+                float *win1_prev = win0_prev + 64 + 63;
+                float *src0_prev = saved + 448;
+                float *src1_prev = buf + 0*128 + 63;
+                float *src0 = buf + 0*128 + 64;
+                float *src1 = buf + 1*128 + 63;
+
+                for(i = 0; i < 64; i++)
+                {
+                    temp0 = src0_prev[0];
+                    temp1 = src1_prev[0];
+                    wi = *win0_prev;
+                    wj = *win1_prev;
+                    temp2 = src0[0];
+                    temp3 = src1[0];
+                    dst0[0] = temp0 * wj - temp1 * wi;
+                    dst1[0] = temp0 * wi + temp1 * wj;
+
+                    wi = *win0;
+                    wj = *win1;
+
+                    temp0 = src0[128];
+                    temp1 = src1[128];
+                    dst0[128] = temp2 * wj - temp3 * wi;
+                    dst1[128] = temp2 * wi + temp3 * wj;
+
+                    temp2 = src0[256];
+                    temp3 = src1[256];
+                    dst0[256] = temp0 * wj - temp1 * wi;
+                    dst1[256] = temp0 * wi + temp1 * wj;
+                    dst0[384] = temp2 * wj - temp3 * wi;
+                    dst1[384] = temp2 * wi + temp3 * wj;
+
+                    temp0 = src0[384];
+                    temp1 = src1[384];
+                    dst0[512] = temp0 * wj - temp1 * wi;
+                    dst2[0] = temp0 * wi + temp1 * wj;
+
+                    src0++;
+                    src1--;
+                    src0_prev++;
+                    src1_prev--;
+                    win0++;
+                    win1--;
+                    win0_prev++;
+                    win1_prev--;
+                    dst0++;
+                    dst1--;
+                    dst2--;
+                }
+            }
+        } else {
+            ac->fdsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            {
+                float *buf1 = buf + 64;
+                float *buf2 = out + 576;
+                int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+                int loop_end;
+
+                /* loop unrolled 8 times */
+                __asm__ volatile (
+                    ".set push                               \n\t"
+                    ".set noreorder                          \n\t"
+                    "addiu   %[loop_end], %[src],      1792  \n\t"
+                "1:                                          \n\t"
+                    "lw      %[temp0],    0(%[src])          \n\t"
+                    "lw      %[temp1],    4(%[src])          \n\t"
+                    "lw      %[temp2],    8(%[src])          \n\t"
+                    "lw      %[temp3],    12(%[src])         \n\t"
+                    "lw      %[temp4],    16(%[src])         \n\t"
+                    "lw      %[temp5],    20(%[src])         \n\t"
+                    "lw      %[temp6],    24(%[src])         \n\t"
+                    "lw      %[temp7],    28(%[src])         \n\t"
+                    "addiu   %[src],      %[src],      32    \n\t"
+                    "sw      %[temp0],    0(%[dst])          \n\t"
+                    "sw      %[temp1],    4(%[dst])          \n\t"
+                    "sw      %[temp2],    8(%[dst])          \n\t"
+                    "sw      %[temp3],    12(%[dst])         \n\t"
+                    "sw      %[temp4],    16(%[dst])         \n\t"
+                    "sw      %[temp5],    20(%[dst])         \n\t"
+                    "sw      %[temp6],    24(%[dst])         \n\t"
+                    "sw      %[temp7],    28(%[dst])         \n\t"
+                    "bne     %[src],      %[loop_end], 1b    \n\t"
+                    " addiu  %[dst],      %[dst],      32    \n\t"
+                    ".set pop                                \n\t"
+
+                    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                      [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
+                      [dst]"+r"(buf2)
+                    :
+                    : "memory"
+                );
+            }
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        ac->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        {
+            float *buf1 = buf + 7*128 + 64;
+            float *buf2 = saved + 448;
+            int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+            int loop_end;
+
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                ".set push                                \n\t"
+                ".set noreorder                           \n\t"
+                "addiu   %[loop_end], %[src],       256   \n\t"
+            "1:                                           \n\t"
+                "lw      %[temp0],    0(%[src])           \n\t"
+                "lw      %[temp1],    4(%[src])           \n\t"
+                "lw      %[temp2],    8(%[src])           \n\t"
+                "lw      %[temp3],    12(%[src])          \n\t"
+                "lw      %[temp4],    16(%[src])          \n\t"
+                "lw      %[temp5],    20(%[src])          \n\t"
+                "lw      %[temp6],    24(%[src])          \n\t"
+                "lw      %[temp7],    28(%[src])          \n\t"
+                "addiu   %[src],      %[src],       32    \n\t"
+                "sw      %[temp0],    0(%[dst])           \n\t"
+                "sw      %[temp1],    4(%[dst])           \n\t"
+                "sw      %[temp2],    8(%[dst])           \n\t"
+                "sw      %[temp3],    12(%[dst])          \n\t"
+                "sw      %[temp4],    16(%[dst])          \n\t"
+                "sw      %[temp5],    20(%[dst])          \n\t"
+                "sw      %[temp6],    24(%[dst])          \n\t"
+                "sw      %[temp7],    28(%[dst])          \n\t"
+                "bne     %[src],      %[loop_end],  1b    \n\t"
+                " addiu  %[dst],      %[dst],       32    \n\t"
+                ".set pop                                 \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
+                  [dst]"+r"(buf2)
+                :
+                : "memory"
+            );
+        }
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float *buf1 = buf + 512;
+        float *buf2 = saved;
+        int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+        int loop_end;
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                \n\t"
+            ".set noreorder                           \n\t"
+            "addiu   %[loop_end], %[src],       1792  \n\t"
+        "1:                                           \n\t"
+            "lw      %[temp0],    0(%[src])           \n\t"
+            "lw      %[temp1],    4(%[src])           \n\t"
+            "lw      %[temp2],    8(%[src])           \n\t"
+            "lw      %[temp3],    12(%[src])          \n\t"
+            "lw      %[temp4],    16(%[src])          \n\t"
+            "lw      %[temp5],    20(%[src])          \n\t"
+            "lw      %[temp6],    24(%[src])          \n\t"
+            "lw      %[temp7],    28(%[src])          \n\t"
+            "addiu   %[src],      %[src],       32    \n\t"
+            "sw      %[temp0],    0(%[dst])           \n\t"
+            "sw      %[temp1],    4(%[dst])           \n\t"
+            "sw      %[temp2],    8(%[dst])           \n\t"
+            "sw      %[temp3],    12(%[dst])          \n\t"
+            "sw      %[temp4],    16(%[dst])          \n\t"
+            "sw      %[temp5],    20(%[dst])          \n\t"
+            "sw      %[temp6],    24(%[dst])          \n\t"
+            "sw      %[temp7],    28(%[dst])          \n\t"
+            "bne     %[src],      %[loop_end],  1b    \n\t"
+            " addiu  %[dst],      %[dst],       32    \n\t"
+            ".set pop                                 \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
+              [dst]"+r"(buf2)
+            :
+            : "memory"
+        );
+        {
+            float *buf1 = buf + 7*128 + 64;
+            float *buf2 = saved + 448;
+            int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+            int loop_end;
+
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                ".set push                                 \n\t"
+                ".set noreorder                            \n\t"
+                "addiu   %[loop_end], %[src],        256   \n\t"
+            "1:                                            \n\t"
+                "lw      %[temp0],    0(%[src])            \n\t"
+                "lw      %[temp1],    4(%[src])            \n\t"
+                "lw      %[temp2],    8(%[src])            \n\t"
+                "lw      %[temp3],    12(%[src])           \n\t"
+                "lw      %[temp4],    16(%[src])           \n\t"
+                "lw      %[temp5],    20(%[src])           \n\t"
+                "lw      %[temp6],    24(%[src])           \n\t"
+                "lw      %[temp7],    28(%[src])           \n\t"
+                "addiu   %[src],      %[src],        32    \n\t"
+                "sw      %[temp0],    0(%[dst])            \n\t"
+                "sw      %[temp1],    4(%[dst])            \n\t"
+                "sw      %[temp2],    8(%[dst])            \n\t"
+                "sw      %[temp3],    12(%[dst])           \n\t"
+                "sw      %[temp4],    16(%[dst])           \n\t"
+                "sw      %[temp5],    20(%[dst])           \n\t"
+                "sw      %[temp6],    24(%[dst])           \n\t"
+                "sw      %[temp7],    28(%[dst])           \n\t"
+                "bne     %[src],      %[loop_end],   1b    \n\t"
+                " addiu  %[dst],      %[dst],        32    \n\t"
+                ".set pop                                  \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
+                  [dst]"+r"(buf2)
+                :
+                : "memory"
+            );
+        }
+    } else { // LONG_STOP or ONLY_LONG
+        float *buf1 = buf + 512;
+        float *buf2 = saved;
+        int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+        int loop_end;
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                 \n\t"
+            ".set noreorder                            \n\t"
+            "addiu   %[loop_end], %[src],        2048  \n\t"
+        "1:                                            \n\t"
+            "lw      %[temp0],    0(%[src])            \n\t"
+            "lw      %[temp1],    4(%[src])            \n\t"
+            "lw      %[temp2],    8(%[src])            \n\t"
+            "lw      %[temp3],    12(%[src])           \n\t"
+            "lw      %[temp4],    16(%[src])           \n\t"
+            "lw      %[temp5],    20(%[src])           \n\t"
+            "lw      %[temp6],    24(%[src])           \n\t"
+            "lw      %[temp7],    28(%[src])           \n\t"
+            "addiu   %[src],      %[src],        32    \n\t"
+            "sw      %[temp0],    0(%[dst])            \n\t"
+            "sw      %[temp1],    4(%[dst])            \n\t"
+            "sw      %[temp2],    8(%[dst])            \n\t"
+            "sw      %[temp3],    12(%[dst])           \n\t"
+            "sw      %[temp4],    16(%[dst])           \n\t"
+            "sw      %[temp5],    20(%[dst])           \n\t"
+            "sw      %[temp6],    24(%[dst])           \n\t"
+            "sw      %[temp7],    28(%[dst])           \n\t"
+            "bne     %[src],      %[loop_end],   1b    \n\t"
+            " addiu  %[dst],      %[dst],        32    \n\t"
+            ".set pop                                  \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [loop_end]"=&r"(loop_end), [src]"+r"(buf1),
+              [dst]"+r"(buf2)
+            :
+            : "memory"
+        );
+    }
+}
+
+static void apply_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+    int j, k;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        float *predTime = sce->ret;
+        float *predFreq = ac->buf_mdct;
+        float *p_predTime;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+            j = (2048 - num_samples) >> 2;
+            k = (2048 - num_samples) & 3;
+            p_predTime = &predTime[num_samples];
+
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
+        for (i = 0; i < j; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                "sw      $0,              4(%[p_predTime])        \n\t"
+                "sw      $0,              8(%[p_predTime])        \n\t"
+                "sw      $0,              12(%[p_predTime])       \n\t"
+                "addiu   %[p_predTime],   %[p_predTime],     16   \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+        for (i = 0; i < k; i++) {
+
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                "addiu   %[p_predTime],   %[p_predTime],     4    \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += predFreq[i];
+    }
+}
+
+#if HAVE_MIPSFPU
+static void update_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *saved     = sce->saved;
+    float *saved_ltp = sce->coeffs;
+    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    int i;
+    int loop_end, loop_end1, loop_end2;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        float *buf = saved;
+        float *buf0 = saved_ltp;
+        float *p_saved_ltp = saved_ltp + 576;
+        float *ptr1 = &saved_ltp[512];
+        float *ptr2 = &ac->buf_mdct[1023];
+        float *ptr3 = (float*)&swindow[63];
+        loop_end1 = (int)(p_saved_ltp + 448);
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                     \n\t"
+            ".set noreorder                                \n\t"
+            "addiu   %[loop_end],   %[src],         2048   \n\t"
+        "1:                                                \n\t"
+            "lw      %[temp0],      0(%[src])              \n\t"
+            "lw      %[temp1],      4(%[src])              \n\t"
+            "lw      %[temp2],      8(%[src])              \n\t"
+            "lw      %[temp3],      12(%[src])             \n\t"
+            "lw      %[temp4],      16(%[src])             \n\t"
+            "lw      %[temp5],      20(%[src])             \n\t"
+            "lw      %[temp6],      24(%[src])             \n\t"
+            "lw      %[temp7],      28(%[src])             \n\t"
+            "addiu   %[src],        %[src],         32     \n\t"
+            "sw      %[temp0],      0(%[dst])              \n\t"
+            "sw      %[temp1],      4(%[dst])              \n\t"
+            "sw      %[temp2],      8(%[dst])              \n\t"
+            "sw      %[temp3],      12(%[dst])             \n\t"
+            "sw      %[temp4],      16(%[dst])             \n\t"
+            "sw      %[temp5],      20(%[dst])             \n\t"
+            "sw      %[temp6],      24(%[dst])             \n\t"
+            "sw      %[temp7],      28(%[dst])             \n\t"
+            "bne     %[src],        %[loop_end],    1b     \n\t"
+            " addiu  %[dst],        %[dst],         32     \n\t"
+            ".set pop                                      \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [loop_end]"=&r"(loop_end), [src]"+r"(buf),
+              [dst]"+r"(buf0)
+            :
+            : "memory"
+        );
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                                   \n\t"
+            "sw     $0,              0(%[p_saved_ltp])        \n\t"
+            "sw     $0,              4(%[p_saved_ltp])        \n\t"
+            "sw     $0,              8(%[p_saved_ltp])        \n\t"
+            "sw     $0,              12(%[p_saved_ltp])       \n\t"
+            "sw     $0,              16(%[p_saved_ltp])       \n\t"
+            "sw     $0,              20(%[p_saved_ltp])       \n\t"
+            "sw     $0,              24(%[p_saved_ltp])       \n\t"
+            "sw     $0,              28(%[p_saved_ltp])       \n\t"
+            "addiu  %[p_saved_ltp],  %[p_saved_ltp],     32   \n\t"
+            "bne    %[p_saved_ltp],  %[loop_end1],       1b   \n\t"
+
+            : [p_saved_ltp]"+r"(p_saved_ltp)
+            : [loop_end1]"r"(loop_end1)
+            : "memory"
+        );
+
+        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        for (i = 0; i < 16; i++){
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lwc1    %[temp0],    0(%[ptr2])                \n\t"
+                "lwc1    %[temp1],    -4(%[ptr2])               \n\t"
+                "lwc1    %[temp2],    -8(%[ptr2])               \n\t"
+                "lwc1    %[temp3],    -12(%[ptr2])              \n\t"
+                "lwc1    %[temp4],    0(%[ptr3])                \n\t"
+                "lwc1    %[temp5],    -4(%[ptr3])               \n\t"
+                "lwc1    %[temp6],    -8(%[ptr3])               \n\t"
+                "lwc1    %[temp7],    -12(%[ptr3])              \n\t"
+                "mul.s   %[temp8],    %[temp0],     %[temp4]    \n\t"
+                "mul.s   %[temp9],    %[temp1],     %[temp5]    \n\t"
+                "mul.s   %[temp10],   %[temp2],     %[temp6]    \n\t"
+                "mul.s   %[temp11],   %[temp3],     %[temp7]    \n\t"
+                "swc1    %[temp8],    0(%[ptr1])                \n\t"
+                "swc1    %[temp9],    4(%[ptr1])                \n\t"
+                "swc1    %[temp10],   8(%[ptr1])                \n\t"
+                "swc1    %[temp11],   12(%[ptr1])               \n\t"
+                "addiu   %[ptr1],     %[ptr1],      16          \n\t"
+                "addiu   %[ptr2],     %[ptr2],      -16         \n\t"
+                "addiu   %[ptr3],     %[ptr3],      -16         \n\t"
+
+                : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+                  [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+                  [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+                  [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+                  [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+                  [ptr1]"+r"(ptr1), [ptr2]"+r"(ptr2), [ptr3]"+r"(ptr3)
+                :
+                : "memory"
+            );
+        }
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float *buff0 = saved;
+        float *buff1 = saved_ltp;
+        float *ptr1 = &saved_ltp[512];
+        float *ptr2 = &ac->buf_mdct[1023];
+        float *ptr3 = (float*)&swindow[63];
+        loop_end = (int)(saved + 448);
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+        "1:                                             \n\t"
+            "lw      %[temp0],    0(%[src])             \n\t"
+            "lw      %[temp1],    4(%[src])             \n\t"
+            "lw      %[temp2],    8(%[src])             \n\t"
+            "lw      %[temp3],    12(%[src])            \n\t"
+            "lw      %[temp4],    16(%[src])            \n\t"
+            "lw      %[temp5],    20(%[src])            \n\t"
+            "lw      %[temp6],    24(%[src])            \n\t"
+            "lw      %[temp7],    28(%[src])            \n\t"
+            "addiu   %[src],      %[src],         32    \n\t"
+            "sw      %[temp0],    0(%[dst])             \n\t"
+            "sw      %[temp1],    4(%[dst])             \n\t"
+            "sw      %[temp2],    8(%[dst])             \n\t"
+            "sw      %[temp3],    12(%[dst])            \n\t"
+            "sw      %[temp4],    16(%[dst])            \n\t"
+            "sw      %[temp5],    20(%[dst])            \n\t"
+            "sw      %[temp6],    24(%[dst])            \n\t"
+            "sw      %[temp7],    28(%[dst])            \n\t"
+            "sw      $0,          2304(%[dst])          \n\t"
+            "sw      $0,          2308(%[dst])          \n\t"
+            "sw      $0,          2312(%[dst])          \n\t"
+            "sw      $0,          2316(%[dst])          \n\t"
+            "sw      $0,          2320(%[dst])          \n\t"
+            "sw      $0,          2324(%[dst])          \n\t"
+            "sw      $0,          2328(%[dst])          \n\t"
+            "sw      $0,          2332(%[dst])          \n\t"
+            "bne     %[src],      %[loop_end],    1b    \n\t"
+            " addiu  %[dst],      %[dst],         32    \n\t"
+            ".set pop                                   \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [src]"+r"(buff0), [dst]"+r"(buff1)
+            : [loop_end]"r"(loop_end)
+            : "memory"
+        );
+        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        for (i = 0; i < 16; i++){
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                "lwc1    %[temp0],    0(%[ptr2])                \n\t"
+                "lwc1    %[temp1],    -4(%[ptr2])               \n\t"
+                "lwc1    %[temp2],    -8(%[ptr2])               \n\t"
+                "lwc1    %[temp3],    -12(%[ptr2])              \n\t"
+                "lwc1    %[temp4],    0(%[ptr3])                \n\t"
+                "lwc1    %[temp5],    -4(%[ptr3])               \n\t"
+                "lwc1    %[temp6],    -8(%[ptr3])               \n\t"
+                "lwc1    %[temp7],    -12(%[ptr3])              \n\t"
+                "mul.s   %[temp8],    %[temp0],     %[temp4]    \n\t"
+                "mul.s   %[temp9],    %[temp1],     %[temp5]    \n\t"
+                "mul.s   %[temp10],   %[temp2],     %[temp6]    \n\t"
+                "mul.s   %[temp11],   %[temp3],     %[temp7]    \n\t"
+                "swc1    %[temp8],    0(%[ptr1])                \n\t"
+                "swc1    %[temp9],    4(%[ptr1])                \n\t"
+                "swc1    %[temp10],   8(%[ptr1])                \n\t"
+                "swc1    %[temp11],   12(%[ptr1])               \n\t"
+                "addiu   %[ptr1],     %[ptr1],      16          \n\t"
+                "addiu   %[ptr2],     %[ptr2],      -16         \n\t"
+                "addiu   %[ptr3],     %[ptr3],      -16         \n\t"
+
+                : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+                  [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+                  [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+                  [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+                  [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+                  [ptr1]"+r"(ptr1), [ptr2]"+r"(ptr2), [ptr3]"+r"(ptr3)
+                :
+                : "memory"
+            );
+        }
+    } else { // LONG_STOP or ONLY_LONG
+        float *ptr1, *ptr2, *ptr3;
+        ac->fdsp.vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+
+        ptr1 = &saved_ltp[512];
+        ptr2 = &ac->buf_mdct[1023];
+        ptr3 = (float*)&lwindow[511];
+
+        for (i = 0; i < 512; i+=4){
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lwc1    %[temp0],    0(%[ptr2])                \n\t"
+                "lwc1    %[temp1],    -4(%[ptr2])               \n\t"
+                "lwc1    %[temp2],    -8(%[ptr2])               \n\t"
+                "lwc1    %[temp3],    -12(%[ptr2])              \n\t"
+                "lwc1    %[temp4],    0(%[ptr3])                \n\t"
+                "lwc1    %[temp5],    -4(%[ptr3])               \n\t"
+                "lwc1    %[temp6],    -8(%[ptr3])               \n\t"
+                "lwc1    %[temp7],    -12(%[ptr3])              \n\t"
+                "mul.s   %[temp8],    %[temp0],     %[temp4]    \n\t"
+                "mul.s   %[temp9],    %[temp1],     %[temp5]    \n\t"
+                "mul.s   %[temp10],   %[temp2],     %[temp6]    \n\t"
+                "mul.s   %[temp11],   %[temp3],     %[temp7]    \n\t"
+                "swc1    %[temp8],    0(%[ptr1])                \n\t"
+                "swc1    %[temp9],    4(%[ptr1])                \n\t"
+                "swc1    %[temp10],   8(%[ptr1])                \n\t"
+                "swc1    %[temp11],   12(%[ptr1])               \n\t"
+                "addiu   %[ptr1],     %[ptr1],      16          \n\t"
+                "addiu   %[ptr2],     %[ptr2],      -16         \n\t"
+                "addiu   %[ptr3],     %[ptr3],      -16         \n\t"
+
+                : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+                  [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+                  [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+                  [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+                  [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+                  [ptr1]"+r"(ptr1), [ptr2]"+r"(ptr2),
+                  [ptr3]"+r"(ptr3)
+                :
+                : "memory"
+            );
+        }
+    }
+
+    {
+        float *buf1 = sce->ltp_state+1024;
+        float *buf2 = sce->ltp_state;
+        float *buf3 = sce->ret;
+        float *buf4 = sce->ltp_state+1024;
+        float *buf5 = saved_ltp;
+        float *buf6 = sce->ltp_state+2048;
+
+        /* loops unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                    \n\t"
+            ".set noreorder                               \n\t"
+            "addiu   %[loop_end],   %[src],         4096  \n\t"
+            "addiu   %[loop_end1],  %[src1],        4096  \n\t"
+            "addiu   %[loop_end2],  %[src2],        4096  \n\t"
+        "1:                                               \n\t"
+            "lw      %[temp0],      0(%[src])             \n\t"
+            "lw      %[temp1],      4(%[src])             \n\t"
+            "lw      %[temp2],      8(%[src])             \n\t"
+            "lw      %[temp3],      12(%[src])            \n\t"
+            "lw      %[temp4],      16(%[src])            \n\t"
+            "lw      %[temp5],      20(%[src])            \n\t"
+            "lw      %[temp6],      24(%[src])            \n\t"
+            "lw      %[temp7],      28(%[src])            \n\t"
+            "addiu   %[src],        %[src],         32    \n\t"
+            "sw      %[temp0],      0(%[dst])             \n\t"
+            "sw      %[temp1],      4(%[dst])             \n\t"
+            "sw      %[temp2],      8(%[dst])             \n\t"
+            "sw      %[temp3],      12(%[dst])            \n\t"
+            "sw      %[temp4],      16(%[dst])            \n\t"
+            "sw      %[temp5],      20(%[dst])            \n\t"
+            "sw      %[temp6],      24(%[dst])            \n\t"
+            "sw      %[temp7],      28(%[dst])            \n\t"
+            "bne     %[src],        %[loop_end],    1b    \n\t"
+            " addiu  %[dst],        %[dst],         32    \n\t"
+        "2:                                               \n\t"
+            "lw      %[temp0],      0(%[src1])            \n\t"
+            "lw      %[temp1],      4(%[src1])            \n\t"
+            "lw      %[temp2],      8(%[src1])            \n\t"
+            "lw      %[temp3],      12(%[src1])           \n\t"
+            "lw      %[temp4],      16(%[src1])           \n\t"
+            "lw      %[temp5],      20(%[src1])           \n\t"
+            "lw      %[temp6],      24(%[src1])           \n\t"
+            "lw      %[temp7],      28(%[src1])           \n\t"
+            "addiu   %[src1],       %[src1],        32    \n\t"
+            "sw      %[temp0],      0(%[dst1])            \n\t"
+            "sw      %[temp1],      4(%[dst1])            \n\t"
+            "sw      %[temp2],      8(%[dst1])            \n\t"
+            "sw      %[temp3],      12(%[dst1])           \n\t"
+            "sw      %[temp4],      16(%[dst1])           \n\t"
+            "sw      %[temp5],      20(%[dst1])           \n\t"
+            "sw      %[temp6],      24(%[dst1])           \n\t"
+            "sw      %[temp7],      28(%[dst1])           \n\t"
+            "bne     %[src1],       %[loop_end1],   2b    \n\t"
+            " addiu  %[dst1],       %[dst1],        32    \n\t"
+        "3:                                               \n\t"
+            "lw      %[temp0],      0(%[src2])            \n\t"
+            "lw      %[temp1],      4(%[src2])            \n\t"
+            "lw      %[temp2],      8(%[src2])            \n\t"
+            "lw      %[temp3],      12(%[src2])           \n\t"
+            "lw      %[temp4],      16(%[src2])           \n\t"
+            "lw      %[temp5],      20(%[src2])           \n\t"
+            "lw      %[temp6],      24(%[src2])           \n\t"
+            "lw      %[temp7],      28(%[src2])           \n\t"
+            "addiu   %[src2],       %[src2],        32    \n\t"
+            "sw      %[temp0],      0(%[dst2])            \n\t"
+            "sw      %[temp1],      4(%[dst2])            \n\t"
+            "sw      %[temp2],      8(%[dst2])            \n\t"
+            "sw      %[temp3],      12(%[dst2])           \n\t"
+            "sw      %[temp4],      16(%[dst2])           \n\t"
+            "sw      %[temp5],      20(%[dst2])           \n\t"
+            "sw      %[temp6],      24(%[dst2])           \n\t"
+            "sw      %[temp7],      28(%[dst2])           \n\t"
+            "bne     %[src2],       %[loop_end2],   3b    \n\t"
+            " addiu  %[dst2],       %[dst2],        32    \n\t"
+            ".set pop                                     \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [loop_end]"=&r"(loop_end), [loop_end1]"=&r"(loop_end1),
+              [loop_end2]"=&r"(loop_end2), [src]"+r"(buf1),
+              [dst]"+r"(buf2), [src1]"+r"(buf3), [dst1]"+r"(buf4),
+              [src2]"+r"(buf5), [dst2]"+r"(buf6)
+            :
+            : "memory"
+        );
+    }
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacdec_init_mips(AACContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->imdct_and_windowing         = imdct_and_windowing_mips;
+    c->apply_ltp                   = apply_ltp_mips;
+#if HAVE_MIPSFPU
+    c->update_ltp                  = update_ltp_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.h b/libavcodec/mips/aacdec_mips.h
new file mode 100644
index 0000000000..9ba307962f
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#ifndef AVCODEC_MIPS_AACDEC_FLOAT_H
+#define AVCODEC_MIPS_AACDEC_FLOAT_H
+
+#include "libavcodec/aac.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+static inline float *VMUL2_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    float temp0, temp1, temp2;
+    int temp3, temp4;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp3],  %[idx],       15           \n\t"
+        "ext     %[temp4],  %[idx],       4,      4    \n\t"
+        "sll     %[temp3],  %[temp3],     2            \n\t"
+        "sll     %[temp4],  %[temp4],     2            \n\t"
+        "lwc1    %[temp2],  0(%[scale])                \n\t"
+        "lwxc1   %[temp0],  %[temp3](%[v])             \n\t"
+        "lwxc1   %[temp1],  %[temp4](%[v])             \n\t"
+        "mul.s   %[temp0],  %[temp0],     %[temp2]     \n\t"
+        "mul.s   %[temp1],  %[temp1],     %[temp2]     \n\t"
+        "addiu   %[ret],    %[dst],       8            \n\t"
+        "swc1    %[temp0],  0(%[dst])                  \n\t"
+        "swc1    %[temp1],  4(%[dst])                  \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    int temp0, temp1, temp2, temp3;
+    float temp4, temp5, temp6, temp7, temp8;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       3           \n\t"
+        "ext     %[temp1],  %[idx],       2,      2   \n\t"
+        "ext     %[temp2],  %[idx],       4,      2   \n\t"
+        "ext     %[temp3],  %[idx],       6,      2   \n\t"
+        "sll     %[temp0],  %[temp0],     2           \n\t"
+        "sll     %[temp1],  %[temp1],     2           \n\t"
+        "sll     %[temp2],  %[temp2],     2           \n\t"
+        "sll     %[temp3],  %[temp3],     2           \n\t"
+        "lwc1    %[temp4],  0(%[scale])               \n\t"
+        "lwxc1   %[temp5],  %[temp0](%[v])            \n\t"
+        "lwxc1   %[temp6],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp7],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp8],  %[temp3](%[v])            \n\t"
+        "mul.s   %[temp5],  %[temp5],     %[temp4]    \n\t"
+        "mul.s   %[temp6],  %[temp6],     %[temp4]    \n\t"
+        "mul.s   %[temp7],  %[temp7],     %[temp4]    \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp4]    \n\t"
+        "addiu   %[ret],    %[dst],       16          \n\t"
+        "swc1    %[temp5],  0(%[dst])                 \n\t"
+        "swc1    %[temp6],  4(%[dst])                 \n\t"
+        "swc1    %[temp7],  8(%[dst])                 \n\t"
+        "swc1    %[temp8],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL2S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       15         \n\t"
+        "ext     %[temp1],  %[idx],       4,     4   \n\t"
+        "lw      %[temp4],  0(%[scale])              \n\t"
+        "srl     %[temp2],  %[sign],      1          \n\t"
+        "sll     %[temp3],  %[sign],      31         \n\t"
+        "sll     %[temp2],  %[temp2],     31         \n\t"
+        "sll     %[temp0],  %[temp0],     2          \n\t"
+        "sll     %[temp1],  %[temp1],     2          \n\t"
+        "lwxc1   %[temp8],  %[temp0](%[v])           \n\t"
+        "lwxc1   %[temp9],  %[temp1](%[v])           \n\t"
+        "xor     %[temp5],  %[temp4],     %[temp2]   \n\t"
+        "xor     %[temp4],  %[temp4],     %[temp3]   \n\t"
+        "mtc1    %[temp5],  %[temp6]                 \n\t"
+        "mtc1    %[temp4],  %[temp7]                 \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp6]   \n\t"
+        "mul.s   %[temp9],  %[temp9],     %[temp7]   \n\t"
+        "addiu   %[ret],    %[dst],       8          \n\t"
+        "swc1    %[temp8],  0(%[dst])                \n\t"
+        "swc1    %[temp9],  4(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+          [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [sign]"r"(sign)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4;
+    float temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+    float *ret;
+    unsigned int mask = 1U << 31;
+
+    __asm__ volatile(
+        "lw      %[temp0],   0(%[scale])               \n\t"
+        "and     %[temp1],   %[idx],       3           \n\t"
+        "ext     %[temp2],   %[idx],       2,      2   \n\t"
+        "ext     %[temp3],   %[idx],       4,      2   \n\t"
+        "ext     %[temp4],   %[idx],       6,      2   \n\t"
+        "sll     %[temp1],   %[temp1],     2           \n\t"
+        "sll     %[temp2],   %[temp2],     2           \n\t"
+        "sll     %[temp3],   %[temp3],     2           \n\t"
+        "sll     %[temp4],   %[temp4],     2           \n\t"
+        "lwxc1   %[temp10],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp11],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp12],  %[temp3](%[v])            \n\t"
+        "lwxc1   %[temp13],  %[temp4](%[v])            \n\t"
+        "and     %[temp1],   %[sign],      %[mask]     \n\t"
+        "ext     %[temp2],   %[idx],       12,     1   \n\t"
+        "ext     %[temp3],   %[idx],       13,     1   \n\t"
+        "ext     %[temp4],   %[idx],       14,     1   \n\t"
+        "sllv    %[sign],    %[sign],      %[temp2]    \n\t"
+        "xor     %[temp1],   %[temp0],     %[temp1]    \n\t"
+        "and     %[temp2],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp1],   %[temp14]                 \n\t"
+        "xor     %[temp2],   %[temp0],     %[temp2]    \n\t"
+        "sllv    %[sign],    %[sign],      %[temp3]    \n\t"
+        "mtc1    %[temp2],   %[temp15]                 \n\t"
+        "and     %[temp3],   %[sign],      %[mask]     \n\t"
+        "sllv    %[sign],    %[sign],      %[temp4]    \n\t"
+        "xor     %[temp3],   %[temp0],     %[temp3]    \n\t"
+        "and     %[temp4],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp3],   %[temp16]                 \n\t"
+        "xor     %[temp4],   %[temp0],     %[temp4]    \n\t"
+        "mtc1    %[temp4],   %[temp17]                 \n\t"
+        "mul.s   %[temp10],  %[temp10],    %[temp14]   \n\t"
+        "mul.s   %[temp11],  %[temp11],    %[temp15]   \n\t"
+        "mul.s   %[temp12],  %[temp12],    %[temp16]   \n\t"
+        "mul.s   %[temp13],  %[temp13],    %[temp17]   \n\t"
+        "addiu   %[ret],     %[dst],       16          \n\t"
+        "swc1    %[temp10],  0(%[dst])                 \n\t"
+        "swc1    %[temp11],  4(%[dst])                 \n\t"
+        "swc1    %[temp12],  8(%[dst])                 \n\t"
+        "swc1    %[temp13],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp10]"=&f"(temp10),
+          [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+          [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+          [temp15]"=&f"(temp15), [temp16]"=&f"(temp16),
+          [temp17]"=&f"(temp17), [ret]"=&r"(ret),
+          [sign]"+r"(sign)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [mask]"r"(mask)
+        : "memory"
+    );
+    return ret;
+}
+
+#define VMUL2 VMUL2_mips
+#define VMUL4 VMUL4_mips
+#define VMUL2S VMUL2S_mips
+#define VMUL4S VMUL4S_mips
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+
+#endif /* AVCODEC_MIPS_AACDEC_FLOAT_H */
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index 2c16c9ad22..ce61cab15b 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -1092,7 +1092,6 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     vard = s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16);
 
     pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = (vard+128)>>8;
-//    pic->mb_cmp_score[s->mb_stride * mb_y + mb_x] = dmin;
     c->mc_mb_var_sum_temp += (vard+128)>>8;
 
     if(mb_type){
@@ -1171,7 +1170,6 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             }
         }
 
-//        pic->mb_cmp_score[s->mb_stride * mb_y + mb_x] = dmin;
         set_p_mv_tables(s, mx, my, mb_type!=CANDIDATE_MB_TYPE_INTER4V);
 
         /* get intra luma score */
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 03f4f9ff1e..0675004948 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -94,10 +94,6 @@ struct MpegEncContext;
 typedef struct Picture{
     struct AVFrame f;
 
-    /**
-     * halfpel luma planes.
-     */
-    uint8_t *interpolated[3];
     int8_t *qscale_table_base;
     int16_t (*motion_val_base[2])[2];
     uint32_t *mb_type_base;
@@ -144,7 +140,6 @@ typedef struct Picture{
     uint16_t *mb_var;           ///< Table for MB variances
     uint16_t *mc_mb_var;        ///< Table for motion compensated MB variances
     uint8_t *mb_mean;           ///< Table for MB luminance
-    int32_t *mb_cmp_score;      ///< Table for MB cmp scores, for mb decision FIXME remove
     int b_frame_score;          /* */
     struct MpegEncContext *owner2; ///< pointer to the MpegEncContext that allocated this picture
     int needs_realloc;          ///< Picture needs to be reallocated (eg due to a frame size change)
@@ -729,10 +724,10 @@ typedef struct MpegEncContext {
     int context_reinit;
 } MpegEncContext;
 
-#define REBASE_PICTURE(pic, new_ctx, old_ctx) (pic ? \
-    (pic >= old_ctx->picture && pic < old_ctx->picture+old_ctx->picture_count ?\
-        &new_ctx->picture[pic - old_ctx->picture] : pic - (Picture*)old_ctx + (Picture*)new_ctx)\
-    : NULL)
+#define REBASE_PICTURE(pic, new_ctx, old_ctx)             \
+    ((pic && pic >= old_ctx->picture &&                   \
+      pic < old_ctx->picture + old_ctx->picture_count) ?  \
+        &new_ctx->picture[pic - old_ctx->picture] : NULL)
 
 /* mpegvideo_enc common options */
 #define FF_MPV_FLAG_SKIP_RD      0x0001
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 11607d22c8..87531d7cd9 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -2844,6 +2844,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         int16_t ac[6][16];
                         const int mvdir= (best_s.mv_dir&MV_DIR_BACKWARD) ? 1 : 0;
                         static const int dquant_tab[4]={-1,1,-2,2};
+                        int storecoefs = s->mb_intra && s->dc_val[0];
 
                         av_assert2(backup_s.dquant == 0);
 
@@ -2863,7 +2864,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             if(qp < s->avctx->qmin || qp > s->avctx->qmax)
                                 continue;
                             backup_s.dquant= dquant;
-                            if(s->mb_intra && s->dc_val[0]){
+                            if(storecoefs){
                                 for(i=0; i<6; i++){
                                     dc[i]= s->dc_val[0][ s->block_index[i] ];
                                     memcpy(ac[i], s->ac_val[0][s->block_index[i]], sizeof(int16_t)*16);
@@ -2873,7 +2874,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_INTER /* wrong but unused */, pb, pb2, tex_pb,
                                          &dmin, &next_block, s->mv[mvdir][0][0], s->mv[mvdir][0][1]);
                             if(best_s.qscale != qp){
-                                if(s->mb_intra && s->dc_val[0]){
+                                if(storecoefs){
                                     for(i=0; i<6; i++){
                                         s->dc_val[0][ s->block_index[i] ]= dc[i];
                                         memcpy(s->ac_val[0][s->block_index[i]], ac[i], sizeof(int16_t)*16);
diff --git a/libavcodec/msrledec.c b/libavcodec/msrledec.c
index 36a46b5978..e969994875 100644
--- a/libavcodec/msrledec.c
+++ b/libavcodec/msrledec.c
@@ -203,36 +203,40 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic,
             pos += p2;
         } else { //run of pixels
             uint8_t pix[3]; //original pixel
-            switch(depth){
-            case  8: pix[0] = bytestream2_get_byte(gb);
-                     break;
-            case 16: pix16  = bytestream2_get_le16(gb);
-                     break;
-            case 24: pix[0] = bytestream2_get_byte(gb);
-                     pix[1] = bytestream2_get_byte(gb);
-                     pix[2] = bytestream2_get_byte(gb);
-                     break;
-            case 32: pix32  = bytestream2_get_le32(gb);
-                     break;
-            }
             if ((pic->linesize[0] > 0 && output + p1 * (depth >> 3) > output_end) ||
                 (pic->linesize[0] < 0 && output + p1 * (depth >> 3) < output_end))
                 continue;
-            for(i = 0; i < p1; i++) {
-                switch(depth){
-                case  8: *output++ = pix[0];
-                         break;
-                case 16: *(uint16_t*)output = pix16;
-                         output += 2;
-                         break;
-                case 24: *output++ = pix[0];
-                         *output++ = pix[1];
-                         *output++ = pix[2];
-                         break;
-                case 32: *(uint32_t*)output = pix32;
-                         output += 4;
-                         break;
+
+            switch(depth){
+            case  8:
+                pix[0] = bytestream2_get_byte(gb);
+                for(i = 0; i < p1; i++)
+                        *output++ = pix[0];
+                break;
+            case 16:
+                pix16  = bytestream2_get_le16(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint16_t*)output = pix16;
+                        output += 2;
+                }
+                break;
+            case 24:
+                pix[0] = bytestream2_get_byte(gb);
+                pix[1] = bytestream2_get_byte(gb);
+                pix[2] = bytestream2_get_byte(gb);
+                for(i = 0; i < p1; i++) {
+                        *output++ = pix[0];
+                        *output++ = pix[1];
+                        *output++ = pix[2];
+                }
+                break;
+            case 32:
+                pix32  = bytestream2_get_le32(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint32_t*)output = pix32;
+                        output += 4;
                 }
+                break;
             }
             pos += p1;
         }
diff --git a/libavcodec/mss3.c b/libavcodec/mss3.c
index 545fefc47a..d5bb2d4e35 100644
--- a/libavcodec/mss3.c
+++ b/libavcodec/mss3.c
@@ -295,7 +295,7 @@ static void rac_normalise(RangeCoder *c)
             c->low |= *c->src++;
         } else if (!c->low) {
             c->got_error = 1;
-            return;
+            c->low = 1;
         }
         if (c->range >= RAC_BOTTOM)
             return;
diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c
index 593df60294..97c0ad19b6 100644
--- a/libavcodec/pngdec.c
+++ b/libavcodec/pngdec.c
@@ -765,10 +765,14 @@ static int decode_frame(AVCodecContext *avctx,
  exit_loop:
 
     if (s->bits_per_pixel == 1 && s->color_type == PNG_COLOR_TYPE_PALETTE){
-        int i, j;
+        int i, j, k;
         uint8_t *pd = s->current_picture->data[0];
         for (j = 0; j < s->height; j++) {
-            for (i = s->width/8-1; i >= 0; i--) {
+            i = s->width / 8;
+            for (k = 7; k >= 1; k--)
+                if ((s->width&7) >= k)
+                    pd[8*i + k - 1] = (pd[i]>>8-k) & 1;
+            for (i--; i >= 0; i--) {
                 pd[8*i + 7]=  pd[i]     & 1;
                 pd[8*i + 6]= (pd[i]>>1) & 1;
                 pd[8*i + 5]= (pd[i]>>2) & 1;
diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c
index d40ce07583..740614d335 100644
--- a/libavcodec/ppc/fmtconvert_altivec.c
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -93,7 +93,7 @@ static void float_to_int16_stride_altivec(int16_t *dst, const float *src,
                                           long len, int stride)
 {
     int i;
-    vector signed short d, s;
+    vector signed short d;
 
     for (i = 0; i < len - 7; i += 8) {
         d = float_to_int16_one_altivec(src + i);
diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h
index a9df91851f..706162af52 100644
--- a/libavcodec/proresdsp.h
+++ b/libavcodec/proresdsp.h
@@ -24,6 +24,7 @@
 #define AVCODEC_PRORESDSP_H
 
 #include <stdint.h>
+#include "dsputil.h"
 
 #define PRORES_BITS_PER_SAMPLE 10 ///< output precision of prores decoder
 
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index 2bd8df8e0a..1ca72b44f2 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -1023,6 +1023,9 @@ void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f)
     PerThreadContext *p = avctx->thread_opaque;
     FrameThreadContext *fctx;
 
+    if (!f->data[0])
+        return;
+
     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) {
         avctx->release_buffer(avctx, f);
         return;
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index ca2aab7b56..4deea07baf 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -649,7 +649,8 @@ static void fill_coding_method_array (sb_int8_array tone_level_idx, sb_int8_arra
 
     if (!superblocktype_2_3) {
         /* This case is untested, no samples available */
-        SAMPLES_NEEDED
+        av_log_ask_for_sample(NULL, "!superblocktype_2_3");
+        return;
         for (ch = 0; ch < nb_channels; ch++)
             for (sb = 0; sb < 30; sb++) {
                 for (j = 1; j < 63; j++) {  // The loop only iterates to 63 so the code doesn't overflow the buffer
@@ -685,7 +686,7 @@ static void fill_coding_method_array (sb_int8_array tone_level_idx, sb_int8_arra
                     for (j = 0; j < 64; j++)
                         acc += tone_level_idx_temp[ch][sb][j];
 
-            multres = 0x66666667 * (acc * 10);
+            multres = 0x66666667LL * (acc * 10);
             esp_40 = (multres >> 32) / 8 + ((multres & 0xffffffff) >> 31);
             for (ch = 0;  ch < nb_channels; ch++)
                 for (sb = 0; sb < 30; sb++)
diff --git a/libavcodec/qdm2_tablegen.h b/libavcodec/qdm2_tablegen.h
index 585edfdd65..a90682f43c 100644
--- a/libavcodec/qdm2_tablegen.h
+++ b/libavcodec/qdm2_tablegen.h
@@ -37,7 +37,7 @@
 #include "libavcodec/qdm2_tables.h"
 #else
 static uint16_t softclip_table[HARDCLIP_THRESHOLD - SOFTCLIP_THRESHOLD + 1];
-static float noise_table[4096];
+static float noise_table[4096 + 20];
 static uint8_t random_dequant_index[256][5];
 static uint8_t random_dequant_type24[128][3];
 static float noise_samples[128];
diff --git a/libavcodec/sanm.c b/libavcodec/sanm.c
index 70ad1f8fed..c9284920b6 100644
--- a/libavcodec/sanm.c
+++ b/libavcodec/sanm.c
@@ -29,6 +29,7 @@
 #include "libavutil/imgutils.h"
 #include "libavcodec/dsputil.h"
 #include "sanm_data.h"
+#include "libavutil/avassert.h"
 
 #define NGLYPHS 256
 
@@ -613,6 +614,16 @@ static int process_block(SANMVideoContext *ctx, uint8_t *dst, uint8_t *prev1,
     } else {
         int mx = motion_vectors[code][0];
         int my = motion_vectors[code][1];
+        int index = prev2 - (const uint8_t*)ctx->frm2;
+
+        av_assert2(index >= 0 && index < (ctx->buf_size>>1));
+
+        if (index < - mx - my*stride ||
+            (ctx->buf_size>>1) - index < mx + size + (my + size - 1)*stride) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "MV is invalid \n");
+            return AVERROR_INVALIDDATA;
+        }
+
         for (k = 0; k < size; k++)
             memcpy(dst + k * stride, prev2 + mx + (my + k) * stride, size);
     }
diff --git a/libavcodec/takdec.c b/libavcodec/takdec.c
index 86c1a7dda5..af6454304c 100644
--- a/libavcodec/takdec.c
+++ b/libavcodec/takdec.c
@@ -693,7 +693,8 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
     if (pkt->size < TAK_MIN_FRAME_HEADER_BYTES)
         return AVERROR_INVALIDDATA;
 
-    init_get_bits(gb, pkt->data, pkt->size * 8);
+    if ((ret = init_get_bits8(gb, pkt->data, pkt->size)) < 0)
+        return ret;
 
     if ((ret = ff_tak_decode_frame_header(avctx, gb, &s->ti, 0)) < 0)
         return ret;
diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c
index 7167278351..acb25c3899 100644
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -594,7 +594,6 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
      * At least currently Ut Video is IDR only.
      * Set flags accordingly.
      */
-    avctx->coded_frame->reference = 0;
     avctx->coded_frame->key_frame = 1;
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
 
diff --git a/libavcodec/v410enc.c b/libavcodec/v410enc.c
index 67d8fc9d77..9661c7c8ea 100644
--- a/libavcodec/v410enc.c
+++ b/libavcodec/v410enc.c
@@ -54,7 +54,6 @@ static int v410_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         return ret;
     dst = pkt->data;
 
-    avctx->coded_frame->reference = 0;
     avctx->coded_frame->key_frame = 1;
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
 
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index e056ffb525..ef763fa675 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -80,7 +80,7 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     int topleft_mb_pos, top_mb_pos;
-    int stride_y, fieldtx;
+    int stride_y, fieldtx = 0;
     int v_dist;
 
     /* The put pixels loop is always one MB row behind the decoding loop,
@@ -93,7 +93,8 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
     if (!s->first_slice_line) {
         if (s->mb_x) {
             topleft_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x - 1;
-            fieldtx        = v->fieldtx_plane[topleft_mb_pos];
+            if (v->fcm == ILACE_FRAME)
+                fieldtx = v->fieldtx_plane[topleft_mb_pos];
             stride_y       = s->linesize << fieldtx;
             v_dist         = (16 - fieldtx) >> (fieldtx == 0);
             s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
@@ -117,7 +118,8 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         }
         if (s->mb_x == s->mb_width - 1) {
             top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x;
-            fieldtx    = v->fieldtx_plane[top_mb_pos];
+            if (v->fcm == ILACE_FRAME)
+                fieldtx = v->fieldtx_plane[top_mb_pos];
             stride_y   = s->linesize << fieldtx;
             v_dist     = fieldtx ? 15 : 8;
             s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
@@ -2726,7 +2728,7 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int a_avail = v->a_avail, c_avail = v->c_avail;
@@ -2938,7 +2940,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
@@ -5672,7 +5674,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     continue;
                 }
                 v->second_field = 1;
-                v->blocks_off   = s->mb_width  * s->mb_height << 1;
+                v->blocks_off   = s->b8_stride * (s->mb_height&~1);
                 v->mb_off       = s->mb_stride * s->mb_height >> 1;
             } else {
                 v->second_field = 0;
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 792dbb2a61..00a77e9e78 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -1928,11 +1928,11 @@ static int vp3_decode_frame(AVCodecContext *avctx,
         int type = get_bits(&gb, 7);
         skip_bits_long(&gb, 6*8); /* "theora" */
 
+        if (s->avctx->active_thread_type&FF_THREAD_FRAME) {
+            av_log(avctx, AV_LOG_ERROR, "midstream reconfiguration with multithreading is unsupported, try -threads 1\n");
+            return AVERROR_PATCHWELCOME;
+        }
         if (type == 0) {
-            if (s->avctx->active_thread_type&FF_THREAD_FRAME) {
-                av_log(avctx, AV_LOG_ERROR, "midstream reconfiguration with multithreading is unsupported, try -threads 1\n");
-                return AVERROR_PATCHWELCOME;
-            }
             vp3_decode_end(avctx);
             ret = theora_decode_header(avctx, &gb);
 
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index 2c22130642..f3f9bd231d 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -310,6 +310,8 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
 
     if (ARCH_ARM)
         ff_vp3dsp_init_arm(c, flags);
+    if (ARCH_BFIN)
+        ff_vp3dsp_init_bfin(c, flags);
     if (ARCH_PPC)
         ff_vp3dsp_init_ppc(c, flags);
     if (ARCH_X86)
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 8ffecca780..558077f908 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -49,6 +49,7 @@ typedef struct VP3DSPContext {
 
 void ff_vp3dsp_init(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
+void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);
 
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index 7df5adade3..6cc377036c 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -394,8 +394,6 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
         mb_type = vp56_decode_mv(s, row, col);
     ref_frame = vp56_reference_frame[mb_type];
 
-    s->dsp.clear_blocks(*s->block_coeff);
-
     s->parse_coeff(s);
 
     vp56_add_predictors_dc(s, ref_frame);
@@ -448,6 +446,11 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
             }
             break;
     }
+
+    if (is_alpha) {
+        s->block_coeff[4][0] = 0;
+        s->block_coeff[5][0] = 0;
+    }
 }
 
 static int vp56_size_changed(VP56Context *s)
diff --git a/libavcodec/wma.c b/libavcodec/wma.c
index 5af20739a5..1e6ca61047 100644
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -134,6 +134,10 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
 
     bps = (float)avctx->bit_rate / (float)(avctx->channels * avctx->sample_rate);
     s->byte_offset_bits = av_log2((int)(bps * s->frame_len / 8.0 + 0.5)) + 2;
+    if (s->byte_offset_bits + 3 > MIN_CACHE_BITS) {
+        av_log(avctx, AV_LOG_ERROR, "byte_offset_bits %d is too large\n", s->byte_offset_bits);
+        return AVERROR_PATCHWELCOME;
+    }
 
     /* compute high frequency value and choose if noise coding should
        be activated */
diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index c3b6ab3b5f..7ddc434d78 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -25,8 +25,6 @@
  * @author Ronald S. Bultje <rsbultje@gmail.com>
  */
 
-#define UNCHECKED_BITSTREAM_READER 1
-
 #include <math.h>
 
 #include "libavutil/channel_layout.h"
@@ -1442,8 +1440,8 @@ static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
                        float *excitation, float *synth)
 {
     WMAVoiceContext *s = ctx->priv_data;
-    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
-    int pitch[MAX_BLOCKS], last_block_pitch;
+    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
+    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
 
     /* Parse frame type ("frame header"), see frame_descs */
     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c
index ee89295c94..cb6465f950 100644
--- a/libavcodec/x86/diracdsp_mmx.c
+++ b/libavcodec/x86/diracdsp_mmx.c
@@ -60,6 +60,9 @@ void ff_diracdsp_init_mmx(DiracDSPContext* c)
 {
     int mm_flags = av_get_cpu_flags();
 
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
 #if HAVE_YASM
     c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
 #if !ARCH_X86_64
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 6a76655a8b..587d5ee968 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -648,46 +648,3 @@ BSWAP32_BUF
 
 INIT_XMM ssse3
 BSWAP32_BUF
-
-INIT_XMM sse2
-; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-cglobal put_pixels16, 4,5,4
-    movsxdifnidn r2, r2d
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
-
-; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-cglobal avg_pixels16, 4,5,4
-    movsxdifnidn r2, r2d
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 303d36d109..2e8300a788 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1612,40 +1612,67 @@ void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
-    OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
 }\
 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
-    OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
 }\
 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
-    OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
-    OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
 }
 
+#if HAVE_MMX_INLINE
 DIRAC_PIXOP(put, put, mmx)
 DIRAC_PIXOP(avg, avg, mmx)
-DIRAC_PIXOP(avg, ff_avg, mmxext)
+#endif
 
 #if HAVE_YASM
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
 {
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
     ff_put_pixels16_sse2(dst, src[0], stride, h);
 }
 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
 {
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
     ff_avg_pixels16_sse2(dst, src[0], stride, h);
 }
 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
 {
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
     ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
     ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
 }
 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
 {
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
     ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
     ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
 }
 #endif
 #endif
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 01172afbf1..9e959c3850 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -544,7 +544,7 @@ void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
         } else if (bit_depth == 10) {
-#if !ARCH_X86_64
+#if ARCH_X86_32
             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 72bc111baa..7f0c285fa3 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -4,6 +4,7 @@
 ;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
 ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
 ;*
 ;* MMX optimized hpel functions
 ;*
@@ -469,3 +470,46 @@ INIT_MMX mmxext
 AVG_PIXELS8_XY2
 INIT_MMX 3dnow
 AVG_PIXELS8_XY2
+
+INIT_XMM sse2
+; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+cglobal put_pixels16, 4,5,4
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*3]
+.loop:
+    movu         m0, [r1]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r2*2]
+    movu         m3, [r1+r4]
+    lea          r1, [r1+r2*4]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova  [r0+r2*2], m2
+    mova    [r0+r4], m3
+    sub         r3d, 4
+    lea          r0, [r0+r2*4]
+    jnz       .loop
+    REP_RET
+
+; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+cglobal avg_pixels16, 4,5,4
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*3]
+.loop:
+    movu         m0, [r1]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r2*2]
+    movu         m3, [r1+r4]
+    lea          r1, [r1+r2*4]
+    pavgb        m0, [r0]
+    pavgb        m1, [r0+r2]
+    pavgb        m2, [r0+r2*2]
+    pavgb        m3, [r0+r4]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova  [r0+r2*2], m2
+    mova    [r0+r4], m3
+    sub         r3d, 4
+    lea          r0, [r0+r2*4]
+    jnz       .loop
+    REP_RET
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index d121b25031..09b9e40364 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -376,7 +376,7 @@ inline void ff_idct_xvid_sse2(short *block)
     JZ("%%esi", "1f")
     "5:                                                          \n\t"
     iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
-#if !ARCH_X86_64
+#if ARCH_X86_32
     iLLM_HEAD
 #endif
     iLLM_PASS("%0")
diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm
index fe47bf302e..afbdf351c7 100644
--- a/libavcodec/x86/mpeg4qpel.asm
+++ b/libavcodec/x86/mpeg4qpel.asm
@@ -2,6 +2,7 @@
 ;* mpeg4 qpel
 ;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
 ;*
 ;* This file is part of FFmpeg.
 ;*
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 1df570ec0b..288dbcc6f3 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -64,7 +64,7 @@ void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
     "paddb "#regb", "#regr"             \n\t"                    \
     "paddb "#regd", "#regp"             \n\t"
 
-static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
+static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
 {
 //    START_TIMER
     MOVQ_BFE(mm6);
author	rogerdpack <rogerpack2005@gmail.com>	2013-01-15 19:09:15 -0700
committer	rogerdpack <rogerpack2005@gmail.com>	2013-01-15 19:09:15 -0700
commit	c540312ac3b58e0bbd751844fc2c47c6e3713cf5 (patch)
tree	fcf92b1c0f1772b379828125c2555a47d1c81c6b /libavcodec
parent	47e88486b4b3b3de992b07f89dfaedf410a8bd5e (diff)
parent	2b20397e1fbe52db800ef5deb810f7bc2602f248 (diff)
download	ffmpeg-c540312ac3b58e0bbd751844fc2c47c6e3713cf5.tar.gz