Merge remote-tracking branch 'qatar/master'

* qatar/master: (29 commits) fate: add golomb-test golomb-test: K&R formatting cosmetics h264: Split h264-test off into a separate file - golomb-test.c. h264-test: cleanup: drop timer invocations, commented out code and other cruft h264-test: Remove unused DSP and AVCodec contexts and related init calls. adpcm: Add missing stdint.h #include to fix standalone header compilation. lavf: add functions for accessing the fourcc<->CodecID mapping tables. lavc: set AVCodecContext.codec in avcodec_get_context_defaults3(). lavc: make avcodec_close() work properly on unopened codecs. lavc: add avcodec_is_open(). lavf: rename AVInputFormat.value to raw_codec_id. lavf: remove the pointless value field from flv and iv8 lavc/lavf: remove unnecessary symbols from the symbol version script. lavc: reorder AVCodec fields. lavf: reorder AVInput/OutputFormat fields. mp3dec: Fix a heap-buffer-overflow adpcmenc: remove some unneeded casts adpcmenc: use int16_t and uint8_t instead of short and unsigned char. adpcmenc: fix adpcm_ms extradata allocation adpcmenc: return proper AVERROR codes instead of -1 ... Conflicts: doc/APIchanges libavcodec/Makefile libavcodec/adpcmenc.c libavcodec/avcodec.h libavcodec/h264.c libavcodec/libavcodec.v libavcodec/mpc7.c libavcodec/mpegaudiodec.c libavcodec/options.c libavformat/Makefile libavformat/avformat.h libavformat/flvdec.c libavformat/libavformat.v Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2012-02-01 02:08:23 +0100
committer: Michael Niedermayer <michaelni@gmx.at> 2012-02-01 02:36:09 +0100
commit: a369a6b85819890b21a87af3ce983ce533b7169b (patch)
tree: 838f9821dc09bd99b59ce4a2d8123d5fd6868b91 /libavcodec/x86
parent: 0a3a69e8d77146b53a1112c715a78e7d293883b1 (diff)
parent: 52afc9716849e6fb6c2420674d790d374061c663 (diff)
download: ffmpeg-a369a6b85819890b21a87af3ce983ce533b7169b.tar.gz
4 files changed, 231 insertions, 4 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 282bc916bd..3b8ee56a49 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -29,8 +29,9 @@ MMX-OBJS-$(CONFIG_H264PRED)            += x86/h264_intrapred_init.o
 MMX-OBJS-$(CONFIG_RV30_DECODER)        += x86/rv34dsp_init.o
 YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
 MMX-OBJS-$(CONFIG_RV40_DECODER)        += x86/rv34dsp_init.o            \
+                                          x86/rv40dsp_init.o
+YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                           x86/rv40dsp.o
-YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o
 
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp_yasm.o
 
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index a3d8f89816..ca0b29344a 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -110,9 +110,9 @@ static void float_interleave_sse(float *dst, const float **src,
 
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
+#if HAVE_YASM
     int mm_flags = av_get_cpu_flags();
 
-#if HAVE_YASM
     if (mm_flags & AV_CPU_FLAG_MMX) {
         c->float_interleave = float_interleave_mmx;
 
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
new file mode 100644
index 0000000000..bff3e7b96a
--- /dev/null
+++ b/libavcodec/x86/rv40dsp.asm
@@ -0,0 +1,207 @@
+;******************************************************************************
+;* MMX/SSE2-optimized functions for the RV40 decoder
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+align 16
+shift_round:   times 8 dw 1 << (16 - 6)
+cextern pw_16
+
+SECTION .text
+
+; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
+%macro RV40_WCORE  4-5
+    movh       m4, [%3 + 0]
+    movh       m5, [%4 + 0]
+%if %0 == 4
+%define OFFSET mmsize / 2
+%else
+    ; 8x8 block and sse2, stride was provided
+%define OFFSET %5
+%endif
+    movh       m6, [%3 + OFFSET]
+    movh       m7, [%4 + OFFSET]
+
+%if %1 == 0
+    ; 14bits weights
+    punpcklbw  m4, m0
+    punpcklbw  m5, m0
+    punpcklbw  m6, m0
+    punpcklbw  m7, m0
+
+    psllw      m4, 7
+    psllw      m5, 7
+    psllw      m6, 7
+    psllw      m7, 7
+    pmulhw     m4, m3
+    pmulhw     m5, m2
+    pmulhw     m6, m3
+    pmulhw     m7, m2
+
+    paddw      m4, m5
+    paddw      m6, m7
+%else
+    ; 5bits weights
+%if cpuflag(ssse3)
+    punpcklbw  m4, m5
+    punpcklbw  m6, m7
+
+    pmaddubsw  m4, m3
+    pmaddubsw  m6, m3
+%else
+    punpcklbw  m4, m0
+    punpcklbw  m5, m0
+    punpcklbw  m6, m0
+    punpcklbw  m7, m0
+
+    pmullw     m4, m3
+    pmullw     m5, m2
+    pmullw     m6, m3
+    pmullw     m7, m2
+    paddw      m4, m5
+    paddw      m6, m7
+%endif
+
+%endif
+
+    ; bias and shift down
+%if cpuflag(ssse3)
+    pmulhrsw   m4, m1
+    pmulhrsw   m6, m1
+%else
+    paddw      m4, m1
+    paddw      m6, m1
+    psrlw      m4, 5
+    psrlw      m6, 5
+%endif
+
+    packuswb   m4, m6
+%if %0 == 5
+    ; Only called for 8x8 blocks and sse2
+    movh       [%2 +  0], m4
+    movhps     [%2 + %5], m4
+%else
+    mova       [%2], m4
+%endif
+%endmacro
+
+
+%macro MAIN_LOOP   2
+%if mmsize == 8
+    RV40_WCORE %2, r0, r1, r2
+%if %1 == 16
+    RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
+%endif
+
+    ; Prepare for next loop
+    add        r0, r5
+    add        r1, r5
+    add        r2, r5
+%else
+%ifidn %1, 8
+    RV40_WCORE %2, r0, r1, r2, r5
+    ; Prepare 2 next lines
+    lea        r0, [r0 + 2 * r5]
+    lea        r1, [r1 + 2 * r5]
+    lea        r2, [r2 + 2 * r5]
+%else
+    RV40_WCORE %2, r0, r1, r2
+    ; Prepare single next line
+    add        r0, r5
+    add        r1, r5
+    add        r2, r5
+%endif
+%endif
+
+    dec        r6
+%endmacro
+
+; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
+; %1=size  %2=num of xmm regs
+%macro RV40_WEIGHT  2
+cglobal rv40_weight_func_%1, 6, 7, %2
+%if cpuflag(ssse3)
+    mova       m1, [shift_round]
+%else
+    mova       m1, [pw_16]
+%endif
+    pxor       m0, m0
+    mov        r6, r3
+    or         r6, r4
+    ; The weights are FP0.14 notation of fractions depending on pts.
+    ; For timebases without rounding error (i.e. PAL), the fractions
+    ; can be simplified, and several operations can be avoided.
+    ; Therefore, we check here whether they are multiples of 2^9 for
+    ; those simplifications to occur.
+    and        r6, 0x1FF
+    ; Set loop counter and increments
+%if mmsize == 8
+    mov        r6, %1
+%else
+    mov        r6, (%1 * %1) / mmsize
+%endif
+
+    ; Use result of test now
+    jz .loop_512
+    movd       m2, r3
+    movd       m3, r4
+    SPLATW     m2, m2
+    SPLATW     m3, m3
+
+.loop:
+    MAIN_LOOP  %1, 0
+    jnz        .loop
+    REP_RET
+
+    ; Weights are multiple of 512, which allows some shortcuts
+.loop_512:
+    sar        r3, 9
+    sar        r4, 9
+    movd       m2, r3
+    movd       m3, r4
+%if cpuflag(ssse3)
+    punpcklbw  m3, m2
+    SPLATW     m3, m3
+%else
+    SPLATW     m2, m2
+    SPLATW     m3, m3
+%endif
+.loop2:
+    MAIN_LOOP  %1, 1
+    jnz        .loop2
+    REP_RET
+
+%endmacro
+
+INIT_MMX mmx
+RV40_WEIGHT    8, 0
+RV40_WEIGHT   16, 0
+
+INIT_XMM sse2
+RV40_WEIGHT    8, 8
+RV40_WEIGHT   16, 8
+
+INIT_XMM ssse3
+RV40_WEIGHT    8, 8
+RV40_WEIGHT   16, 8
diff --git a/libavcodec/x86/rv40dsp.c b/libavcodec/x86/rv40dsp_init.c
index 9f90ad8bb6..3d6c6f0fa0 100644
--- a/libavcodec/x86/rv40dsp.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -40,14 +40,25 @@ void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
 void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
                                   int stride, int h, int x, int y);
 
+#define DECLARE_WEIGHT(opt) \
+void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                  int w1, int w2, int stride); \
+void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                  int w1, int w2, int stride);
+DECLARE_WEIGHT(mmx)
+DECLARE_WEIGHT(sse2)
+DECLARE_WEIGHT(ssse3)
+
 void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
 {
-    av_unused int mm_flags = av_get_cpu_flags();
-
 #if HAVE_YASM
+    int mm_flags = av_get_cpu_flags();
+
     if (mm_flags & AV_CPU_FLAG_MMX) {
         c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
         c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx;
+        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx;
     }
     if (mm_flags & AV_CPU_FLAG_MMX2) {
         c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
@@ -56,5 +67,13 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
         c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
         c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
     }
+    if (mm_flags & AV_CPU_FLAG_SSE2) {
+        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2;
+        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2;
+    }
+    if (mm_flags & AV_CPU_FLAG_SSSE3) {
+        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3;
+        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3;
+    }
 #endif
 }
author	Michael Niedermayer <michaelni@gmx.at>	2012-02-01 02:08:23 +0100
committer	Michael Niedermayer <michaelni@gmx.at>	2012-02-01 02:36:09 +0100
commit	a369a6b85819890b21a87af3ce983ce533b7169b (patch)
tree	838f9821dc09bd99b59ce4a2d8123d5fd6868b91 /libavcodec/x86
parent	0a3a69e8d77146b53a1112c715a78e7d293883b1 (diff)
parent	52afc9716849e6fb6c2420674d790d374061c663 (diff)
download	ffmpeg-a369a6b85819890b21a87af3ce983ce533b7169b.tar.gz