Merge remote-tracking branch 'qatar/master'

* qatar/master: libx264: fix indentation. vorbis: fix overflows in floor1[] vector and inverse db table index. win64: add a XMM clobber test configure option. movdec: Parse the dvc1 atom ARM: ac3: fix ac3_bit_alloc_calc_bap_armv6 swscale: K&R formatting cosmetics for Blackfin code frwu: lowercase the FRWU codec name movdec: fix dts generation in fragmented files fate: make acodec-ac3_fixed test output raw AC3 APIchanges: add missing commit hashes swscale: implement MMX, SSE2 and AVX functions for RGB32 input. ra144enc: drop pointless "encoder" from .long_name bethsoftvideo: fix palette reading. mpc7: use av_fast_padded_malloc() mpc7: simplify handling of packet sizes that are not a multiple of 4 bytes doc: decoding Forward Uncompressed is supported Fix a typo in the x86 asm version of ff_vector_clip_int32() pcmenc: Do not set avpkt->size. ff_alloc_packet: modify the size of the packet to match the requested size Conflicts: doc/APIchanges libavcodec/libx264.c libavcodec/mpc7.c libavformat/isom.h libswscale/Makefile libswscale/bfin/yuv2rgb_bfin.c tests/ref/fate/bethsoft-vid tests/ref/seek/ac3_ac3 Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2012-02-03 02:41:47 +0100
committer: Michael Niedermayer <michaelni@gmx.at> 2012-02-03 03:51:32 +0100
commit: d77294c5e404c8a214da0e74f7836390b48b2dba (patch)
tree: 9c894cf54b1e18f285cc04eaf7e021e9976f4f2b /libswscale
parent: 9477fa094b89645b3a34ef3bc52c4f18719ab4b3 (diff)
parent: e15e2a6d2a886aa9944ac9798687104c829d1541 (diff)
download: ffmpeg-d77294c5e404c8a214da0e74f7836390b48b2dba.tar.gz
7 files changed, 283 insertions, 72 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 77d896a76b..b761470fd1 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -25,6 +25,8 @@ MMX-OBJS-$(HAVE_YASM)      +=  x86/input.o              \
 
 $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
 
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
 TESTPROGS = colorspace swscale
 
 DIRS = bfin mlib ppc sparc x86
diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S
index cb8d71253c..eab30aa6ce 100644
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -30,11 +30,11 @@ and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
 
 The following calculation is used for the conversion:
 
-  r = clipz((y-oy)*cy  + crv*(v-128))
-  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
-  b = clipz((y-oy)*cy  + cbu*(u-128))
+  r = clipz((y - oy) * cy  + crv * (v - 128))
+  g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
+  b = clipz((y - oy) * cy  + cbu * (u - 128))
 
-y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
+y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
 
 
 New factorization to eliminate the truncation error which was
@@ -47,7 +47,7 @@ occurring due to the byteop3p.
 2) Scale operands up by a factor of 4 not 8 because Blackfin
    multiplies include a shift.
 
-3) Compute into the accumulators cy*yx0, cy*yx1.
+3) Compute into the accumulators cy * yx0, cy * yx1.
 
 4) Compute each of the linear equations:
      r = clipz((y - oy) * cy  + crv * (v - 128))
@@ -73,7 +73,7 @@ occurring due to the byteop3p.
 
 Where coeffs have the following layout in memory.
 
-uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
+uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
 
 coeffs is a pointer to oy.
 
diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c
index 870636ea05..3cd4f28387 100644
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -27,32 +27,34 @@
 #include <assert.h>
 #include "config.h"
 #include <unistd.h>
+
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 
 #if defined (__FDPIC__) && CONFIG_SRAM
-#define L1CODE __attribute__ ((l1_text))
+#define L1CODE __attribute__((l1_text))
 #else
 #define L1CODE
 #endif
 
-int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                       int width, int height,
+int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height,
                        int lumStride, int chromStride, int srcStride) L1CODE;
 
-int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                       int width, int height,
+int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height,
                        int lumStride, int chromStride, int srcStride) L1CODE;
 
-static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                               int srcSliceH, uint8_t* dst[], int dstStride[])
+static int uyvytoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[],
+                               int srcSliceY, int srcSliceH, uint8_t *dst[],
+                               int dstStride[])
 {
-    uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
-    uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
-    uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
-    uint8_t *ip   = src[0] + srcStride[0]*srcSliceY;
-    int w         = dstStride[0];
+    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+    uint8_t *ip   = src[0] + srcStride[0] * srcSliceY;
+    int w = dstStride[0];
 
     ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
                        dstStride[0], dstStride[1], srcStride[0]);
@@ -60,14 +62,15 @@ static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
     return srcSliceH;
 }
 
-static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                               int srcSliceH, uint8_t* dst[], int dstStride[])
+static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[],
+                               int srcSliceY, int srcSliceH, uint8_t *dst[],
+                               int dstStride[])
 {
-    uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
-    uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
-    uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
-    uint8_t *ip   = src[0] + srcStride[0]*srcSliceY;
-    int w         = dstStride[0];
+    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+    uint8_t *ip   = src[0] + srcStride[0] * srcSliceY;
+    int w = dstStride[0];
 
     ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
                        dstStride[0], dstStride[1], srcStride[0]);
@@ -75,15 +78,16 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
     return srcSliceH;
 }
 
-
 void ff_bfin_get_unscaled_swscale(SwsContext *c)
 {
     if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) {
-        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
+        av_log(NULL, AV_LOG_VERBOSE,
+               "selecting Blackfin optimized uyvytoyv12_unscaled\n");
         c->swScale = uyvytoyv12_unscaled;
     }
     if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) {
-        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
+        av_log(NULL, AV_LOG_VERBOSE,
+               "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
         c->swScale = yuyvtoyv12_unscaled;
     }
 }
diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c
index 7a7dc7f0e6..e7f657fe00 100644
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -26,15 +26,16 @@
 #include <string.h>
 #include <inttypes.h>
 #include <assert.h>
-#include "config.h"
 #include <unistd.h>
 #include "libavutil/pixdesc.h"
+
+#include "config.h"
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 
 #if defined(__FDPIC__) && CONFIG_SRAM
-#define L1CODE __attribute__ ((l1_text))
+#define L1CODE __attribute__((l1_text))
 #else
 #define L1CODE
 #endif
@@ -48,21 +49,20 @@ void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
 void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                             int w, uint32_t *coeffs) L1CODE;
 
-typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                            int w, uint32_t *coeffs);
-
+typedef void (*ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                           int w, uint32_t *coeffs);
 
 static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
 {
     int oy;
-    oy      = c->yOffset&0xffff;
-    oy      = oy >> 3; // keep everything U8.0 for offset calculation
+    oy = c->yOffset & 0xffff;
+    oy = oy >> 3;      // keep everything U8.0 for offset calculation
 
-    c->oc   = 128*0x01010101U;
-    c->oy   =  oy*0x01010101U;
+    c->oc = 128 * 0x01010101U;
+    c->oy = oy * 0x01010101U;
 
     /* copy 64bit vector coeffs down to 32bit vector coeffs */
-    c->cy  = c->yCoeff;
+    c->cy   = c->yCoeff;
     c->zero = 0;
 
     if (rgb) {
@@ -77,7 +77,6 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
         c->cgv = c->ugCoeff;
     }
 
-
     if (masks == 555) {
         c->rmask = 0x001f * 0x00010001U;
         c->gmask = 0x03e0 * 0x00010001U;
@@ -89,27 +88,25 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
     }
 }
 
-static int core_yuv420_rgb(SwsContext *c,
-                           uint8_t **in, int *instrides,
-                           int srcSliceY, int srcSliceH,
-                           uint8_t **oplanes, int *outstrides,
-                           ltransform lcscf, int rgb, int masks)
+static int core_yuv420_rgb(SwsContext *c, uint8_t **in, int *instrides,
+                           int srcSliceY, int srcSliceH, uint8_t **oplanes,
+                           int *outstrides, ltransform lcscf,
+                           int rgb, int masks)
 {
-    uint8_t *py,*pu,*pv,*op;
+    uint8_t *py, *pu, *pv, *op;
     int w  = instrides[0];
-    int h2 = srcSliceH>>1;
+    int h2 = srcSliceH >> 1;
     int i;
 
     bfin_prepare_coefficients(c, rgb, masks);
 
     py = in[0];
-    pu = in[1+(1^rgb)];
-    pv = in[1+(0^rgb)];
-
-    op = oplanes[0] + srcSliceY*outstrides[0];
+    pu = in[1 + (1 ^ rgb)];
+    pv = in[1 + (0 ^ rgb)];
 
-    for (i=0;i<h2;i++) {
+    op = oplanes[0] + srcSliceY * outstrides[0];
 
+    for (i = 0; i < h2; i++) {
         lcscf(py, pu, pv, op, w, &c->oy);
 
         py += instrides[0];
@@ -126,9 +123,7 @@ static int core_yuv420_rgb(SwsContext *c,
     return srcSliceH;
 }
 
-
-static int bfin_yuv420_rgb555(SwsContext *c,
-                              uint8_t **in, int *instrides,
+static int bfin_yuv420_rgb555(SwsContext *c, uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
@@ -136,8 +131,7 @@ static int bfin_yuv420_rgb555(SwsContext *c,
                            outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
 }
 
-static int bfin_yuv420_bgr555(SwsContext *c,
-                              uint8_t **in, int *instrides,
+static int bfin_yuv420_bgr555(SwsContext *c, uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
@@ -145,8 +139,7 @@ static int bfin_yuv420_bgr555(SwsContext *c,
                            outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
 }
 
-static int bfin_yuv420_rgb24(SwsContext *c,
-                             uint8_t **in, int *instrides,
+static int bfin_yuv420_rgb24(SwsContext *c, uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
@@ -154,8 +147,7 @@ static int bfin_yuv420_rgb24(SwsContext *c,
                            outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
 }
 
-static int bfin_yuv420_bgr24(SwsContext *c,
-                             uint8_t **in, int *instrides,
+static int bfin_yuv420_bgr24(SwsContext *c, uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
@@ -163,8 +155,7 @@ static int bfin_yuv420_bgr24(SwsContext *c,
                            outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
 }
 
-static int bfin_yuv420_rgb565(SwsContext *c,
-                              uint8_t **in, int *instrides,
+static int bfin_yuv420_rgb565(SwsContext *c, uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
@@ -172,8 +163,7 @@ static int bfin_yuv420_rgb565(SwsContext *c,
                            outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
 }
 
-static int bfin_yuv420_bgr565(SwsContext *c,
-                              uint8_t **in, int *instrides,
+static int bfin_yuv420_bgr565(SwsContext *c, uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
@@ -181,18 +171,29 @@ static int bfin_yuv420_bgr565(SwsContext *c,
                            outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
 }
 
-
 SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
 {
     SwsFunc f;
 
-    switch(c->dstFormat) {
-    case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
-    case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
-    case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
-    case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
-    case PIX_FMT_RGB24:  f = bfin_yuv420_rgb24;  break;
-    case PIX_FMT_BGR24:  f = bfin_yuv420_bgr24;  break;
+    switch (c->dstFormat) {
+    case PIX_FMT_RGB555:
+        f = bfin_yuv420_rgb555;
+        break;
+    case PIX_FMT_BGR555:
+        f = bfin_yuv420_bgr555;
+        break;
+    case PIX_FMT_RGB565:
+        f = bfin_yuv420_rgb565;
+        break;
+    case PIX_FMT_BGR565:
+        f = bfin_yuv420_bgr565;
+        break;
+    case PIX_FMT_RGB24:
+        f = bfin_yuv420_rgb24;
+        break;
+    case PIX_FMT_BGR24:
+        f = bfin_yuv420_bgr24;
+        break;
     default:
         return 0;
     }
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 9a8e24f0b0..50e071a89a 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
 rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
 rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
 
+rgba_Ycoeff_rb:  times 4 dw RY, BY
+rgba_Ycoeff_br:  times 4 dw BY, RY
+rgba_Ycoeff_ga:  times 4 dw GY, 0
+rgba_Ycoeff_ag:  times 4 dw 0,  GY
+rgba_Ucoeff_rb:  times 4 dw RU, BU
+rgba_Ucoeff_br:  times 4 dw BU, RU
+rgba_Ucoeff_ga:  times 4 dw GU, 0
+rgba_Ucoeff_ag:  times 4 dw 0,  GU
+rgba_Vcoeff_rb:  times 4 dw RV, BV
+rgba_Vcoeff_br:  times 4 dw BV, RV
+rgba_Vcoeff_ga:  times 4 dw GV, 0
+rgba_Vcoeff_ag:  times 4 dw 0,  GV
+
 shuf_rgb_12x4:   db 0, 0x80, 1, 0x80,  2, 0x80,  3, 0x80, \
                     6, 0x80, 7, 0x80,  8, 0x80,  9, 0x80
 shuf_rgb_3x56:   db 2, 0x80, 3, 0x80,  4, 0x80,  5, 0x80, \
@@ -294,6 +307,150 @@ RGB24_FUNCS 11, 13
 INIT_XMM avx
 RGB24_FUNCS 11, 13
 
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_Y_FN 5-6
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
+    mova           m5, [rgba_Ycoeff_%2%4]
+    mova           m6, [rgba_Ycoeff_%3%5]
+%if %0 == 6
+    jmp mangle(program_name %+ _ %+ %6 %+ ToY %+ SUFFIX).body
+%else ; %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    lea          srcq, [srcq+wq*4]
+    add            wq, wq
+    add          dstq, wq
+    neg            wq
+    mova           m4, [rgb_Yrnd]
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+    pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
+    pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
+    pmaddwd        m3, m5                 ; (dword) { Bx*BY + Rx*RY }[4-7]
+    pmaddwd        m2, m6                 ; (dword) { Gx*GY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd
+    paddd          m2, m4                 ; += rgb_Yrnd
+    paddd          m0, m1                 ; (dword) { Y[0-3] }
+    paddd          m2, m3                 ; (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_UV_FN 5-6
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
+%if ARCH_X86_64
+    mova           m8, [rgba_Ucoeff_%2%4]
+    mova           m9, [rgba_Ucoeff_%3%5]
+    mova          m10, [rgba_Vcoeff_%2%4]
+    mova          m11, [rgba_Vcoeff_%3%5]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [rgba_Ucoeff_%2%4]
+%define coeffU2 [rgba_Ucoeff_%3%5]
+%define coeffV1 [rgba_Vcoeff_%2%4]
+%define coeffV2 [rgba_Vcoeff_%3%5]
+%endif ; x86-64/32
+%if ARCH_X86_64 && %0 == 6
+    jmp mangle(program_name %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    add         dstUq, wq
+    add         dstVq, wq
+    lea          srcq, [srcq+wq*2]
+    neg            wq
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+    mova           m6, [rgb_UVrnd]
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+    pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
+    pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
+    pmaddwd        m1, coeffU1            ; (dword) { Bx*BU + Rx*RU }[0-3]
+    pmaddwd        m0, coeffU2            ; (dword) { Gx*GU }[0-3]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m1, m6                 ; += rgb_UVrnd
+    paddd          m2, m3                 ; (dword) { V[0-3] }
+    paddd          m0, m1                 ; (dword) { U[0-3] }
+    pmaddwd        m3, m5, coeffV1        ; (dword) { Bx*BV + Rx*RV }[4-7]
+    pmaddwd        m1, m4, coeffV2        ; (dword) { Gx*GV }[4-7]
+    pmaddwd        m5, coeffU1            ; (dword) { Bx*BU + Rx*RU }[4-7]
+    pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m5, m6                 ; += rgb_UVrnd
+    psrad          m0, 9
+    paddd          m1, m3                 ; (dword) { V[4-7] }
+    paddd          m4, m5                 ; (dword) { U[4-7] }
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
+    packssdw       m0, m4                 ; (word) { U[0-7] }
+    packssdw       m2, m1                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB32_FUNCS 2
+RGB32_TO_Y_FN %1, r, g, b, a
+RGB32_TO_Y_FN %1, b, g, r, a, rgba
+RGB32_TO_Y_FN %1, a, r, g, b, rgba
+RGB32_TO_Y_FN %1, a, b, g, r, rgba
+
+RGB32_TO_UV_FN %2, r, g, b, a
+RGB32_TO_UV_FN %2, b, g, r, a, rgba
+RGB32_TO_UV_FN %2, a, r, g, b, rgba
+RGB32_TO_UV_FN %2, a, b, g, r, rgba
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB32_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB32_FUNCS 8, 12
+
+INIT_XMM avx
+RGB32_FUNCS 8, 12
+
 ;-----------------------------------------------------------------------------
 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
 ;
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index ab5b68fb0b..1118515164 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -308,6 +308,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
     INPUT_FUNC(yuyv, opt); \
     INPUT_UV_FUNC(nv12, opt); \
     INPUT_UV_FUNC(nv21, opt); \
+    INPUT_FUNC(rgba, opt); \
+    INPUT_FUNC(bgra, opt); \
+    INPUT_FUNC(argb, opt); \
+    INPUT_FUNC(abgr, opt); \
     INPUT_FUNC(rgb24, opt); \
     INPUT_FUNC(bgr24, opt)
 
@@ -406,6 +410,10 @@ switch(c->dstBpc){ \
             break;
         case_rgb(rgb24, RGB24, mmx);
         case_rgb(bgr24, BGR24, mmx);
+        case_rgb(bgra,  BGRA,  mmx);
+        case_rgb(rgba,  RGBA,  mmx);
+        case_rgb(abgr,  ABGR,  mmx);
+        case_rgb(argb,  ARGB,  mmx);
         default:
             break;
         }
@@ -450,6 +458,10 @@ switch(c->dstBpc){ \
             break;
         case_rgb(rgb24, RGB24, sse2);
         case_rgb(bgr24, BGR24, sse2);
+        case_rgb(bgra,  BGRA,  sse2);
+        case_rgb(rgba,  RGBA,  sse2);
+        case_rgb(abgr,  ABGR,  sse2);
+        case_rgb(argb,  ARGB,  sse2);
         default:
             break;
         }
@@ -493,6 +505,10 @@ switch(c->dstBpc){ \
             break;
         case_rgb(rgb24, RGB24, avx);
         case_rgb(bgr24, BGR24, avx);
+        case_rgb(bgra,  BGRA,  avx);
+        case_rgb(rgba,  RGBA,  avx);
+        case_rgb(abgr,  ABGR,  avx);
+        case_rgb(argb,  ARGB,  avx);
         default:
             break;
         }
diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c
new file mode 100644
index 0000000000..dd9a2a4378
--- /dev/null
+++ b/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/w64xmmtest.h"
+#include "libswscale/swscale.h"
+
+wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+               const int srcStride[], int srcSliceY, int srcSliceH,
+               uint8_t *const dst[], const int dstStride[]))
+{
+    testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
+                    srcSliceH, dst, dstStride);
+}
author	Michael Niedermayer <michaelni@gmx.at>	2012-02-03 02:41:47 +0100
committer	Michael Niedermayer <michaelni@gmx.at>	2012-02-03 03:51:32 +0100
commit	d77294c5e404c8a214da0e74f7836390b48b2dba (patch)
tree	9c894cf54b1e18f285cc04eaf7e021e9976f4f2b /libswscale
parent	9477fa094b89645b3a34ef3bc52c4f18719ab4b3 (diff)
parent	e15e2a6d2a886aa9944ac9798687104c829d1541 (diff)
download	ffmpeg-d77294c5e404c8a214da0e74f7836390b48b2dba.tar.gz