aboutsummaryrefslogtreecommitdiffstats
path: root/libswscale
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-02-03 02:41:47 +0100
committerMichael Niedermayer <michaelni@gmx.at>2012-02-03 03:51:32 +0100
commitd77294c5e404c8a214da0e74f7836390b48b2dba (patch)
tree9c894cf54b1e18f285cc04eaf7e021e9976f4f2b /libswscale
parent9477fa094b89645b3a34ef3bc52c4f18719ab4b3 (diff)
parente15e2a6d2a886aa9944ac9798687104c829d1541 (diff)
downloadffmpeg-d77294c5e404c8a214da0e74f7836390b48b2dba.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: libx264: fix indentation. vorbis: fix overflows in floor1[] vector and inverse db table index. win64: add a XMM clobber test configure option. movdec: Parse the dvc1 atom ARM: ac3: fix ac3_bit_alloc_calc_bap_armv6 swscale: K&R formatting cosmetics for Blackfin code frwu: lowercase the FRWU codec name movdec: fix dts generation in fragmented files fate: make acodec-ac3_fixed test output raw AC3 APIchanges: add missing commit hashes swscale: implement MMX, SSE2 and AVX functions for RGB32 input. ra144enc: drop pointless "encoder" from .long_name bethsoftvideo: fix palette reading. mpc7: use av_fast_padded_malloc() mpc7: simplify handling of packet sizes that are not a multiple of 4 bytes doc: decoding Forward Uncompressed is supported Fix a typo in the x86 asm version of ff_vector_clip_int32() pcmenc: Do not set avpkt->size. ff_alloc_packet: modify the size of the packet to match the requested size Conflicts: doc/APIchanges libavcodec/libx264.c libavcodec/mpc7.c libavformat/isom.h libswscale/Makefile libswscale/bfin/yuv2rgb_bfin.c tests/ref/fate/bethsoft-vid tests/ref/seek/ac3_ac3 Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale')
-rw-r--r--libswscale/Makefile2
-rw-r--r--libswscale/bfin/internal_bfin.S12
-rw-r--r--libswscale/bfin/swscale_bfin.c48
-rw-r--r--libswscale/bfin/yuv2rgb_bfin.c89
-rw-r--r--libswscale/x86/input.asm157
-rw-r--r--libswscale/x86/swscale_mmx.c16
-rw-r--r--libswscale/x86/w64xmmtest.c31
7 files changed, 283 insertions, 72 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 77d896a76b..b761470fd1 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -25,6 +25,8 @@ MMX-OBJS-$(HAVE_YASM) += x86/input.o \
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
TESTPROGS = colorspace swscale
DIRS = bfin mlib ppc sparc x86
diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S
index cb8d71253c..eab30aa6ce 100644
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -30,11 +30,11 @@ and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
The following calculation is used for the conversion:
- r = clipz((y-oy)*cy + crv*(v-128))
- g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
- b = clipz((y-oy)*cy + cbu*(u-128))
+ r = clipz((y - oy) * cy + crv * (v - 128))
+ g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
+ b = clipz((y - oy) * cy + cbu * (u - 128))
-y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
+y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
New factorization to eliminate the truncation error which was
@@ -47,7 +47,7 @@ occurring due to the byteop3p.
2) Scale operands up by a factor of 4 not 8 because Blackfin
multiplies include a shift.
-3) Compute into the accumulators cy*yx0, cy*yx1.
+3) Compute into the accumulators cy * yx0, cy * yx1.
4) Compute each of the linear equations:
r = clipz((y - oy) * cy + crv * (v - 128))
@@ -73,7 +73,7 @@ occurring due to the byteop3p.
Where coeffs have the following layout in memory.
-uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
+uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
coeffs is a pointer to oy.
diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c
index 870636ea05..3cd4f28387 100644
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -27,32 +27,34 @@
#include <assert.h>
#include "config.h"
#include <unistd.h>
+
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if defined (__FDPIC__) && CONFIG_SRAM
-#define L1CODE __attribute__ ((l1_text))
+#define L1CODE __attribute__((l1_text))
#else
#define L1CODE
#endif
-int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
- int width, int height,
+int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ uint8_t *vdst, int width, int height,
int lumStride, int chromStride, int srcStride) L1CODE;
-int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
- int width, int height,
+int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ uint8_t *vdst, int width, int height,
int lumStride, int chromStride, int srcStride) L1CODE;
-static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
- int srcSliceH, uint8_t* dst[], int dstStride[])
+static int uyvytoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[],
+ int dstStride[])
{
- uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
- uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
- uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
- uint8_t *ip = src[0] + srcStride[0]*srcSliceY;
- int w = dstStride[0];
+ uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+ uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+ uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
+ int w = dstStride[0];
ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
@@ -60,14 +62,15 @@ static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
return srcSliceH;
}
-static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
- int srcSliceH, uint8_t* dst[], int dstStride[])
+static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[],
+ int dstStride[])
{
- uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
- uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
- uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
- uint8_t *ip = src[0] + srcStride[0]*srcSliceY;
- int w = dstStride[0];
+ uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+ uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+ uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
+ int w = dstStride[0];
ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
@@ -75,15 +78,16 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
return srcSliceH;
}
-
void ff_bfin_get_unscaled_swscale(SwsContext *c)
{
if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) {
- av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
+ av_log(NULL, AV_LOG_VERBOSE,
+ "selecting Blackfin optimized uyvytoyv12_unscaled\n");
c->swScale = uyvytoyv12_unscaled;
}
if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) {
- av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
+ av_log(NULL, AV_LOG_VERBOSE,
+ "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
c->swScale = yuyvtoyv12_unscaled;
}
}
diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c
index 7a7dc7f0e6..e7f657fe00 100644
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -26,15 +26,16 @@
#include <string.h>
#include <inttypes.h>
#include <assert.h>
-#include "config.h"
#include <unistd.h>
#include "libavutil/pixdesc.h"
+
+#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if defined(__FDPIC__) && CONFIG_SRAM
-#define L1CODE __attribute__ ((l1_text))
+#define L1CODE __attribute__((l1_text))
#else
#define L1CODE
#endif
@@ -48,21 +49,20 @@ void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) L1CODE;
-typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
- int w, uint32_t *coeffs);
-
+typedef void (*ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+ int w, uint32_t *coeffs);
static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
{
int oy;
- oy = c->yOffset&0xffff;
- oy = oy >> 3; // keep everything U8.0 for offset calculation
+ oy = c->yOffset & 0xffff;
+ oy = oy >> 3; // keep everything U8.0 for offset calculation
- c->oc = 128*0x01010101U;
- c->oy = oy*0x01010101U;
+ c->oc = 128 * 0x01010101U;
+ c->oy = oy * 0x01010101U;
/* copy 64bit vector coeffs down to 32bit vector coeffs */
- c->cy = c->yCoeff;
+ c->cy = c->yCoeff;
c->zero = 0;
if (rgb) {
@@ -77,7 +77,6 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
c->cgv = c->ugCoeff;
}
-
if (masks == 555) {
c->rmask = 0x001f * 0x00010001U;
c->gmask = 0x03e0 * 0x00010001U;
@@ -89,27 +88,25 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
}
}
-static int core_yuv420_rgb(SwsContext *c,
- uint8_t **in, int *instrides,
- int srcSliceY, int srcSliceH,
- uint8_t **oplanes, int *outstrides,
- ltransform lcscf, int rgb, int masks)
+static int core_yuv420_rgb(SwsContext *c, uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH, uint8_t **oplanes,
+ int *outstrides, ltransform lcscf,
+ int rgb, int masks)
{
- uint8_t *py,*pu,*pv,*op;
+ uint8_t *py, *pu, *pv, *op;
int w = instrides[0];
- int h2 = srcSliceH>>1;
+ int h2 = srcSliceH >> 1;
int i;
bfin_prepare_coefficients(c, rgb, masks);
py = in[0];
- pu = in[1+(1^rgb)];
- pv = in[1+(0^rgb)];
-
- op = oplanes[0] + srcSliceY*outstrides[0];
+ pu = in[1 + (1 ^ rgb)];
+ pv = in[1 + (0 ^ rgb)];
- for (i=0;i<h2;i++) {
+ op = oplanes[0] + srcSliceY * outstrides[0];
+ for (i = 0; i < h2; i++) {
lcscf(py, pu, pv, op, w, &c->oy);
py += instrides[0];
@@ -126,9 +123,7 @@ static int core_yuv420_rgb(SwsContext *c,
return srcSliceH;
}
-
-static int bfin_yuv420_rgb555(SwsContext *c,
- uint8_t **in, int *instrides,
+static int bfin_yuv420_rgb555(SwsContext *c, uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
@@ -136,8 +131,7 @@ static int bfin_yuv420_rgb555(SwsContext *c,
outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
}
-static int bfin_yuv420_bgr555(SwsContext *c,
- uint8_t **in, int *instrides,
+static int bfin_yuv420_bgr555(SwsContext *c, uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
@@ -145,8 +139,7 @@ static int bfin_yuv420_bgr555(SwsContext *c,
outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
}
-static int bfin_yuv420_rgb24(SwsContext *c,
- uint8_t **in, int *instrides,
+static int bfin_yuv420_rgb24(SwsContext *c, uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
@@ -154,8 +147,7 @@ static int bfin_yuv420_rgb24(SwsContext *c,
outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
}
-static int bfin_yuv420_bgr24(SwsContext *c,
- uint8_t **in, int *instrides,
+static int bfin_yuv420_bgr24(SwsContext *c, uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
@@ -163,8 +155,7 @@ static int bfin_yuv420_bgr24(SwsContext *c,
outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
}
-static int bfin_yuv420_rgb565(SwsContext *c,
- uint8_t **in, int *instrides,
+static int bfin_yuv420_rgb565(SwsContext *c, uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
@@ -172,8 +163,7 @@ static int bfin_yuv420_rgb565(SwsContext *c,
outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
}
-static int bfin_yuv420_bgr565(SwsContext *c,
- uint8_t **in, int *instrides,
+static int bfin_yuv420_bgr565(SwsContext *c, uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
@@ -181,18 +171,29 @@ static int bfin_yuv420_bgr565(SwsContext *c,
outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
}
-
SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
{
SwsFunc f;
- switch(c->dstFormat) {
- case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
- case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
- case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
- case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
- case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break;
- case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break;
+ switch (c->dstFormat) {
+ case PIX_FMT_RGB555:
+ f = bfin_yuv420_rgb555;
+ break;
+ case PIX_FMT_BGR555:
+ f = bfin_yuv420_bgr555;
+ break;
+ case PIX_FMT_RGB565:
+ f = bfin_yuv420_rgb565;
+ break;
+ case PIX_FMT_BGR565:
+ f = bfin_yuv420_bgr565;
+ break;
+ case PIX_FMT_RGB24:
+ f = bfin_yuv420_rgb24;
+ break;
+ case PIX_FMT_BGR24:
+ f = bfin_yuv420_bgr24;
+ break;
default:
return 0;
}
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 9a8e24f0b0..50e071a89a 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+rgba_Ycoeff_rb: times 4 dw RY, BY
+rgba_Ycoeff_br: times 4 dw BY, RY
+rgba_Ycoeff_ga: times 4 dw GY, 0
+rgba_Ycoeff_ag: times 4 dw 0, GY
+rgba_Ucoeff_rb: times 4 dw RU, BU
+rgba_Ucoeff_br: times 4 dw BU, RU
+rgba_Ucoeff_ga: times 4 dw GU, 0
+rgba_Ucoeff_ag: times 4 dw 0, GU
+rgba_Vcoeff_rb: times 4 dw RV, BV
+rgba_Vcoeff_br: times 4 dw BV, RV
+rgba_Vcoeff_ga: times 4 dw GV, 0
+rgba_Vcoeff_ag: times 4 dw 0, GV
+
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
@@ -294,6 +307,150 @@ RGB24_FUNCS 11, 13
INIT_XMM avx
RGB24_FUNCS 11, 13
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_Y_FN 5-6
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
+ mova m5, [rgba_Ycoeff_%2%4]
+ mova m6, [rgba_Ycoeff_%3%5]
+%if %0 == 6
+ jmp mangle(program_name %+ _ %+ %6 %+ ToY %+ SUFFIX).body
+%else ; %0 == 6
+.body:
+%if ARCH_X86_64
+ movsxd wq, wd
+%endif
+ lea srcq, [srcq+wq*4]
+ add wq, wq
+ add dstq, wq
+ neg wq
+ mova m4, [rgb_Yrnd]
+ pcmpeqb m7, m7
+ psrlw m7, 8 ; (word) { 0x00ff } x4
+.loop:
+ ; FIXME check alignment and use mova
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+ pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
+ pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
+ pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
+ pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
+ paddd m0, m4 ; += rgb_Yrnd
+ paddd m2, m4 ; += rgb_Yrnd
+ paddd m0, m1 ; (dword) { Y[0-3] }
+ paddd m2, m3 ; (dword) { Y[4-7] }
+ psrad m0, 9
+ psrad m2, 9
+ packssdw m0, m2 ; (word) { Y[0-7] }
+ mova [dstq+wq], m0
+ add wq, mmsize
+ jl .loop
+ REP_RET
+%endif ; %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_UV_FN 5-6
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
+%if ARCH_X86_64
+ mova m8, [rgba_Ucoeff_%2%4]
+ mova m9, [rgba_Ucoeff_%3%5]
+ mova m10, [rgba_Vcoeff_%2%4]
+ mova m11, [rgba_Vcoeff_%3%5]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [rgba_Ucoeff_%2%4]
+%define coeffU2 [rgba_Ucoeff_%3%5]
+%define coeffV1 [rgba_Vcoeff_%2%4]
+%define coeffV2 [rgba_Vcoeff_%3%5]
+%endif ; x86-64/32
+%if ARCH_X86_64 && %0 == 6
+ jmp mangle(program_name %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 6
+.body:
+%if ARCH_X86_64
+ movsxd wq, dword r5m
+%else ; x86-32
+ mov wq, r5m
+%endif
+ add wq, wq
+ add dstUq, wq
+ add dstVq, wq
+ lea srcq, [srcq+wq*2]
+ neg wq
+ pcmpeqb m7, m7
+ psrlw m7, 8 ; (word) { 0x00ff } x4
+ mova m6, [rgb_UVrnd]
+.loop:
+ ; FIXME check alignment and use mova
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+ pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
+ pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
+ pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
+ pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
+ paddd m3, m6 ; += rgb_UVrnd
+ paddd m1, m6 ; += rgb_UVrnd
+ paddd m2, m3 ; (dword) { V[0-3] }
+ paddd m0, m1 ; (dword) { U[0-3] }
+ pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
+ pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
+ pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
+ pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
+ paddd m3, m6 ; += rgb_UVrnd
+ paddd m5, m6 ; += rgb_UVrnd
+ psrad m0, 9
+ paddd m1, m3 ; (dword) { V[4-7] }
+ paddd m4, m5 ; (dword) { U[4-7] }
+ psrad m2, 9
+ psrad m4, 9
+ psrad m1, 9
+ packssdw m0, m4 ; (word) { U[0-7] }
+ packssdw m2, m1 ; (word) { V[0-7] }
+%if mmsize == 8
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
+%else ; mmsize == 16
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
+%endif ; mmsize == 8/16
+ add wq, mmsize
+ jl .loop
+ REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB32_FUNCS 2
+RGB32_TO_Y_FN %1, r, g, b, a
+RGB32_TO_Y_FN %1, b, g, r, a, rgba
+RGB32_TO_Y_FN %1, a, r, g, b, rgba
+RGB32_TO_Y_FN %1, a, b, g, r, rgba
+
+RGB32_TO_UV_FN %2, r, g, b, a
+RGB32_TO_UV_FN %2, b, g, r, a, rgba
+RGB32_TO_UV_FN %2, a, r, g, b, rgba
+RGB32_TO_UV_FN %2, a, b, g, r, rgba
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB32_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB32_FUNCS 8, 12
+
+INIT_XMM avx
+RGB32_FUNCS 8, 12
+
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index ab5b68fb0b..1118515164 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -308,6 +308,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt); \
+ INPUT_FUNC(rgba, opt); \
+ INPUT_FUNC(bgra, opt); \
+ INPUT_FUNC(argb, opt); \
+ INPUT_FUNC(abgr, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
@@ -406,6 +410,10 @@ switch(c->dstBpc){ \
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
+ case_rgb(bgra, BGRA, mmx);
+ case_rgb(rgba, RGBA, mmx);
+ case_rgb(abgr, ABGR, mmx);
+ case_rgb(argb, ARGB, mmx);
default:
break;
}
@@ -450,6 +458,10 @@ switch(c->dstBpc){ \
break;
case_rgb(rgb24, RGB24, sse2);
case_rgb(bgr24, BGR24, sse2);
+ case_rgb(bgra, BGRA, sse2);
+ case_rgb(rgba, RGBA, sse2);
+ case_rgb(abgr, ABGR, sse2);
+ case_rgb(argb, ARGB, sse2);
default:
break;
}
@@ -493,6 +505,10 @@ switch(c->dstBpc){ \
break;
case_rgb(rgb24, RGB24, avx);
case_rgb(bgr24, BGR24, avx);
+ case_rgb(bgra, BGRA, avx);
+ case_rgb(rgba, RGBA, avx);
+ case_rgb(abgr, ABGR, avx);
+ case_rgb(argb, ARGB, avx);
default:
break;
}
diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c
new file mode 100644
index 0000000000..dd9a2a4378
--- /dev/null
+++ b/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/w64xmmtest.h"
+#include "libswscale/swscale.h"
+
+wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+ const int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *const dst[], const int dstStride[]))
+{
+ testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
+ srcSliceH, dst, dstStride);
+}