diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-02-03 02:41:47 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-02-03 03:51:32 +0100 |
commit | d77294c5e404c8a214da0e74f7836390b48b2dba (patch) | |
tree | 9c894cf54b1e18f285cc04eaf7e021e9976f4f2b /libswscale | |
parent | 9477fa094b89645b3a34ef3bc52c4f18719ab4b3 (diff) | |
parent | e15e2a6d2a886aa9944ac9798687104c829d1541 (diff) | |
download | ffmpeg-d77294c5e404c8a214da0e74f7836390b48b2dba.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
libx264: fix indentation.
vorbis: fix overflows in floor1[] vector and inverse db table index.
win64: add a XMM clobber test configure option.
movdec: Parse the dvc1 atom
ARM: ac3: fix ac3_bit_alloc_calc_bap_armv6
swscale: K&R formatting cosmetics for Blackfin code
frwu: lowercase the FRWU codec name
movdec: fix dts generation in fragmented files
fate: make acodec-ac3_fixed test output raw AC3
APIchanges: add missing commit hashes
swscale: implement MMX, SSE2 and AVX functions for RGB32 input.
ra144enc: drop pointless "encoder" from .long_name
bethsoftvideo: fix palette reading.
mpc7: use av_fast_padded_malloc()
mpc7: simplify handling of packet sizes that are not a multiple of 4 bytes
doc: decoding Forward Uncompressed is supported
Fix a typo in the x86 asm version of ff_vector_clip_int32()
pcmenc: Do not set avpkt->size.
ff_alloc_packet: modify the size of the packet to match the requested size
Conflicts:
doc/APIchanges
libavcodec/libx264.c
libavcodec/mpc7.c
libavformat/isom.h
libswscale/Makefile
libswscale/bfin/yuv2rgb_bfin.c
tests/ref/fate/bethsoft-vid
tests/ref/seek/ac3_ac3
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale')
-rw-r--r-- | libswscale/Makefile | 2 | ||||
-rw-r--r-- | libswscale/bfin/internal_bfin.S | 12 | ||||
-rw-r--r-- | libswscale/bfin/swscale_bfin.c | 48 | ||||
-rw-r--r-- | libswscale/bfin/yuv2rgb_bfin.c | 89 | ||||
-rw-r--r-- | libswscale/x86/input.asm | 157 | ||||
-rw-r--r-- | libswscale/x86/swscale_mmx.c | 16 | ||||
-rw-r--r-- | libswscale/x86/w64xmmtest.c | 31 |
7 files changed, 283 insertions, 72 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile index 77d896a76b..b761470fd1 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -25,6 +25,8 @@ MMX-OBJS-$(HAVE_YASM) += x86/input.o \ $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) +OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o + TESTPROGS = colorspace swscale DIRS = bfin mlib ppc sparc x86 diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S index cb8d71253c..eab30aa6ce 100644 --- a/libswscale/bfin/internal_bfin.S +++ b/libswscale/bfin/internal_bfin.S @@ -30,11 +30,11 @@ and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts. The following calculation is used for the conversion: - r = clipz((y-oy)*cy + crv*(v-128)) - g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) - b = clipz((y-oy)*cy + cbu*(u-128)) + r = clipz((y - oy) * cy + crv * (v - 128)) + g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) + b = clipz((y - oy) * cy + cbu * (u - 128)) -y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision. +y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision. New factorization to eliminate the truncation error which was @@ -47,7 +47,7 @@ occurring due to the byteop3p. 2) Scale operands up by a factor of 4 not 8 because Blackfin multiplies include a shift. -3) Compute into the accumulators cy*yx0, cy*yx1. +3) Compute into the accumulators cy * yx0, cy * yx1. 4) Compute each of the linear equations: r = clipz((y - oy) * cy + crv * (v - 128)) @@ -73,7 +73,7 @@ occurring due to the byteop3p. Where coeffs have the following layout in memory. -uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; +uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv; coeffs is a pointer to oy. diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c index 870636ea05..3cd4f28387 100644 --- a/libswscale/bfin/swscale_bfin.c +++ b/libswscale/bfin/swscale_bfin.c @@ -27,32 +27,34 @@ #include <assert.h> #include "config.h" #include <unistd.h> + #include "libswscale/rgb2rgb.h" #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #if defined (__FDPIC__) && CONFIG_SRAM -#define L1CODE __attribute__ ((l1_text)) +#define L1CODE __attribute__((l1_text)) #else #define L1CODE #endif -int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, +int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride) L1CODE; -int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, +int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride) L1CODE; -static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]) +static int uyvytoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[], + int srcSliceY, int srcSliceH, uint8_t *dst[], + int dstStride[]) { - uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY; - uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2; - uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2; - uint8_t *ip = src[0] + srcStride[0]*srcSliceY; - int w = dstStride[0]; + uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY; + uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2; + uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2; + uint8_t *ip = src[0] + srcStride[0] * srcSliceY; + int w = dstStride[0]; ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]); @@ -60,14 +62,15 @@ static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i return srcSliceH; } -static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]) +static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[], + int srcSliceY, int srcSliceH, uint8_t *dst[], + int dstStride[]) { - uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY; - uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2; - uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2; - uint8_t *ip = src[0] + srcStride[0]*srcSliceY; - int w = dstStride[0]; + uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY; + uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2; + uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2; + uint8_t *ip = src[0] + srcStride[0] * srcSliceY; + int w = dstStride[0]; ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]); @@ -75,15 +78,16 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i return srcSliceH; } - void ff_bfin_get_unscaled_swscale(SwsContext *c) { if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) { - av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n"); + av_log(NULL, AV_LOG_VERBOSE, + "selecting Blackfin optimized uyvytoyv12_unscaled\n"); c->swScale = uyvytoyv12_unscaled; } if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) { - av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n"); + av_log(NULL, AV_LOG_VERBOSE, + "selecting Blackfin optimized yuyvtoyv12_unscaled\n"); c->swScale = yuyvtoyv12_unscaled; } } diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c index 7a7dc7f0e6..e7f657fe00 100644 --- a/libswscale/bfin/yuv2rgb_bfin.c +++ b/libswscale/bfin/yuv2rgb_bfin.c @@ -26,15 +26,16 @@ #include <string.h> #include <inttypes.h> #include <assert.h> -#include "config.h" #include <unistd.h> #include "libavutil/pixdesc.h" + +#include "config.h" #include "libswscale/rgb2rgb.h" #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #if defined(__FDPIC__) && CONFIG_SRAM -#define L1CODE __attribute__ ((l1_text)) +#define L1CODE __attribute__((l1_text)) #else #define L1CODE #endif @@ -48,21 +49,20 @@ void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, int w, uint32_t *coeffs) L1CODE; -typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, - int w, uint32_t *coeffs); - +typedef void (*ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, + int w, uint32_t *coeffs); static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks) { int oy; - oy = c->yOffset&0xffff; - oy = oy >> 3; // keep everything U8.0 for offset calculation + oy = c->yOffset & 0xffff; + oy = oy >> 3; // keep everything U8.0 for offset calculation - c->oc = 128*0x01010101U; - c->oy = oy*0x01010101U; + c->oc = 128 * 0x01010101U; + c->oy = oy * 0x01010101U; /* copy 64bit vector coeffs down to 32bit vector coeffs */ - c->cy = c->yCoeff; + c->cy = c->yCoeff; c->zero = 0; if (rgb) { @@ -77,7 +77,6 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks) c->cgv = c->ugCoeff; } - if (masks == 555) { c->rmask = 0x001f * 0x00010001U; c->gmask = 0x03e0 * 0x00010001U; @@ -89,27 +88,25 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks) } } -static int core_yuv420_rgb(SwsContext *c, - uint8_t **in, int *instrides, - int srcSliceY, int srcSliceH, - uint8_t **oplanes, int *outstrides, - ltransform lcscf, int rgb, int masks) +static int core_yuv420_rgb(SwsContext *c, uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, uint8_t **oplanes, + int *outstrides, ltransform lcscf, + int rgb, int masks) { - uint8_t *py,*pu,*pv,*op; + uint8_t *py, *pu, *pv, *op; int w = instrides[0]; - int h2 = srcSliceH>>1; + int h2 = srcSliceH >> 1; int i; bfin_prepare_coefficients(c, rgb, masks); py = in[0]; - pu = in[1+(1^rgb)]; - pv = in[1+(0^rgb)]; - - op = oplanes[0] + srcSliceY*outstrides[0]; + pu = in[1 + (1 ^ rgb)]; + pv = in[1 + (0 ^ rgb)]; - for (i=0;i<h2;i++) { + op = oplanes[0] + srcSliceY * outstrides[0]; + for (i = 0; i < h2; i++) { lcscf(py, pu, pv, op, w, &c->oy); py += instrides[0]; @@ -126,9 +123,7 @@ static int core_yuv420_rgb(SwsContext *c, return srcSliceH; } - -static int bfin_yuv420_rgb555(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_rgb555(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -136,8 +131,7 @@ static int bfin_yuv420_rgb555(SwsContext *c, outstrides, ff_bfin_yuv2rgb555_line, 1, 555); } -static int bfin_yuv420_bgr555(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_bgr555(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -145,8 +139,7 @@ static int bfin_yuv420_bgr555(SwsContext *c, outstrides, ff_bfin_yuv2rgb555_line, 0, 555); } -static int bfin_yuv420_rgb24(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_rgb24(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -154,8 +147,7 @@ static int bfin_yuv420_rgb24(SwsContext *c, outstrides, ff_bfin_yuv2rgb24_line, 1, 888); } -static int bfin_yuv420_bgr24(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_bgr24(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -163,8 +155,7 @@ static int bfin_yuv420_bgr24(SwsContext *c, outstrides, ff_bfin_yuv2rgb24_line, 0, 888); } -static int bfin_yuv420_rgb565(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_rgb565(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -172,8 +163,7 @@ static int bfin_yuv420_rgb565(SwsContext *c, outstrides, ff_bfin_yuv2rgb565_line, 1, 565); } -static int bfin_yuv420_bgr565(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_bgr565(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -181,18 +171,29 @@ static int bfin_yuv420_bgr565(SwsContext *c, outstrides, ff_bfin_yuv2rgb565_line, 0, 565); } - SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c) { SwsFunc f; - switch(c->dstFormat) { - case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break; - case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break; - case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break; - case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break; - case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break; - case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break; + switch (c->dstFormat) { + case PIX_FMT_RGB555: + f = bfin_yuv420_rgb555; + break; + case PIX_FMT_BGR555: + f = bfin_yuv420_bgr555; + break; + case PIX_FMT_RGB565: + f = bfin_yuv420_rgb565; + break; + case PIX_FMT_BGR565: + f = bfin_yuv420_bgr565; + break; + case PIX_FMT_RGB24: + f = bfin_yuv420_rgb24; + break; + case PIX_FMT_BGR24: + f = bfin_yuv420_bgr24; + break; default: return 0; } diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 9a8e24f0b0..50e071a89a 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV +rgba_Ycoeff_rb: times 4 dw RY, BY +rgba_Ycoeff_br: times 4 dw BY, RY +rgba_Ycoeff_ga: times 4 dw GY, 0 +rgba_Ycoeff_ag: times 4 dw 0, GY +rgba_Ucoeff_rb: times 4 dw RU, BU +rgba_Ucoeff_br: times 4 dw BU, RU +rgba_Ucoeff_ga: times 4 dw GU, 0 +rgba_Ucoeff_ag: times 4 dw 0, GU +rgba_Vcoeff_rb: times 4 dw RV, BV +rgba_Vcoeff_br: times 4 dw BV, RV +rgba_Vcoeff_ga: times 4 dw GV, 0 +rgba_Vcoeff_ag: times 4 dw 0, GV + shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \ @@ -294,6 +307,150 @@ RGB24_FUNCS 11, 13 INIT_XMM avx RGB24_FUNCS 11, 13 +; %1 = nr. of XMM registers +; %2-5 = rgba, bgra, argb or abgr (in individual characters) +%macro RGB32_TO_Y_FN 5-6 +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3 + mova m5, [rgba_Ycoeff_%2%4] + mova m6, [rgba_Ycoeff_%3%5] +%if %0 == 6 + jmp mangle(program_name %+ _ %+ %6 %+ ToY %+ SUFFIX).body +%else ; %0 == 6 +.body: +%if ARCH_X86_64 + movsxd wq, wd +%endif + lea srcq, [srcq+wq*4] + add wq, wq + add dstq, wq + neg wq + mova m4, [rgb_Yrnd] + pcmpeqb m7, m7 + psrlw m7, 8 ; (word) { 0x00ff } x4 +.loop: + ; FIXME check alignment and use mova + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] + pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] + pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] + pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7] + pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7] + paddd m0, m4 ; += rgb_Yrnd + paddd m2, m4 ; += rgb_Yrnd + paddd m0, m1 ; (dword) { Y[0-3] } + paddd m2, m3 ; (dword) { Y[4-7] } + psrad m0, 9 + psrad m2, 9 + packssdw m0, m2 ; (word) { Y[0-7] } + mova [dstq+wq], m0 + add wq, mmsize + jl .loop + REP_RET +%endif ; %0 == 3 +%endmacro + +; %1 = nr. of XMM registers +; %2-5 = rgba, bgra, argb or abgr (in individual characters) +%macro RGB32_TO_UV_FN 5-6 +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3 +%if ARCH_X86_64 + mova m8, [rgba_Ucoeff_%2%4] + mova m9, [rgba_Ucoeff_%3%5] + mova m10, [rgba_Vcoeff_%2%4] + mova m11, [rgba_Vcoeff_%3%5] +%define coeffU1 m8 +%define coeffU2 m9 +%define coeffV1 m10 +%define coeffV2 m11 +%else ; x86-32 +%define coeffU1 [rgba_Ucoeff_%2%4] +%define coeffU2 [rgba_Ucoeff_%3%5] +%define coeffV1 [rgba_Vcoeff_%2%4] +%define coeffV2 [rgba_Vcoeff_%3%5] +%endif ; x86-64/32 +%if ARCH_X86_64 && %0 == 6 + jmp mangle(program_name %+ _ %+ %6 %+ ToUV %+ SUFFIX).body +%else ; ARCH_X86_64 && %0 == 6 +.body: +%if ARCH_X86_64 + movsxd wq, dword r5m +%else ; x86-32 + mov wq, r5m +%endif + add wq, wq + add dstUq, wq + add dstVq, wq + lea srcq, [srcq+wq*2] + neg wq + pcmpeqb m7, m7 + psrlw m7, 8 ; (word) { 0x00ff } x4 + mova m6, [rgb_UVrnd] +.loop: + ; FIXME check alignment and use mova + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] + pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] + pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] + pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] + pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] + paddd m3, m6 ; += rgb_UVrnd + paddd m1, m6 ; += rgb_UVrnd + paddd m2, m3 ; (dword) { V[0-3] } + paddd m0, m1 ; (dword) { U[0-3] } + pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7] + pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7] + pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7] + pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] + paddd m3, m6 ; += rgb_UVrnd + paddd m5, m6 ; += rgb_UVrnd + psrad m0, 9 + paddd m1, m3 ; (dword) { V[4-7] } + paddd m4, m5 ; (dword) { U[4-7] } + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 + packssdw m0, m4 ; (word) { U[0-7] } + packssdw m2, m1 ; (word) { V[0-7] } +%if mmsize == 8 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%else ; mmsize == 16 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%endif ; mmsize == 8/16 + add wq, mmsize + jl .loop + REP_RET +%endif ; ARCH_X86_64 && %0 == 3 +%endmacro + +; %1 = nr. of XMM registers for rgb-to-Y func +; %2 = nr. of XMM registers for rgb-to-UV func +%macro RGB32_FUNCS 2 +RGB32_TO_Y_FN %1, r, g, b, a +RGB32_TO_Y_FN %1, b, g, r, a, rgba +RGB32_TO_Y_FN %1, a, r, g, b, rgba +RGB32_TO_Y_FN %1, a, b, g, r, rgba + +RGB32_TO_UV_FN %2, r, g, b, a +RGB32_TO_UV_FN %2, b, g, r, a, rgba +RGB32_TO_UV_FN %2, a, r, g, b, rgba +RGB32_TO_UV_FN %2, a, b, g, r, rgba +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +RGB32_FUNCS 0, 0 +%endif + +INIT_XMM sse2 +RGB32_FUNCS 8, 12 + +INIT_XMM avx +RGB32_FUNCS 8, 12 + ;----------------------------------------------------------------------------- ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. ; diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index ab5b68fb0b..1118515164 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -308,6 +308,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ INPUT_FUNC(yuyv, opt); \ INPUT_UV_FUNC(nv12, opt); \ INPUT_UV_FUNC(nv21, opt); \ + INPUT_FUNC(rgba, opt); \ + INPUT_FUNC(bgra, opt); \ + INPUT_FUNC(argb, opt); \ + INPUT_FUNC(abgr, opt); \ INPUT_FUNC(rgb24, opt); \ INPUT_FUNC(bgr24, opt) @@ -406,6 +410,10 @@ switch(c->dstBpc){ \ break; case_rgb(rgb24, RGB24, mmx); case_rgb(bgr24, BGR24, mmx); + case_rgb(bgra, BGRA, mmx); + case_rgb(rgba, RGBA, mmx); + case_rgb(abgr, ABGR, mmx); + case_rgb(argb, ARGB, mmx); default: break; } @@ -450,6 +458,10 @@ switch(c->dstBpc){ \ break; case_rgb(rgb24, RGB24, sse2); case_rgb(bgr24, BGR24, sse2); + case_rgb(bgra, BGRA, sse2); + case_rgb(rgba, RGBA, sse2); + case_rgb(abgr, ABGR, sse2); + case_rgb(argb, ARGB, sse2); default: break; } @@ -493,6 +505,10 @@ switch(c->dstBpc){ \ break; case_rgb(rgb24, RGB24, avx); case_rgb(bgr24, BGR24, avx); + case_rgb(bgra, BGRA, avx); + case_rgb(rgba, RGBA, avx); + case_rgb(abgr, ABGR, avx); + case_rgb(argb, ARGB, avx); default: break; } diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c new file mode 100644 index 0000000000..dd9a2a4378 --- /dev/null +++ b/libswscale/x86/w64xmmtest.c @@ -0,0 +1,31 @@ +/* + * check XMM registers for clobbers on Win64 + * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/w64xmmtest.h" +#include "libswscale/swscale.h" + +wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[], + const int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *const dst[], const int dstStride[])) +{ + testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY, + srcSliceH, dst, dstStride); +} |