aboutsummaryrefslogtreecommitdiffstats
path: root/libswscale/x86
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-03-16 07:47:27 +0100
committerMichael Niedermayer <michaelni@gmx.at>2012-03-16 09:01:08 +0100
commit568e9062bd29e13e0bfa42f2ac8411d01608634d (patch)
treea78351d75b3dee8257909ccffde46880099c91bb /libswscale/x86
parent5dbc75870f486fb9c0237870eafa834a8a2066c8 (diff)
parent5effcfa76792470677a1f6bc9aa73347a87ef720 (diff)
downloadffmpeg-568e9062bd29e13e0bfa42f2ac8411d01608634d.tar.gz
Merge remote-tracking branch 'qatar/release/0.8' into release/0.10
* qatar/release/0.8: (154 commits) Update Changelog for the 0.8.1 Release dca: include libavutil/mathematics.h for possibly missing M_SQRT1_2 dca: don't use av_clip_uintp2(). snow: check reference frame indices. snow: reject unsupported chroma shifts. xa_adpcm: limit filter to prevent xa_adpcm_table[] array bounds overruns. h264: increase reference poc list from 16 to 32. h264: stricter reference limit enforcement. h264: improve parsing of broken AVC SPS Replace computations of remaining bits with calls to get_bits_left(). png: convert to bytestream2 API. roqvideo: convert to bytestream2 API. smc: port to bytestream2 API. tgq: convert to bytestream2 API. algmm: convert to bytestream2 API. jvdec: unbreak video decoding h264: Fix invalid interlaced/progressive MB combinations for direct mode prediction. libx264: add 'stats' private option for setting 2pass stats filename. libx264: fix help text for slice-max-size option. avconv: reindent ... Conflicts: Changelog RELEASE avconv.c doc/APIchanges ffplay.c libavcodec/Makefile libavcodec/aacdec.c libavcodec/alsdec.c libavcodec/atrac3.c libavcodec/avcodec.h libavcodec/dvdata.c libavcodec/fraps.c libavcodec/golomb.h libavcodec/h264.c libavcodec/h264.h libavcodec/h264_cabac.c libavcodec/h264_cavlc.c libavcodec/h264_direct.c libavcodec/h264_parser.c libavcodec/h264_ps.c libavcodec/h264idct_template.c libavcodec/indeo3.c libavcodec/kgv1dec.c libavcodec/kmvc.c libavcodec/mjpegbdec.c libavcodec/mmvideo.c libavcodec/mpegaudiodec.c libavcodec/mpegvideo.h libavcodec/options.c libavcodec/pngdec.c libavcodec/roqvideodec.c libavcodec/shorten.c libavcodec/svq3.c libavcodec/utils.c libavcodec/version.h libavcodec/wmadec.c libavcodec/xxan.c libavformat/Makefile libavformat/asfdec.c libavformat/dv.c libavformat/mov.c libavformat/nsvdec.c libavformat/utils.c libavformat/version.h libavutil/avutil.h libavutil/error.c libavutil/error.h libswscale/swscale.c libswscale/utils.c libswscale/x86/swscale_template.c tests/ref/acodec/g722 Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale/x86')
-rw-r--r--libswscale/x86/scale.asm31
-rw-r--r--libswscale/x86/swscale_mmx.c44
-rw-r--r--libswscale/x86/swscale_template.c28
3 files changed, 72 insertions, 31 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 09313b926f..692a88fe39 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -38,7 +38,7 @@ SECTION .text
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
-; const int16_t *filterPos, int filterSize);
+; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bits width or 16-bits width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
@@ -53,6 +53,9 @@ SECTION .text
cglobal hscale%1to%2_%4_%5, %6, 7, %7
%ifdef ARCH_X86_64
movsxd r2, r2d
+%define mov32 movsxd
+%else ; x86-32
+%define mov32 mov
%endif ; x86-64
%if %2 == 19
%if mmsize == 8 ; mmx
@@ -95,14 +98,14 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%else ; %2 == 19
lea r1, [r1+r2*(4>>r2shr)]
%endif ; %2 == 15/19
- lea r5, [r5+r2*(2>>r2shr)]
+ lea r5, [r5+r2*(4>>r2shr)]
neg r2
.loop:
%if %3 == 4 ; filterSize == 4 scaling
; load 2x4 or 4x4 source pixels into m0/m1
- movsx r0, word [r5+r2*2+0] ; filterPos[0]
- movsx r6, word [r5+r2*2+2] ; filterPos[1]
+ mov32 r0, dword [r5+r2*4+0] ; filterPos[0]
+ mov32 r6, dword [r5+r2*4+4] ; filterPos[1]
movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}]
%if mmsize == 8
movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
@@ -112,8 +115,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%else ; %1 == 8
movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%endif
- movsx r0, word [r5+r2*2+4] ; filterPos[2]
- movsx r6, word [r5+r2*2+6] ; filterPos[3]
+ mov32 r0, dword [r5+r2*4+8] ; filterPos[2]
+ mov32 r6, dword [r5+r2*4+12] ; filterPos[3]
movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}]
%if %1 > 8
movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}]
@@ -156,8 +159,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%endif ; mmx/sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
- movsx r0, word [r5+r2*1+0] ; filterPos[0]
- movsx r6, word [r5+r2*1+2] ; filterPos[1]
+ mov32 r0, dword [r5+r2*2+0] ; filterPos[0]
+ mov32 r6, dword [r5+r2*2+4] ; filterPos[1]
movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
%if mmsize == 8
movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
@@ -165,8 +168,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
%else ; mmsize == 16
movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
- movsx r0, word [r5+r2*1+4] ; filterPos[2]
- movsx r6, word [r5+r2*1+6] ; filterPos[3]
+ mov32 r0, dword [r5+r2*2+8] ; filterPos[2]
+ mov32 r6, dword [r5+r2*2+12] ; filterPos[3]
movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
%endif ; mmsize == 8/16
@@ -251,7 +254,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%define r1x r1
%define filter2 r6m
%endif ; x86-32/64
- lea r5, [r5+r2*2]
+ lea r5, [r5+r2*4]
%if %2 == 15
lea r1, [r1+r2*2]
%else ; %2 == 19
@@ -261,8 +264,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
neg r2
.loop:
- movsx r0, word [r5+r2*2+0] ; filterPos[0]
- movsx r1x, word [r5+r2*2+2] ; filterPos[1]
+ mov32 r0, dword [r5+r2*4+0] ; filterPos[0]
+ mov32 r1x, dword [r5+r2*4+4] ; filterPos[1]
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
pxor m4, m4
pxor m5, m5
@@ -293,7 +296,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
jl .innerloop
%ifidn %4, X4
- movsx r1x, word [r5+r2*2+2] ; filterPos[1]
+ mov32 r1x, dword [r5+r2*4+4] ; filterPos[1]
movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0]
sub r1x, r6 ; and first 4 srcpx of dstpx[1]
%if %1 > 8
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index d0bb861618..fdfd1f215a 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -108,8 +108,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize;
- int16_t *vLumFilterPos= c->vLumFilterPos;
- int16_t *vChrFilterPos= c->vChrFilterPos;
+ int32_t *vLumFilterPos= c->vLumFilterPos;
+ int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter;
@@ -132,6 +132,44 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
int i;
+
+ if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+ const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+ int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
+ for (i = 0; i < neg; i++)
+ tmpY[i] = lumSrcPtr[neg];
+ for ( ; i < end; i++)
+ tmpY[i] = lumSrcPtr[i];
+ for ( ; i < vLumFilterSize; i++)
+ tmpY[i] = tmpY[i-1];
+ lumSrcPtr = tmpY;
+
+ if (alpSrcPtr) {
+ const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+ for (i = 0; i < neg; i++)
+ tmpA[i] = alpSrcPtr[neg];
+ for ( ; i < end; i++)
+ tmpA[i] = alpSrcPtr[i];
+ for ( ; i < vLumFilterSize; i++)
+ tmpA[i] = tmpA[i - 1];
+ alpSrcPtr = tmpA;
+ }
+ }
+ if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+ const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+ int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
+ for (i = 0; i < neg; i++) {
+ tmpU[i] = chrUSrcPtr[neg];
+ }
+ for ( ; i < end; i++) {
+ tmpU[i] = chrUSrcPtr[i];
+ }
+ for ( ; i < vChrFilterSize; i++) {
+ tmpU[i] = tmpU[i - 1];
+ }
+ chrUSrcPtr = tmpU;
+ }
+
if (flags & SWS_ACCURATE_RND) {
int s= APCK_SIZE / 8;
for (i=0; i<vLumFilterSize; i+=2) {
@@ -242,7 +280,7 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt(
SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \
const int16_t *filter, \
- const int16_t *filterPos, int filterSize)
+ const int32_t *filterPos, int filterSize)
#define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt); \
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index e92d927440..61ee3ebae0 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -762,10 +762,10 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
@@ -993,10 +993,10 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
@@ -1048,9 +1048,9 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
@@ -1101,10 +1101,10 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
@@ -1368,9 +1368,9 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
@@ -1386,10 +1386,10 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
@@ -1579,7 +1579,7 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
int srcW, int xInc)
{
- int16_t *filterPos = c->hLumFilterPos;
+ int32_t *filterPos = c->hLumFilterPos;
int16_t *filter = c->hLumFilter;
void *mmx2FilterCode= c->lumMmx2FilterCode;
int i;
@@ -1675,7 +1675,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
- int16_t *filterPos = c->hChrFilterPos;
+ int32_t *filterPos = c->hChrFilterPos;
int16_t *filter = c->hChrFilter;
void *mmx2FilterCode= c->chrMmx2FilterCode;
int i;