diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-03-16 07:47:27 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-03-16 09:01:08 +0100 |
commit | 568e9062bd29e13e0bfa42f2ac8411d01608634d (patch) | |
tree | a78351d75b3dee8257909ccffde46880099c91bb /libswscale/x86 | |
parent | 5dbc75870f486fb9c0237870eafa834a8a2066c8 (diff) | |
parent | 5effcfa76792470677a1f6bc9aa73347a87ef720 (diff) | |
download | ffmpeg-568e9062bd29e13e0bfa42f2ac8411d01608634d.tar.gz |
Merge remote-tracking branch 'qatar/release/0.8' into release/0.10
* qatar/release/0.8: (154 commits)
Update Changelog for the 0.8.1 Release
dca: include libavutil/mathematics.h for possibly missing M_SQRT1_2
dca: don't use av_clip_uintp2().
snow: check reference frame indices.
snow: reject unsupported chroma shifts.
xa_adpcm: limit filter to prevent xa_adpcm_table[] array bounds overruns.
h264: increase reference poc list from 16 to 32.
h264: stricter reference limit enforcement.
h264: improve parsing of broken AVC SPS
Replace computations of remaining bits with calls to get_bits_left().
png: convert to bytestream2 API.
roqvideo: convert to bytestream2 API.
smc: port to bytestream2 API.
tgq: convert to bytestream2 API.
algmm: convert to bytestream2 API.
jvdec: unbreak video decoding
h264: Fix invalid interlaced/progressive MB combinations for direct mode prediction.
libx264: add 'stats' private option for setting 2pass stats filename.
libx264: fix help text for slice-max-size option.
avconv: reindent
...
Conflicts:
Changelog
RELEASE
avconv.c
doc/APIchanges
ffplay.c
libavcodec/Makefile
libavcodec/aacdec.c
libavcodec/alsdec.c
libavcodec/atrac3.c
libavcodec/avcodec.h
libavcodec/dvdata.c
libavcodec/fraps.c
libavcodec/golomb.h
libavcodec/h264.c
libavcodec/h264.h
libavcodec/h264_cabac.c
libavcodec/h264_cavlc.c
libavcodec/h264_direct.c
libavcodec/h264_parser.c
libavcodec/h264_ps.c
libavcodec/h264idct_template.c
libavcodec/indeo3.c
libavcodec/kgv1dec.c
libavcodec/kmvc.c
libavcodec/mjpegbdec.c
libavcodec/mmvideo.c
libavcodec/mpegaudiodec.c
libavcodec/mpegvideo.h
libavcodec/options.c
libavcodec/pngdec.c
libavcodec/roqvideodec.c
libavcodec/shorten.c
libavcodec/svq3.c
libavcodec/utils.c
libavcodec/version.h
libavcodec/wmadec.c
libavcodec/xxan.c
libavformat/Makefile
libavformat/asfdec.c
libavformat/dv.c
libavformat/mov.c
libavformat/nsvdec.c
libavformat/utils.c
libavformat/version.h
libavutil/avutil.h
libavutil/error.c
libavutil/error.h
libswscale/swscale.c
libswscale/utils.c
libswscale/x86/swscale_template.c
tests/ref/acodec/g722
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale/x86')
-rw-r--r-- | libswscale/x86/scale.asm | 31 | ||||
-rw-r--r-- | libswscale/x86/swscale_mmx.c | 44 | ||||
-rw-r--r-- | libswscale/x86/swscale_template.c | 28 |
3 files changed, 72 insertions, 31 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 09313b926f..692a88fe39 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -38,7 +38,7 @@ SECTION .text ; (SwsContext *c, int{16,32}_t *dst, ; int dstW, const uint{8,16}_t *src, ; const int16_t *filter, -; const int16_t *filterPos, int filterSize); +; const int32_t *filterPos, int filterSize); ; ; Scale one horizontal line. Input is either 8-bits width or 16-bits width ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to @@ -53,6 +53,9 @@ SECTION .text cglobal hscale%1to%2_%4_%5, %6, 7, %7 %ifdef ARCH_X86_64 movsxd r2, r2d +%define mov32 movsxd +%else ; x86-32 +%define mov32 mov %endif ; x86-64 %if %2 == 19 %if mmsize == 8 ; mmx @@ -95,14 +98,14 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %else ; %2 == 19 lea r1, [r1+r2*(4>>r2shr)] %endif ; %2 == 15/19 - lea r5, [r5+r2*(2>>r2shr)] + lea r5, [r5+r2*(4>>r2shr)] neg r2 .loop: %if %3 == 4 ; filterSize == 4 scaling ; load 2x4 or 4x4 source pixels into m0/m1 - movsx r0, word [r5+r2*2+0] ; filterPos[0] - movsx r6, word [r5+r2*2+2] ; filterPos[1] + mov32 r0, dword [r5+r2*4+0] ; filterPos[0] + mov32 r6, dword [r5+r2*4+4] ; filterPos[1] movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] %if mmsize == 8 movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] @@ -112,8 +115,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %else ; %1 == 8 movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] %endif - movsx r0, word [r5+r2*2+4] ; filterPos[2] - movsx r6, word [r5+r2*2+6] ; filterPos[3] + mov32 r0, dword [r5+r2*4+8] ; filterPos[2] + mov32 r6, dword [r5+r2*4+12] ; filterPos[3] movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] %if %1 > 8 movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] @@ -156,8 +159,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %endif ; mmx/sse2/ssse3/sse4 %else ; %3 == 8, i.e. filterSize == 8 scaling ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 - movsx r0, word [r5+r2*1+0] ; filterPos[0] - movsx r6, word [r5+r2*1+2] ; filterPos[1] + mov32 r0, dword [r5+r2*2+0] ; filterPos[0] + mov32 r6, dword [r5+r2*2+4] ; filterPos[1] movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] %if mmsize == 8 movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] @@ -165,8 +168,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] %else ; mmsize == 16 movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] - movsx r0, word [r5+r2*1+4] ; filterPos[2] - movsx r6, word [r5+r2*1+6] ; filterPos[3] + mov32 r0, dword [r5+r2*2+8] ; filterPos[2] + mov32 r6, dword [r5+r2*2+12] ; filterPos[3] movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] %endif ; mmsize == 8/16 @@ -251,7 +254,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %define r1x r1 %define filter2 r6m %endif ; x86-32/64 - lea r5, [r5+r2*2] + lea r5, [r5+r2*4] %if %2 == 15 lea r1, [r1+r2*2] %else ; %2 == 19 @@ -261,8 +264,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 neg r2 .loop: - movsx r0, word [r5+r2*2+0] ; filterPos[0] - movsx r1x, word [r5+r2*2+2] ; filterPos[1] + mov32 r0, dword [r5+r2*4+0] ; filterPos[0] + mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? pxor m4, m4 pxor m5, m5 @@ -293,7 +296,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 jl .innerloop %ifidn %4, X4 - movsx r1x, word [r5+r2*2+2] ; filterPos[1] + mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] sub r1x, r6 ; and first 4 srcpx of dstpx[1] %if %1 > 8 diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index d0bb861618..fdfd1f215a 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -108,8 +108,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI int16_t **alpPixBuf= c->alpPixBuf; const int vLumBufSize= c->vLumBufSize; const int vChrBufSize= c->vChrBufSize; - int16_t *vLumFilterPos= c->vLumFilterPos; - int16_t *vChrFilterPos= c->vChrFilterPos; + int32_t *vLumFilterPos= c->vLumFilterPos; + int32_t *vChrFilterPos= c->vChrFilterPos; int16_t *vLumFilter= c->vLumFilter; int16_t *vChrFilter= c->vChrFilter; int32_t *lumMmxFilter= c->lumMmxFilter; @@ -132,6 +132,44 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; int i; + + if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { + const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize; + int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize); + for (i = 0; i < neg; i++) + tmpY[i] = lumSrcPtr[neg]; + for ( ; i < end; i++) + tmpY[i] = lumSrcPtr[i]; + for ( ; i < vLumFilterSize; i++) + tmpY[i] = tmpY[i-1]; + lumSrcPtr = tmpY; + + if (alpSrcPtr) { + const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize; + for (i = 0; i < neg; i++) + tmpA[i] = alpSrcPtr[neg]; + for ( ; i < end; i++) + tmpA[i] = alpSrcPtr[i]; + for ( ; i < vLumFilterSize; i++) + tmpA[i] = tmpA[i - 1]; + alpSrcPtr = tmpA; + } + } + if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) { + const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize; + int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize); + for (i = 0; i < neg; i++) { + tmpU[i] = chrUSrcPtr[neg]; + } + for ( ; i < end; i++) { + tmpU[i] = chrUSrcPtr[i]; + } + for ( ; i < vChrFilterSize; i++) { + tmpU[i] = tmpU[i - 1]; + } + chrUSrcPtr = tmpU; + } + if (flags & SWS_ACCURATE_RND) { int s= APCK_SIZE / 8; for (i=0; i<vLumFilterSize; i+=2) { @@ -242,7 +280,7 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( SwsContext *c, int16_t *data, \ int dstW, const uint8_t *src, \ const int16_t *filter, \ - const int16_t *filterPos, int filterSize) + const int32_t *filterPos, int filterSize) #define SCALE_FUNCS(filter_n, opt) \ SCALE_FUNC(filter_n, 8, 15, opt); \ diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index e92d927440..61ee3ebae0 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -762,10 +762,10 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ @@ -993,10 +993,10 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ @@ -1048,9 +1048,9 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], ".p2align 4 \n\t"\ "1: \n\t"\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ @@ -1101,10 +1101,10 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ @@ -1368,9 +1368,9 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, ".p2align 4 \n\t"\ "1: \n\t"\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "psraw $7, %%mm3 \n\t" \ "psraw $7, %%mm4 \n\t" \ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ @@ -1386,10 +1386,10 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "psrlw $8, %%mm3 \n\t" \ @@ -1579,7 +1579,7 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc) { - int16_t *filterPos = c->hLumFilterPos; + int32_t *filterPos = c->hLumFilterPos; int16_t *filter = c->hLumFilter; void *mmx2FilterCode= c->lumMmx2FilterCode; int i; @@ -1675,7 +1675,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc) { - int16_t *filterPos = c->hChrFilterPos; + int32_t *filterPos = c->hChrFilterPos; int16_t *filter = c->hChrFilter; void *mmx2FilterCode= c->chrMmx2FilterCode; int i; |