diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2014-07-19 05:09:57 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-07-19 05:36:27 +0200 |
commit | 6532a1a8286e3ecc80c30ef74ff08663f237424d (patch) | |
tree | cc85616835b34f73cf0ffc4a8c0258d372e2dec9 /libswscale/utils.c | |
parent | e9f7c7aef96dd8ca3519f0cfaa52573cf63a1d74 (diff) | |
download | ffmpeg-6532a1a8286e3ecc80c30ef74ff08663f237424d.tar.gz |
sws/x86: split mmxext fast bilinear scaler out
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale/utils.c')
-rw-r--r-- | libswscale/utils.c | 174 |
1 files changed, 4 insertions, 170 deletions
diff --git a/libswscale/utils.c b/libswscale/utils.c index 7274153453..e4a4200767 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -681,172 +681,6 @@ fail: return ret; } -#if HAVE_MMXEXT_INLINE -static av_cold int init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, - int16_t *filter, int32_t *filterPos, - int numSplits) -{ - uint8_t *fragmentA; - x86_reg imm8OfPShufW1A; - x86_reg imm8OfPShufW2A; - x86_reg fragmentLengthA; - uint8_t *fragmentB; - x86_reg imm8OfPShufW1B; - x86_reg imm8OfPShufW2B; - x86_reg fragmentLengthB; - int fragmentPos; - - int xpos, i; - - // create an optimized horizontal scaling routine - /* This scaler is made of runtime-generated MMXEXT code using specially tuned - * pshufw instructions. For every four output pixels, if four input pixels - * are enough for the fast bilinear scaling, then a chunk of fragmentB is - * used. If five input pixels are needed, then a chunk of fragmentA is used. - */ - - // code fragment - - __asm__ volatile ( - "jmp 9f \n\t" - // Begin - "0: \n\t" - "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" - "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" - "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "pshufw $0xFF, %%mm1, %%mm1 \n\t" - "1: \n\t" - "pshufw $0xFF, %%mm0, %%mm0 \n\t" - "2: \n\t" - "psubw %%mm1, %%mm0 \n\t" - "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" - "pmullw %%mm3, %%mm0 \n\t" - "psllw $7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - - "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - // End - "9: \n\t" - // "int $3 \n\t" - "lea " LOCAL_MANGLE(0b) ", %0 \n\t" - "lea " LOCAL_MANGLE(1b) ", %1 \n\t" - "lea " LOCAL_MANGLE(2b) ", %2 \n\t" - "dec %1 \n\t" - "dec %2 \n\t" - "sub %0, %1 \n\t" - "sub %0, %2 \n\t" - "lea " LOCAL_MANGLE(9b) ", %3 \n\t" - "sub %0, %3 \n\t" - - - : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), - "=r" (fragmentLengthA) - ); - - __asm__ volatile ( - "jmp 9f \n\t" - // Begin - "0: \n\t" - "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" - "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "pshufw $0xFF, %%mm0, %%mm1 \n\t" - "1: \n\t" - "pshufw $0xFF, %%mm0, %%mm0 \n\t" - "2: \n\t" - "psubw %%mm1, %%mm0 \n\t" - "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" - "pmullw %%mm3, %%mm0 \n\t" - "psllw $7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - - "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - // End - "9: \n\t" - // "int $3 \n\t" - "lea " LOCAL_MANGLE(0b) ", %0 \n\t" - "lea " LOCAL_MANGLE(1b) ", %1 \n\t" - "lea " LOCAL_MANGLE(2b) ", %2 \n\t" - "dec %1 \n\t" - "dec %2 \n\t" - "sub %0, %1 \n\t" - "sub %0, %2 \n\t" - "lea " LOCAL_MANGLE(9b) ", %3 \n\t" - "sub %0, %3 \n\t" - - - : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), - "=r" (fragmentLengthB) - ); - - xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers - fragmentPos = 0; - - for (i = 0; i < dstW / numSplits; i++) { - int xx = xpos >> 16; - - if ((i & 3) == 0) { - int a = 0; - int b = ((xpos + xInc) >> 16) - xx; - int c = ((xpos + xInc * 2) >> 16) - xx; - int d = ((xpos + xInc * 3) >> 16) - xx; - int inc = (d + 1 < 4); - uint8_t *fragment = inc ? fragmentB : fragmentA; - x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A; - x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A; - x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA; - int maxShift = 3 - (d + inc); - int shift = 0; - - if (filterCode) { - filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; - filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; - filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; - filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; - filterPos[i / 2] = xx; - - memcpy(filterCode + fragmentPos, fragment, fragmentLength); - - filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | - ((b + inc) << 2) | - ((c + inc) << 4) | - ((d + inc) << 6); - filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | - (c << 4) | - (d << 6); - - if (i + 4 - inc >= dstW) - shift = maxShift; // avoid overread - else if ((filterPos[i / 2] & 3) <= maxShift) - shift = filterPos[i / 2] & 3; // align - - if (shift && i >= shift) { - filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; - filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; - filterPos[i / 2] -= shift; - } - } - - fragmentPos += fragmentLength; - - if (filterCode) - filterCode[fragmentPos] = RET; - } - xpos += xInc; - } - if (filterCode) - filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part - - return fragmentPos + 1; -} -#endif /* HAVE_MMXEXT_INLINE */ - static void fill_rgb2yuv_table(SwsContext *c, const int table[4], int dstRange) { int64_t W, V, Z, Cy, Cu, Cv; @@ -1400,9 +1234,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, #if HAVE_MMXEXT_INLINE // can't downscale !!! if (c->canMMXEXTBeUsed && (flags & SWS_FAST_BILINEAR)) { - c->lumMmxextFilterCodeSize = init_hscaler_mmxext(dstW, c->lumXInc, NULL, + c->lumMmxextFilterCodeSize = ff_init_hscaler_mmxext(dstW, c->lumXInc, NULL, NULL, NULL, 8); - c->chrMmxextFilterCodeSize = init_hscaler_mmxext(c->chrDstW, c->chrXInc, + c->chrMmxextFilterCodeSize = ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc, NULL, NULL, NULL, 4); #if USE_MMAP @@ -1443,9 +1277,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW / 2 / 8 + 8) * sizeof(int32_t), fail); FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail); - init_hscaler_mmxext( dstW, c->lumXInc, c->lumMmxextFilterCode, + ff_init_hscaler_mmxext( dstW, c->lumXInc, c->lumMmxextFilterCode, c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8); - init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode, + ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode, c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4); #if USE_MMAP |