diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-08-13 22:23:40 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-08-13 22:24:47 +0200 |
commit | ca1dfea12771b585846fb86aa08c3d7f066a3cc4 (patch) | |
tree | 6db034b36245b9fe51d9a41d41be1718a3be8b89 /libswscale | |
parent | 75af0e6a1601a4246d6409ca28dc80a3ba0e8d6e (diff) | |
parent | 3304a1e69a8a050eb66d2304acd2d01354fa1aac (diff) | |
download | ffmpeg-ca1dfea12771b585846fb86aa08c3d7f066a3cc4.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
swscale: add dithering to yuv2yuvX_altivec_real
rv34: free+allocate buffer instead of reallocating it to preserve alignment
h264: add missing brackets.
swscale: use 15-bit intermediates for 9/10-bit scaling.
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale')
-rw-r--r-- | libswscale/ppc/swscale_altivec.c | 23 | ||||
-rw-r--r-- | libswscale/swscale.c | 259 | ||||
-rw-r--r-- | libswscale/swscale_internal.h | 108 | ||||
-rw-r--r-- | libswscale/utils.c | 22 | ||||
-rw-r--r-- | libswscale/x86/swscale_template.c | 4 |
5 files changed, 246 insertions, 170 deletions
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 8bc0ddd9d8..8a5bac308e 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -92,6 +92,7 @@ altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) } } +//FIXME remove the usage of scratch buffers. static void yuv2yuvX_altivec_real(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, @@ -101,17 +102,13 @@ yuv2yuvX_altivec_real(SwsContext *c, uint8_t *dest[4], int dstW, int chrDstW) { uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2]; - const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; register int i, j; { DECLARE_ALIGNED(16, int, val)[dstW]; - for (i = 0; i < (dstW -7); i+=4) { - vec_st(vini, i << 2, val); - } - for (; i < dstW; i++) { - val[i] = (1 << 18); - } + for (i=0; i<dstW; i++) + val[i] = lumDither[i & 7] << 12; for (j = 0; j < lumFilterSize; j++) { vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter); @@ -155,13 +152,9 @@ yuv2yuvX_altivec_real(SwsContext *c, DECLARE_ALIGNED(16, int, u)[chrDstW]; DECLARE_ALIGNED(16, int, v)[chrDstW]; - for (i = 0; i < (chrDstW -7); i+=4) { - vec_st(vini, i << 2, u); - vec_st(vini, i << 2, v); - } - for (; i < chrDstW; i++) { - u[i] = (1 << 18); - v[i] = (1 << 18); + for (i=0; i<chrDstW; i++) { + u[i] = chrDither[i & 7] << 12; + v[i] = chrDither[(i + 3) & 7] << 12; } for (j = 0; j < chrFilterSize; j++) { @@ -406,7 +399,7 @@ void ff_sws_init_swScale_altivec(SwsContext *c) if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) return; - if (c->scalingBpp == 8) { + if (c->srcBpc == 8 && c->dstBpc <= 10) { c->hScale = hScale_altivec_real; } if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 9d17868019..9897f2fd89 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -307,17 +307,9 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, #define output_pixel(pos, val) \ if (big_endian) { \ - if (output_bits == 16) { \ - AV_WB16(pos, av_clip_uint16(val >> shift)); \ - } else { \ - AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \ - } \ + AV_WB16(pos, av_clip_uint16(val >> shift)); \ } else { \ - if (output_bits == 16) { \ - AV_WL16(pos, av_clip_uint16(val >> shift)); \ - } else { \ - AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \ - } \ + AV_WL16(pos, av_clip_uint16(val >> shift)); \ } for (i = 0; i < dstW; i++) { int val = 1 << (26-output_bits + 4*dword - 1); @@ -359,7 +351,67 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, #undef output_pixel } -#define yuv2NBPS(bits, BE_LE, is_be) \ +static av_always_inline void +yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc, + int lumFilterSize, const int16_t *chrFilter, + const int16_t **chrUSrc, const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint16_t *dest[4], int dstW, int chrDstW, + int big_endian, int output_bits) +{ + //FIXME Optimize (just quickly written not optimized..) + int i; + uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], + *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; + int shift = 11 + 16 - output_bits - 1; + +#define output_pixel(pos, val) \ + if (big_endian) { \ + AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \ + } else { \ + AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \ + } + for (i = 0; i < dstW; i++) { + int val = 1 << (26-output_bits - 1); + int j; + + for (j = 0; j < lumFilterSize; j++) + val += (lumSrc[j][i] * lumFilter[j]) >> 1; + + output_pixel(&yDest[i], val); + } + + if (uDest) { + for (i = 0; i < chrDstW; i++) { + int u = 1 << (26-output_bits - 1); + int v = 1 << (26-output_bits - 1); + int j; + + for (j = 0; j < chrFilterSize; j++) { + u += (chrUSrc[j][i] * chrFilter[j]) >> 1; + v += (chrVSrc[j][i] * chrFilter[j]) >> 1; + } + + output_pixel(&uDest[i], u); + output_pixel(&vDest[i], v); + } + } + + if (CONFIG_SWSCALE_ALPHA && aDest) { + for (i = 0; i < dstW; i++) { + int val = 1 << (26-output_bits - 1); + int j; + + for (j = 0; j < lumFilterSize; j++) + val += (alpSrc[j][i] * lumFilter[j]) >> 1; + + output_pixel(&aDest[i], val); + } + } +#undef output_pixel +} + +#define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \ static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \ const int16_t **_lumSrc, int lumFilterSize, \ const int16_t *chrFilter, const int16_t **_chrUSrc, \ @@ -367,21 +419,21 @@ static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFil int chrFilterSize, const int16_t **_alpSrc, \ uint8_t *_dest[4], int dstW, int chrDstW) \ { \ - const int32_t **lumSrc = (const int32_t **) _lumSrc, \ - **chrUSrc = (const int32_t **) _chrUSrc, \ - **chrVSrc = (const int32_t **) _chrVSrc, \ - **alpSrc = (const int32_t **) _alpSrc; \ - yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \ - chrFilter, chrUSrc, chrVSrc, chrFilterSize, \ - alpSrc, (uint16_t **) _dest, \ - dstW, chrDstW, is_be, bits); \ -} -yuv2NBPS( 9, BE, 1); -yuv2NBPS( 9, LE, 0); -yuv2NBPS(10, BE, 1); -yuv2NBPS(10, LE, 0); -yuv2NBPS(16, BE, 1); -yuv2NBPS(16, LE, 0); + const typeX_t **lumSrc = (const typeX_t **) _lumSrc, \ + **chrUSrc = (const typeX_t **) _chrUSrc, \ + **chrVSrc = (const typeX_t **) _chrVSrc, \ + **alpSrc = (const typeX_t **) _alpSrc; \ + yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \ + chrFilter, chrUSrc, chrVSrc, chrFilterSize, \ + alpSrc, (uint16_t **) _dest, \ + dstW, chrDstW, is_be, bits); \ +} +yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t); +yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t); +yuv2NBPS(10, BE, 1, yuv2yuvX10_c_template, int16_t); +yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t); +yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t); +yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t); static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, @@ -1971,15 +2023,15 @@ static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1, } } -static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src, - const int16_t *filter, - const int16_t *filterPos, int filterSize) +static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src, + const int16_t *filter, + const int16_t *filterPos, int filterSize) { int i; int32_t *dst = (int32_t *) _dst; const uint16_t *src = (const uint16_t *) _src; int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1; - int sh = (bits <= 7) ? 11 : (bits - 4); + int sh = bits - 4; if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15) sh= 9; @@ -1997,10 +2049,31 @@ static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_s } } +static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src, + const int16_t *filter, + const int16_t *filterPos, int filterSize) +{ + int i; + const uint16_t *src = (const uint16_t *) _src; + int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1; + + for (i = 0; i < dstW; i++) { + int j; + int srcPos = filterPos[i]; + int val = 0; + + for (j = 0; j < filterSize; j++) { + val += src[srcPos + j] * filter[filterSize * i + j]; + } + // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit + dst[i] = FFMIN(val >> sh, (1 << 15) - 1); + } +} + // bilinear / bicubic scaling -static void hScale_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, - const int16_t *filter, const int16_t *filterPos, - int filterSize) +static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, + const int16_t *filter, const int16_t *filterPos, + int filterSize) { int i; for (i=0; i<dstW; i++) { @@ -2045,6 +2118,25 @@ static inline void hScale16NX_c(int16_t *dst, int dstW, const uint16_t *src, int } } +static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src, + const int16_t *filter, const int16_t *filterPos, + int filterSize) +{ + int i; + int32_t *dst = (int32_t *) _dst; + for (i=0; i<dstW; i++) { + int j; + int srcPos= filterPos[i]; + int val=0; + for (j=0; j<filterSize; j++) { + val += ((int)src[srcPos + j])*filter[filterSize*i + j]; + } + //filter += hFilterSize; + dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ... + //dst[i] = val>>7; + } +} + //FIXME all pal and rgb srcFormats could do this convertion as well //FIXME all scalers more complex than bilinear could do half of this transform static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width) @@ -2126,23 +2218,6 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, dst[i] = src[srcW-1]*128; } -static void scale8To16Rv_c(uint16_t *_dst, const uint8_t *src, int len) -{ - int i; - uint8_t *dst = (uint8_t *) _dst; - for (i = len - 1; i >= 0; i--) { - dst[i * 2] = dst[i * 2 + 1] = src[i]; - } -} - -static void scale19To15Fw_c(int16_t *dst, const int32_t *src, int len) -{ - int i; - for (i = 0; i < len; i++) { - dst[i] = src[i] >> 4; - } -} - // *** horizontal scale Y line to temp buffer static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc, @@ -2159,11 +2234,6 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth, src= formatConvBuffer; } - if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16 && !isAnyRGB(c->srcFormat)) { - c->scale8To16Rv((uint16_t *) formatConvBuffer, src, srcW); - src = formatConvBuffer; - } - if (c->hScale16) { int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1; c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift); @@ -2175,10 +2245,6 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth, if (convertRange) convertRange(dst, dstWidth); - - if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 15 && c->scalingBpp == 16) { - c->scale19To15Fw(dst, (int32_t *) dst, dstWidth); - } } static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2, @@ -2213,14 +2279,6 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2 src2= buf2; } - if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16 && !isAnyRGB(c->srcFormat)) { - uint8_t *buf2 = (formatConvBuffer + FFALIGN(srcW * 2+78, 16)); - c->scale8To16Rv((uint16_t *) formatConvBuffer, src1, srcW); - c->scale8To16Rv((uint16_t *) buf2, src2, srcW); - src1 = formatConvBuffer; - src2 = buf2; - } - if (c->hScale16) { int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1; c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift); @@ -2234,11 +2292,6 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2 if (c->chrConvertRange) c->chrConvertRange(dst1, dst2, dstWidth); - - if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 15 && c->scalingBpp == 16) { - c->scale19To15Fw(dst1, (int32_t *) dst1, dstWidth); - c->scale19To15Fw(dst2, (int32_t *) dst2, dstWidth); - } } static av_always_inline void @@ -2775,12 +2828,12 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV444P10BE: case PIX_FMT_YUV422P10BE: - case PIX_FMT_YUV420P10BE: c->hScale16= HAVE_BIGENDIAN ? hScale16N_c : hScale16NX_c; break; + case PIX_FMT_YUV420P10BE: c->hScale16= HAVE_BIGENDIAN ? NULL : hScale16NX_c; break; case PIX_FMT_YUV444P9LE: case PIX_FMT_YUV420P9LE: case PIX_FMT_YUV422P10LE: case PIX_FMT_YUV420P10LE: - case PIX_FMT_YUV444P10LE: c->hScale16= HAVE_BIGENDIAN ? hScale16NX_c : hScale16N_c; break; + case PIX_FMT_YUV444P10LE: c->hScale16= HAVE_BIGENDIAN ? hScale16NX_c : NULL; break; #if HAVE_BIGENDIAN case PIX_FMT_YUV420P16LE: case PIX_FMT_YUV422P16LE: @@ -2889,37 +2942,45 @@ static av_cold void sws_init_swScale_c(SwsContext *c) } } - if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15) - || c->srcFormat == PIX_FMT_PAL8) - c->hScale16= hScale16N_c; - - if (c->scalingBpp == 8) { - c->hScale = hScale_c; - if (c->flags & SWS_FAST_BILINEAR) { - c->hyscale_fast = hyscale_fast_c; - c->hcscale_fast = hcscale_fast_c; - } - if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { - if (c->srcRange) { - c->lumConvertRange = lumRangeFromJpeg_c; - c->chrConvertRange = chrRangeFromJpeg_c; + if (c->srcBpc == 8) { + if (c->dstBpc <= 10) { + if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15) + || c->srcFormat == PIX_FMT_PAL8) + c->hScale16= hScale16N_c; + c->hScale = hScale8To15_c; + if (c->flags & SWS_FAST_BILINEAR) { + c->hyscale_fast = hyscale_fast_c; + c->hcscale_fast = hcscale_fast_c; + } } else { - c->lumConvertRange = lumRangeToJpeg_c; - c->chrConvertRange = chrRangeToJpeg_c; + c->hScale = hScale8To19_c; + av_assert0(c->hScale16 != hScale16N_c && c->hScale16 != hScale16NX_c); } - } } else { - if(c->hScale16 == hScale16NX_c && !isAnyRGB(c->srcFormat)){ - c->chrToYV12 = bswap16UV_c; - c->lumToYV12 = bswap16Y_c; + if(c->dstBpc > 10){ + if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15) + || c->srcFormat == PIX_FMT_PAL8) + c->hScale16= hScale16N_c; + if(c->hScale16 == hScale16NX_c && !isAnyRGB(c->srcFormat)){ + c->chrToYV12 = bswap16UV_c; + c->lumToYV12 = bswap16Y_c; + } + c->hScale16 = NULL; } - c->hScale16 = NULL; - c->hScale = hScale16_c; - c->scale19To15Fw = scale19To15Fw_c; - c->scale8To16Rv = scale8To16Rv_c; + c->hScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c; + } - if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->dstBpc <= 10) { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg_c; + c->chrConvertRange = chrRangeFromJpeg_c; + } else { + c->lumConvertRange = lumRangeToJpeg_c; + c->chrConvertRange = chrRangeToJpeg_c; + } + } else { if (c->srcRange) { c->lumConvertRange = lumRangeFromJpeg16_c; c->chrConvertRange = chrRangeFromJpeg16_c; diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 30dec99130..a936f96373 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -66,11 +66,16 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[], * without any additional vertical scaling (or point-scaling). * * @param c SWS scaling context - * @param lumSrc scaled luma (Y) source data, 15bit for 8bit output - * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output - * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output - * @param alpSrc scaled alpha (A) source data, 15bit for 8bit output - * @param dest pointer to the 4 output planes (Y/U/V/A) + * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param dest pointer to the 4 output planes (Y/U/V/A). For >8bit + * output, this is in uint16_t * @param dstW width of dest[0], dest[3], lumSrc and alpSrc in pixels * @param chrDstW width of dest[1], dest[2], chrUSrc and chrVSrc */ @@ -84,14 +89,19 @@ typedef void (*yuv2planar1_fn) (struct SwsContext *c, * * @param c SWS scaling context * @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096] - * @param lumSrc scaled luma (Y) source data, 15bit for 8bit output + * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) * @param lumFilterSize number of vertical luma/alpha input lines to scale * @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096] - * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output - * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output + * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) * @param chrFilterSize number of vertical chroma input lines to scale - * @param alpSrc scaled alpha (A) source data, 15bit for 8bit output - * @param dest pointer to the 4 output planes (Y/U/V/A) + * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param dest pointer to the 4 output planes (Y/U/V/A). For >8bit + * output, this is in uint16_t * @param dstW width of dest[0], dest[3], lumSrc and alpSrc in pixels * @param chrDstW width of dest[1], dest[2], chrUSrc and chrVSrc */ @@ -107,11 +117,16 @@ typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter, * that this function may do chroma scaling, see the "uvalpha" argument. * * @param c SWS scaling context - * @param lumSrc scaled luma (Y) source data, 15bit for 8bit output - * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output - * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output - * @param alpSrc scaled alpha (A) source data, 15bit for 8bit output - * @param dest pointer to the output plane + * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param dest pointer to the output plane. For 16bit output, this is + * uint16_t * @param dstW width of lumSrc and alpSrc in pixels, number of pixels * to write into dest[] * @param uvalpha chroma scaling coefficient for the second line of chroma @@ -134,11 +149,16 @@ typedef void (*yuv2packed1_fn) (struct SwsContext *c, const int16_t *lumSrc, * output by doing bilinear scaling between two input lines. * * @param c SWS scaling context - * @param lumSrc scaled luma (Y) source data, 15bit for 8bit output - * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output - * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output - * @param alpSrc scaled alpha (A) source data, 15bit for 8bit output - * @param dest pointer to the output plane + * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param dest pointer to the output plane. For 16bit output, this is + * uint16_t * @param dstW width of lumSrc and alpSrc in pixels, number of pixels * to write into dest[] * @param yalpha luma/alpha scaling coefficients for the second input line. @@ -162,14 +182,19 @@ typedef void (*yuv2packed2_fn) (struct SwsContext *c, const int16_t *lumSrc[2], * * @param c SWS scaling context * @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096] - * @param lumSrc scaled luma (Y) source data, 15bit for 8bit output + * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) * @param lumFilterSize number of vertical luma/alpha input lines to scale * @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096] - * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output - * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output + * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) * @param chrFilterSize number of vertical chroma input lines to scale - * @param alpSrc scaled alpha (A) source data, 15bit for 8bit output - * @param dest pointer to the output plane + * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param dest pointer to the output plane. For 16bit output, this is + * uint16_t * @param dstW width of lumSrc and alpSrc in pixels, number of pixels * to write into dest[] * @param y vertical line number for this output. This does not need @@ -209,7 +234,7 @@ typedef struct SwsContext { enum PixelFormat srcFormat; ///< Source pixel format. int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format. int srcFormatBpp; ///< Number of bits per pixel of the source pixel format. - int scalingBpp; + int dstBpc, srcBpc; int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image. int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image. int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image. @@ -433,17 +458,19 @@ typedef struct SwsContext { * lines, to produce one (differently sized) line of output data. * * @param dst pointer to destination buffer for horizontally scaled - * data. If the scaling depth (SwsContext->scalingBpp) is - * 8, data will be 15bpp in 16bits (int16_t) width. If - * scaling depth is 16, data will be 19bpp in 32bpp - * (int32_t) width. + * data. If the number of bits per component of one + * destination pixel (SwsContext->dstBpc) is <= 10, data + * will be 15bpc in 16bits (int16_t) width. Else (i.e. + * SwsContext->dstBpc == 16), data will be 19bpc in + * 32bits (int32_t) width. * @param dstW width of destination image - * @param src pointer to source data to be scaled. If scaling depth - * is 8, this is 8bpp in 8bpp (uint8_t) width. If scaling - * depth is 16, this is native depth in 16bbp (uint16_t) - * width. In other words, for 9-bit YUV input, this is - * 9bpp, for 10-bit YUV input, this is 10bpp, and for - * 16-bit RGB or YUV, this is 16bpp. + * @param src pointer to source data to be scaled. If the number of + * bits per component of a source pixel (SwsContext->srcBpc) + * is 8, this is 8bpc in 8bits (uint8_t) width. Else + * (i.e. SwsContext->dstBpc > 8), this is native depth + * in 16bits (uint16_t) width. In other words, for 9-bit + * YUV input, this is 9bpc, for 10-bit YUV input, this is + * 10bpc, and for 16-bit RGB or YUV, this is 16bpc. * @param filter filter coefficients to be used per output pixel for * scaling. This contains 14bpp filtering coefficients. * Guaranteed to contain dstW * filterSize entries. @@ -467,15 +494,6 @@ typedef struct SwsContext { void (*lumConvertRange)(int16_t *dst, int width); ///< Color range conversion function for luma plane if needed. void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width); ///< Color range conversion function for chroma planes if needed. - /** - * dst[..] = (src[..] << 8) | src[..]; - */ - void (*scale8To16Rv)(uint16_t *dst, const uint8_t *src, int len); - /** - * dst[..] = src[..] >> 4; - */ - void (*scale19To15Fw)(int16_t *dst, const int32_t *src, int len); - int needs_hcscale; ///< Set if there are chroma planes to be converted. } SwsContext; diff --git a/libswscale/utils.c b/libswscale/utils.c index d3451e4bbd..54a4c0d42f 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -842,14 +842,18 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) } } - c->scalingBpp = FFMAX(av_pix_fmt_descriptors[srcFormat].comp[0].depth_minus1, - av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1) >= 15 ? 16 : 8; - - if (c->scalingBpp == 16) + c->srcBpc = 1 + av_pix_fmt_descriptors[srcFormat].comp[0].depth_minus1; + if (c->srcBpc < 8) + c->srcBpc = 8; + c->dstBpc = 1 + av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1; + if (c->dstBpc < 8) + c->dstBpc = 8; + if (isAnyRGB(srcFormat) && c->dstBpc == 16) + c->srcBpc = 16; + if (c->dstBpc == 16) dst_stride <<= 1; - av_assert0(c->scalingBpp<=16); FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail); - if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 && c->scalingBpp == 8) { + if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 && c->srcBpc == 8 && c->dstBpc <= 10) { c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) { if (flags&SWS_PRINT_INFO) @@ -875,7 +879,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) c->chrXInc+= 20; } //we don't use the x86 asm scaler if MMX is available - else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->scalingBpp == 8) { + else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->dstBpc <= 10) { c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20; } @@ -1007,7 +1011,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize]; } // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate) - c->uv_off = (dst_stride>>1) + 64 / c->scalingBpp; + c->uv_off = (dst_stride>>1) + 64 / (c->dstBpc &~ 7); c->uv_offx2 = dst_stride + 16; for (i=0; i<c->vChrBufSize; i++) { FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+32, fail); @@ -1023,7 +1027,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) //try to avoid drawing green stuff between the right end and the stride end for (i=0; i<c->vChrBufSize; i++) if(av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 == 15){ - av_assert0(c->scalingBpp == 16); + av_assert0(c->dstBpc > 10); for(j=0; j<dst_stride/2+1; j++) ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18; } else diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 87248e76fa..4163647fa0 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -2472,7 +2472,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } - if (c->scalingBpp == 8) { + if (c->srcBpc == 8 && c->dstBpc <= 10) { #if !COMPILE_TEMPLATE_MMX2 c->hScale = RENAME(hScale ); #endif /* !COMPILE_TEMPLATE_MMX2 */ @@ -2532,6 +2532,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) #endif /* !COMPILE_TEMPLATE_MMX2 */ if(isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15) c->hScale16= RENAME(hScale16); - if(c->scalingBpp != 8) + if(c->dstBpc > 10) c->hScale16 = NULL; } |