diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-11-07 02:41:01 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-11-07 03:01:43 +0100 |
commit | 13b7781ec8d475513c1ee40a6e481763b728a71e (patch) | |
tree | 953bee0a6461e74085e45e1f7793de6f850534e5 /libswscale | |
parent | ada8d485c0f77a4e79fac7f3f96031c4d0e6bc7a (diff) | |
parent | f2bd8a0786ded12c70d6877f16944b44ea731462 (diff) | |
download | ffmpeg-13b7781ec8d475513c1ee40a6e481763b728a71e.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (23 commits)
x86inc: use sse versions of common macros instead of sse2 when applicable
doc/APIchanges: add missing dates and hashes
lavf: don't return from void av_update_cur_dts()
Changelog: add more entries.
Changelog: update ffmpeg/avconv incompatibility list.
avconv: remove some redundant temporary variables.
avconv: fix broken indentation
avconv: move copy_initial_nonkeyframes to the options context.
avconv: use file:stream instead of file.stream in log messages.
doc/avconv: elaborate on basic functionality.
doc/avconv: -sample_fmts, not -help sample_fmts prints the sample formats
openssl: Only use CRYPTO_set_id_callback on OpenSSL < 1.0.0
Call avformat_network_init/deinit in the programs
Remove leftover includes of strings.h
avutil: Don't allow using strcasecmp/strncasecmp
Replace all usage of strcasecmp/strncasecmp
avstring: Add locale independent implementations of strcasecmp/strncasecmp
avstring: Add locale independent implementations of toupper/tolower
cosmetics: insert some spaces in explicit enum value assignments
move 8SVX audio codecs to the audio codec list part on the next bump
...
Conflicts:
avprobe.c
doc/APIchanges
ffplay.c
ffserver.c
libavcodec/avcodec.h
libavdevice/bktr.c
libavdevice/v4l.c
libavdevice/v4l2.c
libavformat/matroskaenc.c
libavformat/wtv.c
libavutil/avstring.c
libavutil/avstring.h
libavutil/avutil.h
libswscale/x86/swscale_template.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale')
-rw-r--r-- | libswscale/x86/scale.asm | 142 | ||||
-rw-r--r-- | libswscale/x86/swscale_mmx.c | 28 | ||||
-rw-r--r-- | libswscale/x86/swscale_template.c | 26 |
3 files changed, 170 insertions, 26 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 2e754862f0..5f831aaebf 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -34,6 +34,12 @@ yuv2yuvX_10_start: times 4 dd 0x10000 yuv2yuvX_9_start: times 4 dd 0x20000 yuv2yuvX_10_upper: times 8 dw 0x3ff yuv2yuvX_9_upper: times 8 dw 0x1ff +pd_4: times 4 dd 4 +pd_4min0x40000:times 4 dd 4 - (0x40000) +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +pw_512: times 8 dw 512 +pw_1024: times 8 dw 1024 SECTION .text @@ -665,3 +671,139 @@ INIT_AVX yuv2planeX_fn avx, 8, 10, 7 yuv2planeX_fn avx, 9, 7, 5 yuv2planeX_fn avx, 10, 7, 5 + +; %1=outout-bpc, %2=alignment (u/a) +%macro yuv2plane1_mainloop 2 +.loop_%2: +%if %1 == 8 + paddsw m0, m2, [r0+r2*2+mmsize*0] + paddsw m1, m3, [r0+r2*2+mmsize*1] + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 + mov%2 [r1+r2], m0 +%elif %1 == 16 + paddd m0, m4, [r0+r2*4+mmsize*0] + paddd m1, m4, [r0+r2*4+mmsize*1] + paddd m2, m4, [r0+r2*4+mmsize*2] + paddd m3, m4, [r0+r2*4+mmsize*3] + psrad m0, 3 + psrad m1, 3 + psrad m2, 3 + psrad m3, 3 +%if cpuflag(sse4) ; avx/sse4 + packusdw m0, m1 + packusdw m2, m3 +%else ; mmx/sse2 + packssdw m0, m1 + packssdw m2, m3 + paddw m0, m5 + paddw m2, m5 +%endif ; mmx/sse2/sse4/avx + mov%2 [r1+r2*2], m0 + mov%2 [r1+r2*2+mmsize], m2 +%else + paddsw m0, m2, [r0+r2*2+mmsize*0] + paddsw m1, m2, [r0+r2*2+mmsize*1] + psraw m0, 15 - %1 + psraw m1, 15 - %1 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m3 + pminsw m1, m3 + mov%2 [r1+r2*2], m0 + mov%2 [r1+r2*2+mmsize], m1 +%endif + add r2, mmsize + jl .loop_%2 +%endmacro + +%macro yuv2plane1_fn 3 +cglobal yuv2plane1_%1, %3, %3, %2 +%if %1 == 8 + add r1, r2 +%else ; %1 != 8 + lea r1, [r1+r2*2] +%endif ; %1 == 8 +%if %1 == 16 + lea r0, [r0+r2*4] +%else ; %1 != 16 + lea r0, [r0+r2*2] +%endif ; %1 == 16 + neg r2 + +%if %1 == 8 + pxor m4, m4 ; zero + + ; create registers holding dither + movq m3, [r3] ; dither + test r4d, r4d + jz .no_rot +%if mmsize == 16 + punpcklqdq m3, m3 +%endif ; mmsize == 16 + PALIGNR_MMX m3, m3, 3, m2 +.no_rot: +%if mmsize == 8 + mova m2, m3 + punpckhbw m3, m4 ; byte->word + punpcklbw m2, m4 ; byte->word +%else + punpcklbw m3, m4 + mova m2, m3 +%endif +%elif %1 == 9 + pxor m4, m4 + mova m3, [pw_512] + mova m2, [pw_32] +%elif %1 == 10 + pxor m4, m4 + mova m3, [pw_1024] + mova m2, [pw_16] +%else ; %1 == 16 +%if cpuflag(sse4) ; sse4/avx + mova m4, [pd_4] +%else ; mmx/sse2 + mova m4, [pd_4min0x40000] + mova m5, [minshort] +%endif ; mmx/sse2/sse4/avx +%endif ; %1 == .. + + ; actual pixel scaling +%if mmsize == 8 + yuv2plane1_mainloop %1, a +%else ; mmsize == 16 + test r1, 15 + jnz .unaligned + yuv2plane1_mainloop %1, a + REP_RET +.unaligned: + yuv2plane1_mainloop %1, u +%endif ; mmsize == 8/16 + REP_RET +%endmacro + +%ifdef ARCH_X86_32 +INIT_MMX mmx +yuv2plane1_fn 8, 0, 5 +yuv2plane1_fn 16, 0, 3 + +INIT_MMX mmx2 +yuv2plane1_fn 9, 0, 3 +yuv2plane1_fn 10, 0, 3 +%endif + +INIT_XMM sse2 +yuv2plane1_fn 8, 5, 5 +yuv2plane1_fn 9, 5, 3 +yuv2plane1_fn 10, 5, 3 +yuv2plane1_fn 16, 6, 3 + +INIT_XMM sse4 +yuv2plane1_fn 16, 5, 3 + +INIT_XMM avx +yuv2plane1_fn 8, 5, 5 +yuv2plane1_fn 9, 5, 3 +yuv2plane1_fn 10, 5, 3 +yuv2plane1_fn 16, 5, 3 diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index c16be83f13..8869274dd4 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -289,6 +289,22 @@ VSCALEX_FUNCS(sse4, sse4); VSCALEX_FUNC(16, sse4); VSCALEX_FUNCS(avx, avx); +#define VSCALE_FUNC(size, opt) \ +extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \ + const uint8_t *dither, int offset) +#define VSCALE_FUNCS(opt1, opt2) \ + VSCALE_FUNC(8, opt1); \ + VSCALE_FUNC(9, opt2); \ + VSCALE_FUNC(10, opt2); \ + VSCALE_FUNC(16, opt1) + +#if ARCH_X86_32 +VSCALE_FUNCS(mmx, mmx2); +#endif +VSCALE_FUNCS(sse2, sse2); +VSCALE_FUNC(16, sse4); +VSCALE_FUNCS(avx, avx); + void ff_sws_init_swScale_mmx(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -336,11 +352,19 @@ switch(c->dstBpc){ \ case 9: if (!isBE(c->dstFormat) && opt2chk) /*vscalefn = ff_yuv2planeX_9_ ## opt2;*/ break; \ default: /*vscalefn = ff_yuv2planeX_8_ ## opt1;*/ break; \ } +#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \ + switch(c->dstBpc){ \ + case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ + case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ + case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ + default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + } #if ARCH_X86_32 if (cpu_flags & AV_CPU_FLAG_MMX) { ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2,); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2); } #endif #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ @@ -355,6 +379,7 @@ switch(c->dstBpc){ \ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, sse2, 1,); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1); } if (cpu_flags & AV_CPU_FLAG_SSSE3) { ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); @@ -366,10 +391,13 @@ switch(c->dstBpc){ \ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4, sse4, 1, if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4); + if (c->dstBpc == 16 && !isBE(c->dstFormat)) + c->yuv2plane1 = ff_yuv2plane1_16_sse4; } if (cpu_flags & AV_CPU_FLAG_AVX) { ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, avx, 1,); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1); } #endif } diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 0f177176cb..5be6072d3b 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -109,29 +109,6 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, ); } -static void RENAME(yuv2yuv1_ar)(const int16_t *src, uint8_t *dst, int dstW, const uint8_t *dither, int offset) -{ - dither_8to16(dither, offset); - __asm__ volatile( - "mov %2, %%"REG_a" \n\t" - ".p2align 4 \n\t" /* FIXME Unroll? */ - "1: \n\t" - "movq (%0, %%"REG_a", 2), %%mm0 \n\t" - "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "paddsw %%mm3, %%mm0 \n\t" - "paddsw %%mm4, %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "psraw $7, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - MOVNTQ(%%mm0, (%1, %%REGa)) - "add $8, %%"REG_a" \n\t" - "jnc 1b \n\t" - :: "r" (src + dstW), "r" (dst + dstW), - "g" ((x86_reg)-dstW) - : "%"REG_a - ); -} - #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ "xor %%"REG_a", %%"REG_a" \n\t"\ @@ -1881,9 +1858,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) c->use_mmx_vfilter= 0; if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { - c->yuv2plane1 = RENAME(yuv2yuv1_ar ); if (c->flags & SWS_ACCURATE_RND) { - //c->yuv2yuv1 = RENAME(yuv2yuv1_ar ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; @@ -1896,7 +1871,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } else { int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat); - //c->yuv2plane1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 ); c->use_mmx_vfilter= 1; c->yuv2planeX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { |