diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-04-10 22:06:53 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-04-10 22:53:25 +0200 |
commit | e387c9d5dd56e1f29470ee933027ee3d92f9cfd6 (patch) | |
tree | daa5876aa5b6515b3c92b6ee45e552852345e35b /libavcodec/x86 | |
parent | b1ef4dc406e8a0bd9acea40d880aa4e74412075b (diff) | |
parent | 2130bd8f5b6504ea14cd41e33f5d4f431eb724f3 (diff) | |
download | ffmpeg-e387c9d5dd56e1f29470ee933027ee3d92f9cfd6.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (22 commits)
rv40dsp x86: use only one register, for both increment and loop counter
rv40dsp: implement prescaled versions for biweight.
avconv: use default channel layouts when they are unknown
avconv: parse channel layout string
nutdec: K&R formatting cosmetics
vda: Signal 4 byte NAL headers to the decoder regardless of what's in the extradata
mem: Consistently return NULL for av_malloc(0)
vf_overlay: implement poll_frame()
vf_scale: support named constants for sws flags.
lavc doxy: add all installed headers to doxy groups.
lavc doxy: add avfft to the main lavc group.
lavc doxy: add remaining avcodec.h functions to a misc doxygen group.
lavc doxy: add AVPicture functions to a doxy group.
lavc doxy: add resampling functions to a doxy group.
lavc doxy: replace \ with /
lavc doxy: add encoding functions to a doxy group.
lavc doxy: add decoding functions to a doxy group.
lavc doxy: fix formatting of AV_PKT_DATA_{PARAM_CHANGE,H263_MB_INFO}
lavc doxy: add AVPacket-related stuff to a separate doxy group.
lavc doxy: add core functions/definitions to a doxy group.
...
Conflicts:
ffmpeg.c
libavcodec/avcodec.h
libavcodec/vda.c
libavcodec/x86/rv40dsp.asm
libavfilter/vf_scale.c
libavformat/nutdec.c
libavutil/mem.c
tests/ref/acodec/pcm_s24daud
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/rv40dsp.asm | 107 | ||||
-rw-r--r-- | libavcodec/x86/rv40dsp_init.c | 30 |
2 files changed, 68 insertions, 69 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index c13e9f03d9..e8acfb25fe 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -32,13 +32,14 @@ SECTION .text ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 %macro RV40_WCORE 4-5 - movh m4, [%3 + 0] - movh m5, [%4 + 0] + movh m4, [%3 + r6 + 0] + movh m5, [%4 + r6 + 0] %if %0 == 4 -%define OFFSET mmsize / 2 +%define OFFSET r6 + mmsize / 2 %else ; 8x8 block and sse2, stride was provided -%define OFFSET %5 +%define OFFSET r6 + add r6, r5 %endif movh m6, [%3 + OFFSET] movh m7, [%4 + OFFSET] @@ -99,10 +100,12 @@ SECTION .text packuswb m4, m6 %if %0 == 5 ; Only called for 8x8 blocks and sse2 - movh [%2 + 0], m4 - movhps [%2 + %5], m4 + sub r6, r5 + movh [%2 + r6], m4 + add r6, r5 + movhps [%2 + r6], m4 %else - mova [%2], m4 + mova [%2 + r6], m4 %endif %endmacro @@ -115,93 +118,79 @@ SECTION .text %endif ; Prepare for next loop - add r0, r5 - add r1, r5 - add r2, r5 + add r6, r5 %else %ifidn %1, 8 RV40_WCORE %2, r0, r1, r2, r5 ; Prepare 2 next lines - lea r0, [r0 + 2 * r5] - lea r1, [r1 + 2 * r5] - lea r2, [r2 + 2 * r5] + add r6, r5 %else RV40_WCORE %2, r0, r1, r2 ; Prepare single next line - add r0, r5 - add r1, r5 - add r2, r5 + add r6, r5 %endif %endif - dec r6 %endmacro ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) ; %1=size %2=num of xmm regs -%macro RV40_WEIGHT 2 -cglobal rv40_weight_func_%1, 6, 7, %2 +; The weights are FP0.14 notation of fractions depending on pts. +; For timebases without rounding error (i.e. PAL), the fractions +; can be simplified, and several operations can be avoided. +; Therefore, we check here whether they are multiples of 2^9 for +; those simplifications to occur. +%macro RV40_WEIGHT 3 +cglobal rv40_weight_func_%1_%2, 6, 7, 8 %if cpuflag(ssse3) mova m1, [shift_round] %else mova m1, [pw_16] %endif pxor m0, m0 - mov r6, r3 - or r6, r4 - ; The weights are FP0.14 notation of fractions depending on pts. - ; For timebases without rounding error (i.e. PAL), the fractions - ; can be simplified, and several operations can be avoided. - ; Therefore, we check here whether they are multiples of 2^9 for - ; those simplifications to occur. - and r6, 0x1FF ; Set loop counter and increments -%if mmsize == 8 - mov r6, %1 -%else - mov r6, (%1 * %1) / mmsize -%endif + mov r6, r5 + shl r6, %3 + add r0, r6 + add r1, r6 + add r2, r6 + neg r6 - ; Use result of test now - jz .loop_512 movd m2, r3d movd m3, r4d +%ifidn %1,rnd +%define RND 0 SPLATW m2, m2 - SPLATW m3, m3 - -.loop: - MAIN_LOOP %1, 0 - jnz .loop - REP_RET - - ; Weights are multiple of 512, which allows some shortcuts -.loop_512: - sar r3, 9 - sar r4, 9 - movd m2, r3d - movd m3, r4d +%else +%define RND 1 %if cpuflag(ssse3) punpcklbw m3, m2 - SPLATW m3, m3 %else SPLATW m2, m2 - SPLATW m3, m3 %endif -.loop2: - MAIN_LOOP %1, 1 - jnz .loop2 - REP_RET +%endif + SPLATW m3, m3 +.loop: + MAIN_LOOP %2, RND + jnz .loop + REP_RET %endmacro INIT_MMX mmx -RV40_WEIGHT 8, 0 -RV40_WEIGHT 16, 0 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 INIT_XMM sse2 -RV40_WEIGHT 8, 8 -RV40_WEIGHT 16, 8 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 INIT_XMM ssse3 -RV40_WEIGHT 8, 8 -RV40_WEIGHT 16, 8 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c index 79c70f78c3..df468aa9e5 100644 --- a/libavcodec/x86/rv40dsp_init.c +++ b/libavcodec/x86/rv40dsp_init.c @@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); #define DECLARE_WEIGHT(opt) \ -void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); \ -void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); +void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); \ +void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); \ +void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); \ +void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); DECLARE_WEIGHT(mmx) DECLARE_WEIGHT(sse2) DECLARE_WEIGHT(ssse3) @@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) if (mm_flags & AV_CPU_FLAG_MMX) { c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx; + c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx; + c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx; } if (mm_flags & AV_CPU_FLAG_MMX2) { c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; @@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; } if (mm_flags & AV_CPU_FLAG_SSE2) { - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; + c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; + c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; } if (mm_flags & AV_CPU_FLAG_SSSE3) { - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; + c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; + c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; } #endif } |