diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-03-19 22:46:28 +0100 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-04-10 10:06:48 -0700 |
commit | 272b252c0110225188c7d7f31167941210aac197 (patch) | |
tree | 47bea5996c88057a418e8872a655bac8f261736e /libavcodec/x86/rv40dsp.asm | |
parent | d3c59d5003a483f1a23e225fc71c19bd1116d11c (diff) | |
download | ffmpeg-272b252c0110225188c7d7f31167941210aac197.tar.gz |
rv40dsp: implement prescaled versions for biweight.
Quite often, the original weights are multiple of 512. By prescaling them
by 1/512 when they are computed (once per frame), no intermediate shifting
is needed, and no prescaling on each call either.
The x86 code already used that trick.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86/rv40dsp.asm')
-rw-r--r-- | libavcodec/x86/rv40dsp.asm | 70 |
1 files changed, 31 insertions, 39 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index bff3e7b96a..9028e74024 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -139,69 +139,61 @@ SECTION .text ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) ; %1=size %2=num of xmm regs -%macro RV40_WEIGHT 2 -cglobal rv40_weight_func_%1, 6, 7, %2 +; The weights are FP0.14 notation of fractions depending on pts. +; For timebases without rounding error (i.e. PAL), the fractions +; can be simplified, and several operations can be avoided. +; Therefore, we check here whether they are multiples of 2^9 for +; those simplifications to occur. +%macro RV40_WEIGHT 3 +cglobal rv40_weight_func_%1_%2, 6, 7, %3 %if cpuflag(ssse3) mova m1, [shift_round] %else mova m1, [pw_16] %endif pxor m0, m0 - mov r6, r3 - or r6, r4 - ; The weights are FP0.14 notation of fractions depending on pts. - ; For timebases without rounding error (i.e. PAL), the fractions - ; can be simplified, and several operations can be avoided. - ; Therefore, we check here whether they are multiples of 2^9 for - ; those simplifications to occur. - and r6, 0x1FF ; Set loop counter and increments %if mmsize == 8 - mov r6, %1 + mov r6, %2 %else - mov r6, (%1 * %1) / mmsize + mov r6, (%2 * %2) / mmsize %endif - ; Use result of test now - jz .loop_512 movd m2, r3 movd m3, r4 +%ifidn %1,rnd +%define RND 0 SPLATW m2, m2 - SPLATW m3, m3 - -.loop: - MAIN_LOOP %1, 0 - jnz .loop - REP_RET - - ; Weights are multiple of 512, which allows some shortcuts -.loop_512: - sar r3, 9 - sar r4, 9 - movd m2, r3 - movd m3, r4 +%else +%define RND 1 %if cpuflag(ssse3) punpcklbw m3, m2 - SPLATW m3, m3 %else SPLATW m2, m2 - SPLATW m3, m3 %endif -.loop2: - MAIN_LOOP %1, 1 - jnz .loop2 - REP_RET +%endif + SPLATW m3, m3 +.loop: + MAIN_LOOP %2, RND + jnz .loop + REP_RET %endmacro INIT_MMX mmx -RV40_WEIGHT 8, 0 -RV40_WEIGHT 16, 0 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 INIT_XMM sse2 -RV40_WEIGHT 8, 8 -RV40_WEIGHT 16, 8 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 INIT_XMM ssse3 -RV40_WEIGHT 8, 8 -RV40_WEIGHT 16, 8 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 |