diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-03-20 16:13:55 +0100 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-04-10 10:07:09 -0700 |
commit | 2130bd8f5b6504ea14cd41e33f5d4f431eb724f3 (patch) | |
tree | 48128b952d208c014d4b21db307c8d1c3b7e57ab /libavcodec | |
parent | 272b252c0110225188c7d7f31167941210aac197 (diff) | |
download | ffmpeg-2130bd8f5b6504ea14cd41e33f5d4f431eb724f3.tar.gz |
rv40dsp x86: use only one register, for both increment and loop counter
Around 10 cycles faster for luma.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/rv40dsp.asm | 43 |
1 files changed, 20 insertions, 23 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index 9028e74024..721d3df094 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -32,13 +32,14 @@ SECTION .text ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 %macro RV40_WCORE 4-5 - movh m4, [%3 + 0] - movh m5, [%4 + 0] + movh m4, [%3 + r6 + 0] + movh m5, [%4 + r6 + 0] %if %0 == 4 -%define OFFSET mmsize / 2 +%define OFFSET r6 + mmsize / 2 %else ; 8x8 block and sse2, stride was provided -%define OFFSET %5 +%define OFFSET r6 + add r6, r5 %endif movh m6, [%3 + OFFSET] movh m7, [%4 + OFFSET] @@ -99,10 +100,12 @@ SECTION .text packuswb m4, m6 %if %0 == 5 ; Only called for 8x8 blocks and sse2 - movh [%2 + 0], m4 - movhps [%2 + %5], m4 + sub r6, r5 + movh [%2 + r6], m4 + add r6, r5 + movhps [%2 + r6], m4 %else - mova [%2], m4 + mova [%2 + r6], m4 %endif %endmacro @@ -115,26 +118,19 @@ SECTION .text %endif ; Prepare for next loop - add r0, r5 - add r1, r5 - add r2, r5 + add r6, r5 %else %ifidn %1, 8 RV40_WCORE %2, r0, r1, r2, r5 ; Prepare 2 next lines - lea r0, [r0 + 2 * r5] - lea r1, [r1 + 2 * r5] - lea r2, [r2 + 2 * r5] + add r6, r5 %else RV40_WCORE %2, r0, r1, r2 ; Prepare single next line - add r0, r5 - add r1, r5 - add r2, r5 + add r6, r5 %endif %endif - dec r6 %endmacro ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) @@ -145,7 +141,7 @@ SECTION .text ; Therefore, we check here whether they are multiples of 2^9 for ; those simplifications to occur. %macro RV40_WEIGHT 3 -cglobal rv40_weight_func_%1_%2, 6, 7, %3 +cglobal rv40_weight_func_%1_%2, 6, 7, 8 %if cpuflag(ssse3) mova m1, [shift_round] %else @@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3 %endif pxor m0, m0 ; Set loop counter and increments -%if mmsize == 8 - mov r6, %2 -%else - mov r6, (%2 * %2) / mmsize -%endif + mov r6, r5 + shl r6, %3 + add r0, r6 + add r1, r6 + add r2, r6 + neg r6 movd m2, r3 movd m3, r4 |