aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/rv40dsp.asm
diff options
context:
space:
mode:
authorChristophe GISQUET <christophe.gisquet@gmail.com>2012-03-20 16:13:55 +0100
committerRonald S. Bultje <rsbultje@gmail.com>2012-04-10 10:07:09 -0700
commit2130bd8f5b6504ea14cd41e33f5d4f431eb724f3 (patch)
tree48128b952d208c014d4b21db307c8d1c3b7e57ab /libavcodec/x86/rv40dsp.asm
parent272b252c0110225188c7d7f31167941210aac197 (diff)
downloadffmpeg-2130bd8f5b6504ea14cd41e33f5d4f431eb724f3.tar.gz
rv40dsp x86: use only one register, for both increment and loop counter
Around 10 cycles faster for luma. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86/rv40dsp.asm')
-rw-r--r--libavcodec/x86/rv40dsp.asm43
1 files changed, 20 insertions, 23 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 9028e74024..721d3df094 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -32,13 +32,14 @@ SECTION .text
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5
- movh m4, [%3 + 0]
- movh m5, [%4 + 0]
+ movh m4, [%3 + r6 + 0]
+ movh m5, [%4 + r6 + 0]
%if %0 == 4
-%define OFFSET mmsize / 2
+%define OFFSET r6 + mmsize / 2
%else
; 8x8 block and sse2, stride was provided
-%define OFFSET %5
+%define OFFSET r6
+ add r6, r5
%endif
movh m6, [%3 + OFFSET]
movh m7, [%4 + OFFSET]
@@ -99,10 +100,12 @@ SECTION .text
packuswb m4, m6
%if %0 == 5
; Only called for 8x8 blocks and sse2
- movh [%2 + 0], m4
- movhps [%2 + %5], m4
+ sub r6, r5
+ movh [%2 + r6], m4
+ add r6, r5
+ movhps [%2 + r6], m4
%else
- mova [%2], m4
+ mova [%2 + r6], m4
%endif
%endmacro
@@ -115,26 +118,19 @@ SECTION .text
%endif
; Prepare for next loop
- add r0, r5
- add r1, r5
- add r2, r5
+ add r6, r5
%else
%ifidn %1, 8
RV40_WCORE %2, r0, r1, r2, r5
; Prepare 2 next lines
- lea r0, [r0 + 2 * r5]
- lea r1, [r1 + 2 * r5]
- lea r2, [r2 + 2 * r5]
+ add r6, r5
%else
RV40_WCORE %2, r0, r1, r2
; Prepare single next line
- add r0, r5
- add r1, r5
- add r2, r5
+ add r6, r5
%endif
%endif
- dec r6
%endmacro
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
@@ -145,7 +141,7 @@ SECTION .text
; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur.
%macro RV40_WEIGHT 3
-cglobal rv40_weight_func_%1_%2, 6, 7, %3
+cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3)
mova m1, [shift_round]
%else
@@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3
%endif
pxor m0, m0
; Set loop counter and increments
-%if mmsize == 8
- mov r6, %2
-%else
- mov r6, (%2 * %2) / mmsize
-%endif
+ mov r6, r5
+ shl r6, %3
+ add r0, r6
+ add r1, r6
+ add r2, r6
+ neg r6
movd m2, r3
movd m3, r4