aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-01-18 16:43:04 +0100
committerRonald S. Bultje <rsbultje@gmail.com>2013-01-19 22:04:55 -0800
commitaeaf268e52fc11c1f64914a319e0edddf1346d6a (patch)
tree4596586d2adfda684defde76992d6fb4426b6089
parent992b03183819553a73b4f870a710ef500b4eb6d0 (diff)
downloadffmpeg-aeaf268e52fc11c1f64914a319e0edddf1346d6a.tar.gz
vp3: integrate clear_blocks with idct of previous block.
This is identical to what e.g. vp8 does, and prevents the function call overhead (plus dependency on dsputil for this particular function). Arm asm updated by Janne Grunau <janne-libav@jannau.net>. Signed-off-by: Janne Grunau <janne-libav@jannau.net>
-rw-r--r--libavcodec/arm/vp3dsp_neon.S22
-rw-r--r--libavcodec/ppc/vp3dsp_altivec.c2
-rw-r--r--libavcodec/vp3.c5
-rw-r--r--libavcodec/vp3dsp.c5
-rw-r--r--libavcodec/vp3dsp.h2
-rw-r--r--libavcodec/x86/vp3dsp.asm27
-rw-r--r--libavcodec/x86/vp3dsp_init.c2
7 files changed, 45 insertions, 20 deletions
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index e09de57281..e5ecfc337e 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -108,14 +108,20 @@ endfunc
function vp3_idct_start_neon
vpush {d8-d15}
+ vmov.i16 q4, #0
+ vmov.i16 q5, #0
movrel r3, vp3_idct_constants
vld1.64 {d0-d1}, [r3,:128]
- vld1.64 {d16-d19}, [r2,:128]!
- vld1.64 {d20-d23}, [r2,:128]!
- vld1.64 {d24-d27}, [r2,:128]!
+ vld1.64 {d16-d19}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vld1.64 {d20-d23}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vld1.64 {d24-d27}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
vadd.s16 q1, q8, q12
vsub.s16 q8, q8, q12
- vld1.64 {d28-d31}, [r2,:128]!
+ vld1.64 {d28-d31}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
vp3_idct_core_neon:
vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
@@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1
endfunc
function ff_vp3_idct_dc_add_neon, export=1
- ldrsh r2, [r2]
+ ldrsh r12, [r2]
mov r3, r0
- add r2, r2, #15
- vdup.16 q15, r2
+ add r12, r12, #15
+ vdup.16 q15, r12
+ mov r12, 0
+ strh r12, [r2]
vshr.s16 q15, q15, #5
vld1.8 {d0}, [r0,:64], r1
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index 75a36779ce..6adf9aefac 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
PUT(b5) dst += stride;
PUT(b6) dst += stride;
PUT(b7)
+ memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
@@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
ADD(b5) dst += stride;
ADD(b6) dst += stride;
ADD(b7)
+ memset(block, 0, sizeof(*block) * 64);
}
#endif /* HAVE_ALTIVEC */
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 0340c22bb2..9417535314 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext {
DSPContext dsp;
VideoDSPContext vdsp;
VP3DSPContext vp3dsp;
+ DECLARE_ALIGNED(16, DCTELEM, block)[64];
int flipped_image;
int last_slice_end;
int skip_loop_filter;
@@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int
static void render_slice(Vp3DecodeContext *s, int slice)
{
int x, y, i, j, fragment;
- LOCAL_ALIGNED_16(DCTELEM, block, [64]);
+ DCTELEM *block = s->block;
int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef;
int motion_halfpel_index;
uint8_t *motion_source;
@@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
}
}
- s->dsp.clear_block(block);
-
/* invert DCT and place (or add) in final output */
if (s->all_fragments[i].coding_method == MODE_INTRA) {
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index 9b0b5d0a9c..9e6209dfdd 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
idct(dest, line_size, block, 1);
+ memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
idct(dest, line_size, block, 2);
+ memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
- const DCTELEM *block/*align 16*/){
+ DCTELEM *block/*align 16*/){
int i, dc = (block[0] + 15) >> 5;
for(i = 0; i < 8; i++){
@@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
dest[7] = av_clip_uint8(dest[7] + dc);
dest += line_size;
}
+ block[0] = 0;
}
static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride,
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3781bbf3a7..feb300017a 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -25,7 +25,7 @@
typedef struct VP3DSPContext {
void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block);
- void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block);
+ void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block);
void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc1e776a13..d2c464c5cf 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -562,6 +562,13 @@ cglobal vp3_idct_put, 3, 4, 9
%endif
%assign %%i %%i+64
%endrep
+
+ pxor m0, m0
+%assign %%offset 0
+%rep 128/mmsize
+ mova [r2+%%offset], m0
+%assign %%offset %%offset+mmsize
+%endrep
RET
cglobal vp3_idct_add, 3, 4, 9
@@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
+%assign %%offset 0
+%rep 32/mmsize
+ mova [r2+%%offset], m4
+%assign %%offset %%offset+mmsize
+%endrep
add r2, 32
dec r3
jg .loop
@@ -620,7 +632,7 @@ vp3_idct_funcs
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
- movq m5, [r0+r3 ]
+ movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
@@ -630,7 +642,7 @@ vp3_idct_funcs
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
- movq [r0+r3 ], m5
+ movq [r0+r2 ], m5
%endmacro
INIT_MMX mmxext
@@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
- lea r3, [r1*3]
- movsx r2, word [r2]
- add r2, 15
- sar r2, 5
- movd m0, r2d
+ movsx r3, word [r2]
+ mov word [r2], 0
+ lea r2, [r1*3]
+ add r3, 15
+ sar r3, 5
+ movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index bbe74ba44a..95beeabfec 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
- const DCTELEM *block);
+ DCTELEM *block);
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);