diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-02-18 21:03:02 -0800 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-02-19 16:25:50 +0100 |
commit | 1acd7d594c15aa491729c837ad3519d3469e620a (patch) | |
tree | 4a515aa31c7483d8a2fdfa1d4399c9a9d2b29cae /libavcodec | |
parent | a1f1ca96b42698002920467c0aab9e636893088a (diff) | |
download | ffmpeg-1acd7d594c15aa491729c837ad3519d3469e620a.tar.gz |
h264: integrate clear_blocks calls with IDCT.
The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700
to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb
(in the decode_slice loop) goes from 1759 to 1733 cycles on the clip
tested (cathedral), i.e. almost 30 cycles per mb faster.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/arm/h264idct_neon.S | 29 | ||||
-rw-r--r-- | libavcodec/h264.c | 14 | ||||
-rw-r--r-- | libavcodec/h264_mb_template.c | 11 | ||||
-rw-r--r-- | libavcodec/h264addpx_template.c | 4 | ||||
-rw-r--r-- | libavcodec/h264dsp.c | 4 | ||||
-rw-r--r-- | libavcodec/h264dsp.h | 4 | ||||
-rw-r--r-- | libavcodec/h264idct_template.c | 16 | ||||
-rw-r--r-- | libavcodec/h264pred.h | 8 | ||||
-rw-r--r-- | libavcodec/h264pred_template.c | 28 | ||||
-rw-r--r-- | libavcodec/ppc/h264_altivec.c | 3 | ||||
-rw-r--r-- | libavcodec/svq3.c | 4 | ||||
-rw-r--r-- | libavcodec/x86/h264_idct.asm | 108 | ||||
-rw-r--r-- | libavcodec/x86/h264_idct_10bit.asm | 53 |
13 files changed, 209 insertions, 77 deletions
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S index 1b349ced58..fa5b90c23a 100644 --- a/libavcodec/arm/h264idct_neon.S +++ b/libavcodec/arm/h264idct_neon.S @@ -22,9 +22,12 @@ function ff_h264_idct_add_neon, export=1 vld1.64 {d0-d3}, [r1,:128] + vmov.i16 q15, #0 vswp d1, d2 + vst1.16 {q15}, [r1,:128]! vadd.i16 d4, d0, d1 + vst1.16 {q15}, [r1,:128]! vshr.s16 q8, q1, #1 vsub.i16 d5, d0, d1 vadd.i16 d6, d2, d17 @@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1 vst1.32 {d0[1]}, [r0,:32], r2 vst1.32 {d1[0]}, [r0,:32], r2 + sub r1, r1, #32 bx lr endfunc function ff_h264_idct_dc_add_neon, export=1 + mov r3, #0 vld1.16 {d2[],d3[]}, [r1,:16] + strh r3, [r1] vrshr.s16 q1, q1, #6 vld1.32 {d0[0]}, [r0,:32], r2 vld1.32 {d0[1]}, [r0,:32], r2 @@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1 add r5, r1, #16*4 add r1, r2, #16*32 mov r2, r3 - mov r3, r1 + mov r10, r1 ldr r6, [sp, #32] movrel r7, scan8+16 mov r12, #0 @@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1 ldr r0, [r5, r12, lsl #2] ldrb r8, [r6, r8] add r0, r0, r4 - add r1, r3, r12, lsl #5 + add r1, r10, r12, lsl #5 cmp r8, #0 ldrsh r8, [r1] iteet ne @@ -180,7 +186,9 @@ endfunc qb .req q14 vshr.s16 q2, q10, #1 vadd.i16 q0, q8, q12 - vld1.16 {q14-q15},[r1,:128]! + vld1.16 {q14-q15},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! vsub.i16 q1, q8, q12 vshr.s16 q3, q14, #1 vsub.i16 q2, q2, q14 @@ -259,9 +267,16 @@ endfunc .endm function ff_h264_idct8_add_neon, export=1 - vld1.16 {q8-q9}, [r1,:128]! - vld1.16 {q10-q11},[r1,:128]! - vld1.16 {q12-q13},[r1,:128]! + vmov.i16 q7, #0 + vld1.16 {q8-q9}, [r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vld1.16 {q10-q11},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vld1.16 {q12-q13},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! idct8x8_cols 0 idct8x8_cols 1 @@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1 endfunc function ff_h264_idct8_dc_add_neon, export=1 + mov r3, #0 vld1.16 {d30[],d31[]},[r1,:16] + strh r3, [r1] vld1.32 {d0}, [r0,:64], r2 vrshr.s16 q15, q15, #6 vld1.32 {d1}, [r0,:64], r2 diff --git a/libavcodec/h264.c b/libavcodec/h264.c index fe12846f4f..ec709760e4 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -2191,7 +2191,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, if (IS_8x8DCT(mb_type)) { if (transform_bypass) { idct_dc_add = - idct_add = h->h264dsp.h264_add_pixels8; + idct_add = h->h264dsp.h264_add_pixels8_clear; } else { idct_dc_add = h->h264dsp.h264_idct8_dc_add; idct_add = h->h264dsp.h264_idct8_add; @@ -2216,7 +2216,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, } else { if (transform_bypass) { idct_dc_add = - idct_add = h->h264dsp.h264_add_pixels4; + idct_add = h->h264dsp.h264_add_pixels4_clear; } else { idct_dc_add = h->h264dsp.h264_idct_dc_add; idct_add = h->h264dsp.h264_idct_add; @@ -2313,9 +2313,9 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, for (i = 0; i < 16; i++) if (h->non_zero_count_cache[scan8[i + p * 16]] || dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) - h->h264dsp.h264_add_pixels4(dest_y + block_offset[i], - h->mb + (i * 16 + p * 256 << pixel_shift), - linesize); + h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i], + h->mb + (i * 16 + p * 256 << pixel_shift), + linesize); } } else { h->h264dsp.h264_idct_add16intra(dest_y, block_offset, @@ -2326,8 +2326,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, } else if (h->cbp & 15) { if (transform_bypass) { const int di = IS_8x8DCT(mb_type) ? 4 : 1; - idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8 - : h->h264dsp.h264_add_pixels4; + idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear + : h->h264dsp.h264_add_pixels4_clear; for (i = 0; i < 16; i += di) if (h->non_zero_count_cache[scan8[i + p * 16]]) idct_add(dest_y + block_offset[i], diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c index b617029a9f..6c62a7608b 100644 --- a/libavcodec/h264_mb_template.c +++ b/libavcodec/h264_mb_template.c @@ -204,7 +204,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) h->mb + (16 * 16 * 2 << PIXEL_SHIFT), uvlinesize); } else { - idct_add = h->h264dsp.h264_add_pixels4; + idct_add = h->h264dsp.h264_add_pixels4_clear; for (j = 1; j < 3; j++) { for (i = j * 16; i < j * 16 + 4; i++) if (h->non_zero_count_cache[scan8[i]] || @@ -258,10 +258,6 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) } } } - if (h->cbp || IS_INTRA(mb_type)) { - h->dsp.clear_blocks(h->mb); - h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT)); - } } } @@ -365,11 +361,6 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h) hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass, PIXEL_SHIFT, block_offset, linesize, dest[p], p); - - if (h->cbp || IS_INTRA(mb_type)) { - h->dsp.clear_blocks(h->mb); - h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT)); - } } } diff --git a/libavcodec/h264addpx_template.c b/libavcodec/h264addpx_template.c index 1173766805..046b6c2e19 100644 --- a/libavcodec/h264addpx_template.c +++ b/libavcodec/h264addpx_template.c @@ -43,6 +43,8 @@ static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride) dst += stride; src += 4; } + + memset(_src, 0, sizeof(dctcoef) * 16); } static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) @@ -65,4 +67,6 @@ static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) dst += stride; src += 8; } + + memset(_src, 0, sizeof(dctcoef) * 64); } diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c index bb70d4e33e..da9e417e8e 100644 --- a/libavcodec/h264dsp.c +++ b/libavcodec/h264dsp.c @@ -66,8 +66,8 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo #define FUNC(a, depth) a ## _ ## depth ## _c #define ADDPX_DSP(depth) \ - c->h264_add_pixels4 = FUNC(ff_h264_add_pixels4, depth);\ - c->h264_add_pixels8 = FUNC(ff_h264_add_pixels8, depth) + c->h264_add_pixels4_clear = FUNC(ff_h264_add_pixels4, depth);\ + c->h264_add_pixels8_clear = FUNC(ff_h264_add_pixels8, depth) if (bit_depth > 8 && bit_depth <= 16) { ADDPX_DSP(16); diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index 82e147fe35..98ea15c330 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -103,8 +103,8 @@ typedef struct H264DSPContext { void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); /* bypass-transform */ - void (*h264_add_pixels8)(uint8_t *dst, int16_t *block, int stride); - void (*h264_add_pixels4)(uint8_t *dst, int16_t *block, int stride); + void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); + void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); } H264DSPContext; void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c index b0ab847d70..9f16e1d2e0 100644 --- a/libavcodec/h264idct_template.c +++ b/libavcodec/h264idct_template.c @@ -61,6 +61,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride) dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6)); dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6)); } + + memset(block, 0, 16 * sizeof(dctcoef)); } void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ @@ -133,14 +135,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) ); dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) ); } + + memset(block, 0, 64 * sizeof(dctcoef)); } // assumes all AC coefs are 0 -void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){ +void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride){ int i, j; - int dc = (((dctcoef*)block)[0] + 32) >> 6; pixel *dst = (pixel*)_dst; + dctcoef *block = (dctcoef*)_block; + int dc = (block[0] + 32) >> 6; stride >>= sizeof(pixel)-1; + block[0] = 0; for( j = 0; j < 4; j++ ) { for( i = 0; i < 4; i++ ) @@ -149,10 +155,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){ } } -void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *block, int stride){ +void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride){ int i, j; - int dc = (((dctcoef*)block)[0] + 32) >> 6; pixel *dst = (pixel*)_dst; + dctcoef *block = (dctcoef*)_block; + int dc = (block[0] + 32) >> 6; + block[0] = 0; stride >>= sizeof(pixel)-1; for( j = 0; j < 8; j++ ) { diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h index 36b542b7be..ed67d2ef43 100644 --- a/libavcodec/h264pred.h +++ b/libavcodec/h264pred.h @@ -98,15 +98,15 @@ typedef struct H264PredContext { void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride); void(*pred4x4_add[2])(uint8_t *pix /*align 4*/, - const int16_t *block /*align 16*/, ptrdiff_t stride); + int16_t *block /*align 16*/, ptrdiff_t stride); void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/, - const int16_t *block /*align 16*/, ptrdiff_t stride); + int16_t *block /*align 16*/, ptrdiff_t stride); void(*pred8x8_add[3])(uint8_t *pix /*align 8*/, const int *block_offset, - const int16_t *block /*align 16*/, ptrdiff_t stride); + int16_t *block /*align 16*/, ptrdiff_t stride); void(*pred16x16_add[3])(uint8_t *pix /*align 16*/, const int *block_offset, - const int16_t *block /*align 16*/, ptrdiff_t stride); + int16_t *block /*align 16*/, ptrdiff_t stride); } H264PredContext; void ff_h264_pred_init(H264PredContext *h, int codec_id, diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index e78f2d423c..8d8d62e0b6 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -1132,7 +1132,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, #undef PL #undef SRC -static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block, +static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block, ptrdiff_t stride) { int i; @@ -1149,9 +1149,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block, pix++; block++; } + + memset(_block, 0, sizeof(dctcoef) * 16); } -static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block, +static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block, ptrdiff_t stride) { int i; @@ -1167,9 +1169,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block, pix+= stride; block+= 4; } + + memset(_block, 0, sizeof(dctcoef) * 16); } -static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block, +static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block, ptrdiff_t stride) { int i; @@ -1190,9 +1194,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block, pix++; block++; } + + memset(_block, 0, sizeof(dctcoef) * 64); } -static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block, +static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block, ptrdiff_t stride) { int i; @@ -1212,10 +1218,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block, pix+= stride; block+= 8; } + + memset(_block, 0, sizeof(dctcoef) * 64); } static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, - const int16_t *block, + int16_t *block, ptrdiff_t stride) { int i; @@ -1225,7 +1233,7 @@ static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, - const int16_t *block, + int16_t *block, ptrdiff_t stride) { int i; @@ -1234,7 +1242,7 @@ static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, } static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, - const int16_t *block, ptrdiff_t stride) + int16_t *block, ptrdiff_t stride) { int i; for(i=0; i<4; i++) @@ -1242,7 +1250,7 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, } static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, - const int16_t *block, ptrdiff_t stride) + int16_t *block, ptrdiff_t stride) { int i; for(i=0; i<4; i++) @@ -1252,7 +1260,7 @@ static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, } static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, - const int16_t *block, + int16_t *block, ptrdiff_t stride) { int i; @@ -1262,7 +1270,7 @@ static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, - const int16_t *block, ptrdiff_t stride) + int16_t *block, ptrdiff_t stride) { int i; for(i=0; i<4; i++) diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c index 46ee61c6f2..3c2bb4d630 100644 --- a/libavcodec/ppc/h264_altivec.c +++ b/libavcodec/ppc/h264_altivec.c @@ -87,6 +87,7 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) vtmp1 = vec_sld(vtmp0, vtmp0, 8); vtmp2 = vec_ld(16,block); vtmp3 = vec_sld(vtmp2, vtmp2, 8); + memset(block, 0, 16 * sizeof(int16_t)); VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); @@ -206,6 +207,7 @@ static void ff_h264_idct8_add_altivec( uint8_t *dst, int16_t *dct, int stride ) s5 = vec_ld(0x50, (int16_t*)dct); s6 = vec_ld(0x60, (int16_t*)dct); s7 = vec_ld(0x70, (int16_t*)dct); + memset(dct, 0, 64 * sizeof(int16_t)); IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); @@ -234,6 +236,7 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl int i; dc = (block[0] + 32) >> 6; + block[0] = 0; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c index 7817766e8c..8f910b588f 100644 --- a/libavcodec/svq3.c +++ b/libavcodec/svq3.c @@ -219,6 +219,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block, dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20)); dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20)); } + + memset(block, 0, 16 * sizeof(int16_t)); } static inline int svq3_decode_block(GetBitContext *gb, int16_t *block, @@ -669,8 +671,6 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type) } if (!IS_SKIP(mb_type) || h->pict_type == AV_PICTURE_TYPE_B) { memset(h->non_zero_count_cache + 8, 0, 14 * 8 * sizeof(uint8_t)); - h->dsp.clear_blocks(h->mb + 0); - h->dsp.clear_blocks(h->mb + 384); } if (!IS_INTRA16x16(mb_type) && diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 8fef7b8199..7bb1653428 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -70,6 +70,10 @@ SECTION .text paddw m0, m6 IDCT4_1D w, 0, 1, 2, 3, 4, 5 pxor m7, m7 + movq [%2+ 0], m7 + movq [%2+ 8], m7 + movq [%2+16], m7 + movq [%2+24], m7 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 lea %1, [%1+%3*2] @@ -161,13 +165,31 @@ cglobal h264_idct_add_8, 3, 3, 0 %endmacro ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT8_ADD_MMX_END 3 +%macro IDCT8_ADD_MMX_END 3-4 IDCT8_1D_FULL %2 mova [%2 ], m5 mova [%2+16], m6 mova [%2+32], m7 pxor m7, m7 +%if %0 == 4 + movq [%4+ 0], m7 + movq [%4+ 8], m7 + movq [%4+ 16], m7 + movq [%4+ 24], m7 + movq [%4+ 32], m7 + movq [%4+ 40], m7 + movq [%4+ 48], m7 + movq [%4+ 56], m7 + movq [%4+ 64], m7 + movq [%4+ 72], m7 + movq [%4+ 80], m7 + movq [%4+ 88], m7 + movq [%4+ 96], m7 + movq [%4+104], m7 + movq [%4+112], m7 + movq [%4+120], m7 +%endif STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 lea %1, [%1+%3*2] STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 @@ -190,7 +212,7 @@ cglobal h264_idct8_add_8, 3, 4, 0 IDCT8_ADD_MMX_START r1 , rsp IDCT8_ADD_MMX_START r1+8, rsp+64 lea r3, [r0+4] - IDCT8_ADD_MMX_END r0 , rsp, r2 + IDCT8_ADD_MMX_END r0 , rsp, r2, r1 IDCT8_ADD_MMX_END r3 , rsp+8, r2 ADD rsp, pad @@ -233,6 +255,14 @@ cglobal h264_idct8_add_8, 3, 4, 0 SWAP 0, 8 SWAP 1, 9 %endif + mova [%2+ 0], m7 + mova [%2+ 16], m7 + mova [%2+ 32], m7 + mova [%2+ 48], m7 + mova [%2+ 64], m7 + mova [%2+ 80], m7 + mova [%2+ 96], m7 + mova [%2+112], m7 lea %1, [%1+%3*4] STORE_DIFF m4, m6, m7, [%1 ] STORE_DIFF m5, m6, m7, [%1+%3 ] @@ -246,19 +276,11 @@ cglobal h264_idct8_add_8, 3, 4, 10 IDCT8_ADD_SSE r0, r1, r2, r3 RET -%macro DC_ADD_MMXEXT_INIT 2-3 -%if %0 == 2 - movsx %1, word [%1] +%macro DC_ADD_MMXEXT_INIT 2 add %1, 32 sar %1, 6 movd m0, %1d lea %1, [%2*3] -%else - add %3, 32 - sar %3, 6 - movd m0, %3d - lea %3, [%2*3] -%endif pshufw m0, m0, 0 pxor m1, m1 psubw m1, m0 @@ -287,19 +309,44 @@ cglobal h264_idct8_add_8, 3, 4, 10 INIT_MMX mmxext ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_dc_add_8, 3, 3, 0 - DC_ADD_MMXEXT_INIT r1, r2 - DC_ADD_MMXEXT_OP movh, r0, r2, r1 +%if ARCH_X86_64 +cglobal h264_idct_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + mov word [r1], 0 + DC_ADD_MMXEXT_INIT r3, r2 + DC_ADD_MMXEXT_OP movh, r0, r2, r3 RET ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_dc_add_8, 3, 3, 0 - DC_ADD_MMXEXT_INIT r1, r2 - DC_ADD_MMXEXT_OP mova, r0, r2, r1 +cglobal h264_idct8_dc_add_8, 3, 4, 0 + movsx r3, word [r1] + mov word [r1], 0 + DC_ADD_MMXEXT_INIT r3, r2 + DC_ADD_MMXEXT_OP mova, r0, r2, r3 lea r0, [r0+r2*4] - DC_ADD_MMXEXT_OP mova, r0, r2, r1 + DC_ADD_MMXEXT_OP mova, r0, r2, r3 + RET +%else +cglobal h264_idct_dc_add_8, 2, 3, 0 + movsx r2, word [r1] + mov word [r1], 0 + mov r1, r2m + DC_ADD_MMXEXT_INIT r2, r1 + DC_ADD_MMXEXT_OP movh, r0, r1, r2 RET +; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_dc_add_8, 2, 3, 0 + movsx r2, word [r1] + mov word [r1], 0 + mov r1, r2m + DC_ADD_MMXEXT_INIT r2, r1 + DC_ADD_MMXEXT_OP mova, r0, r1, r2 + lea r0, [r0+r1*4] + DC_ADD_MMXEXT_OP mova, r0, r1, r2 + RET +%endif + INIT_MMX mmx ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, ; int16_t *block, int stride, const uint8_t nnzc[6*8]) @@ -343,7 +390,7 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, add word [r2], 32 IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3 + IDCT8_ADD_MMX_END r6 , rsp, r3, r2 mov r6d, dword [r1+r5*4] lea r6, [r0+r6+4] IDCT8_ADD_MMX_END r6 , rsp+8, r3 @@ -373,7 +420,8 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride movsx r6, word [r2] test r6, r6 jz .no_dc - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -450,7 +498,8 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s movsx r6, word [r2] test r6, r6 jz .skipblock - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -489,7 +538,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride movsx r6, word [r2] test r6, r6 jz .no_dc - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -515,7 +565,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride add word [r2], 32 IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3 + IDCT8_ADD_MMX_END r6 , rsp, r3, r2 mov r6d, dword [r1+r5*4] lea r6, [r0+r6+4] IDCT8_ADD_MMX_END r6 , rsp+8, r3 @@ -547,7 +597,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, strid test r6, r6 jz .no_dc INIT_MMX cpuname - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 == 0 %define dst2q r1 %define dst2d r1d @@ -650,7 +701,8 @@ h264_idct_add8_mmxext_plane: movsx r6, word [r2] test r6, r6 jz .skipblock - DC_ADD_MMXEXT_INIT r2, r3, r6 + mov word [r2], 0 + DC_ADD_MMXEXT_INIT r6, r3 %if ARCH_X86_64 mov r0d, dword [r1+r5*4] add r0, [dst2q] @@ -693,7 +745,9 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered h264_idct_dc_add8_mmxext: movd m0, [r2 ] ; 0 0 X D + mov word [r2+ 0], 0 punpcklwd m0, [r2+32] ; x X d D + mov word [r2+32], 0 paddsw m0, [pw_32] psraw m0, 6 punpcklwd m0, m0 ; d d D D @@ -723,6 +777,10 @@ h264_add8x4_idct_sse2: paddw m0, [pw_32] IDCT4_1D w,0,1,2,3,4,5 pxor m7, m7 + mova [r2+ 0], m7 + mova [r2+16], m7 + mova [r2+32], m7 + mova [r2+48], m7 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 lea r0, [r0+r3*2] STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index eb375f9430..88fdb843a6 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -66,6 +66,10 @@ SECTION .text paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 + mova [%2+ 0], m5 + mova [%2+16], m5 + mova [%2+32], m5 + mova [%2+48], m5 STORE_DIFFx2 m0, m1, m4, m5, %1, %3 lea %1, [%1+%3*2] STORE_DIFFx2 m2, m3, m4, m5, %1, %3 @@ -100,6 +104,10 @@ add4x4_idct %+ SUFFIX: paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 + mova [r2+ 0], m5 + mova [r2+16], m5 + mova [r2+32], m5 + mova [r2+48], m5 STORE_DIFFx2 m0, m1, m4, m5, r5, r3 lea r5, [r5+r3*2] STORE_DIFFx2 m2, m3, m4, m5, r5, r3 @@ -187,6 +195,7 @@ IDCT_ADD16_10 INIT_MMX mmxext cglobal h264_idct_dc_add_10,3,3 movd m0, [r1] + mov dword [r1], 0 paddd m0, [pd_32] psrad m0, 6 lea r1, [r2*3] @@ -199,11 +208,11 @@ cglobal h264_idct_dc_add_10,3,3 ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) ;----------------------------------------------------------------------------- %macro IDCT8_DC_ADD 0 -cglobal h264_idct8_dc_add_10,3,3,7 - mov r1d, [r1] - add r1, 32 - sar r1, 6 - movd m0, r1d +cglobal h264_idct8_dc_add_10,3,4,7 + movd m0, [r1] + mov dword[r1], 0 + paddd m0, [pd_32] + psrad m0, 6 lea r1, [r2*3] SPLATW m0, m0, 0 mova m6, [pw_pixel_max] @@ -255,6 +264,8 @@ idct_dc_add %+ SUFFIX: add r5, r0 movq m0, [r2+ 0] movhps m0, [r2+64] + mov dword [r2+ 0], 0 + mov dword [r2+64], 0 paddd m0, [pd_32] psrad m0, 6 pshufhw m0, m0, 0 @@ -473,6 +484,22 @@ h264_idct8_add1_10 %+ SUFFIX: packssdw m8, m0 paddsw m8, [r0] pxor m0, m0 + mova [r1+ 0], m0 + mova [r1+ 16], m0 + mova [r1+ 32], m0 + mova [r1+ 48], m0 + mova [r1+ 64], m0 + mova [r1+ 80], m0 + mova [r1+ 96], m0 + mova [r1+112], m0 + mova [r1+128], m0 + mova [r1+144], m0 + mova [r1+160], m0 + mova [r1+176], m0 + mova [r1+192], m0 + mova [r1+208], m0 + mova [r1+224], m0 + mova [r1+240], m0 CLIPW m8, m0, [pw_pixel_max] mova [r0], m8 mova m8, [pw_pixel_max] @@ -492,6 +519,22 @@ h264_idct8_add1_10 %+ SUFFIX: lea r3, [r0+8] IDCT8_ADD_SSE_END r0, rsp, r2 IDCT8_ADD_SSE_END r3, rsp+16, r2 + mova [r1+ 0], m7 + mova [r1+ 16], m7 + mova [r1+ 32], m7 + mova [r1+ 48], m7 + mova [r1+ 64], m7 + mova [r1+ 80], m7 + mova [r1+ 96], m7 + mova [r1+112], m7 + mova [r1+128], m7 + mova [r1+144], m7 + mova [r1+160], m7 + mova [r1+176], m7 + mova [r1+192], m7 + mova [r1+208], m7 + mova [r1+224], m7 + mova [r1+240], m7 %endif ; ARCH_X86_64 add rsp, pad |