diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-03-19 22:46:28 +0100 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-04-10 10:06:48 -0700 |
commit | 272b252c0110225188c7d7f31167941210aac197 (patch) | |
tree | 47bea5996c88057a418e8872a655bac8f261736e | |
parent | d3c59d5003a483f1a23e225fc71c19bd1116d11c (diff) | |
download | ffmpeg-272b252c0110225188c7d7f31167941210aac197.tar.gz |
rv40dsp: implement prescaled versions for biweight.
Quite often, the original weights are multiple of 512. By prescaling them
by 1/512 when they are computed (once per frame), no intermediate shifting
is needed, and no prescaling on each call either.
The x86 code already used that trick.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r-- | libavcodec/arm/rv40dsp_init_neon.c | 4 | ||||
-rw-r--r-- | libavcodec/rv34.c | 58 | ||||
-rw-r--r-- | libavcodec/rv34.h | 2 | ||||
-rw-r--r-- | libavcodec/rv34dsp.h | 7 | ||||
-rw-r--r-- | libavcodec/rv40dsp.c | 20 | ||||
-rw-r--r-- | libavcodec/x86/rv40dsp.asm | 70 | ||||
-rw-r--r-- | libavcodec/x86/rv40dsp_init.c | 30 |
7 files changed, 112 insertions, 79 deletions
diff --git a/libavcodec/arm/rv40dsp_init_neon.c b/libavcodec/arm/rv40dsp_init_neon.c index 650ef61878..2ce50a2073 100644 --- a/libavcodec/arm/rv40dsp_init_neon.c +++ b/libavcodec/arm/rv40dsp_init_neon.c @@ -128,8 +128,8 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon; - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon; c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c index 3ad1717d13..12475692c6 100644 --- a/libavcodec/rv34.c +++ b/libavcodec/rv34.c @@ -521,7 +521,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int */ static int calc_add_mv(RV34DecContext *r, int dir, int val) { - int mul = dir ? -r->weight2 : r->weight1; + int mul = dir ? -r->mv_weight2 : r->mv_weight1; return (val * mul + 0x2000) >> 14; } @@ -776,24 +776,24 @@ static void rv34_mc_1mv(RV34DecContext *r, const int block_type, static void rv4_weight(RV34DecContext *r) { - r->rdsp.rv40_weight_pixels_tab[0](r->s.dest[0], - r->tmp_b_block_y[0], - r->tmp_b_block_y[1], - r->weight1, - r->weight2, - r->s.linesize); - r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[1], - r->tmp_b_block_uv[0], - r->tmp_b_block_uv[2], - r->weight1, - r->weight2, - r->s.uvlinesize); - r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[2], - r->tmp_b_block_uv[1], - r->tmp_b_block_uv[3], - r->weight1, - r->weight2, - r->s.uvlinesize); + r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][0](r->s.dest[0], + r->tmp_b_block_y[0], + r->tmp_b_block_y[1], + r->weight1, + r->weight2, + r->s.linesize); + r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[1], + r->tmp_b_block_uv[0], + r->tmp_b_block_uv[2], + r->weight1, + r->weight2, + r->s.uvlinesize); + r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[2], + r->tmp_b_block_uv[1], + r->tmp_b_block_uv[3], + r->weight1, + r->weight2, + r->s.uvlinesize); } static void rv34_mc_2mv(RV34DecContext *r, const int block_type) @@ -1703,11 +1703,21 @@ int ff_rv34_decode_frame(AVCodecContext *avctx, int dist0 = GET_PTS_DIFF(r->cur_pts, r->last_pts); int dist1 = GET_PTS_DIFF(r->next_pts, r->cur_pts); - if (!refdist) { - r->weight1 = r->weight2 = 8192; - } else { - r->weight1 = (dist0 << 14) / refdist; - r->weight2 = (dist1 << 14) / refdist; + if(!refdist){ + r->mv_weight1 = r->mv_weight2 = r->weight1 = r->weight2 = 8192; + r->scaled_weight = 0; + }else{ + r->mv_weight1 = (dist0 << 14) / refdist; + r->mv_weight2 = (dist1 << 14) / refdist; + if((r->mv_weight1|r->mv_weight2) & 511){ + r->weight1 = r->mv_weight1; + r->weight2 = r->mv_weight2; + r->scaled_weight = 0; + }else{ + r->weight1 = r->mv_weight1 >> 9; + r->weight2 = r->mv_weight2 >> 9; + r->scaled_weight = 1; + } } } s->mb_x = s->mb_y = 0; diff --git a/libavcodec/rv34.h b/libavcodec/rv34.h index 76232145c5..e7a59c4bed 100644 --- a/libavcodec/rv34.h +++ b/libavcodec/rv34.h @@ -106,7 +106,9 @@ typedef struct RV34DecContext{ int rpr; ///< one field size in RV30 slice header int cur_pts, last_pts, next_pts; + int scaled_weight; int weight1, weight2; ///< B frame distance fractions (0.14) used in motion compensation + int mv_weight1, mv_weight2; uint16_t *cbp_luma; ///< CBP values for luma subblocks uint8_t *cbp_chroma; ///< CBP values for chroma subblocks diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h index c70194cc20..58da59f038 100644 --- a/libavcodec/rv34dsp.h +++ b/libavcodec/rv34dsp.h @@ -58,7 +58,12 @@ typedef struct RV34DSPContext { qpel_mc_func avg_pixels_tab[4][16]; h264_chroma_mc_func put_chroma_pixels_tab[3]; h264_chroma_mc_func avg_chroma_pixels_tab[3]; - rv40_weight_func rv40_weight_pixels_tab[2]; + /** + * Biweight functions, first dimension is transform size (16/8), + * second is whether the weight is prescaled by 1/512 to skip + * the intermediate shifting. + */ + rv40_weight_func rv40_weight_pixels_tab[2][2]; rv34_inv_transform_func rv34_inv_transform; rv34_inv_transform_func rv34_inv_transform_dc; rv34_idct_add_func rv34_idct_add; diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c index c12958a89c..19a18d37a5 100644 --- a/libavcodec/rv40dsp.c +++ b/libavcodec/rv40dsp.c @@ -278,7 +278,7 @@ RV40_CHROMA_MC(put_, op_put) RV40_CHROMA_MC(avg_, op_avg) #define RV40_WEIGHT_FUNC(size) \ -static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ +static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ {\ int i, j;\ \ @@ -289,6 +289,18 @@ static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src src2 += stride;\ dst += stride;\ }\ +}\ +static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ +{\ + int i, j;\ +\ + for (j = 0; j < size; j++) {\ + for (i = 0; i < size; i++)\ + dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\ + src1 += stride;\ + src2 += stride;\ + dst += stride;\ + }\ } RV40_WEIGHT_FUNC(16) @@ -578,8 +590,10 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) { c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c; c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c; - c->rv40_weight_pixels_tab[0] = rv40_weight_func_16; - c->rv40_weight_pixels_tab[1] = rv40_weight_func_8; + c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16; + c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8; + c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16; + c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8; c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter; c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter; diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index bff3e7b96a..9028e74024 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -139,69 +139,61 @@ SECTION .text ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) ; %1=size %2=num of xmm regs -%macro RV40_WEIGHT 2 -cglobal rv40_weight_func_%1, 6, 7, %2 +; The weights are FP0.14 notation of fractions depending on pts. +; For timebases without rounding error (i.e. PAL), the fractions +; can be simplified, and several operations can be avoided. +; Therefore, we check here whether they are multiples of 2^9 for +; those simplifications to occur. +%macro RV40_WEIGHT 3 +cglobal rv40_weight_func_%1_%2, 6, 7, %3 %if cpuflag(ssse3) mova m1, [shift_round] %else mova m1, [pw_16] %endif pxor m0, m0 - mov r6, r3 - or r6, r4 - ; The weights are FP0.14 notation of fractions depending on pts. - ; For timebases without rounding error (i.e. PAL), the fractions - ; can be simplified, and several operations can be avoided. - ; Therefore, we check here whether they are multiples of 2^9 for - ; those simplifications to occur. - and r6, 0x1FF ; Set loop counter and increments %if mmsize == 8 - mov r6, %1 + mov r6, %2 %else - mov r6, (%1 * %1) / mmsize + mov r6, (%2 * %2) / mmsize %endif - ; Use result of test now - jz .loop_512 movd m2, r3 movd m3, r4 +%ifidn %1,rnd +%define RND 0 SPLATW m2, m2 - SPLATW m3, m3 - -.loop: - MAIN_LOOP %1, 0 - jnz .loop - REP_RET - - ; Weights are multiple of 512, which allows some shortcuts -.loop_512: - sar r3, 9 - sar r4, 9 - movd m2, r3 - movd m3, r4 +%else +%define RND 1 %if cpuflag(ssse3) punpcklbw m3, m2 - SPLATW m3, m3 %else SPLATW m2, m2 - SPLATW m3, m3 %endif -.loop2: - MAIN_LOOP %1, 1 - jnz .loop2 - REP_RET +%endif + SPLATW m3, m3 +.loop: + MAIN_LOOP %2, RND + jnz .loop + REP_RET %endmacro INIT_MMX mmx -RV40_WEIGHT 8, 0 -RV40_WEIGHT 16, 0 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 INIT_XMM sse2 -RV40_WEIGHT 8, 8 -RV40_WEIGHT 16, 8 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 INIT_XMM ssse3 -RV40_WEIGHT 8, 8 -RV40_WEIGHT 16, 8 +RV40_WEIGHT rnd, 8, 3 +RV40_WEIGHT rnd, 16, 4 +RV40_WEIGHT nornd, 8, 3 +RV40_WEIGHT nornd, 16, 4 diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c index 79c70f78c3..df468aa9e5 100644 --- a/libavcodec/x86/rv40dsp_init.c +++ b/libavcodec/x86/rv40dsp_init.c @@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); #define DECLARE_WEIGHT(opt) \ -void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); \ -void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); +void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); \ +void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); \ +void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); \ +void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ + int w1, int w2, ptrdiff_t stride); DECLARE_WEIGHT(mmx) DECLARE_WEIGHT(sse2) DECLARE_WEIGHT(ssse3) @@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) if (mm_flags & AV_CPU_FLAG_MMX) { c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx; + c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx; + c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx; } if (mm_flags & AV_CPU_FLAG_MMX2) { c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; @@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; } if (mm_flags & AV_CPU_FLAG_SSE2) { - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; + c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; + c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; } if (mm_flags & AV_CPU_FLAG_SSSE3) { - c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3; - c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3; + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; + c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; + c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; } #endif } |