diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2006-04-04 09:23:45 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2006-04-04 09:23:45 +0000 |
commit | 703c8195a89d1784209d2167e1e9164d1d550e8f (patch) | |
tree | b3e9cfb63db68ef849cecba992d433d2ab43123e | |
parent | 841f65f25a14cfd7020567ae106e9c7ee60de785 (diff) | |
download | ffmpeg-703c8195a89d1784209d2167e1e9164d1d550e8f.tar.gz |
mmx implementation of 3-point GMC. (5x faster than C)
Originally committed as revision 5265 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | libavcodec/dsputil.c | 4 | ||||
-rw-r--r-- | libavcodec/dsputil.h | 3 | ||||
-rw-r--r-- | libavcodec/i386/dsputil_mmx.c | 122 |
3 files changed, 127 insertions, 2 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 29ce97a593..ad1bfd482f 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -1144,7 +1144,7 @@ static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y } } -static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, +void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) { int y, vx, vy; @@ -3865,7 +3865,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->add_pixels8 = add_pixels8_c; c->add_pixels4 = add_pixels4_c; c->gmc1 = gmc1_c; - c->gmc = gmc_c; + c->gmc = ff_gmc_c; c->clear_blocks = clear_blocks_c; c->pix_sum = pix_sum_c; c->pix_norm1 = pix_norm1_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 9f0f952df5..44e6a9efc7 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -82,6 +82,9 @@ void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, i void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); +void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); + /* minimum alignment rules ;) if u notice errors in the align stuff, need more alignment for some asm code for some cpu or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ... diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 1aa5c850b2..c315e196eb 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2403,6 +2403,126 @@ static void just_return() { return; } c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ c->avg_ ## postfix1 = avg_ ## postfix2; +static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ + const int w = 8; + const int s = 1<<shift; + const int ix = ox>>(16+shift); + const int iy = oy>>(16+shift); + const int oxs = ox>>4; + const int oys = oy>>4; + const int dxxs = dxx>>4; + const int dxys = dxy>>4; + const int dyxs = dyx>>4; + const int dyys = dyy>>4; + const uint16_t r4[4] = {r,r,r,r}; + const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; + const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; + const uint64_t shift2 = 2*shift; + uint8_t edge_buf[(h+1)*stride]; + int x, y; + + const int dxw = (dxx-(1<<(16+shift)))*(w-1); + const int dyh = (dyy-(1<<(16+shift)))*(h-1); + const int dxh = dxy*(h-1); + const int dyw = dyx*(w-1); + if( // non-constant fullpel offset (3% of blocks) + (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) | + oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift) + // uses more than 16 bits of subpel mv (only at huge resolution) + || (dxx|dxy|dyx|dyy)&15 ) + { + //FIXME could still use mmx for some of the rows + ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); + return; + } + + if( (unsigned)ix >= width-w || + (unsigned)iy >= height-h ) + { + ff_emulated_edge_mc(edge_buf, src+ix+iy*stride, stride, w+1, h+1, ix, iy, width, height); + src = edge_buf; + } + else + src += ix + iy*stride; + + for(x=0; x<w; x+=4){ + uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), + oxs - dxys + dxxs*(x+1), + oxs - dxys + dxxs*(x+2), + oxs - dxys + dxxs*(x+3) }; + uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), + oys - dyys + dyxs*(x+1), + oys - dyys + dyxs*(x+2), + oys - dyys + dyxs*(x+3) }; + + asm volatile( + "movd %0, %%mm6 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + :: "g"(s) + ); + + for(y=0; y<h; y++){ + asm volatile( + "movq %0, %%mm4 \n\t" + "movq %1, %%mm5 \n\t" + "paddw %2, %%mm4 \n\t" + "paddw %3, %%mm5 \n\t" + "movq %%mm4, %0 \n\t" + "movq %%mm5, %1 \n\t" + "psrlw $12, %%mm4 \n\t" + "psrlw $12, %%mm5 \n\t" + : "+m"(*dx4), "+m"(*dy4) + : "m"(*dxy4), "m"(*dyy4) + ); + + asm volatile( + "movq %%mm6, %%mm2 \n\t" + "movq %%mm6, %%mm1 \n\t" + "psubw %%mm4, %%mm2 \n\t" + "psubw %%mm5, %%mm1 \n\t" + "movq %%mm2, %%mm0 \n\t" + "movq %%mm4, %%mm3 \n\t" + "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) + "pmullw %%mm5, %%mm3 \n\t" // dx*dy + "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy + "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) + + "movd %4, %%mm5 \n\t" + "movd %3, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy + "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy + + "movd %2, %%mm5 \n\t" + "movd %1, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) + "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %5, %%mm0 \n\t" + + "psrlw %6, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, %0 \n\t" + + : "=m"(dst[x+y*stride]) + : "m"(src[0]), "m"(src[1]), + "m"(src[stride]), "m"(src[stride+1]), + "m"(*r4), "m"(shift2) + ); + src += stride; + } + src += 4-h*stride; + } +} + static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ long i=0; @@ -2725,6 +2845,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; + c->gmc= gmc_mmx; + c->add_bytes= add_bytes_mmx; #ifdef CONFIG_ENCODERS c->diff_bytes= diff_bytes_mmx; |