diff options
author | Luca Barbato <lu_zero@gentoo.org> | 2006-04-25 19:54:02 +0000 |
---|---|---|
committer | Luca Barbato <lu_zero@gentoo.org> | 2006-04-25 19:54:02 +0000 |
commit | 5f22aa3cc0484e8fb7116f64a8d30a23f22accac (patch) | |
tree | 2ff69147491cba2cc61a1249adff5af5d4eb37a8 | |
parent | f420826c352c2ee84b75a82fefb07540b8cb6d1d (diff) | |
download | ffmpeg-5f22aa3cc0484e8fb7116f64a8d30a23f22accac.tar.gz |
13% faster inner_add_yblock
Originally committed as revision 5316 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | libavcodec/ppc/dsputil_snow_altivec.c | 576 |
1 files changed, 267 insertions, 309 deletions
diff --git a/libavcodec/ppc/dsputil_snow_altivec.c b/libavcodec/ppc/dsputil_snow_altivec.c index b2ae8734cd..06157e0cf2 100644 --- a/libavcodec/ppc/dsputil_snow_altivec.c +++ b/libavcodec/ppc/dsputil_snow_altivec.c @@ -413,6 +413,96 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, } } +#define LOAD_BLOCKS \ + tmp1 = vec_ld(0, &block[3][y*src_stride]);\ + align = vec_lvsl(0, &block[3][y*src_stride]);\ + tmp2 = vec_ld(15, &block[3][y*src_stride]);\ +\ + b3 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, &block[2][y*src_stride]);\ + align = vec_lvsl(0, &block[2][y*src_stride]);\ + tmp2 = vec_ld(15, &block[2][y*src_stride]);\ +\ + b2 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, &block[1][y*src_stride]);\ + align = vec_lvsl(0, &block[1][y*src_stride]);\ + tmp2 = vec_ld(15, &block[1][y*src_stride]);\ +\ + b1 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, &block[0][y*src_stride]);\ + align = vec_lvsl(0, &block[0][y*src_stride]);\ + tmp2 = vec_ld(15, &block[0][y*src_stride]);\ +\ + b0 = vec_perm(tmp1,tmp2,align); + +#define LOAD_OBMCS \ + tmp1 = vec_ld(0, obmc1);\ + align = vec_lvsl(0, obmc1);\ + tmp2 = vec_ld(15, obmc1);\ +\ + ob1 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, obmc2);\ + align = vec_lvsl(0, obmc2);\ + tmp2 = vec_ld(15, obmc2);\ +\ + ob2 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, obmc3);\ + align = vec_lvsl(0, obmc3);\ + tmp2 = vec_ld(15, obmc3);\ +\ + ob3 = vec_perm(tmp1,tmp2,align);\ +\ + tmp1 = vec_ld(0, obmc4);\ + align = vec_lvsl(0, obmc4);\ + tmp2 = vec_ld(15, obmc4);\ +\ + ob4 = vec_perm(tmp1,tmp2,align); + +/* interleave logic + * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] + * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] + * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] + */ + +#define STEPS_0_1\ + h1 = (vector unsigned short)\ + vec_mergeh(ob1, ob2);\ +\ + h2 = (vector unsigned short)\ + vec_mergeh(ob3, ob4);\ +\ + ih = (vector unsigned char)\ + vec_mergeh(h1,h2);\ +\ + l1 = (vector unsigned short) vec_mergeh(b3, b2);\ +\ + ih1 = (vector unsigned char) vec_mergel(h1, h2);\ +\ + l2 = (vector unsigned short) vec_mergeh(b1, b0);\ +\ + il = (vector unsigned char) vec_mergeh(l1, l2);\ +\ + v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ +\ + il1 = (vector unsigned char) vec_mergel(l1, l2);\ +\ + v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); + +#define FINAL_STEP_SCALAR\ + for(x=0; x<b_w; x++)\ + if(add){\ + vbuf[x] += dst[x + src_x];\ + vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\ + if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ + dst8[x + y*src_stride] = vbuf[x];\ + }else{\ + dst[x + src_x] -= vbuf[x];\ + } static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, const int obmc_stride, @@ -423,11 +513,13 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, { int y, x; DWTELEM * dst; -// vector bool int mask; -// vector signed int vs; vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, tmp1, tmp2, align; + vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; vector unsigned char b0,b1,b2,b3; + vector unsigned char ob1,ob2,ob3,ob4; + + DECLARE_ALIGNED_16(int, vbuf[16]); + vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y<b_h; y++){ //FIXME ugly missue of obmc_stride @@ -436,168 +528,177 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, uint8_t *obmc2= obmc1+ (obmc_stride>>1); uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); uint8_t *obmc4= obmc3+ (obmc_stride>>1); -#if 1 - vector unsigned char ob1; - vector unsigned char ob2; - vector unsigned char ob3; - vector unsigned char ob4; - -#endif - DECLARE_ALIGNED_16(int, vbuf[16]); - vector signed int *v = (vector signed int *)vbuf, *d; dst = slice_buffer_get_line(sb, src_y + y); d = (vector signed int *)(dst + src_x); -#if 0 - for(x=0; x<b_w; x++){ - vbuf[x] = obmc1[x] * block[3][x + y*src_stride] - +obmc2[x] * block[2][x + y*src_stride] - +obmc3[x] * block[1][x + y*src_stride] - +obmc4[x] * block[0][x + y*src_stride]; - } -#else - - -// load blocks - //FIXME i could avoid some loads! - tmp1 = vec_ld(0, &block[3][y*src_stride]); - align = vec_lvsl(0, &block[3][y*src_stride]); - tmp2 = vec_ld(15, &block[3][y*src_stride]); +//FIXME i could avoid some loads! - b3 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[2][y*src_stride]); - align = vec_lvsl(0, &block[2][y*src_stride]); - tmp2 = vec_ld(15, &block[2][y*src_stride]); + // load blocks + LOAD_BLOCKS - b2 = vec_perm(tmp1,tmp2,align); + // load obmcs + LOAD_OBMCS - tmp1 = vec_ld(0, &block[1][y*src_stride]); - align = vec_lvsl(0, &block[1][y*src_stride]); - tmp2 = vec_ld(15, &block[1][y*src_stride]); + // steps 0 1 + STEPS_0_1 - b1 = vec_perm(tmp1,tmp2,align); + FINAL_STEP_SCALAR - tmp1 = vec_ld(0, &block[0][y*src_stride]); - align = vec_lvsl(0, &block[0][y*src_stride]); - tmp2 = vec_ld(15, &block[0][y*src_stride]); + } - b0 = vec_perm(tmp1,tmp2,align); +} - // load obmcs +#define STEPS_2_3\ + h1 = (vector unsigned short) vec_mergel(ob1, ob2);\ +\ + h2 = (vector unsigned short) vec_mergel(ob3, ob4);\ +\ + ih = (vector unsigned char) vec_mergeh(h1,h2);\ +\ + l1 = (vector unsigned short) vec_mergel(b3, b2);\ +\ + l2 = (vector unsigned short) vec_mergel(b1, b0);\ +\ + ih1 = (vector unsigned char) vec_mergel(h1,h2);\ +\ + il = (vector unsigned char) vec_mergeh(l1,l2);\ +\ + v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ +\ + il1 = (vector unsigned char) vec_mergel(l1,l2);\ +\ + v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); - tmp1 = vec_ld(0, obmc1); - align = vec_lvsl(0, obmc1); - tmp2 = vec_ld(15, obmc1); - ob1 = vec_perm(tmp1,tmp2,align); +static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, + const int obmc_stride, + uint8_t * * block, int b_w, + int b_h, int src_x, int src_y, + int src_stride, slice_buffer * sb, + int add, uint8_t * dst8) +{ + int y, x; + DWTELEM * dst; + vector unsigned short h1, h2, l1, l2; + vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; + vector unsigned char b0,b1,b2,b3; + vector unsigned char ob1,ob2,ob3,ob4; + DECLARE_ALIGNED_16(int, vbuf[b_w]); + vector signed int *v = (vector signed int *)vbuf, *d; - tmp1 = vec_ld(0, obmc2); - align = vec_lvsl(0, obmc2); - tmp2 = vec_ld(15, obmc2); + for(y=0; y<b_h; y++){ + //FIXME ugly missue of obmc_stride - ob2 = vec_perm(tmp1,tmp2,align); + uint8_t *obmc1= obmc + y*obmc_stride; + uint8_t *obmc2= obmc1+ (obmc_stride>>1); + uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); + uint8_t *obmc4= obmc3+ (obmc_stride>>1); - tmp1 = vec_ld(0, obmc3); - align = vec_lvsl(0, obmc3); - tmp2 = vec_ld(15, obmc3); + dst = slice_buffer_get_line(sb, src_y + y); + d = (vector signed int *)(dst + src_x); - ob3 = vec_perm(tmp1,tmp2,align); + // load blocks + LOAD_BLOCKS - tmp1 = vec_ld(0, obmc4); - align = vec_lvsl(0, obmc4); - tmp2 = vec_ld(15, obmc4); + // load obmcs + LOAD_OBMCS - ob4 = vec_perm(tmp1,tmp2,align); - h1 = (vector unsigned short) - vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b, a,b,a,b, - a,b,a,b, a,b,a,b ] */ - h2 = (vector unsigned short) - vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d, c,d,c,d, - c,d,c,d, c,d,c,d ] */ + // steps 0 1 2 3 + STEPS_0_1 - ih = (vector unsigned char) - vec_mergeh(h1,h2); /*ih <- [ a,b,c,d, a,b,c,d, - a,b,c,d, a,b,c,d ]*/ + STEPS_2_3 - l1 = (vector unsigned short) vec_mergeh(b3, b2); + FINAL_STEP_SCALAR - l2 = (vector unsigned short) vec_mergeh(b1, b0); + } +} - il = (vector unsigned char) vec_mergeh(l1, l2); +#define FINAL_STEP_VEC \ +\ + if(add)\ + {\ + for(x=0; x<b_w/4; x++)\ + {\ + v[x] = vec_add(v[x], d[x]);\ + v[x] = vec_sra(vec_add(v[x],\ + vec_sl( vec_splat_s32(1),\ + vec_splat_u32(7))),\ + vec_splat_u32(8));\ +\ + mask = vec_sl((vector signed int)\ + vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\ + mask = vec_and(v[x],vec_nor(mask,mask));\ +\ + mask = (vector signed int)\ + vec_cmpeq((vector signed int)mask,\ + (vector signed int)vec_splat_u32(0));\ +\ + vs = vec_sra(v[x],vec_splat_u32(8));\ + vs = vec_sra(v[x],vec_splat_u32(8));\ + vs = vec_sra(v[x],vec_splat_u32(15));\ +\ + vs = vec_nor(vs,vs);\ +\ + v[x]= vec_sel(v[x],vs,mask);\ + }\ +\ + for(x=0; x<b_w; x++)\ + dst8[x + y*src_stride] = vbuf[x];\ +\ + }\ + else\ + for(x=0; x<b_w/4; x++)\ + d[x] = vec_sub(d[x], v[x]); - v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); -//step1 +static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, + const int obmc_stride, + uint8_t * * block, int b_w, + int b_h, int src_x, int src_y, + int src_stride, slice_buffer * sb, + int add, uint8_t * dst8) +{ + int y, x; + DWTELEM * dst; + vector bool int mask; + vector signed int vs; + vector unsigned short h1, h2, l1, l2; + vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; + vector unsigned char b0,b1,b2,b3; + vector unsigned char ob1,ob2,ob3,ob4; - h1 = (vector unsigned short) vec_mergeh(ob1, ob2); + DECLARE_ALIGNED_16(int, vbuf[16]); + vector signed int *v = (vector signed int *)vbuf, *d; - h2 = (vector unsigned short) vec_mergeh(ob3, ob4); + for(y=0; y<b_h; y++){ + //FIXME ugly missue of obmc_stride - ih = (vector unsigned char) vec_mergel(h1, h2); + uint8_t *obmc1= obmc + y*obmc_stride; + uint8_t *obmc2= obmc1+ (obmc_stride>>1); + uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); + uint8_t *obmc4= obmc3+ (obmc_stride>>1); - l1 = (vector unsigned short) vec_mergeh(b3, b2); + dst = slice_buffer_get_line(sb, src_y + y); + d = (vector signed int *)(dst + src_x); - l2 = (vector unsigned short) vec_mergeh(b1, b0); +//FIXME i could avoid some loads! - il = (vector unsigned char) vec_mergel(l1, l2); + // load blocks + LOAD_BLOCKS - v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); + // load obmcs + LOAD_OBMCS + // steps 0 1 + STEPS_0_1 -#endif + FINAL_STEP_VEC -#if 1 - for(x=0; x<b_w; x++) - if(add){ - vbuf[x] += dst[x + src_x]; - vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS; - if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31); - dst8[x + y*src_stride] = vbuf[x]; - }else{ - dst[x + src_x] -= vbuf[x]; - } -#else - if(add) - { - for(x=0; x<b_w/4; x++) - { - v[x] = vec_add(v[x], d[x]); - v[x] = vec_sra(vec_add(v[x], - vec_sl( vec_splat_s32(1), - vec_splat_u32(7))), - vec_splat_u32(8)); - - mask = (vector bool int) - vec_sl((vector signed int) vec_cmpeq(v[x],v[x]), - vec_splat_u32(8)); - mask = (vector bool int) - vec_and(v[x],vec_nor(mask,mask)); - - mask = (vector bool int) - vec_cmpeq((vector signed int)mask, vec_splat_s32(0)); - - vs = vec_sra(v[x],vec_splat_u32(8)); - vs = vec_sra(v[x],vec_splat_u32(8)); - vs = vec_sra(v[x],vec_splat_u32(15)); - - vs = vec_nor(vs,vs); - - v[x]= vec_sel(v[x],vs,mask); - } - for(x=0; x<b_w; x++) - dst8[x + y*src_stride] = vbuf[x]; - } - else - for(x=0; x<b_w/4; x++) - d[x] = vec_sub(d[x], v[x]); -#endif } - } -static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, +static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, @@ -606,9 +707,14 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, { int y, x; DWTELEM * dst; + vector bool int mask; + vector signed int vs; vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, tmp1, tmp2, align; + vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; vector unsigned char b0,b1,b2,b3; + vector unsigned char ob1,ob2,ob3,ob4; + DECLARE_ALIGNED_16(int, vbuf[b_w]); + vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y<b_h; y++){ //FIXME ugly missue of obmc_stride @@ -618,183 +724,23 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); uint8_t *obmc4= obmc3+ (obmc_stride>>1); - vector unsigned char ob1; - vector unsigned char ob2; - vector unsigned char ob3; - vector unsigned char ob4; - - DECLARE_ALIGNED_16(int, vbuf[b_w]); - vector signed int *v = (vector signed int *)vbuf, *d; - dst = slice_buffer_get_line(sb, src_y + y); d = (vector signed int *)(dst + src_x); // load blocks + LOAD_BLOCKS - tmp1 = vec_ld(0, &block[3][y*src_stride]); - align = vec_lvsl(0, &block[3][y*src_stride]); - tmp2 = vec_ld(15, &block[3][y*src_stride]); - - b3 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[2][y*src_stride]); - align = vec_lvsl(0, &block[2][y*src_stride]); - tmp2 = vec_ld(15, &block[2][y*src_stride]); - - b2 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[1][y*src_stride]); - align = vec_lvsl(0, &block[1][y*src_stride]); - tmp2 = vec_ld(15, &block[1][y*src_stride]); - - b1 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, &block[0][y*src_stride]); - align = vec_lvsl(0, &block[0][y*src_stride]); - tmp2 = vec_ld(15, &block[0][y*src_stride]); - - b0 = vec_perm(tmp1,tmp2,align); - - // load obmcs - - tmp1 = vec_ld(0, obmc1); - align = vec_lvsl(0, obmc1); - tmp2 = vec_ld(15, obmc1); - - ob1 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, obmc2); - align = vec_lvsl(0, obmc2); - tmp2 = vec_ld(15, obmc2); - - ob2 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, obmc3); - align = vec_lvsl(0, obmc3); - tmp2 = vec_ld(15, obmc3); - - ob3 = vec_perm(tmp1,tmp2,align); - - tmp1 = vec_ld(0, obmc4); - align = vec_lvsl(0, obmc4); - tmp2 = vec_ld(15, obmc4); - - ob4 = vec_perm(tmp1,tmp2,align); - -//step0 - h1 = (vector unsigned short) - vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b, - a,b,a,b, - a,b,a,b, - a,b,a,b ] */ - h2 = (vector unsigned short) - vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d, - c,d,c,d, - c,d,c,d, - c,d,c,d ] */ - - ih = (vector unsigned char) - vec_mergeh(h1,h2); /*ih <- [ a,b,c,d, - a,b,c,d, - a,b,c,d, - a,b,c,d ]*/ - - l1 = (vector unsigned short) vec_mergeh(b3, b2); - - l2 = (vector unsigned short) vec_mergeh(b1, b0); - - il = (vector unsigned char) vec_mergeh(l1,l2); - - v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); -//step1 - - h1 = (vector unsigned short) vec_mergeh(ob1, ob2); + // load obmcs + LOAD_OBMCS - h2 = (vector unsigned short) vec_mergeh(ob3, ob4); + // steps 0 1 2 3 + STEPS_0_1 - ih = (vector unsigned char) vec_mergel(h1,h2); + STEPS_2_3 - l1 = (vector unsigned short) vec_mergeh(b3, b2); + FINAL_STEP_VEC - l2 = (vector unsigned short) vec_mergeh(b1, b0); - - il = (vector unsigned char) vec_mergel(l1,l2); - - v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); - -//step2 - h1 = (vector unsigned short) vec_mergel(ob1, ob2); - - h2 = (vector unsigned short) vec_mergel(ob3, ob4); - - ih = (vector unsigned char) vec_mergeh(h1,h2); - - l1 = (vector unsigned short) vec_mergel(b3, b2); - - l2 = (vector unsigned short) vec_mergel(b1, b0); - - il = (vector unsigned char) vec_mergeh(l1,l2); - - v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); - -//step3 - h1 = (vector unsigned short) vec_mergel(ob1, ob2); - - h2 = (vector unsigned short) vec_mergel(ob3, ob4); - - ih = (vector unsigned char) vec_mergel(h1,h2); - - l1 = (vector unsigned short) vec_mergel(b3, b2); - - l2 = (vector unsigned short) vec_mergel(b1, b0); - - il = (vector unsigned char) vec_mergel(l1,l2); - - v[3] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0)); -#if 1 - for(x=0; x<b_w; x++) - if(add){ - vbuf[x] += dst[x + src_x]; - vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS; - if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31); - dst8[x + y*src_stride] = vbuf[x]; - }else{ - dst[x + src_x] -= vbuf[x]; - } -#else - if(add) - { - for(x=0; x<b_w/4; x++) - { - v[x] = vec_add(v[x], d[x]); - v[x] = vec_sra(vec_add(v[x], - vec_sl( vec_splat_s32(1), - vec_splat_u32(7))), - vec_splat_u32(8)); - - mask = vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),vec_splat_u32(8)); - mask = vec_and(v[x],vec_nor(mask,mask)); - - mask = (vector signed int) vec_cmpeq((vector signed int)mask,(vector signed int)vec_splat_u32(0)); - - vs = vec_sra(v[x],vec_splat_u32(8)); - vs = vec_sra(v[x],vec_splat_u32(8)); - vs = vec_sra(v[x],vec_splat_u32(15)); - - vs = vec_nor(vs,vs); - - v[x]= vec_sel(v[x],vs,mask); - } - - for(x=0; x<b_w; x++) - dst8[x + y*src_stride] = vbuf[x]; - - } - else - for(x=0; x<b_w/4; x++) - d[x] = vec_sub(d[x], v[x]); -#endif - } + } } @@ -804,17 +750,29 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, slice_buffer * sb, int add, uint8_t * dst8) { -//FIXME implement src_x&15 cases later - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, b_w, - b_h, src_x, src_y, src_stride, - sb, add, dst8); - else if (b_w == 8) - inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else - - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, - src_y, src_stride, sb, add, dst8); + if (src_x&15) { + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, + b_w, b_h, src_x, src_y, + src_stride, sb, add, dst8); + else if (b_w == 8) + inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, + b_w, b_h, src_x, src_y, + src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, + src_y, src_stride, sb, add, dst8); + } else { + if (b_w == 16) + inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block, + b_w, b_h, src_x, src_y, + src_stride, sb, add, dst8); + else if (b_w == 8) + inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block, + b_w, b_h, src_x, src_y, + src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, + src_y, src_stride, sb, add, dst8); + } } |