aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLuca Barbato <lu_zero@gentoo.org>2006-04-25 19:54:02 +0000
committerLuca Barbato <lu_zero@gentoo.org>2006-04-25 19:54:02 +0000
commit5f22aa3cc0484e8fb7116f64a8d30a23f22accac (patch)
tree2ff69147491cba2cc61a1249adff5af5d4eb37a8
parentf420826c352c2ee84b75a82fefb07540b8cb6d1d (diff)
downloadffmpeg-5f22aa3cc0484e8fb7116f64a8d30a23f22accac.tar.gz
13% faster inner_add_yblock
Originally committed as revision 5316 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/ppc/dsputil_snow_altivec.c576
1 files changed, 267 insertions, 309 deletions
diff --git a/libavcodec/ppc/dsputil_snow_altivec.c b/libavcodec/ppc/dsputil_snow_altivec.c
index b2ae8734cd..06157e0cf2 100644
--- a/libavcodec/ppc/dsputil_snow_altivec.c
+++ b/libavcodec/ppc/dsputil_snow_altivec.c
@@ -413,6 +413,96 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
}
}
+#define LOAD_BLOCKS \
+ tmp1 = vec_ld(0, &block[3][y*src_stride]);\
+ align = vec_lvsl(0, &block[3][y*src_stride]);\
+ tmp2 = vec_ld(15, &block[3][y*src_stride]);\
+\
+ b3 = vec_perm(tmp1,tmp2,align);\
+\
+ tmp1 = vec_ld(0, &block[2][y*src_stride]);\
+ align = vec_lvsl(0, &block[2][y*src_stride]);\
+ tmp2 = vec_ld(15, &block[2][y*src_stride]);\
+\
+ b2 = vec_perm(tmp1,tmp2,align);\
+\
+ tmp1 = vec_ld(0, &block[1][y*src_stride]);\
+ align = vec_lvsl(0, &block[1][y*src_stride]);\
+ tmp2 = vec_ld(15, &block[1][y*src_stride]);\
+\
+ b1 = vec_perm(tmp1,tmp2,align);\
+\
+ tmp1 = vec_ld(0, &block[0][y*src_stride]);\
+ align = vec_lvsl(0, &block[0][y*src_stride]);\
+ tmp2 = vec_ld(15, &block[0][y*src_stride]);\
+\
+ b0 = vec_perm(tmp1,tmp2,align);
+
+#define LOAD_OBMCS \
+ tmp1 = vec_ld(0, obmc1);\
+ align = vec_lvsl(0, obmc1);\
+ tmp2 = vec_ld(15, obmc1);\
+\
+ ob1 = vec_perm(tmp1,tmp2,align);\
+\
+ tmp1 = vec_ld(0, obmc2);\
+ align = vec_lvsl(0, obmc2);\
+ tmp2 = vec_ld(15, obmc2);\
+\
+ ob2 = vec_perm(tmp1,tmp2,align);\
+\
+ tmp1 = vec_ld(0, obmc3);\
+ align = vec_lvsl(0, obmc3);\
+ tmp2 = vec_ld(15, obmc3);\
+\
+ ob3 = vec_perm(tmp1,tmp2,align);\
+\
+ tmp1 = vec_ld(0, obmc4);\
+ align = vec_lvsl(0, obmc4);\
+ tmp2 = vec_ld(15, obmc4);\
+\
+ ob4 = vec_perm(tmp1,tmp2,align);
+
+/* interleave logic
+ * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
+ * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
+ * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
+ */
+
+#define STEPS_0_1\
+ h1 = (vector unsigned short)\
+ vec_mergeh(ob1, ob2);\
+\
+ h2 = (vector unsigned short)\
+ vec_mergeh(ob3, ob4);\
+\
+ ih = (vector unsigned char)\
+ vec_mergeh(h1,h2);\
+\
+ l1 = (vector unsigned short) vec_mergeh(b3, b2);\
+\
+ ih1 = (vector unsigned char) vec_mergel(h1, h2);\
+\
+ l2 = (vector unsigned short) vec_mergeh(b1, b0);\
+\
+ il = (vector unsigned char) vec_mergeh(l1, l2);\
+\
+ v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
+\
+ il1 = (vector unsigned char) vec_mergel(l1, l2);\
+\
+ v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
+
+#define FINAL_STEP_SCALAR\
+ for(x=0; x<b_w; x++)\
+ if(add){\
+ vbuf[x] += dst[x + src_x];\
+ vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
+ if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
+ dst8[x + y*src_stride] = vbuf[x];\
+ }else{\
+ dst[x + src_x] -= vbuf[x];\
+ }
static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
const int obmc_stride,
@@ -423,11 +513,13 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
{
int y, x;
DWTELEM * dst;
-// vector bool int mask;
-// vector signed int vs;
vector unsigned short h1, h2, l1, l2;
- vector unsigned char ih, il, tmp1, tmp2, align;
+ vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
vector unsigned char b0,b1,b2,b3;
+ vector unsigned char ob1,ob2,ob3,ob4;
+
+ DECLARE_ALIGNED_16(int, vbuf[16]);
+ vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
//FIXME ugly missue of obmc_stride
@@ -436,168 +528,177 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
-#if 1
- vector unsigned char ob1;
- vector unsigned char ob2;
- vector unsigned char ob3;
- vector unsigned char ob4;
-
-#endif
- DECLARE_ALIGNED_16(int, vbuf[16]);
- vector signed int *v = (vector signed int *)vbuf, *d;
dst = slice_buffer_get_line(sb, src_y + y);
d = (vector signed int *)(dst + src_x);
-#if 0
- for(x=0; x<b_w; x++){
- vbuf[x] = obmc1[x] * block[3][x + y*src_stride]
- +obmc2[x] * block[2][x + y*src_stride]
- +obmc3[x] * block[1][x + y*src_stride]
- +obmc4[x] * block[0][x + y*src_stride];
- }
-#else
-
-
-// load blocks
- //FIXME i could avoid some loads!
- tmp1 = vec_ld(0, &block[3][y*src_stride]);
- align = vec_lvsl(0, &block[3][y*src_stride]);
- tmp2 = vec_ld(15, &block[3][y*src_stride]);
+//FIXME i could avoid some loads!
- b3 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, &block[2][y*src_stride]);
- align = vec_lvsl(0, &block[2][y*src_stride]);
- tmp2 = vec_ld(15, &block[2][y*src_stride]);
+ // load blocks
+ LOAD_BLOCKS
- b2 = vec_perm(tmp1,tmp2,align);
+ // load obmcs
+ LOAD_OBMCS
- tmp1 = vec_ld(0, &block[1][y*src_stride]);
- align = vec_lvsl(0, &block[1][y*src_stride]);
- tmp2 = vec_ld(15, &block[1][y*src_stride]);
+ // steps 0 1
+ STEPS_0_1
- b1 = vec_perm(tmp1,tmp2,align);
+ FINAL_STEP_SCALAR
- tmp1 = vec_ld(0, &block[0][y*src_stride]);
- align = vec_lvsl(0, &block[0][y*src_stride]);
- tmp2 = vec_ld(15, &block[0][y*src_stride]);
+ }
- b0 = vec_perm(tmp1,tmp2,align);
+}
- // load obmcs
+#define STEPS_2_3\
+ h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
+\
+ h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
+\
+ ih = (vector unsigned char) vec_mergeh(h1,h2);\
+\
+ l1 = (vector unsigned short) vec_mergel(b3, b2);\
+\
+ l2 = (vector unsigned short) vec_mergel(b1, b0);\
+\
+ ih1 = (vector unsigned char) vec_mergel(h1,h2);\
+\
+ il = (vector unsigned char) vec_mergeh(l1,l2);\
+\
+ v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
+\
+ il1 = (vector unsigned char) vec_mergel(l1,l2);\
+\
+ v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
- tmp1 = vec_ld(0, obmc1);
- align = vec_lvsl(0, obmc1);
- tmp2 = vec_ld(15, obmc1);
- ob1 = vec_perm(tmp1,tmp2,align);
+static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
+ const int obmc_stride,
+ uint8_t * * block, int b_w,
+ int b_h, int src_x, int src_y,
+ int src_stride, slice_buffer * sb,
+ int add, uint8_t * dst8)
+{
+ int y, x;
+ DWTELEM * dst;
+ vector unsigned short h1, h2, l1, l2;
+ vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
+ vector unsigned char b0,b1,b2,b3;
+ vector unsigned char ob1,ob2,ob3,ob4;
+ DECLARE_ALIGNED_16(int, vbuf[b_w]);
+ vector signed int *v = (vector signed int *)vbuf, *d;
- tmp1 = vec_ld(0, obmc2);
- align = vec_lvsl(0, obmc2);
- tmp2 = vec_ld(15, obmc2);
+ for(y=0; y<b_h; y++){
+ //FIXME ugly missue of obmc_stride
- ob2 = vec_perm(tmp1,tmp2,align);
+ uint8_t *obmc1= obmc + y*obmc_stride;
+ uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+ uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+ uint8_t *obmc4= obmc3+ (obmc_stride>>1);
- tmp1 = vec_ld(0, obmc3);
- align = vec_lvsl(0, obmc3);
- tmp2 = vec_ld(15, obmc3);
+ dst = slice_buffer_get_line(sb, src_y + y);
+ d = (vector signed int *)(dst + src_x);
- ob3 = vec_perm(tmp1,tmp2,align);
+ // load blocks
+ LOAD_BLOCKS
- tmp1 = vec_ld(0, obmc4);
- align = vec_lvsl(0, obmc4);
- tmp2 = vec_ld(15, obmc4);
+ // load obmcs
+ LOAD_OBMCS
- ob4 = vec_perm(tmp1,tmp2,align);
- h1 = (vector unsigned short)
- vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b, a,b,a,b,
- a,b,a,b, a,b,a,b ] */
- h2 = (vector unsigned short)
- vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d, c,d,c,d,
- c,d,c,d, c,d,c,d ] */
+ // steps 0 1 2 3
+ STEPS_0_1
- ih = (vector unsigned char)
- vec_mergeh(h1,h2); /*ih <- [ a,b,c,d, a,b,c,d,
- a,b,c,d, a,b,c,d ]*/
+ STEPS_2_3
- l1 = (vector unsigned short) vec_mergeh(b3, b2);
+ FINAL_STEP_SCALAR
- l2 = (vector unsigned short) vec_mergeh(b1, b0);
+ }
+}
- il = (vector unsigned char) vec_mergeh(l1, l2);
+#define FINAL_STEP_VEC \
+\
+ if(add)\
+ {\
+ for(x=0; x<b_w/4; x++)\
+ {\
+ v[x] = vec_add(v[x], d[x]);\
+ v[x] = vec_sra(vec_add(v[x],\
+ vec_sl( vec_splat_s32(1),\
+ vec_splat_u32(7))),\
+ vec_splat_u32(8));\
+\
+ mask = vec_sl((vector signed int)\
+ vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
+ mask = vec_and(v[x],vec_nor(mask,mask));\
+\
+ mask = (vector signed int)\
+ vec_cmpeq((vector signed int)mask,\
+ (vector signed int)vec_splat_u32(0));\
+\
+ vs = vec_sra(v[x],vec_splat_u32(8));\
+ vs = vec_sra(v[x],vec_splat_u32(8));\
+ vs = vec_sra(v[x],vec_splat_u32(15));\
+\
+ vs = vec_nor(vs,vs);\
+\
+ v[x]= vec_sel(v[x],vs,mask);\
+ }\
+\
+ for(x=0; x<b_w; x++)\
+ dst8[x + y*src_stride] = vbuf[x];\
+\
+ }\
+ else\
+ for(x=0; x<b_w/4; x++)\
+ d[x] = vec_sub(d[x], v[x]);
- v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
-//step1
+static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
+ const int obmc_stride,
+ uint8_t * * block, int b_w,
+ int b_h, int src_x, int src_y,
+ int src_stride, slice_buffer * sb,
+ int add, uint8_t * dst8)
+{
+ int y, x;
+ DWTELEM * dst;
+ vector bool int mask;
+ vector signed int vs;
+ vector unsigned short h1, h2, l1, l2;
+ vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
+ vector unsigned char b0,b1,b2,b3;
+ vector unsigned char ob1,ob2,ob3,ob4;
- h1 = (vector unsigned short) vec_mergeh(ob1, ob2);
+ DECLARE_ALIGNED_16(int, vbuf[16]);
+ vector signed int *v = (vector signed int *)vbuf, *d;
- h2 = (vector unsigned short) vec_mergeh(ob3, ob4);
+ for(y=0; y<b_h; y++){
+ //FIXME ugly missue of obmc_stride
- ih = (vector unsigned char) vec_mergel(h1, h2);
+ uint8_t *obmc1= obmc + y*obmc_stride;
+ uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+ uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+ uint8_t *obmc4= obmc3+ (obmc_stride>>1);
- l1 = (vector unsigned short) vec_mergeh(b3, b2);
+ dst = slice_buffer_get_line(sb, src_y + y);
+ d = (vector signed int *)(dst + src_x);
- l2 = (vector unsigned short) vec_mergeh(b1, b0);
+//FIXME i could avoid some loads!
- il = (vector unsigned char) vec_mergel(l1, l2);
+ // load blocks
+ LOAD_BLOCKS
- v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
+ // load obmcs
+ LOAD_OBMCS
+ // steps 0 1
+ STEPS_0_1
-#endif
+ FINAL_STEP_VEC
-#if 1
- for(x=0; x<b_w; x++)
- if(add){
- vbuf[x] += dst[x + src_x];
- vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
- if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);
- dst8[x + y*src_stride] = vbuf[x];
- }else{
- dst[x + src_x] -= vbuf[x];
- }
-#else
- if(add)
- {
- for(x=0; x<b_w/4; x++)
- {
- v[x] = vec_add(v[x], d[x]);
- v[x] = vec_sra(vec_add(v[x],
- vec_sl( vec_splat_s32(1),
- vec_splat_u32(7))),
- vec_splat_u32(8));
-
- mask = (vector bool int)
- vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),
- vec_splat_u32(8));
- mask = (vector bool int)
- vec_and(v[x],vec_nor(mask,mask));
-
- mask = (vector bool int)
- vec_cmpeq((vector signed int)mask, vec_splat_s32(0));
-
- vs = vec_sra(v[x],vec_splat_u32(8));
- vs = vec_sra(v[x],vec_splat_u32(8));
- vs = vec_sra(v[x],vec_splat_u32(15));
-
- vs = vec_nor(vs,vs);
-
- v[x]= vec_sel(v[x],vs,mask);
- }
- for(x=0; x<b_w; x++)
- dst8[x + y*src_stride] = vbuf[x];
- }
- else
- for(x=0; x<b_w/4; x++)
- d[x] = vec_sub(d[x], v[x]);
-#endif
}
-
}
-static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
+static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
const int obmc_stride,
uint8_t * * block, int b_w,
int b_h, int src_x, int src_y,
@@ -606,9 +707,14 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
{
int y, x;
DWTELEM * dst;
+ vector bool int mask;
+ vector signed int vs;
vector unsigned short h1, h2, l1, l2;
- vector unsigned char ih, il, tmp1, tmp2, align;
+ vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
vector unsigned char b0,b1,b2,b3;
+ vector unsigned char ob1,ob2,ob3,ob4;
+ DECLARE_ALIGNED_16(int, vbuf[b_w]);
+ vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
//FIXME ugly missue of obmc_stride
@@ -618,183 +724,23 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
- vector unsigned char ob1;
- vector unsigned char ob2;
- vector unsigned char ob3;
- vector unsigned char ob4;
-
- DECLARE_ALIGNED_16(int, vbuf[b_w]);
- vector signed int *v = (vector signed int *)vbuf, *d;
-
dst = slice_buffer_get_line(sb, src_y + y);
d = (vector signed int *)(dst + src_x);
// load blocks
+ LOAD_BLOCKS
- tmp1 = vec_ld(0, &block[3][y*src_stride]);
- align = vec_lvsl(0, &block[3][y*src_stride]);
- tmp2 = vec_ld(15, &block[3][y*src_stride]);
-
- b3 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, &block[2][y*src_stride]);
- align = vec_lvsl(0, &block[2][y*src_stride]);
- tmp2 = vec_ld(15, &block[2][y*src_stride]);
-
- b2 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, &block[1][y*src_stride]);
- align = vec_lvsl(0, &block[1][y*src_stride]);
- tmp2 = vec_ld(15, &block[1][y*src_stride]);
-
- b1 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, &block[0][y*src_stride]);
- align = vec_lvsl(0, &block[0][y*src_stride]);
- tmp2 = vec_ld(15, &block[0][y*src_stride]);
-
- b0 = vec_perm(tmp1,tmp2,align);
-
- // load obmcs
-
- tmp1 = vec_ld(0, obmc1);
- align = vec_lvsl(0, obmc1);
- tmp2 = vec_ld(15, obmc1);
-
- ob1 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, obmc2);
- align = vec_lvsl(0, obmc2);
- tmp2 = vec_ld(15, obmc2);
-
- ob2 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, obmc3);
- align = vec_lvsl(0, obmc3);
- tmp2 = vec_ld(15, obmc3);
-
- ob3 = vec_perm(tmp1,tmp2,align);
-
- tmp1 = vec_ld(0, obmc4);
- align = vec_lvsl(0, obmc4);
- tmp2 = vec_ld(15, obmc4);
-
- ob4 = vec_perm(tmp1,tmp2,align);
-
-//step0
- h1 = (vector unsigned short)
- vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b,
- a,b,a,b,
- a,b,a,b,
- a,b,a,b ] */
- h2 = (vector unsigned short)
- vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d,
- c,d,c,d,
- c,d,c,d,
- c,d,c,d ] */
-
- ih = (vector unsigned char)
- vec_mergeh(h1,h2); /*ih <- [ a,b,c,d,
- a,b,c,d,
- a,b,c,d,
- a,b,c,d ]*/
-
- l1 = (vector unsigned short) vec_mergeh(b3, b2);
-
- l2 = (vector unsigned short) vec_mergeh(b1, b0);
-
- il = (vector unsigned char) vec_mergeh(l1,l2);
-
- v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
-//step1
-
- h1 = (vector unsigned short) vec_mergeh(ob1, ob2);
+ // load obmcs
+ LOAD_OBMCS
- h2 = (vector unsigned short) vec_mergeh(ob3, ob4);
+ // steps 0 1 2 3
+ STEPS_0_1
- ih = (vector unsigned char) vec_mergel(h1,h2);
+ STEPS_2_3
- l1 = (vector unsigned short) vec_mergeh(b3, b2);
+ FINAL_STEP_VEC
- l2 = (vector unsigned short) vec_mergeh(b1, b0);
-
- il = (vector unsigned char) vec_mergel(l1,l2);
-
- v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
-
-//step2
- h1 = (vector unsigned short) vec_mergel(ob1, ob2);
-
- h2 = (vector unsigned short) vec_mergel(ob3, ob4);
-
- ih = (vector unsigned char) vec_mergeh(h1,h2);
-
- l1 = (vector unsigned short) vec_mergel(b3, b2);
-
- l2 = (vector unsigned short) vec_mergel(b1, b0);
-
- il = (vector unsigned char) vec_mergeh(l1,l2);
-
- v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
-
-//step3
- h1 = (vector unsigned short) vec_mergel(ob1, ob2);
-
- h2 = (vector unsigned short) vec_mergel(ob3, ob4);
-
- ih = (vector unsigned char) vec_mergel(h1,h2);
-
- l1 = (vector unsigned short) vec_mergel(b3, b2);
-
- l2 = (vector unsigned short) vec_mergel(b1, b0);
-
- il = (vector unsigned char) vec_mergel(l1,l2);
-
- v[3] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
-#if 1
- for(x=0; x<b_w; x++)
- if(add){
- vbuf[x] += dst[x + src_x];
- vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
- if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);
- dst8[x + y*src_stride] = vbuf[x];
- }else{
- dst[x + src_x] -= vbuf[x];
- }
-#else
- if(add)
- {
- for(x=0; x<b_w/4; x++)
- {
- v[x] = vec_add(v[x], d[x]);
- v[x] = vec_sra(vec_add(v[x],
- vec_sl( vec_splat_s32(1),
- vec_splat_u32(7))),
- vec_splat_u32(8));
-
- mask = vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),vec_splat_u32(8));
- mask = vec_and(v[x],vec_nor(mask,mask));
-
- mask = (vector signed int) vec_cmpeq((vector signed int)mask,(vector signed int)vec_splat_u32(0));
-
- vs = vec_sra(v[x],vec_splat_u32(8));
- vs = vec_sra(v[x],vec_splat_u32(8));
- vs = vec_sra(v[x],vec_splat_u32(15));
-
- vs = vec_nor(vs,vs);
-
- v[x]= vec_sel(v[x],vs,mask);
- }
-
- for(x=0; x<b_w; x++)
- dst8[x + y*src_stride] = vbuf[x];
-
- }
- else
- for(x=0; x<b_w/4; x++)
- d[x] = vec_sub(d[x], v[x]);
-#endif
- }
+ }
}
@@ -804,17 +750,29 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
slice_buffer * sb, int add,
uint8_t * dst8)
{
-//FIXME implement src_x&15 cases later
- if (b_w == 16)
- inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, b_w,
- b_h, src_x, src_y, src_stride,
- sb, add, dst8);
- else if (b_w == 8)
- inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
- b_w, b_h, src_x, src_y,
- src_stride, sb, add, dst8);
- else
-
- ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
- src_y, src_stride, sb, add, dst8);
+ if (src_x&15) {
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
+ b_w, b_h, src_x, src_y,
+ src_stride, sb, add, dst8);
+ else if (b_w == 8)
+ inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
+ b_w, b_h, src_x, src_y,
+ src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
+ src_y, src_stride, sb, add, dst8);
+ } else {
+ if (b_w == 16)
+ inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
+ b_w, b_h, src_x, src_y,
+ src_stride, sb, add, dst8);
+ else if (b_w == 8)
+ inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
+ b_w, b_h, src_x, src_y,
+ src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
+ src_y, src_stride, sb, add, dst8);
+ }
}