altivec patches by Romain Dolbeau

Originally committed as revision 1423 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Fabrice Bellard <fabrice@bellard.org> 2003-01-08 18:47:49 +0000
committer: Fabrice Bellard <fabrice@bellard.org> 2003-01-08 18:47:49 +0000
commit: 4013fcf4afa5790bc2f5f2b36101123576cd2b69 (patch)
tree: 6c4fd23496afda9bb83ea090957a96b99618d39e /libavcodec/ppc/dsputil_altivec.c
parent: e366e6795d6e2343a744458f945a396171b07bd8 (diff)
download: ffmpeg-4013fcf4afa5790bc2f5f2b36101123576cd2b69.tar.gz
1 files changed, 144 insertions, 33 deletions
diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c
index 5f14ed0eb9..cf0100d0a4 100644
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -26,15 +26,16 @@
 
 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned char zero = (const vector unsigned char)(0);
+    vector unsigned char *tv;
     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
     vector unsigned int sad;
     vector signed int sumdiffs;
 
     s = 0;
-    zero = vec_splat_u8(0);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)(0);
     for(i=0;i<16;i++) {
         /*
            Read unaligned pixels into our vectors. The vectors are as follows:
@@ -72,16 +73,17 @@ int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned char zero = (const vector unsigned char)(0);
+    vector unsigned char *tv;
     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
     vector unsigned int sad;
     vector signed int sumdiffs;
     uint8_t *pix3 = pix2 + line_size;
 
     s = 0;
-    zero = vec_splat_u8(0);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)(0);
 
     /*
        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
@@ -131,20 +133,21 @@ int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
+    int i;
+    int s __attribute__((aligned(16)));
     uint8_t *pix3 = pix2 + line_size;
-    vector unsigned char *tv, avgv, t5, zero;
+    const vector unsigned char zero = (const vector unsigned char)(0);
+    const vector unsigned short two = (const vector unsigned short)(2);
+    vector unsigned char *tv, avgv, t5;
     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
-    vector unsigned short avghv, avglv, two;
+    vector unsigned short avghv, avglv;
     vector unsigned short t1, t2, t3, t4;
     vector unsigned int sad;
     vector signed int sumdiffs;
 
-    zero = vec_splat_u8(0);
-    two = vec_splat_u16(2);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)(0);
     
     s = 0;
 
@@ -231,13 +234,14 @@ int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int i, s;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)(0);
     vector unsigned char perm1, perm2, *pix1v, *pix2v;
     vector unsigned char t1, t2, t3,t4, t5;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
     
-    zero = (vector unsigned int) (0);
     sad = (vector unsigned int) (0);
 
 
@@ -272,14 +276,15 @@ int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int i, s;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)(0);
     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
     vector unsigned char t1, t2, t3,t4, t5;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
 
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
+    sad = (vector unsigned int)(0);
     permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 
     for(i=0;i<8;i++) {
@@ -315,14 +320,15 @@ int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)(0);
+    vector unsigned char *tv;
     vector unsigned char pixv;
     vector unsigned int sv;
     vector signed int sum;
-        
-    zero = vec_splat_u8(0);
-    sv = vec_splat_u32(0);
+    
+    sv = (vector unsigned int)(0);
     
     s = 0;
     for (i = 0; i < 16; i++) {
@@ -343,17 +349,122 @@ int pix_norm1_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
-int pix_sum_altivec(UINT8 * pix, int line_size)
+/**
+ * Sum of Squared Errors for a 8x8 block.
+ * AltiVec-enhanced.
+ * It's the pix_abs8x8_altivec code above w/ squaring added.
+ */
+int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)(0);
+    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sum;
+    vector signed int sumsqr;
+    
+    sum = (vector unsigned int)(0);
+    permclear = (vector unsigned char)(0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
+    
+    for(i=0;i<8;i++) {
+	/* Read potentially unaligned pixels into t1 and t2
+	   Since we're reading 16 pixels, and actually only want 8,
+	   mask out the last 8 pixels. The 0s don't change the sum. */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
+        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
+
+        /*
+          Since we want to use unsigned chars, we can take advantage
+          of the fact that abs(a-b)^2 = (a-b)^2.
+        */
+        
+	/* Calculate abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+        
+        /* Square the values and add them to our sum */
+        sum = vec_msum(t5, t5, sum);
+        
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
+    sumsqr = vec_splat(sumsqr, 3);
+    vec_ste(sumsqr, 0, &s);
+    
+    return s;
+}
+
+/**
+ * Sum of Squared Errors for a 16x16 block.
+ * AltiVec-enhanced.
+ * It's the pix_abs16x16_altivec code above w/ squaring added.
+ */
+int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 {
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)(0);
+    vector unsigned char perm1, perm2, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sum;
+    vector signed int sumsqr;
+    
+    sum = (vector unsigned int)(0);
+    
+    for(i=0;i<16;i++) {
+	/* Read potentially unaligned pixels into t1 and t2 */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
+        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
+
+        /*
+          Since we want to use unsigned chars, we can take advantage
+          of the fact that abs(a-b)^2 = (a-b)^2.
+        */
+        
+	/* Calculate abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+        
+        /* Square the values and add them to our sum */
+        sum = vec_msum(t5, t5, sum);
+        
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
+    sumsqr = vec_splat(sumsqr, 3);
+    vec_ste(sumsqr, 0, &s);
+    
+    return s;
+}
 
+int pix_sum_altivec(UINT8 * pix, int line_size)
+{
+    const vector unsigned int zero = (const vector unsigned int)(0);
     vector unsigned char perm, *pixv;
     vector unsigned char t1;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
 
-    int s, i;
-
-    zero = (vector unsigned int) (0);
+    int i;
+    int s __attribute__((aligned(16)));
+    
     sad = (vector unsigned int) (0);
     
     for (i = 0; i < 16; i++) {
@@ -380,7 +491,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_s
 {
     int i;
     vector unsigned char perm, bytes, *pixv;
-    vector unsigned char zero = (vector unsigned char) (0);
+    const vector unsigned char zero = (const vector unsigned char) (0);
     vector signed short shorts;
 
     for(i=0;i<8;i++)
@@ -407,7 +518,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
 {
     int i;
     vector unsigned char perm, bytes, *pixv;
-    vector unsigned char zero = (vector unsigned char) (0);
+    const vector unsigned char zero = (const vector unsigned char) (0);
     vector signed short shorts1, shorts2;
 
     for(i=0;i<4;i++)
author	Fabrice Bellard <fabrice@bellard.org>	2003-01-08 18:47:49 +0000
committer	Fabrice Bellard <fabrice@bellard.org>	2003-01-08 18:47:49 +0000
commit	4013fcf4afa5790bc2f5f2b36101123576cd2b69 (patch)
tree	6c4fd23496afda9bb83ea090957a96b99618d39e /libavcodec/ppc/dsputil_altivec.c
parent	e366e6795d6e2343a744458f945a396171b07bd8 (diff)
download	ffmpeg-4013fcf4afa5790bc2f5f2b36101123576cd2b69.tar.gz