aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/ppc
diff options
context:
space:
mode:
authorRomain Dolbeau <dolbeau@irisa.fr>2004-04-22 13:21:59 +0000
committerMichael Niedermayer <michaelni@gmx.at>2004-04-22 13:21:59 +0000
commit9007f51460bfb78813f25b37d3484c7b40ceb0d1 (patch)
tree8d68fd44fa4ff6c7c401b09c8e6ee85db8a54030 /libavcodec/ppc
parent2750b827b3144a62fdc161a47341dd58764522b7 (diff)
downloadffmpeg-9007f51460bfb78813f25b37d3484c7b40ceb0d1.tar.gz
better hadamard8_diff16 in AltiVec, and more patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
Originally committed as revision 3038 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/ppc')
-rw-r--r--libavcodec/ppc/dsputil_altivec.c350
-rw-r--r--libavcodec/ppc/dsputil_altivec.h1
-rw-r--r--libavcodec/ppc/dsputil_ppc.c7
-rw-r--r--libavcodec/ppc/dsputil_ppc.h34
4 files changed, 318 insertions, 74 deletions
diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c
index fff0b38106..1bc6fb009c 100644
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -1306,42 +1306,43 @@ int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
int sum;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
- {
- const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
+ register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
+ register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
#ifdef CONFIG_DARWIN
- const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
- const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
- const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
- const_vector unsigned char perm1 = (const_vector unsigned char)
+ {
+ register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+ register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+ register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+ register const_vector unsigned char perm1 = (const_vector unsigned char)
(0x02, 0x03, 0x00, 0x01,
0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09,
0x0E, 0x0F, 0x0C, 0x0D);
- const_vector unsigned char perm2 = (const_vector unsigned char)
+ register const_vector unsigned char perm2 = (const_vector unsigned char)
(0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F,
0x08, 0x09, 0x0A, 0x0B);
- const_vector unsigned char perm3 = (const_vector unsigned char)
+ register const_vector unsigned char perm3 = (const_vector unsigned char)
(0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07);
#else
- const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
- const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
- const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
- const_vector unsigned char perm1 = (const_vector unsigned char)
+ register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+ register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+ register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+ register const_vector unsigned char perm1 = (const_vector unsigned char)
{0x02, 0x03, 0x00, 0x01,
0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09,
0x0E, 0x0F, 0x0C, 0x0D};
- const_vector unsigned char perm2 = (const_vector unsigned char)
+ register const_vector unsigned char perm2 = (const_vector unsigned char)
{0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F,
0x08, 0x09, 0x0A, 0x0B};
- const_vector unsigned char perm3 = (const_vector unsigned char)
+ register const_vector unsigned char perm3 = (const_vector unsigned char)
{0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03,
@@ -1350,8 +1351,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
#define ONEITERBUTTERFLY(i, res) \
{ \
- vector unsigned char src1, src2, srcO; \
- vector unsigned char dst1, dst2, dstO; \
+ register vector unsigned char src1, src2, srcO; \
+ register vector unsigned char dst1, dst2, dstO; \
src1 = vec_ld(stride * i, src); \
if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
src2 = vec_ld((stride * i) + 16, src); \
@@ -1362,20 +1363,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \
/* we're in the 8x8 function, we only care for the first 8 */ \
- vector signed short srcV = \
+ register vector signed short srcV = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
- vector signed short dstV = \
+ register vector signed short dstV = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
/* substractions inside the first butterfly */ \
- vector signed short but0 = vec_sub(srcV, dstV); \
- vector signed short op1 = vec_perm(but0, but0, perm1); \
- vector signed short but1 = vec_mladd(but0, vprod1, op1); \
- vector signed short op2 = vec_perm(but1, but1, perm2); \
- vector signed short but2 = vec_mladd(but1, vprod2, op2); \
- vector signed short op3 = vec_perm(but2, but2, perm3); \
+ register vector signed short but0 = vec_sub(srcV, dstV); \
+ register vector signed short op1 = vec_perm(but0, but0, perm1); \
+ register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
+ register vector signed short op2 = vec_perm(but1, but1, perm2); \
+ register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
+ register vector signed short op3 = vec_perm(but2, but2, perm3); \
res = vec_mladd(but2, vprod3, op3); \
}
- vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
ONEITERBUTTERFLY(0, temp0);
ONEITERBUTTERFLY(1, temp1);
ONEITERBUTTERFLY(2, temp2);
@@ -1384,53 +1384,275 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
ONEITERBUTTERFLY(5, temp5);
ONEITERBUTTERFLY(6, temp6);
ONEITERBUTTERFLY(7, temp7);
+ }
#undef ONEITERBUTTERFLY
- {
- vector signed int vsum;
- vector signed short line0 = vec_add(temp0, temp1);
- vector signed short line1 = vec_sub(temp0, temp1);
- vector signed short line2 = vec_add(temp2, temp3);
- vector signed short line3 = vec_sub(temp2, temp3);
- vector signed short line4 = vec_add(temp4, temp5);
- vector signed short line5 = vec_sub(temp4, temp5);
- vector signed short line6 = vec_add(temp6, temp7);
- vector signed short line7 = vec_sub(temp6, temp7);
+ {
+ register vector signed int vsum;
+ register vector signed short line0 = vec_add(temp0, temp1);
+ register vector signed short line1 = vec_sub(temp0, temp1);
+ register vector signed short line2 = vec_add(temp2, temp3);
+ register vector signed short line3 = vec_sub(temp2, temp3);
+ register vector signed short line4 = vec_add(temp4, temp5);
+ register vector signed short line5 = vec_sub(temp4, temp5);
+ register vector signed short line6 = vec_add(temp6, temp7);
+ register vector signed short line7 = vec_sub(temp6, temp7);
+
+ register vector signed short line0B = vec_add(line0, line2);
+ register vector signed short line2B = vec_sub(line0, line2);
+ register vector signed short line1B = vec_add(line1, line3);
+ register vector signed short line3B = vec_sub(line1, line3);
+ register vector signed short line4B = vec_add(line4, line6);
+ register vector signed short line6B = vec_sub(line4, line6);
+ register vector signed short line5B = vec_add(line5, line7);
+ register vector signed short line7B = vec_sub(line5, line7);
+
+ register vector signed short line0C = vec_add(line0B, line4B);
+ register vector signed short line4C = vec_sub(line0B, line4B);
+ register vector signed short line1C = vec_add(line1B, line5B);
+ register vector signed short line5C = vec_sub(line1B, line5B);
+ register vector signed short line2C = vec_add(line2B, line6B);
+ register vector signed short line6C = vec_sub(line2B, line6B);
+ register vector signed short line3C = vec_add(line3B, line7B);
+ register vector signed short line7C = vec_sub(line3B, line7B);
+
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+ vsum = vec_sum4s(vec_abs(line1C), vsum);
+ vsum = vec_sum4s(vec_abs(line2C), vsum);
+ vsum = vec_sum4s(vec_abs(line3C), vsum);
+ vsum = vec_sum4s(vec_abs(line4C), vsum);
+ vsum = vec_sum4s(vec_abs(line5C), vsum);
+ vsum = vec_sum4s(vec_abs(line6C), vsum);
+ vsum = vec_sum4s(vec_abs(line7C), vsum);
+ vsum = vec_sums(vsum, (vector signed int)vzero);
+ vsum = vec_splat(vsum, 3);
+ vec_ste(vsum, 0, &sum);
+ }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
+ return sum;
+}
+
+/*
+ 16x8 works with 16 elements ; it allows to avoid replicating
+ loads, and give the compiler more rooms for scheduling.
+ It's only used from inside hadamard8_diff16_altivec.
+
+ Unfortunately, it seems gcc-3.3 is a bit dumb, and
+ the compiled code has a LOT of spill code, it seems
+ gcc (unlike xlc) cannot keep everything in registers
+ by itself. The following code include hand-made
+ registers allocation. It's not clean, but on
+ a 7450 the resulting code is much faster (best case
+ fall from 700+ cycles to 550).
+
+ xlc doesn't add spill code, but it doesn't know how to
+ schedule for the 7450, and its code isn't much faster than
+ gcc-3.3 on the 7450 (but uses 25% less instructions...)
+
+ On the 970, the hand-made RA is still a win (arount 690
+ vs. around 780), but xlc goes to around 660 on the
+ regular C code...
+*/
+
+static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
+ int sum;
+ register vector signed short
+ temp0 asm ("v0"),
+ temp1 asm ("v1"),
+ temp2 asm ("v2"),
+ temp3 asm ("v3"),
+ temp4 asm ("v4"),
+ temp5 asm ("v5"),
+ temp6 asm ("v6"),
+ temp7 asm ("v7");
+ register vector signed short
+ temp0S asm ("v8"),
+ temp1S asm ("v9"),
+ temp2S asm ("v10"),
+ temp3S asm ("v11"),
+ temp4S asm ("v12"),
+ temp5S asm ("v13"),
+ temp6S asm ("v14"),
+ temp7S asm ("v15");
+ register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
+ {
+#ifdef CONFIG_DARWIN
+ register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+ register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+ register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+ register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
+ (0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09,
+ 0x0E, 0x0F, 0x0C, 0x0D);
+ register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
+ (0x04, 0x05, 0x06, 0x07,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B);
+ register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
+ (0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07);
+#else
+ register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+ register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+ register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+ register const_vector unsigned char perm1 = (const_vector unsigned char)
+ {0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09,
+ 0x0E, 0x0F, 0x0C, 0x0D};
+ register const_vector unsigned char perm2 = (const_vector unsigned char)
+ {0x04, 0x05, 0x06, 0x07,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B};
+ register const_vector unsigned char perm3 = (const_vector unsigned char)
+ {0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07};
+#endif
+#define ONEITERBUTTERFLY(i, res1, res2) \
+ { \
+ register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
+ register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
+ src1 = vec_ld(stride * i, src); \
+ src2 = vec_ld((stride * i) + 16, src); \
+ register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+ dst1 = vec_ld(stride * i, dst); \
+ dst2 = vec_ld((stride * i) + 16, dst); \
+ register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+ /* promote the unsigned chars to signed shorts */ \
+ register vector signed short srcV asm ("v24") = \
+ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
+ register vector signed short dstV asm ("v25") = \
+ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+ register vector signed short srcW asm ("v26") = \
+ (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
+ register vector signed short dstW asm ("v27") = \
+ (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
+ /* substractions inside the first butterfly */ \
+ register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
+ register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
+ register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
+ register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
+ register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
+ register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
+ register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
+ register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
+ register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
+ register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
+ register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
+ res1 = vec_mladd(but2, vprod3, op3); \
+ register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
+ res2 = vec_mladd(but2S, vprod3, op3S); \
+ }
+ ONEITERBUTTERFLY(0, temp0, temp0S);
+ ONEITERBUTTERFLY(1, temp1, temp1S);
+ ONEITERBUTTERFLY(2, temp2, temp2S);
+ ONEITERBUTTERFLY(3, temp3, temp3S);
+ ONEITERBUTTERFLY(4, temp4, temp4S);
+ ONEITERBUTTERFLY(5, temp5, temp5S);
+ ONEITERBUTTERFLY(6, temp6, temp6S);
+ ONEITERBUTTERFLY(7, temp7, temp7S);
+ }
+#undef ONEITERBUTTERFLY
+ {
+ register vector signed int vsum;
+ register vector signed short line0 = vec_add(temp0, temp1);
+ register vector signed short line1 = vec_sub(temp0, temp1);
+ register vector signed short line2 = vec_add(temp2, temp3);
+ register vector signed short line3 = vec_sub(temp2, temp3);
+ register vector signed short line4 = vec_add(temp4, temp5);
+ register vector signed short line5 = vec_sub(temp4, temp5);
+ register vector signed short line6 = vec_add(temp6, temp7);
+ register vector signed short line7 = vec_sub(temp6, temp7);
- vector signed short line0B = vec_add(line0, line2);
- vector signed short line2B = vec_sub(line0, line2);
- vector signed short line1B = vec_add(line1, line3);
- vector signed short line3B = vec_sub(line1, line3);
- vector signed short line4B = vec_add(line4, line6);
- vector signed short line6B = vec_sub(line4, line6);
- vector signed short line5B = vec_add(line5, line7);
- vector signed short line7B = vec_sub(line5, line7);
+ register vector signed short line0B = vec_add(line0, line2);
+ register vector signed short line2B = vec_sub(line0, line2);
+ register vector signed short line1B = vec_add(line1, line3);
+ register vector signed short line3B = vec_sub(line1, line3);
+ register vector signed short line4B = vec_add(line4, line6);
+ register vector signed short line6B = vec_sub(line4, line6);
+ register vector signed short line5B = vec_add(line5, line7);
+ register vector signed short line7B = vec_sub(line5, line7);
- vector signed short line0C = vec_add(line0B, line4B);
- vector signed short line4C = vec_sub(line0B, line4B);
- vector signed short line1C = vec_add(line1B, line5B);
- vector signed short line5C = vec_sub(line1B, line5B);
- vector signed short line2C = vec_add(line2B, line6B);
- vector signed short line6C = vec_sub(line2B, line6B);
- vector signed short line3C = vec_add(line3B, line7B);
- vector signed short line7C = vec_sub(line3B, line7B);
+ register vector signed short line0C = vec_add(line0B, line4B);
+ register vector signed short line4C = vec_sub(line0B, line4B);
+ register vector signed short line1C = vec_add(line1B, line5B);
+ register vector signed short line5C = vec_sub(line1B, line5B);
+ register vector signed short line2C = vec_add(line2B, line6B);
+ register vector signed short line6C = vec_sub(line2B, line6B);
+ register vector signed short line3C = vec_add(line3B, line7B);
+ register vector signed short line7C = vec_sub(line3B, line7B);
- vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
- vsum = vec_sum4s(vec_abs(line1C), vsum);
- vsum = vec_sum4s(vec_abs(line2C), vsum);
- vsum = vec_sum4s(vec_abs(line3C), vsum);
- vsum = vec_sum4s(vec_abs(line4C), vsum);
- vsum = vec_sum4s(vec_abs(line5C), vsum);
- vsum = vec_sum4s(vec_abs(line6C), vsum);
- vsum = vec_sum4s(vec_abs(line7C), vsum);
- vsum = vec_sums(vsum, (vector signed int)vzero);
- vsum = vec_splat(vsum, 3);
- vec_ste(vsum, 0, &sum);
- }
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+ vsum = vec_sum4s(vec_abs(line1C), vsum);
+ vsum = vec_sum4s(vec_abs(line2C), vsum);
+ vsum = vec_sum4s(vec_abs(line3C), vsum);
+ vsum = vec_sum4s(vec_abs(line4C), vsum);
+ vsum = vec_sum4s(vec_abs(line5C), vsum);
+ vsum = vec_sum4s(vec_abs(line6C), vsum);
+ vsum = vec_sum4s(vec_abs(line7C), vsum);
+
+ register vector signed short line0S = vec_add(temp0S, temp1S);
+ register vector signed short line1S = vec_sub(temp0S, temp1S);
+ register vector signed short line2S = vec_add(temp2S, temp3S);
+ register vector signed short line3S = vec_sub(temp2S, temp3S);
+ register vector signed short line4S = vec_add(temp4S, temp5S);
+ register vector signed short line5S = vec_sub(temp4S, temp5S);
+ register vector signed short line6S = vec_add(temp6S, temp7S);
+ register vector signed short line7S = vec_sub(temp6S, temp7S);
+
+ register vector signed short line0BS = vec_add(line0S, line2S);
+ register vector signed short line2BS = vec_sub(line0S, line2S);
+ register vector signed short line1BS = vec_add(line1S, line3S);
+ register vector signed short line3BS = vec_sub(line1S, line3S);
+ register vector signed short line4BS = vec_add(line4S, line6S);
+ register vector signed short line6BS = vec_sub(line4S, line6S);
+ register vector signed short line5BS = vec_add(line5S, line7S);
+ register vector signed short line7BS = vec_sub(line5S, line7S);
+
+ register vector signed short line0CS = vec_add(line0BS, line4BS);
+ register vector signed short line4CS = vec_sub(line0BS, line4BS);
+ register vector signed short line1CS = vec_add(line1BS, line5BS);
+ register vector signed short line5CS = vec_sub(line1BS, line5BS);
+ register vector signed short line2CS = vec_add(line2BS, line6BS);
+ register vector signed short line6CS = vec_sub(line2BS, line6BS);
+ register vector signed short line3CS = vec_add(line3BS, line7BS);
+ register vector signed short line7CS = vec_sub(line3BS, line7BS);
+
+ vsum = vec_sum4s(vec_abs(line0CS), vsum);
+ vsum = vec_sum4s(vec_abs(line1CS), vsum);
+ vsum = vec_sum4s(vec_abs(line2CS), vsum);
+ vsum = vec_sum4s(vec_abs(line3CS), vsum);
+ vsum = vec_sum4s(vec_abs(line4CS), vsum);
+ vsum = vec_sum4s(vec_abs(line5CS), vsum);
+ vsum = vec_sum4s(vec_abs(line6CS), vsum);
+ vsum = vec_sum4s(vec_abs(line7CS), vsum);
+ vsum = vec_sums(vsum, (vector signed int)vzero);
+ vsum = vec_splat(vsum, 3);
+ vec_ste(vsum, 0, &sum);
}
-POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
return sum;
}
+int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
+POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
+ int score;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
+ score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+ if (h==16) {
+ dst += 8*stride;
+ src += 8*stride;
+ score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+ }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
+ return score;
+}
+
int has_altivec(void)
{
#ifdef CONFIG_DARWIN
diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h
index df632347ba..e2729ab22e 100644
--- a/libavcodec/ppc/dsputil_altivec.h
+++ b/libavcodec/ppc/dsputil_altivec.h
@@ -47,6 +47,7 @@ extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
+extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder);
diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c
index c3d3e808f3..b70de73289 100644
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -61,6 +61,7 @@ static unsigned char* perfname[] = {
"put_pixels16_xy2_altivec",
"put_no_rnd_pixels16_xy2_altivec",
"hadamard8_diff8x8_altivec",
+ "hadamard8_diff16_altivec",
"clear_blocks_dcbz32_ppc",
"clear_blocks_dcbz128_ppc"
};
@@ -226,12 +227,6 @@ long check_dcbzl_effect(void)
}
#endif
-#ifdef HAVE_ALTIVEC
-// can't put that in dsputil_altivec.c,
-// has WARPER8_16_SQ declare the function "static" ...
-WARPER8_16_SQ(hadamard8_diff8x8_altivec, hadamard8_diff16_altivec)
-#endif
-
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
{
// Common optimizations whether Altivec is available or not
diff --git a/libavcodec/ppc/dsputil_ppc.h b/libavcodec/ppc/dsputil_ppc.h
index 3be44fff8c..8b34c6b456 100644
--- a/libavcodec/ppc/dsputil_ppc.h
+++ b/libavcodec/ppc/dsputil_ppc.h
@@ -51,6 +51,7 @@ enum powerpc_perf_index {
altivec_put_pixels16_xy2_num,
altivec_put_no_rnd_pixels16_xy2_num,
altivec_hadamard8_diff8x8_num,
+ altivec_hadamard8_diff16_num,
powerpc_clear_blocks_dcbz32,
powerpc_clear_blocks_dcbz128,
powerpc_perf_total
@@ -64,6 +65,8 @@ enum powerpc_data_index {
};
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
+#ifndef POWERPC_MODE_64BITS
+#define POWERP_PMC_DATATYPE unsigned long
#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
#if (POWERPC_NUM_PMC_ENABLED > 2)
@@ -80,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
-#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index;
+#else /* POWERPC_MODE_64BITS */
+#define POWERP_PMC_DATATYPE unsigned long long
+#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
+#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a))
+#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a))
+#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#endif /* POWERPC_MODE_64BITS */
+#define POWERPC_PERF_DECLARE(a, cond) \
+ POWERP_PMC_DATATYPE \
+ pmc_start[POWERPC_NUM_PMC_ENABLED], \
+ pmc_stop[POWERPC_NUM_PMC_ENABLED], \
+ pmc_loop_index;
#define POWERPC_PERF_START_COUNT(a, cond) do { \
POWERPC_GET_PMC6(pmc_start[5]); \
POWERPC_GET_PMC5(pmc_start[4]); \
@@ -102,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
pmc_loop_index++) \
{ \
- if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
- { \
- unsigned long diff = \
+ if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
+ { \
+ POWERP_PMC_DATATYPE diff = \
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \