diff options
author | Romain Dolbeau <dolbeau@irisa.fr> | 2004-04-22 13:21:59 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2004-04-22 13:21:59 +0000 |
commit | 9007f51460bfb78813f25b37d3484c7b40ceb0d1 (patch) | |
tree | 8d68fd44fa4ff6c7c401b09c8e6ee85db8a54030 /libavcodec/ppc | |
parent | 2750b827b3144a62fdc161a47341dd58764522b7 (diff) | |
download | ffmpeg-9007f51460bfb78813f25b37d3484c7b40ceb0d1.tar.gz |
better hadamard8_diff16 in AltiVec, and more patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
Originally committed as revision 3038 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/ppc')
-rw-r--r-- | libavcodec/ppc/dsputil_altivec.c | 350 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_altivec.h | 1 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_ppc.c | 7 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_ppc.h | 34 |
4 files changed, 318 insertions, 74 deletions
diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index fff0b38106..1bc6fb009c 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -1306,42 +1306,43 @@ int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); int sum; POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); - { - const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); + register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); + register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; #ifdef CONFIG_DARWIN - const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); - const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); - const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); - const_vector unsigned char perm1 = (const_vector unsigned char) + { + register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); + register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); + register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); + register const_vector unsigned char perm1 = (const_vector unsigned char) (0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - const_vector unsigned char perm2 = (const_vector unsigned char) + register const_vector unsigned char perm2 = (const_vector unsigned char) (0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - const_vector unsigned char perm3 = (const_vector unsigned char) + register const_vector unsigned char perm3 = (const_vector unsigned char) (0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); #else - const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; - const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; - const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; - const_vector unsigned char perm1 = (const_vector unsigned char) + register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; + register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; + register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; + register const_vector unsigned char perm1 = (const_vector unsigned char) {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; - const_vector unsigned char perm2 = (const_vector unsigned char) + register const_vector unsigned char perm2 = (const_vector unsigned char) {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; - const_vector unsigned char perm3 = (const_vector unsigned char) + register const_vector unsigned char perm3 = (const_vector unsigned char) {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, @@ -1350,8 +1351,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); #define ONEITERBUTTERFLY(i, res) \ { \ - vector unsigned char src1, src2, srcO; \ - vector unsigned char dst1, dst2, dstO; \ + register vector unsigned char src1, src2, srcO; \ + register vector unsigned char dst1, dst2, dstO; \ src1 = vec_ld(stride * i, src); \ if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ src2 = vec_ld((stride * i) + 16, src); \ @@ -1362,20 +1363,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ /* promote the unsigned chars to signed shorts */ \ /* we're in the 8x8 function, we only care for the first 8 */ \ - vector signed short srcV = \ + register vector signed short srcV = \ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ - vector signed short dstV = \ + register vector signed short dstV = \ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ /* substractions inside the first butterfly */ \ - vector signed short but0 = vec_sub(srcV, dstV); \ - vector signed short op1 = vec_perm(but0, but0, perm1); \ - vector signed short but1 = vec_mladd(but0, vprod1, op1); \ - vector signed short op2 = vec_perm(but1, but1, perm2); \ - vector signed short but2 = vec_mladd(but1, vprod2, op2); \ - vector signed short op3 = vec_perm(but2, but2, perm3); \ + register vector signed short but0 = vec_sub(srcV, dstV); \ + register vector signed short op1 = vec_perm(but0, but0, perm1); \ + register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ + register vector signed short op2 = vec_perm(but1, but1, perm2); \ + register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ + register vector signed short op3 = vec_perm(but2, but2, perm3); \ res = vec_mladd(but2, vprod3, op3); \ } - vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; ONEITERBUTTERFLY(0, temp0); ONEITERBUTTERFLY(1, temp1); ONEITERBUTTERFLY(2, temp2); @@ -1384,53 +1384,275 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); ONEITERBUTTERFLY(5, temp5); ONEITERBUTTERFLY(6, temp6); ONEITERBUTTERFLY(7, temp7); + } #undef ONEITERBUTTERFLY - { - vector signed int vsum; - vector signed short line0 = vec_add(temp0, temp1); - vector signed short line1 = vec_sub(temp0, temp1); - vector signed short line2 = vec_add(temp2, temp3); - vector signed short line3 = vec_sub(temp2, temp3); - vector signed short line4 = vec_add(temp4, temp5); - vector signed short line5 = vec_sub(temp4, temp5); - vector signed short line6 = vec_add(temp6, temp7); - vector signed short line7 = vec_sub(temp6, temp7); + { + register vector signed int vsum; + register vector signed short line0 = vec_add(temp0, temp1); + register vector signed short line1 = vec_sub(temp0, temp1); + register vector signed short line2 = vec_add(temp2, temp3); + register vector signed short line3 = vec_sub(temp2, temp3); + register vector signed short line4 = vec_add(temp4, temp5); + register vector signed short line5 = vec_sub(temp4, temp5); + register vector signed short line6 = vec_add(temp6, temp7); + register vector signed short line7 = vec_sub(temp6, temp7); + + register vector signed short line0B = vec_add(line0, line2); + register vector signed short line2B = vec_sub(line0, line2); + register vector signed short line1B = vec_add(line1, line3); + register vector signed short line3B = vec_sub(line1, line3); + register vector signed short line4B = vec_add(line4, line6); + register vector signed short line6B = vec_sub(line4, line6); + register vector signed short line5B = vec_add(line5, line7); + register vector signed short line7B = vec_sub(line5, line7); + + register vector signed short line0C = vec_add(line0B, line4B); + register vector signed short line4C = vec_sub(line0B, line4B); + register vector signed short line1C = vec_add(line1B, line5B); + register vector signed short line5C = vec_sub(line1B, line5B); + register vector signed short line2C = vec_add(line2B, line6B); + register vector signed short line6C = vec_sub(line2B, line6B); + register vector signed short line3C = vec_add(line3B, line7B); + register vector signed short line7C = vec_sub(line3B, line7B); + + vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); + vsum = vec_sum4s(vec_abs(line1C), vsum); + vsum = vec_sum4s(vec_abs(line2C), vsum); + vsum = vec_sum4s(vec_abs(line3C), vsum); + vsum = vec_sum4s(vec_abs(line4C), vsum); + vsum = vec_sum4s(vec_abs(line5C), vsum); + vsum = vec_sum4s(vec_abs(line6C), vsum); + vsum = vec_sum4s(vec_abs(line7C), vsum); + vsum = vec_sums(vsum, (vector signed int)vzero); + vsum = vec_splat(vsum, 3); + vec_ste(vsum, 0, &sum); + } +POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); + return sum; +} + +/* + 16x8 works with 16 elements ; it allows to avoid replicating + loads, and give the compiler more rooms for scheduling. + It's only used from inside hadamard8_diff16_altivec. + + Unfortunately, it seems gcc-3.3 is a bit dumb, and + the compiled code has a LOT of spill code, it seems + gcc (unlike xlc) cannot keep everything in registers + by itself. The following code include hand-made + registers allocation. It's not clean, but on + a 7450 the resulting code is much faster (best case + fall from 700+ cycles to 550). + + xlc doesn't add spill code, but it doesn't know how to + schedule for the 7450, and its code isn't much faster than + gcc-3.3 on the 7450 (but uses 25% less instructions...) + + On the 970, the hand-made RA is still a win (arount 690 + vs. around 780), but xlc goes to around 660 on the + regular C code... +*/ + +static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { + int sum; + register vector signed short + temp0 asm ("v0"), + temp1 asm ("v1"), + temp2 asm ("v2"), + temp3 asm ("v3"), + temp4 asm ("v4"), + temp5 asm ("v5"), + temp6 asm ("v6"), + temp7 asm ("v7"); + register vector signed short + temp0S asm ("v8"), + temp1S asm ("v9"), + temp2S asm ("v10"), + temp3S asm ("v11"), + temp4S asm ("v12"), + temp5S asm ("v13"), + temp6S asm ("v14"), + temp7S asm ("v15"); + register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0); + { +#ifdef CONFIG_DARWIN + register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); + register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); + register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); + register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char) + (0x02, 0x03, 0x00, 0x01, + 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, + 0x0E, 0x0F, 0x0C, 0x0D); + register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char) + (0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B); + register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char) + (0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07); +#else + register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; + register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; + register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; + register const_vector unsigned char perm1 = (const_vector unsigned char) + {0x02, 0x03, 0x00, 0x01, + 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, + 0x0E, 0x0F, 0x0C, 0x0D}; + register const_vector unsigned char perm2 = (const_vector unsigned char) + {0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B}; + register const_vector unsigned char perm3 = (const_vector unsigned char) + {0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07}; +#endif +#define ONEITERBUTTERFLY(i, res1, res2) \ + { \ + register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \ + register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \ + src1 = vec_ld(stride * i, src); \ + src2 = vec_ld((stride * i) + 16, src); \ + register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + dst1 = vec_ld(stride * i, dst); \ + dst2 = vec_ld((stride * i) + 16, dst); \ + register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + /* promote the unsigned chars to signed shorts */ \ + register vector signed short srcV asm ("v24") = \ + (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ + register vector signed short dstV asm ("v25") = \ + (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ + register vector signed short srcW asm ("v26") = \ + (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \ + register vector signed short dstW asm ("v27") = \ + (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \ + /* substractions inside the first butterfly */ \ + register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \ + register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \ + register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \ + register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \ + register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \ + register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \ + register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \ + register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \ + register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \ + register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \ + register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \ + res1 = vec_mladd(but2, vprod3, op3); \ + register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \ + res2 = vec_mladd(but2S, vprod3, op3S); \ + } + ONEITERBUTTERFLY(0, temp0, temp0S); + ONEITERBUTTERFLY(1, temp1, temp1S); + ONEITERBUTTERFLY(2, temp2, temp2S); + ONEITERBUTTERFLY(3, temp3, temp3S); + ONEITERBUTTERFLY(4, temp4, temp4S); + ONEITERBUTTERFLY(5, temp5, temp5S); + ONEITERBUTTERFLY(6, temp6, temp6S); + ONEITERBUTTERFLY(7, temp7, temp7S); + } +#undef ONEITERBUTTERFLY + { + register vector signed int vsum; + register vector signed short line0 = vec_add(temp0, temp1); + register vector signed short line1 = vec_sub(temp0, temp1); + register vector signed short line2 = vec_add(temp2, temp3); + register vector signed short line3 = vec_sub(temp2, temp3); + register vector signed short line4 = vec_add(temp4, temp5); + register vector signed short line5 = vec_sub(temp4, temp5); + register vector signed short line6 = vec_add(temp6, temp7); + register vector signed short line7 = vec_sub(temp6, temp7); - vector signed short line0B = vec_add(line0, line2); - vector signed short line2B = vec_sub(line0, line2); - vector signed short line1B = vec_add(line1, line3); - vector signed short line3B = vec_sub(line1, line3); - vector signed short line4B = vec_add(line4, line6); - vector signed short line6B = vec_sub(line4, line6); - vector signed short line5B = vec_add(line5, line7); - vector signed short line7B = vec_sub(line5, line7); + register vector signed short line0B = vec_add(line0, line2); + register vector signed short line2B = vec_sub(line0, line2); + register vector signed short line1B = vec_add(line1, line3); + register vector signed short line3B = vec_sub(line1, line3); + register vector signed short line4B = vec_add(line4, line6); + register vector signed short line6B = vec_sub(line4, line6); + register vector signed short line5B = vec_add(line5, line7); + register vector signed short line7B = vec_sub(line5, line7); - vector signed short line0C = vec_add(line0B, line4B); - vector signed short line4C = vec_sub(line0B, line4B); - vector signed short line1C = vec_add(line1B, line5B); - vector signed short line5C = vec_sub(line1B, line5B); - vector signed short line2C = vec_add(line2B, line6B); - vector signed short line6C = vec_sub(line2B, line6B); - vector signed short line3C = vec_add(line3B, line7B); - vector signed short line7C = vec_sub(line3B, line7B); + register vector signed short line0C = vec_add(line0B, line4B); + register vector signed short line4C = vec_sub(line0B, line4B); + register vector signed short line1C = vec_add(line1B, line5B); + register vector signed short line5C = vec_sub(line1B, line5B); + register vector signed short line2C = vec_add(line2B, line6B); + register vector signed short line6C = vec_sub(line2B, line6B); + register vector signed short line3C = vec_add(line3B, line7B); + register vector signed short line7C = vec_sub(line3B, line7B); - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } + vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); + vsum = vec_sum4s(vec_abs(line1C), vsum); + vsum = vec_sum4s(vec_abs(line2C), vsum); + vsum = vec_sum4s(vec_abs(line3C), vsum); + vsum = vec_sum4s(vec_abs(line4C), vsum); + vsum = vec_sum4s(vec_abs(line5C), vsum); + vsum = vec_sum4s(vec_abs(line6C), vsum); + vsum = vec_sum4s(vec_abs(line7C), vsum); + + register vector signed short line0S = vec_add(temp0S, temp1S); + register vector signed short line1S = vec_sub(temp0S, temp1S); + register vector signed short line2S = vec_add(temp2S, temp3S); + register vector signed short line3S = vec_sub(temp2S, temp3S); + register vector signed short line4S = vec_add(temp4S, temp5S); + register vector signed short line5S = vec_sub(temp4S, temp5S); + register vector signed short line6S = vec_add(temp6S, temp7S); + register vector signed short line7S = vec_sub(temp6S, temp7S); + + register vector signed short line0BS = vec_add(line0S, line2S); + register vector signed short line2BS = vec_sub(line0S, line2S); + register vector signed short line1BS = vec_add(line1S, line3S); + register vector signed short line3BS = vec_sub(line1S, line3S); + register vector signed short line4BS = vec_add(line4S, line6S); + register vector signed short line6BS = vec_sub(line4S, line6S); + register vector signed short line5BS = vec_add(line5S, line7S); + register vector signed short line7BS = vec_sub(line5S, line7S); + + register vector signed short line0CS = vec_add(line0BS, line4BS); + register vector signed short line4CS = vec_sub(line0BS, line4BS); + register vector signed short line1CS = vec_add(line1BS, line5BS); + register vector signed short line5CS = vec_sub(line1BS, line5BS); + register vector signed short line2CS = vec_add(line2BS, line6BS); + register vector signed short line6CS = vec_sub(line2BS, line6BS); + register vector signed short line3CS = vec_add(line3BS, line7BS); + register vector signed short line7CS = vec_sub(line3BS, line7BS); + + vsum = vec_sum4s(vec_abs(line0CS), vsum); + vsum = vec_sum4s(vec_abs(line1CS), vsum); + vsum = vec_sum4s(vec_abs(line2CS), vsum); + vsum = vec_sum4s(vec_abs(line3CS), vsum); + vsum = vec_sum4s(vec_abs(line4CS), vsum); + vsum = vec_sum4s(vec_abs(line5CS), vsum); + vsum = vec_sum4s(vec_abs(line6CS), vsum); + vsum = vec_sum4s(vec_abs(line7CS), vsum); + vsum = vec_sums(vsum, (vector signed int)vzero); + vsum = vec_splat(vsum, 3); + vec_ste(vsum, 0, &sum); } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); return sum; } +int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ +POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); + int score; +POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); + score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + if (h==16) { + dst += 8*stride; + src += 8*stride; + score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + } +POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); + return score; +} + int has_altivec(void) { #ifdef CONFIG_DARWIN diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h index df632347ba..e2729ab22e 100644 --- a/libavcodec/ppc/dsputil_altivec.h +++ b/libavcodec/ppc/dsputil_altivec.h @@ -47,6 +47,7 @@ extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h); extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h); extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h); +extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h); extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder); diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index c3d3e808f3..b70de73289 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -61,6 +61,7 @@ static unsigned char* perfname[] = { "put_pixels16_xy2_altivec", "put_no_rnd_pixels16_xy2_altivec", "hadamard8_diff8x8_altivec", + "hadamard8_diff16_altivec", "clear_blocks_dcbz32_ppc", "clear_blocks_dcbz128_ppc" }; @@ -226,12 +227,6 @@ long check_dcbzl_effect(void) } #endif -#ifdef HAVE_ALTIVEC -// can't put that in dsputil_altivec.c, -// has WARPER8_16_SQ declare the function "static" ... -WARPER8_16_SQ(hadamard8_diff8x8_altivec, hadamard8_diff16_altivec) -#endif - void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) { // Common optimizations whether Altivec is available or not diff --git a/libavcodec/ppc/dsputil_ppc.h b/libavcodec/ppc/dsputil_ppc.h index 3be44fff8c..8b34c6b456 100644 --- a/libavcodec/ppc/dsputil_ppc.h +++ b/libavcodec/ppc/dsputil_ppc.h @@ -51,6 +51,7 @@ enum powerpc_perf_index { altivec_put_pixels16_xy2_num, altivec_put_no_rnd_pixels16_xy2_num, altivec_hadamard8_diff8x8_num, + altivec_hadamard8_diff16_num, powerpc_clear_blocks_dcbz32, powerpc_clear_blocks_dcbz128, powerpc_perf_total @@ -64,6 +65,8 @@ enum powerpc_data_index { }; extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; +#ifndef POWERPC_MODE_64BITS +#define POWERP_PMC_DATATYPE unsigned long #define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a)) #define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a)) #if (POWERPC_NUM_PMC_ENABLED > 2) @@ -80,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ #define POWERPC_GET_PMC5(a) do {} while (0) #define POWERPC_GET_PMC6(a) do {} while (0) #endif -#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index; +#else /* POWERPC_MODE_64BITS */ +#define POWERP_PMC_DATATYPE unsigned long long +#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a)) +#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a)) +#if (POWERPC_NUM_PMC_ENABLED > 2) +#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a)) +#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a)) +#else +#define POWERPC_GET_PMC3(a) do {} while (0) +#define POWERPC_GET_PMC4(a) do {} while (0) +#endif +#if (POWERPC_NUM_PMC_ENABLED > 4) +#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a)) +#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a)) +#else +#define POWERPC_GET_PMC5(a) do {} while (0) +#define POWERPC_GET_PMC6(a) do {} while (0) +#endif +#endif /* POWERPC_MODE_64BITS */ +#define POWERPC_PERF_DECLARE(a, cond) \ + POWERP_PMC_DATATYPE \ + pmc_start[POWERPC_NUM_PMC_ENABLED], \ + pmc_stop[POWERPC_NUM_PMC_ENABLED], \ + pmc_loop_index; #define POWERPC_PERF_START_COUNT(a, cond) do { \ POWERPC_GET_PMC6(pmc_start[5]); \ POWERPC_GET_PMC5(pmc_start[4]); \ @@ -102,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ pmc_loop_index++) \ { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ - { \ - unsigned long diff = \ + if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ + { \ + POWERP_PMC_DATATYPE diff = \ pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ |