diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2003-01-16 21:54:55 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2003-01-16 21:54:55 +0000 |
commit | db40a39aba6a22729279ac8915b52b182473f209 (patch) | |
tree | bd67103c33705c172f5cac6838edc4899a1bfc80 /libavcodec | |
parent | f44a2df04fb553ef076594daca3ed4ccab315123 (diff) | |
download | ffmpeg-db40a39aba6a22729279ac8915b52b182473f209.tar.gz |
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/fft.c | 4 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_altivec.c | 235 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_altivec.h | 65 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_ppc.c | 18 | ||||
-rw-r--r-- | libavcodec/ppc/fft_altivec.c | 110 | ||||
-rw-r--r-- | libavcodec/ppc/gmc_altivec.c | 20 | ||||
-rw-r--r-- | libavcodec/ppc/idct_altivec.c | 25 | ||||
-rw-r--r-- | libavcodec/ppc/mpegvideo_altivec.c | 14 | ||||
-rw-r--r-- | libavcodec/ppc/mpegvideo_ppc.c | 4 |
9 files changed, 455 insertions, 40 deletions
diff --git a/libavcodec/fft.c b/libavcodec/fft.c index 079d2d6a6e..65eb575f36 100644 --- a/libavcodec/fft.c +++ b/libavcodec/fft.c @@ -53,12 +53,12 @@ int fft_init(FFTContext *s, int nbits, int inverse) /* compute constant table for HAVE_SSE version */ #if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(HAVE_ALTIVEC) { - int has_vectors; + int has_vectors = 0; #if defined(HAVE_MMX) has_vectors = mm_support() & MM_SSE; #endif -#if defined(HAVE_ALTIVEC) +#if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) has_vectors = mm_support() & MM_ALTIVEC; #endif if (has_vectors) { diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 55ea7371d3..e7f34ee16c 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -24,6 +24,22 @@ #include <sys/sysctl.h> #endif +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT +unsigned long long perfdata[altivec_perf_total][altivec_data_total]; +/* list below must match enum in dsputil_altivec.h */ +static unsigned char* perfname[] = { + "fft_calc", + "gmc1", + "dct_unquantize_h263", + "idct_add", + "idct_put", + "put_pixels_clamped", + "put_pixels16", + "avg_pixels16" +}; +#include <stdio.h> +#endif + int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; @@ -594,7 +610,7 @@ int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { } void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { -#if 0 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE int i; for(i=0; i+7<w; i++){ dst[i+0] += src[i+0]; @@ -608,38 +624,188 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { } for(; i<w; i++) dst[i+0] += src[i+0]; -#else +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ register int i; - register uint8_t *temp_src = src, *temp_dst = dst; - register vector unsigned char vdst, vsrc, temp1, temp2; - register vector unsigned char perm; - register int count = 0; - - for (i = 0; (i < w) && ((unsigned long)temp_dst & 0x0000000F) ; i++) - { - dst[i] = src[i]; - temp_src ++; - temp_dst ++; - } - /* temp_dst is a properly aligned pointer */ - /* we still need to deal with ill-aligned src */ - perm = vec_lvsl(0, temp_src); - temp1 = vec_ld(0, temp_src); - while ((i + 15) < w) + register vector unsigned char vdst, vsrc; + + /* dst and src are 16 bytes-aligned (guaranteed) */ + for(i = 0 ; (i + 15) < w ; i++) { - temp2 = vec_ld(count + 16, temp_src); - vdst = vec_ld(count, temp_dst); - vsrc = vec_perm(temp1, temp2, perm); - temp1 = temp2; + vdst = vec_ld(i << 4, (unsigned char*)dst); + vsrc = vec_ld(i << 4, (unsigned char*)src); vdst = vec_add(vsrc, vdst); - vec_st(vdst, count, temp_dst); - count += 16; + vec_st(vdst, i << 4, (unsigned char*)dst); } + /* if w is not a multiple of 16 */ for (; (i < w) ; i++) { dst[i] = src[i]; } -#endif +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +extern UINT8 cropTbl[]; +void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels, + int line_size) +{ +ALTIVEC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + UINT8 *cm = cropTbl + MAX_NEG_CROP; + +ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); + + /* read the pixels */ + for(i=0;i<8;i++) { + pixels[0] = cm[block[0]]; + pixels[1] = cm[block[1]]; + pixels[2] = cm[block[2]]; + pixels[3] = cm[block[3]]; + pixels[4] = cm[block[4]]; + pixels[5] = cm[block[5]]; + pixels[6] = cm[block[6]]; + pixels[7] = cm[block[7]]; + + pixels += line_size; + block += 8; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register const vector short vczero = (const vector short)(0); + register vector short + blockv0, blockv1, blockv2, blockv3, + blockv4, blockv5, blockv6, blockv7; + register vector unsigned char + pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4, + pixelsv0old, pixelsv4old; + +ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1); + + blockv0 = vec_ld(0, block); + blockv1 = vec_ld(16, block); + blockv2 = vec_ld(32, block); + blockv3 = vec_ld(48, block); + blockv4 = vec_ld(64, block); + blockv5 = vec_ld(80, block); + blockv6 = vec_ld(96, block); + blockv7 = vec_ld(112, block); + if (((unsigned long)pixels) & 0x0000000F) + { + pixelsv0old = vec_ld(-8, pixels); + pixelsv4old = vec_ld(56, pixels); + pixelsv0 = vec_packsu(vczero, blockv0); + pixelsv1 = vec_packsu(blockv1, blockv2); + pixelsv2 = vec_packsu(blockv3, blockv4); + pixelsv3 = vec_packsu(blockv5, blockv6); + pixelsv4 = vec_packsu(blockv5, vczero); + pixelsv0 = vec_perm(pixelsv0old, pixelsv0, vcprm(0, 1, s2, s3)); + pixelsv4 = vec_perm(pixelsv4, pixelsv4old, vcprm(0, 1, s2, s3)); + vec_st(pixelsv0, -8, pixels); + vec_st(pixelsv1, 8, pixels); + vec_st(pixelsv2, 24, pixels); + vec_st(pixelsv3, 40, pixels); + vec_st(pixelsv4, 56, pixels); + } + else + { + pixelsv0 = vec_packsu(blockv0, blockv1); + pixelsv1 = vec_packsu(blockv2, blockv3); + pixelsv2 = vec_packsu(blockv4, blockv5); + pixelsv3 = vec_packsu(blockv6, blockv7); + vec_st(pixelsv0, 0, pixels); + vec_st(pixelsv1, 16, pixels); + vec_st(pixelsv2, 32, pixels); + vec_st(pixelsv3, 48, pixels); + } + +ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +ALTIVEC_TBL_DECLARE(altivec_put_pixels16_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + +ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); + + for(i=0; i<h; i++) { + *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); + *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); + *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); + *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); + pixels+=line_size; + block +=line_size; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + + register vector unsigned char perm = vec_lvsl(0, pixels); + register vector unsigned char pixelsv1, pixelsv2; + int i; + +ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1); + + for(i=0; i<h; i++) { + pixelsv1 = vec_ld(0, (unsigned char*)pixels); + pixelsv2 = vec_ld(16, (unsigned char*)pixels); + vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block); + pixels+=line_size; + block +=line_size; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +ALTIVEC_TBL_DECLARE(altivec_avg_pixels16_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + +ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); + + for(i=0; i<h; i++) { + op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); + op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); + op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); + op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); + pixels+=line_size; + block +=line_size; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + + register vector unsigned char perm = vec_lvsl(0, pixels); + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; + int i; + +ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); + + for(i=0; i<h; i++) { + pixelsv1 = vec_ld(0, (unsigned char*)pixels); + pixelsv2 = vec_ld(16, (unsigned char*)pixels); + blockv = vec_ld(0, block); + pixelsv = vec_perm(pixelsv1, pixelsv2, perm); + blockv = vec_avg(blockv,pixelsv); + vec_st(blockv, 0, (unsigned char*)block); + pixels+=line_size; + block +=line_size; + } + +ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } int has_altivec(void) @@ -656,3 +822,22 @@ int has_altivec(void) #endif return 0; } + +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT +void altivec_display_perf_report(void) +{ + int i; + fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n"); + for(i = 0 ; i < altivec_perf_total ; i++) + { + if (perfdata[i][altivec_data_num] != (unsigned long long)0) + fprintf(stderr, " Function \"%s\":\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", + perfname[i], + perfdata[i][altivec_data_min], + perfdata[i][altivec_data_max], + (double)perfdata[i][altivec_data_sum] / + (double)perfdata[i][altivec_data_num], + perfdata[i][altivec_data_num]); + } +} +#endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */ diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h index 4a6043fe2f..b7b4238db9 100644 --- a/libavcodec/ppc/dsputil_altivec.h +++ b/libavcodec/ppc/dsputil_altivec.h @@ -31,13 +31,16 @@ extern int pix_sum_altivec(UINT8 * pix, int line_size); extern void diff_pixels_altivec(DCTELEM* block, const UINT8* s1, const UINT8* s2, int stride); extern void get_pixels_altivec(DCTELEM* block, const UINT8 * pixels, int line_size); -extern void gmc1_altivec(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder); - extern void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w); +extern void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels, int line_size); +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); -extern int has_altivec(void); +extern void gmc1_altivec(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder); +extern int has_altivec(void); +#ifdef HAVE_ALTIVEC // used to build registers permutation vectors (vcprm) // the 's' are for words in the _s_econd vector @@ -63,3 +66,59 @@ extern int has_altivec(void); #define FLOAT_p 1. #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) + +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT +void altivec_display_perf_report(void); +/* if you add to the enum below, also add to the perfname array + in dsputil_altivec.c */ +enum altivec_perf_index { + altivec_fft_num = 0, + altivec_gmc1_num, + altivec_dct_unquantize_h263_num, + altivec_idct_add_num, + altivec_idct_put_num, + altivec_put_pixels_clamped_num, + altivec_put_pixels16_num, + altivec_avg_pixels16_num, + altivec_perf_total +}; +enum altivec_data_index { + altivec_data_min = 0, + altivec_data_max, + altivec_data_sum, + altivec_data_num, + altivec_data_total +}; +extern unsigned long long perfdata[altivec_perf_total][altivec_data_total]; +#define ALTIVEC_TBL_DECLARE(a, cond) register unsigned long tbl_start, tbl_stop +#define ALTIVEC_TBL_START_COUNT(a, cond) do { asm("mftb %0" : "=r" (tbl_start)); } while (0) +#define ALTIVEC_TBL_STOP_COUNT(a, cond) do { \ + asm volatile("mftb %0" : "=r" (tbl_stop)); \ + if (tbl_stop > tbl_start) \ + { \ + unsigned long diff = tbl_stop - tbl_start; \ + if (cond) \ + { \ + if (diff < perfdata[a][altivec_data_min]) \ + perfdata[a][altivec_data_min] = diff; \ + if (diff > perfdata[a][altivec_data_max]) \ + perfdata[a][altivec_data_max] = diff; \ + perfdata[a][altivec_data_sum] += diff; \ + perfdata[a][altivec_data_num] ++; \ + } \ + } \ +} while (0) +#else /* ALTIVEC_TBL_PERFORMANCE_REPORT */ +#define ALTIVEC_TBL_DECLARE(a, cond) +#define ALTIVEC_TBL_START_COUNT(a, cond) +#define ALTIVEC_TBL_STOP_COUNT(a, cond) +#endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */ + +#else /* HAVE_ALTIVEC */ +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +#error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC" +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT +#error "I can't use ALTIVEC_TBL_PERFORMANCE_REPORT if I don't use HAVE_ALTIVEC" +#endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */ +#endif /* HAVE_ALTIVEC */ diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index 055a8f09fc..ec2dda4690 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -60,11 +60,27 @@ void dsputil_init_ppc(DSPContext* c, unsigned mask) c->pix_sum = pix_sum_altivec; c->diff_pixels = diff_pixels_altivec; c->get_pixels = get_pixels_altivec; -// next one disabled as it it untested. +// next two disabled as they're untested. #if 0 c->add_bytes= add_bytes_altivec; + c->put_pixels_clamped = put_pixels_clamped_altivec; #endif + c->put_pixels_tab[0][0] = put_pixels16_altivec; + c->avg_pixels_tab[0][0] = avg_pixels16_altivec; c->gmc1 = gmc1_altivec; + +#ifdef ALTIVEC_TBL_PERFORMANCE_REPORT + { + int i; + for (i = 0 ; i < altivec_perf_total ; i++) + { + perfdata[i][altivec_data_min] = 0xFFFFFFFFFFFFFFFF; + perfdata[i][altivec_data_max] = 0x0000000000000000; + perfdata[i][altivec_data_sum] = 0x0000000000000000; + perfdata[i][altivec_data_num] = 0x0000000000000000; + } + } +#endif } else #endif { diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c index 4abdd4f740..55bfcdf4d2 100644 --- a/libavcodec/ppc/fft_altivec.c +++ b/libavcodec/ppc/fft_altivec.c @@ -22,6 +22,31 @@ #include "dsputil_altivec.h" +/* + those three macros are from libavcodec/fft.c + and are required for the reference C code +*/ +/* butter fly op */ +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ +{\ + FFTSample ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ +} +#define MUL16(a,b) ((a) * (b)) +#define CMUL(pre, pim, are, aim, bre, bim) \ +{\ + pre = (MUL16(are, bre) - MUL16(aim, bim));\ + pim = (MUL16(are, bim) + MUL16(bre, aim));\ +} + + /** * Do a complex FFT with the parameters defined in fft_init(). The * input data must be permuted before with s->revtab table. No @@ -35,6 +60,84 @@ */ void fft_calc_altivec(FFTContext *s, FFTComplex *z) { +ALTIVEC_TBL_DECLARE(altivec_fft_num, s->nbits >= 6); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *exptab = s->exptab; + int l; + FFTSample tmp_re, tmp_im; + +ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6); + + np = 1 << ln; + + /* pass 0 */ + + p=&z[0]; + j=(np >> 1); + do { + BF(p[0].re, p[0].im, p[1].re, p[1].im, + p[0].re, p[0].im, p[1].re, p[1].im); + p+=2; + } while (--j != 0); + + /* pass 1 */ + + + p=&z[0]; + j=np >> 2; + if (s->inverse) { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, -p[3].im, p[3].re); + p+=4; + } while (--j != 0); + } else { + do { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, p[3].im, -p[3].re); + p+=4; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + do { + p = z; + q = z + nloops; + for (j = 0; j < nblocks; ++j) { + BF(p->re, p->im, q->re, q->im, + p->re, p->im, q->re, q->im); + + p++; + q++; + for(l = nblocks; l < np2; l += nblocks) { + CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); + BF(p->re, p->im, q->re, q->im, + p->re, p->im, tmp_re, tmp_im); + p++; + q++; + } + + p += nloops; + q += nloops; + } + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); + +ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ register const vector float vczero = (const vector float)(0.); int ln = s->nbits; @@ -44,6 +147,8 @@ void fft_calc_altivec(FFTContext *s, FFTComplex *z) FFTComplex *cptr, *cptr1; int k; +ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6); + np = 1 << ln; { @@ -129,5 +234,8 @@ void fft_calc_altivec(FFTContext *s, FFTComplex *z) nblocks = nblocks >> 1; nloops = nloops << 1; } while (nblocks != 0); -} +ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} diff --git a/libavcodec/ppc/gmc_altivec.c b/libavcodec/ppc/gmc_altivec.c index 3bda9ac16f..c2d908d039 100644 --- a/libavcodec/ppc/gmc_altivec.c +++ b/libavcodec/ppc/gmc_altivec.c @@ -28,13 +28,15 @@ */ void gmc1_altivec(UINT8 *dst /* align 8 */, UINT8 *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { -#if 0 +ALTIVEC_TBL_DECLARE(altivec_gmc1_num, h == 8); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE const int A=(16-x16)*(16-y16); const int B=( x16)*(16-y16); const int C=(16-x16)*( y16); const int D=( x16)*( y16); - int i; + +ALTIVEC_TBL_START_COUNT(altivec_gmc1_num, h == 8); for(i=0; i<h; i++) { @@ -49,7 +51,10 @@ void gmc1_altivec(UINT8 *dst /* align 8 */, UINT8 *src /* align1 */, int stride, dst+= stride; src+= stride; } -#else + +ALTIVEC_TBL_STOP_COUNT(altivec_gmc1_num, h == 8); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ const unsigned short __attribute__ ((aligned(16))) rounder_a[8] = {rounder, rounder, rounder, rounder, rounder, rounder, rounder, rounder}; @@ -61,7 +66,6 @@ void gmc1_altivec(UINT8 *dst /* align 8 */, UINT8 *src /* align1 */, int stride, ( x16)*( y16), /* D */ 0, 0, 0, 0 /* padding */ }; - register const vector unsigned char vczero = (const vector unsigned char)(0); register const vector unsigned short vcsr8 = (const vector unsigned short)(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; @@ -70,6 +74,9 @@ void gmc1_altivec(UINT8 *dst /* align 8 */, UINT8 *src /* align1 */, int stride, unsigned long dst_odd = (unsigned long)dst & 0x0000000F; unsigned long src_really_odd = (unsigned long)src & 0x0000000F; + +ALTIVEC_TBL_START_COUNT(altivec_gmc1_num, h == 8); + tempA = vec_ld(0, (unsigned short*)ABCD); Av = vec_splat(tempA, 0); Bv = vec_splat(tempA, 1); @@ -155,5 +162,8 @@ void gmc1_altivec(UINT8 *dst /* align 8 */, UINT8 *src /* align1 */, int stride, dst += stride; src += stride; } -#endif + +ALTIVEC_TBL_STOP_COUNT(altivec_gmc1_num, h == 8); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } diff --git a/libavcodec/ppc/idct_altivec.c b/libavcodec/ppc/idct_altivec.c index 8036d403fa..533ab3e4c8 100644 --- a/libavcodec/ppc/idct_altivec.c +++ b/libavcodec/ppc/idct_altivec.c @@ -38,6 +38,7 @@ #include <stdlib.h> /* malloc(), free() */ #include <string.h> #include "../dsputil.h" +#include "dsputil_altivec.h" #define vector_s16_t vector signed short #define vector_u16_t vector unsigned short @@ -160,8 +161,17 @@ static const vector_s16_t constants[5] = { void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) { +ALTIVEC_TBL_DECLARE(altivec_idct_put_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +ALTIVEC_TBL_START_COUNT(altivec_idct_put_num, 1); + void simple_idct_put(UINT8 *dest, int line_size, INT16 *block); + simple_idct_put(dest, stride, (INT16*)block); +ALTIVEC_TBL_STOP_COUNT(altivec_idct_put_num, 1); +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ vector_u8_t tmp; +ALTIVEC_TBL_START_COUNT(altivec_idct_put_num, 1); + IDCT #define COPY(dest,src) \ @@ -177,16 +187,28 @@ void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) COPY (dest, vx5) dest += stride; COPY (dest, vx6) dest += stride; COPY (dest, vx7) + +ALTIVEC_TBL_STOP_COUNT(altivec_idct_put_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) { +ALTIVEC_TBL_DECLARE(altivec_idct_add_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +ALTIVEC_TBL_START_COUNT(altivec_idct_add_num, 1); + void simple_idct_add(UINT8 *dest, int line_size, INT16 *block); + simple_idct_add(dest, stride, (INT16*)block); +ALTIVEC_TBL_STOP_COUNT(altivec_idct_add_num, 1); +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ vector_u8_t tmp; vector_s16_t tmp2, tmp3; vector_u8_t perm0; vector_u8_t perm1; vector_u8_t p0, p1, p; +ALTIVEC_TBL_START_COUNT(altivec_idct_add_num, 1); + IDCT p0 = vec_lvsl (0, dest); @@ -212,5 +234,8 @@ void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) + +ALTIVEC_TBL_STOP_COUNT(altivec_idct_add_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c index a63df307c0..dfa4c0229a 100644 --- a/libavcodec/ppc/mpegvideo_altivec.c +++ b/libavcodec/ppc/mpegvideo_altivec.c @@ -20,6 +20,7 @@ #include <stdio.h> #include "../dsputil.h" #include "../mpegvideo.h" +#include "dsputil_altivec.h" // Swaps two variables (used for altivec registers) #define SWAP(a,b) \ @@ -510,10 +511,13 @@ int dct_quantize_altivec(MpegEncContext* s, void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { +ALTIVEC_TBL_DECLARE(altivec_dct_unquantize_h263_num, 1); int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); + +ALTIVEC_TBL_START_COUNT(altivec_dct_unquantize_h263_num, 1); qadd = (qscale - 1) | 1; qmul = qscale << 1; @@ -533,7 +537,7 @@ void dct_unquantize_h263_altivec(MpegEncContext *s, nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } -#if 0 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE for(;i<=nCoeffs;i++) { level = block[i]; if (level) { @@ -545,7 +549,7 @@ void dct_unquantize_h263_altivec(MpegEncContext *s, block[i] = level; } } -#else +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ { register const vector short vczero = (const vector short)(0); short __attribute__ ((aligned(16))) qmul8[] = @@ -572,6 +576,7 @@ void dct_unquantize_h263_altivec(MpegEncContext *s, qaddv = vec_ld(0, qadd8); nqaddv = vec_ld(0, nqadd8); +#if 0 // block *is* 16 bytes-aligned, it seems. // first make sure block[j] is 16 bytes-aligned for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { level = block[j]; @@ -584,6 +589,7 @@ void dct_unquantize_h263_altivec(MpegEncContext *s, block[j] = level; } } +#endif // vectorize all the 16 bytes-aligned blocks // of 8 elements @@ -622,5 +628,7 @@ void dct_unquantize_h263_altivec(MpegEncContext *s, block[0] = backup_0; } } -#endif +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ + +ALTIVEC_TBL_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } diff --git a/libavcodec/ppc/mpegvideo_ppc.c b/libavcodec/ppc/mpegvideo_ppc.c index b1c834f828..78105526e1 100644 --- a/libavcodec/ppc/mpegvideo_ppc.c +++ b/libavcodec/ppc/mpegvideo_ppc.c @@ -44,7 +44,11 @@ void MPV_common_init_ppc(MpegEncContext *s) { s->idct_put = idct_put_altivec; s->idct_add = idct_add_altivec; +#ifndef ALTIVEC_USE_REFERENCE_C_CODE s->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + s->idct_permutation_type = FF_NO_IDCT_PERM; +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } // Test to make sure that the dct required alignments are met. |