diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-09-22 20:18:58 -0400 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2013-09-23 15:23:53 +0200 |
commit | 34b429d5ba7b823748e897f0295aa78aaa2ebb9d (patch) | |
tree | 12cecac2e1b1f3e8d3c9b9a52c17fa2c778a4660 | |
parent | 3f3867ca275233f696cea51d09b9390c21cb7de5 (diff) | |
download | ffmpeg-34b429d5ba7b823748e897f0295aa78aaa2ebb9d.tar.gz |
vp8: fix PPC assembly and bilinear C code to work if src_stride != dst_stride.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/ppc/vp8dsp_altivec.c | 49 | ||||
-rw-r--r-- | libavcodec/vp8dsp.c | 20 |
2 files changed, 52 insertions, 17 deletions
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c index 2401d2a628..c858d8a45f 100644 --- a/libavcodec/ppc/vp8dsp_altivec.c +++ b/libavcodec/ppc/vp8dsp_altivec.c @@ -241,15 +241,15 @@ void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst } #define EPEL_HV(WIDTH, HTAPS, VTAPS) \ -static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) \ +static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ { \ DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ if (VTAPS == 6) { \ - put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*stride, stride, h+5, mx, my); \ - put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16, 16, h, mx, my); \ + put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ + put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \ } else { \ - put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-stride, stride, h+4, mx, my); \ - put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16, 16, h, mx, my); \ + put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \ + put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \ } \ } @@ -269,9 +269,44 @@ EPEL_HV(4, 4,6) EPEL_HV(4, 6,4) EPEL_HV(4, 4,4) -static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) +static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) { - ff_put_pixels16_altivec(dst, src, stride, h); + register vector unsigned char pixelsv1, pixelsv2; + register vector unsigned char pixelsv1B, pixelsv2B; + register vector unsigned char pixelsv1C, pixelsv2C; + register vector unsigned char pixelsv1D, pixelsv2D; + + register vector unsigned char perm = vec_lvsl(0, src); + int i; + register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; + register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; + register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; + +// hand-unrolling the loop by 4 gains about 15% +// mininum execution time goes from 74 to 60 cycles +// it's faster than -funroll-loops, but using +// -funroll-loops w/ this is bad - 74 cycles again. +// all this is on a 7450, tuning for the 7450 + for (i = 0; i < h; i += 4) { + pixelsv1 = vec_ld( 0, src); + pixelsv2 = vec_ld(15, src); + pixelsv1B = vec_ld(sstride, src); + pixelsv2B = vec_ld(15 + sstride, src); + pixelsv1C = vec_ld(sstride2, src); + pixelsv2C = vec_ld(15 + sstride2, src); + pixelsv1D = vec_ld(sstride3, src); + pixelsv2D = vec_ld(15 + sstride3, src); + vec_st(vec_perm(pixelsv1, pixelsv2, perm), + 0, (unsigned char*)dst); + vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), + dstride, (unsigned char*)dst); + vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), + dstride2, (unsigned char*)dst); + vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), + dstride3, (unsigned char*)dst); + src += sstride4; + dst += dstride4; + } } #endif /* HAVE_ALTIVEC */ diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index 3ead24e613..ba267fdcd2 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -415,7 +415,7 @@ VP8_EPEL_HV(8, 6, 6) VP8_EPEL_HV(4, 6, 6) #define VP8_BILINEAR(SIZE) \ -static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s2, int h, int mx, int my) \ +static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ { \ int a = 8-mx, b = mx; \ int x, y; \ @@ -423,24 +423,24 @@ static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t stride, uin for (y = 0; y < h; y++) { \ for (x = 0; x < SIZE; x++) \ dst[x] = (a*src[x] + b*src[x+1] + 4) >> 3; \ - dst += stride; \ - src += stride; \ + dst += dstride; \ + src += sstride; \ } \ } \ -static void put_vp8_bilinear ## SIZE ## _v_c(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s2, int h, int mx, int my) \ +static void put_vp8_bilinear ## SIZE ## _v_c(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ { \ int c = 8-my, d = my; \ int x, y; \ \ for (y = 0; y < h; y++) { \ for (x = 0; x < SIZE; x++) \ - dst[x] = (c*src[x] + d*src[x+stride] + 4) >> 3; \ - dst += stride; \ - src += stride; \ + dst[x] = (c*src[x] + d*src[x+sstride] + 4) >> 3; \ + dst += dstride; \ + src += sstride; \ } \ } \ \ -static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s2, int h, int mx, int my) \ +static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ { \ int a = 8-mx, b = mx; \ int c = 8-my, d = my; \ @@ -452,7 +452,7 @@ static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t stride, ui for (x = 0; x < SIZE; x++) \ tmp[x] = (a*src[x] + b*src[x+1] + 4) >> 3; \ tmp += SIZE; \ - src += stride; \ + src += sstride; \ } \ \ tmp = tmp_array; \ @@ -460,7 +460,7 @@ static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t stride, ui for (y = 0; y < h; y++) { \ for (x = 0; x < SIZE; x++) \ dst[x] = (c*tmp[x] + d*tmp[x+SIZE] + 4) >> 3; \ - dst += stride; \ + dst += dstride; \ tmp += SIZE; \ } \ } |