add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)

Patch by David Conrad %lessen42 A gmail P com% Originally committed as revision 16443 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: David Conrad <[email protected]> 2009-01-05 23:15:14 +0000
committer: Guillaume Poirier <[email protected]> 2009-01-05 23:15:14 +0000
commit: df7fb435694e88336c7c9f7ac08c5b7710c3a937 (patch)
tree: 41840377ca277c366ad4991f8f14322eafdc118b
parent: c663cb0d4fa30be5e942c196714d1d659cfab7aa (diff)
1 files changed, 82 insertions, 0 deletions
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index 67b20f062e..6a5ed44f54 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -935,6 +935,82 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
     write16x4(pix-2, stride, line1, line2, line3, line4);
 }
 
+
+static av_always_inline
+void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+                               int weightd, int weights, int offset, int w, int h)
+{
+    int y, dst_aligned, src_aligned;
+    vec_u8 vsrc, vdst;
+    vec_u16 vtemp, vlog2_denom, vweights, vweightd, voffset, v0, v1, v2, v3;
+    DECLARE_ALIGNED_16(int32_t, temp[4]);
+    LOAD_ZERO;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+    temp[0] = log2_denom+1;
+    temp[1] = weights;
+    temp[2] = weightd;
+    temp[3] = offset;
+
+    vtemp = vec_ld(0, temp);
+    vlog2_denom = vec_splat(vtemp, 1);
+    vweights = vec_splat(vtemp, 3);
+    vweightd = vec_splat(vtemp, 5);
+    voffset = vec_splat(vtemp, 7);
+    dst_aligned = !((unsigned long)dst & 0xf);
+    src_aligned = !((unsigned long)src & 0xf);
+
+    for (y=0; y<h; y++) {
+        vdst = vec_ld(0, dst);
+        vsrc = vec_ld(0, src);
+
+        v0 = vec_mergeh(zero_u8v, vdst);
+        v1 = vec_mergel(zero_u8v, vdst);
+        v2 = vec_mergeh(zero_u8v, vsrc);
+        v3 = vec_mergel(zero_u8v, vsrc);
+
+        if (w == 8) {
+            if (src_aligned)
+                v3 = v2;
+            else
+                v2 = v3;
+        }
+
+        if (w == 16 || dst_aligned) {
+            v0 = vec_mladd(v0, vweightd, zero_u16v);
+            v2 = vec_mladd(v2, vweights, zero_u16v);
+
+            v0 = vec_adds(v0, voffset);
+            v0 = vec_adds(v0, v2);
+            v0 = vec_sra(v0, vlog2_denom);
+        }
+        if (w == 16 || !dst_aligned) {
+            v1 = vec_mladd(v1, vweightd, zero_u16v);
+            v3 = vec_mladd(v3, vweights, zero_u16v);
+
+            v1 = vec_adds(v1, voffset);
+            v1 = vec_adds(v1, v3);
+            v1 = vec_sra(v1, vlog2_denom);
+        }
+        vdst = vec_packsu(v0, v1);
+        vec_st(vdst, 0, dst);
+
+        dst += stride;
+        src += stride;
+    }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
 
     if (has_altivec()) {
@@ -973,5 +1049,11 @@ void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
         dspfunc(put_h264_qpel, 0, 16);
         dspfunc(avg_h264_qpel, 0, 16);
 #undef dspfunc
+
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
+        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
+        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
     }
 }
author	David Conrad <[email protected]>	2009-01-05 23:15:14 +0000
committer	Guillaume Poirier <[email protected]>	2009-01-05 23:15:14 +0000
commit	df7fb435694e88336c7c9f7ac08c5b7710c3a937 (patch)
tree	41840377ca277c366ad4991f8f14322eafdc118b
parent	c663cb0d4fa30be5e942c196714d1d659cfab7aa (diff)