mmx2 optimization of huffyuv median encoding

Originally committed as revision 2372 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Michael Niedermayer <michaelni@gmx.at> 2003-10-13 17:27:30 +0000
committer: Michael Niedermayer <michaelni@gmx.at> 2003-10-13 17:27:30 +0000
commit: 84705403444dc440a41d6aea2d49bac7bdfdedff (patch)
tree: 4250e821d043be1a0bf9849e8c768816a9631238 /libavcodec
parent: 8c812d73178de487c658e343505d2212ba55bcb1 (diff)
download: ffmpeg-84705403444dc440a41d6aea2d49bac7bdfdedff.tar.gz
4 files changed, 70 insertions, 26 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 9e7cca734f..270424706a 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2526,6 +2526,24 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
         dst[i+0] = src1[i+0]-src2[i+0];
 }
 
+static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
+    int i;
+    uint8_t l, lt;
+
+    l= *left;
+    lt= *left_top;
+
+    for(i=0; i<w; i++){
+        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
+        lt= src1[i];
+        l= src2[i];
+        dst[i]= l - pred;
+    }    
+
+    *left= l;
+    *left_top= lt;
+}
+
 #define BUTTERFLY2(o1,o2,i1,i2) \
 o1= (i1)+(i2);\
 o2= (i1)-(i2);
@@ -3007,6 +3025,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
         
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
+    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
     c->bswap_buf= bswap_buf;
 
 #ifdef HAVE_MMX
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 402b99091a..0cc6b9be0f 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -234,6 +234,11 @@ typedef struct DSPContext {
     /* huffyuv specific */
     void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
     void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
+    /**
+     * subtract huffyuv's variant of median prediction
+     * note, this might read from src1[-1], src2[-1]
+     */
+    void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
     void (*bswap_buf)(uint32_t *dst, uint32_t *src, int w);
     
     /* (I)DCT */
diff --git a/libavcodec/huffyuv.c b/libavcodec/huffyuv.c
index a6220f0ac8..ee9790d989 100644
--- a/libavcodec/huffyuv.c
+++ b/libavcodec/huffyuv.c
@@ -153,25 +153,6 @@ static inline void add_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *d
     *left_top= lt;
 }
 
-//FIXME optimize
-static inline void sub_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
-    int i;
-    uint8_t l, lt;
-
-    l= *left;
-    lt= *left_top;
-
-    for(i=0; i<w; i++){
-        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
-        lt= src1[i];
-        l= src2[i];
-        dst[i]= l - pred;
-    }    
-
-    *left= l;
-    *left_top= lt;
-}
-
 static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w, int *red, int *green, int *blue){
     int i;
     int r,g,b;
@@ -999,9 +980,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
             lefttopy= p->data[0][3];
             lefttopu= p->data[1][1];
             lefttopv= p->data[2][1];
-            sub_median_prediction(s->temp[0], p->data[0]+4, p->data[0] + fake_ystride+4, width-4 , &lefty, &lefttopy);
-            sub_median_prediction(s->temp[1], p->data[1]+2, p->data[1] + fake_ustride+2, width2-2, &leftu, &lefttopu);
-            sub_median_prediction(s->temp[2], p->data[2]+2, p->data[2] + fake_vstride+2, width2-2, &leftv, &lefttopv);
+            s->dsp.sub_hfyu_median_prediction(s->temp[0], p->data[0]+4, p->data[0] + fake_ystride+4, width-4 , &lefty, &lefttopy);
+            s->dsp.sub_hfyu_median_prediction(s->temp[1], p->data[1]+2, p->data[1] + fake_ustride+2, width2-2, &leftu, &lefttopu);
+            s->dsp.sub_hfyu_median_prediction(s->temp[2], p->data[2]+2, p->data[2] + fake_vstride+2, width2-2, &leftv, &lefttopv);
             encode_422_bitstream(s, width-4);
             y++; cy++;
 
@@ -1011,7 +992,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
                 if(s->bitstream_bpp==12){
                     while(2*cy > y){
                         ydst= p->data[0] + p->linesize[0]*y;
-                        sub_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
+                        s->dsp.sub_hfyu_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
                         encode_gray_bitstream(s, width);
                         y++;
                     }
@@ -1021,9 +1002,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
                 udst= p->data[1] + p->linesize[1]*cy;
                 vdst= p->data[2] + p->linesize[2]*cy;
 
-                sub_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
-                sub_median_prediction(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
-                sub_median_prediction(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
+                s->dsp.sub_hfyu_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
+                s->dsp.sub_hfyu_median_prediction(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
+                s->dsp.sub_hfyu_median_prediction(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
 
                 encode_422_bitstream(s, width);
             }
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index adf27d4a11..2447ab6abf 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -583,6 +583,43 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     for(; i<w; i++)
         dst[i+0] = src1[i+0]-src2[i+0];
 }
+
+static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
+    int i=0;
+    uint8_t l, lt;
+
+    asm volatile(
+        "1:				\n\t"
+        "movq  -1(%1, %0), %%mm0	\n\t" // LT
+        "movq  (%1, %0), %%mm1		\n\t" // T
+        "movq  -1(%2, %0), %%mm2	\n\t" // L
+        "movq  (%2, %0), %%mm3		\n\t" // X
+        "movq %%mm2, %%mm4		\n\t" // L
+        "psubb %%mm0, %%mm2		\n\t"
+        "paddb %%mm1, %%mm2		\n\t" // L + T - LT
+        "movq %%mm4, %%mm5		\n\t" // L
+        "pmaxub %%mm1, %%mm4		\n\t" // max(T, L)
+        "pminub %%mm5, %%mm1		\n\t" // min(T, L)
+        "pminub %%mm2, %%mm4		\n\t" 
+        "pmaxub %%mm1, %%mm4		\n\t"
+        "psubb %%mm4, %%mm3		\n\t" // dst - pred
+        "movq %%mm3, (%3, %0)		\n\t"
+        "addl $8, %0			\n\t"
+        "cmpl %4, %0			\n\t"
+        " jb 1b				\n\t"
+        : "+r" (i)
+        : "r"(src1), "r"(src2), "r"(dst), "r"(w)
+    );
+
+    l= *left;
+    lt= *left_top;
+    
+    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
+    
+    *left_top= src1[w-1];
+    *left    = src2[w-1];
+}
+
 #define LBUTTERFLY2(a1,b1,a2,b2)\
     "paddw " #b1 ", " #a1 "		\n\t"\
     "paddw " #b2 ", " #a2 "		\n\t"\
@@ -1699,6 +1736,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
             SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
 #endif
+
+            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
         } else if (mm_flags & MM_3DNOW) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
author	Michael Niedermayer <michaelni@gmx.at>	2003-10-13 17:27:30 +0000
committer	Michael Niedermayer <michaelni@gmx.at>	2003-10-13 17:27:30 +0000
commit	84705403444dc440a41d6aea2d49bac7bdfdedff (patch)
tree	4250e821d043be1a0bf9849e8c768816a9631238 /libavcodec
parent	8c812d73178de487c658e343505d2212ba55bcb1 (diff)
download	ffmpeg-84705403444dc440a41d6aea2d49bac7bdfdedff.tar.gz