diff options
author | Baptiste Coudurier <baptiste.coudurier@gmail.com> | 2011-04-27 18:34:10 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-04-27 20:08:09 +0200 |
commit | 6d4c49a2afdb823418297bb9157890b80d45c0fe (patch) | |
tree | 840b69b53989450bfedf8ad38553092750a7ab5e | |
parent | d6f910ea47255b519e0b71c33d74c409a29ab3db (diff) | |
download | ffmpeg-6d4c49a2afdb823418297bb9157890b80d45c0fe.tar.gz |
Move png mmx functions into x86/png_mmx.c, remove them from DSPContext.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/dsputil.c | 16 | ||||
-rw-r--r-- | libavcodec/dsputil.h | 2 | ||||
-rw-r--r-- | libavcodec/png.h | 40 | ||||
-rw-r--r-- | libavcodec/pngdec.c | 83 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 99 | ||||
-rw-r--r-- | libavcodec/x86/png_mmx.c | 141 |
7 files changed, 218 insertions, 164 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 90e6440a87..d8e83f8d7b 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -38,7 +38,6 @@ #include "config.h" #include "ac3dec.h" #include "vorbis.h" -#include "png.h" uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; uint32_t ff_squareTbl[512] = {0, }; @@ -1924,17 +1923,6 @@ static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ dst[i+0] += src[i+0]; } -static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ - long i; - for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ - long a = *(long*)(src1+i); - long b = *(long*)(src2+i); - *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); - } - for(; i<w; i++) - dst[i] = src1[i]+src2[i]; -} - static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ long i; #if !HAVE_FAST_UNALIGNED @@ -3087,7 +3075,6 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c; c->add_bytes= add_bytes_c; - c->add_bytes_l2= add_bytes_l2_c; c->diff_bytes= diff_bytes_c; c->add_hfyu_median_prediction= add_hfyu_median_prediction_c; c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; @@ -3095,9 +3082,6 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c; c->bswap_buf= bswap_buf; c->bswap16_buf = bswap16_buf; -#if CONFIG_PNG_DECODER - c->add_png_paeth_prediction= ff_add_png_paeth_prediction; -#endif if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { c->h263_h_loop_filter= h263_h_loop_filter_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 8240372270..85c2da5031 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -371,7 +371,6 @@ typedef struct DSPContext { /* huffyuv specific */ void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w); - void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w); void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w); /** * subtract huffyuv's variant of median prediction @@ -382,7 +381,6 @@ typedef struct DSPContext { int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left); void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha); /* this might write to dst[w] */ - void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w); void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len); diff --git a/libavcodec/png.h b/libavcodec/png.h index bab5224851..d6fac3e673 100644 --- a/libavcodec/png.h +++ b/libavcodec/png.h @@ -23,6 +23,9 @@ #define AVCODEC_PNG_H #include <stdint.h> +#include <zlib.h> + +#include "avcodec.h" #define PNG_COLOR_MASK_PALETTE 1 #define PNG_COLOR_MASK_COLOR 2 @@ -69,4 +72,41 @@ int ff_png_pass_row_size(int pass, int bits_per_pixel, int width); void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); +typedef struct PNGDecContext { + const uint8_t *bytestream; + const uint8_t *bytestream_start; + const uint8_t *bytestream_end; + AVFrame picture1, picture2; + AVFrame *current_picture, *last_picture; + + int state; + int width, height; + int bit_depth; + int color_type; + int compression_type; + int interlace_type; + int filter_type; + int channels; + int bits_per_pixel; + int bpp; + + uint8_t *image_buf; + int image_linesize; + uint32_t palette[256]; + uint8_t *crow_buf; + uint8_t *last_row; + uint8_t *tmp_row; + int pass; + int crow_size; /* compressed row size (include filter type) */ + int row_size; /* decompressed row size */ + int pass_row_size; /* decompress row size of the current pass */ + int y; + z_stream zstream; + + void (*add_bytes_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w); + void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); +} PNGDecContext; + +void ff_png_init_mmx(PNGDecContext *s); + #endif /* AVCODEC_PNG_H */ diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c index 2f9b343e5b..17fea6815b 100644 --- a/libavcodec/pngdec.c +++ b/libavcodec/pngdec.c @@ -18,11 +18,13 @@ * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ + +//#define DEBUG + #include "libavutil/imgutils.h" #include "avcodec.h" #include "bytestream.h" #include "png.h" -#include "dsputil.h" /* TODO: * - add 2, 4 and 16 bit depth support @@ -30,42 +32,6 @@ #include <zlib.h> -//#define DEBUG - -typedef struct PNGDecContext { - DSPContext dsp; - - const uint8_t *bytestream; - const uint8_t *bytestream_start; - const uint8_t *bytestream_end; - AVFrame picture1, picture2; - AVFrame *current_picture, *last_picture; - - int state; - int width, height; - int bit_depth; - int color_type; - int compression_type; - int interlace_type; - int filter_type; - int channels; - int bits_per_pixel; - int bpp; - - uint8_t *image_buf; - int image_linesize; - uint32_t palette[256]; - uint8_t *crow_buf; - uint8_t *last_row; - uint8_t *tmp_row; - int pass; - int crow_size; /* compressed row size (include filter type) */ - int row_size; /* decompressed row size */ - int pass_row_size; /* decompress row size of the current pass */ - int y; - z_stream zstream; -} PNGDecContext; - /* Mask to determine which y pixels can be written in a pass */ static const uint8_t png_pass_dsp_ymask[NB_PASSES] = { 0xff, 0xff, 0x0f, 0xcc, 0x33, 0xff, 0x55, @@ -134,7 +100,23 @@ static void png_put_interlaced_row(uint8_t *dst, int width, } } -void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp) +// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size +#define pb_7f (~0UL/255 * 0x7f) +#define pb_80 (~0UL/255 * 0x80) + +static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w) +{ + long i; + for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ + long a = *(long*)(src1+i); + long b = *(long*)(src2+i); + *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); + } + for(; i<w; i++) + dst[i] = src1[i]+src2[i]; +} + +static void add_paeth_prediction_c(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp) { int i; for(i = 0; i < w; i++) { @@ -191,7 +173,7 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w } /* NOTE: 'dst' can be equal to 'last' */ -static void png_filter_row(DSPContext *dsp, uint8_t *dst, int filter_type, +static void png_filter_row(PNGDecContext *s, uint8_t *dst, int filter_type, uint8_t *src, uint8_t *last, int size, int bpp) { int i, p, r, g, b, a; @@ -217,7 +199,7 @@ static void png_filter_row(DSPContext *dsp, uint8_t *dst, int filter_type, } break; case PNG_FILTER_VALUE_UP: - dsp->add_bytes_l2(dst, src, last, size); + s->add_bytes_l2(dst, src, last, size); break; case PNG_FILTER_VALUE_AVG: for(i = 0; i < bpp; i++) { @@ -235,10 +217,10 @@ static void png_filter_row(DSPContext *dsp, uint8_t *dst, int filter_type, if(bpp > 1 && size > 4) { // would write off the end of the array if we let it process the last pixel with bpp=3 int w = bpp==4 ? size : size-3; - dsp->add_png_paeth_prediction(dst+i, src+i, last+i, w-i, bpp); + s->add_paeth_prediction(dst+i, src+i, last+i, w-i, bpp); i = w; } - ff_add_png_paeth_prediction(dst+i, src+i, last+i, size-i, bpp); + add_paeth_prediction_c(dst+i, src+i, last+i, size-i, bpp); break; } } @@ -291,7 +273,7 @@ static void png_handle_row(PNGDecContext *s) ptr = s->image_buf + s->image_linesize * s->y; /* need to swap bytes correctly for RGB_ALPHA */ if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) { - png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(s, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, s->last_row, s->row_size, s->bpp); convert_to_rgb32(ptr, s->tmp_row, s->width, s->filter_type == PNG_FILTER_TYPE_LOCO); FFSWAP(uint8_t*, s->last_row, s->tmp_row); @@ -302,7 +284,7 @@ static void png_handle_row(PNGDecContext *s) else last_row = ptr - s->image_linesize; - png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(s, ptr, s->crow_buf[0], s->crow_buf + 1, last_row, s->row_size, s->bpp); } /* loco lags by 1 row so that it doesn't interfere with top prediction */ @@ -325,7 +307,7 @@ static void png_handle_row(PNGDecContext *s) wait for the next one */ if (got_line) break; - png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, + png_filter_row(s, s->tmp_row, s->crow_buf[0], s->crow_buf + 1, s->last_row, s->pass_row_size, s->bpp); FFSWAP(uint8_t*, s->last_row, s->tmp_row); got_line = 1; @@ -633,14 +615,21 @@ static int decode_frame(AVCodecContext *avctx, goto the_end; } -static av_cold int png_dec_init(AVCodecContext *avctx){ +static av_cold int png_dec_init(AVCodecContext *avctx) +{ PNGDecContext *s = avctx->priv_data; s->current_picture = &s->picture1; s->last_picture = &s->picture2; avcodec_get_frame_defaults(&s->picture1); avcodec_get_frame_defaults(&s->picture2); - dsputil_init(&s->dsp, avctx); + + ff_png_init_mmx(s); + + if (!s->add_paeth_prediction) + s->add_paeth_prediction = add_paeth_prediction_c; + if (!s->add_bytes_l2) + s->add_bytes_l2 = add_bytes_l2_c; return 0; } diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index f8d456d3ea..6416e600a3 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -25,6 +25,7 @@ MMX-OBJS-$(CONFIG_MP2FLOAT_DECODER) += x86/mpegaudiodec_mmx.o MMX-OBJS-$(CONFIG_MP3FLOAT_DECODER) += x86/mpegaudiodec_mmx.o MMX-OBJS-$(CONFIG_MP3ON4FLOAT_DECODER) += x86/mpegaudiodec_mmx.o MMX-OBJS-$(CONFIG_MP3ADUFLOAT_DECODER) += x86/mpegaudiodec_mmx.o +MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/png_mmx.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 1b17d0ec7e..985a15d2f1 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -579,28 +579,6 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ dst[i+0] += src[i+0]; } -static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ - x86_reg i=0; - __asm__ volatile( - "jmp 2f \n\t" - "1: \n\t" - "movq (%2, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb (%3, %0), %%mm0 \n\t" - "paddb 8(%3, %0), %%mm1 \n\t" - "movq %%mm0, (%1, %0) \n\t" - "movq %%mm1, 8(%1, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %4, %0 \n\t" - " js 1b \n\t" - : "+r" (i) - : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) - ); - for(; i<w; i++) - dst[i] = src1[i] + src2[i]; -} - #if HAVE_7REGS && HAVE_TEN_OPERANDS static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { x86_reg w2 = -w; @@ -876,80 +854,6 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, } } -#define PAETH(cpu, abs3)\ -static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ -{\ - x86_reg i = -bpp;\ - x86_reg end = w-3;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n"\ - "movd (%1,%0), %%mm0 \n"\ - "movd (%2,%0), %%mm1 \n"\ - "punpcklbw %%mm7, %%mm0 \n"\ - "punpcklbw %%mm7, %%mm1 \n"\ - "add %4, %0 \n"\ - "1: \n"\ - "movq %%mm1, %%mm2 \n"\ - "movd (%2,%0), %%mm1 \n"\ - "movq %%mm2, %%mm3 \n"\ - "punpcklbw %%mm7, %%mm1 \n"\ - "movq %%mm2, %%mm4 \n"\ - "psubw %%mm1, %%mm3 \n"\ - "psubw %%mm0, %%mm4 \n"\ - "movq %%mm3, %%mm5 \n"\ - "paddw %%mm4, %%mm5 \n"\ - abs3\ - "movq %%mm4, %%mm6 \n"\ - "pminsw %%mm5, %%mm6 \n"\ - "pcmpgtw %%mm6, %%mm3 \n"\ - "pcmpgtw %%mm5, %%mm4 \n"\ - "movq %%mm4, %%mm6 \n"\ - "pand %%mm3, %%mm4 \n"\ - "pandn %%mm3, %%mm6 \n"\ - "pandn %%mm0, %%mm3 \n"\ - "movd (%3,%0), %%mm0 \n"\ - "pand %%mm1, %%mm6 \n"\ - "pand %%mm4, %%mm2 \n"\ - "punpcklbw %%mm7, %%mm0 \n"\ - "movq %6, %%mm5 \n"\ - "paddw %%mm6, %%mm0 \n"\ - "paddw %%mm2, %%mm3 \n"\ - "paddw %%mm3, %%mm0 \n"\ - "pand %%mm5, %%mm0 \n"\ - "movq %%mm0, %%mm3 \n"\ - "packuswb %%mm3, %%mm3 \n"\ - "movd %%mm3, (%1,%0) \n"\ - "add %4, %0 \n"\ - "cmp %5, %0 \n"\ - "jle 1b \n"\ - :"+r"(i)\ - :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ - "m"(ff_pw_255)\ - :"memory"\ - );\ -} - -#define ABS3_MMX2\ - "psubw %%mm5, %%mm7 \n"\ - "pmaxsw %%mm7, %%mm5 \n"\ - "pxor %%mm6, %%mm6 \n"\ - "pxor %%mm7, %%mm7 \n"\ - "psubw %%mm3, %%mm6 \n"\ - "psubw %%mm4, %%mm7 \n"\ - "pmaxsw %%mm6, %%mm3 \n"\ - "pmaxsw %%mm7, %%mm4 \n"\ - "pxor %%mm7, %%mm7 \n" - -#define ABS3_SSSE3\ - "pabsw %%mm3, %%mm3 \n"\ - "pabsw %%mm4, %%mm4 \n"\ - "pabsw %%mm5, %%mm5 \n" - -PAETH(mmx2, ABS3_MMX2) -#if HAVE_SSSE3 -PAETH(ssse3, ABS3_SSSE3) -#endif - #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ @@ -2537,7 +2441,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif c->add_bytes= add_bytes_mmx; - c->add_bytes_l2= add_bytes_l2_mmx; if (!h264_high_depth) c->draw_edges = draw_edges_mmx; @@ -2658,7 +2561,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; #endif - c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; } else if (mm_flags & AV_CPU_FLAG_3DNOW) { c->prefetch = prefetch_3dnow; @@ -2772,7 +2674,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); } - c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; #if HAVE_YASM if (!h264_high_depth) { c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd; diff --git a/libavcodec/x86/png_mmx.c b/libavcodec/x86/png_mmx.c new file mode 100644 index 0000000000..cdf035962c --- /dev/null +++ b/libavcodec/x86/png_mmx.c @@ -0,0 +1,141 @@ +/* + * MMX optimized PNG utils + * Copyright (c) 2008 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/png.h" +#include "dsputil_mmx.h" + +//#undef NDEBUG +//#include <assert.h> + +static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w) +{ + x86_reg i=0; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb (%3, %0), %%mm0 \n\t" + "paddb 8(%3, %0), %%mm1 \n\t" + "movq %%mm0, (%1, %0) \n\t" + "movq %%mm1, 8(%1, %0) \n\t" + "add $16, %0 \n\t" + "2: \n\t" + "cmp %4, %0 \n\t" + " js 1b \n\t" + : "+r" (i) + : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) + ); + for(; i<w; i++) + dst[i] = src1[i] + src2[i]; +} + +#define PAETH(cpu, abs3)\ +static void add_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ +{\ + x86_reg i = -bpp;\ + x86_reg end = w-3;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n"\ + "movd (%1,%0), %%mm0 \n"\ + "movd (%2,%0), %%mm1 \n"\ + "punpcklbw %%mm7, %%mm0 \n"\ + "punpcklbw %%mm7, %%mm1 \n"\ + "add %4, %0 \n"\ + "1: \n"\ + "movq %%mm1, %%mm2 \n"\ + "movd (%2,%0), %%mm1 \n"\ + "movq %%mm2, %%mm3 \n"\ + "punpcklbw %%mm7, %%mm1 \n"\ + "movq %%mm2, %%mm4 \n"\ + "psubw %%mm1, %%mm3 \n"\ + "psubw %%mm0, %%mm4 \n"\ + "movq %%mm3, %%mm5 \n"\ + "paddw %%mm4, %%mm5 \n"\ + abs3\ + "movq %%mm4, %%mm6 \n"\ + "pminsw %%mm5, %%mm6 \n"\ + "pcmpgtw %%mm6, %%mm3 \n"\ + "pcmpgtw %%mm5, %%mm4 \n"\ + "movq %%mm4, %%mm6 \n"\ + "pand %%mm3, %%mm4 \n"\ + "pandn %%mm3, %%mm6 \n"\ + "pandn %%mm0, %%mm3 \n"\ + "movd (%3,%0), %%mm0 \n"\ + "pand %%mm1, %%mm6 \n"\ + "pand %%mm4, %%mm2 \n"\ + "punpcklbw %%mm7, %%mm0 \n"\ + "movq %6, %%mm5 \n"\ + "paddw %%mm6, %%mm0 \n"\ + "paddw %%mm2, %%mm3 \n"\ + "paddw %%mm3, %%mm0 \n"\ + "pand %%mm5, %%mm0 \n"\ + "movq %%mm0, %%mm3 \n"\ + "packuswb %%mm3, %%mm3 \n"\ + "movd %%mm3, (%1,%0) \n"\ + "add %4, %0 \n"\ + "cmp %5, %0 \n"\ + "jle 1b \n"\ + :"+r"(i)\ + :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ + "m"(ff_pw_255)\ + :"memory"\ + );\ +} + +#define ABS3_MMX2\ + "psubw %%mm5, %%mm7 \n"\ + "pmaxsw %%mm7, %%mm5 \n"\ + "pxor %%mm6, %%mm6 \n"\ + "pxor %%mm7, %%mm7 \n"\ + "psubw %%mm3, %%mm6 \n"\ + "psubw %%mm4, %%mm7 \n"\ + "pmaxsw %%mm6, %%mm3 \n"\ + "pmaxsw %%mm7, %%mm4 \n"\ + "pxor %%mm7, %%mm7 \n" + +#define ABS3_SSSE3\ + "pabsw %%mm3, %%mm3 \n"\ + "pabsw %%mm4, %%mm4 \n"\ + "pabsw %%mm5, %%mm5 \n" + +PAETH(mmx2, ABS3_MMX2) +#if HAVE_SSSE3 +PAETH(ssse3, ABS3_SSSE3) +#endif + +void ff_png_init_mmx(PNGDecContext *s) +{ + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX2) { + s->add_bytes_l2 = add_bytes_l2_mmx; + s->add_paeth_prediction = add_paeth_prediction_mmx2; +#if HAVE_SSSE3 + if (mm_flags & AV_CPU_FLAG_SSSE3) + s->add_paeth_prediction = add_paeth_prediction_ssse3; +#endif + } +} |