diff options
author | Nolan L <nol888@gmail.com> | 2010-12-12 17:59:10 +0000 |
---|---|---|
committer | Stefano Sabatini <stefano.sabatini-lala@poste.it> | 2010-12-12 17:59:10 +0000 |
commit | d5f187fd3355ec6d4922d8479930c10d1b6f9ebf (patch) | |
tree | c95815e013f51c29f9bb1d2e44a2896d3d3f03b1 | |
parent | 9d845ca40cff56c8c0dc04cc76964b0573ef9796 (diff) | |
download | ffmpeg-d5f187fd3355ec6d4922d8479930c10d1b6f9ebf.tar.gz |
Add gradfun filter, ported from MPlayer.
Patch by Nolan L nol888 <=> gmail >=< com.
See thread:
Subject: [FFmpeg-devel] [PATCH] Port gradfun to libavfilter (GCI)
Date: Mon, 29 Nov 2010 07:18:14 -0500
Originally committed as revision 25942 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r-- | Changelog | 1 | ||||
-rw-r--r-- | doc/filters.texi | 29 | ||||
-rw-r--r-- | libavfilter/Makefile | 1 | ||||
-rw-r--r-- | libavfilter/allfilters.c | 1 | ||||
-rw-r--r-- | libavfilter/avfilter.h | 4 | ||||
-rw-r--r-- | libavfilter/gradfun.h | 48 | ||||
-rw-r--r-- | libavfilter/vf_gradfun.c | 253 | ||||
-rw-r--r-- | libavfilter/x86/Makefile | 1 | ||||
-rw-r--r-- | libavfilter/x86/gradfun.c | 162 |
9 files changed, 498 insertions, 2 deletions
@@ -64,6 +64,7 @@ version <next>: - hqdn3d filter added - RTP depacketization of QCELP - FLAC parser added +- gradfun filter added version 0.6: diff --git a/doc/filters.texi b/doc/filters.texi index c460aa951f..be3f8e180e 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -425,6 +425,35 @@ frei0r=perspective:0.2/0.2:0.8/0.2 For more information see: @url{http://piksel.org/frei0r} +@section gradfun + +Fix the banding artifacts that are sometimes introduced into nearly flat +regions by truncation to 8bit colordepth. +Interpolate the gradients that should go where the bands are, and +dither them. + +The filter takes two optional parameters, separated by ':': +@var{strength}:@var{radius} + +@var{strength} is the maximum amount by which the filter will change +any one pixel. Also the threshold for detecting nearly flat +regions. Acceptable values range from .51 to 255, default value is +1.2, out-of-range values will be clipped to the valid range. + +@var{radius} is the neighborhood to fit the gradient to. A larger +radius makes for smoother gradients, but also prevents the filter from +modifying the pixels near detailed regions. Acceptable values are +8-32, default value is 16, out-of-range values will be clipped to the +valid range. + +@example +# default parameters +gradfun=1.2:16 + +# omitting radius +gradfun=1.2 +@end example + @section hflip Flip the input video horizontally. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index aece3abd07..14666458f8 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -26,6 +26,7 @@ OBJS-$(CONFIG_DRAWBOX_FILTER) += vf_drawbox.o OBJS-$(CONFIG_FIFO_FILTER) += vf_fifo.o OBJS-$(CONFIG_FORMAT_FILTER) += vf_format.o OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o +OBJS-$(CONFIG_GRADFUN_FILTER) += vf_gradfun.o OBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o OBJS-$(CONFIG_HQDN3D_FILTER) += vf_hqdn3d.o OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 8ce4f1b168..7f7e46d448 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -47,6 +47,7 @@ void avfilter_register_all(void) REGISTER_FILTER (FIFO, fifo, vf); REGISTER_FILTER (FORMAT, format, vf); REGISTER_FILTER (FREI0R, frei0r, vf); + REGISTER_FILTER (GRADFUN, gradfun, vf); REGISTER_FILTER (HFLIP, hflip, vf); REGISTER_FILTER (HQDN3D, hqdn3d, vf); REGISTER_FILTER (NOFORMAT, noformat, vf); diff --git a/libavfilter/avfilter.h b/libavfilter/avfilter.h index 54cd9050f1..a4bc978eba 100644 --- a/libavfilter/avfilter.h +++ b/libavfilter/avfilter.h @@ -27,8 +27,8 @@ #include "libavcore/samplefmt.h" #define LIBAVFILTER_VERSION_MAJOR 1 -#define LIBAVFILTER_VERSION_MINOR 68 -#define LIBAVFILTER_VERSION_MICRO 1 +#define LIBAVFILTER_VERSION_MINOR 69 +#define LIBAVFILTER_VERSION_MICRO 0 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ LIBAVFILTER_VERSION_MINOR, \ diff --git a/libavfilter/gradfun.h b/libavfilter/gradfun.h new file mode 100644 index 0000000000..3dacbcb252 --- /dev/null +++ b/libavfilter/gradfun.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 Nolan Lum <nol888@gmail.com> + * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_GRADFUN_H +#define AVFILTER_GRADFUN_H + +#include "avfilter.h" + +/// Holds instance-specific information for gradfun. +typedef struct { + int thresh; ///< threshold for gradient algorithm + int radius; ///< blur radius + int chroma_w; ///< width of the chroma planes + int chroma_h; ///< weight of the chroma planes + int chroma_r; ///< blur radius for the chroma planes + uint16_t *buf; ///< holds image data for blur algorithm passed into filter. + /// DSP functions. + void (*filter_line) (uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers); + void (*blur_line) (uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width); +} GradFunContext; + +void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers); +void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width); + +void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers); +void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers); + +void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width); + +#endif /* AVFILTER_GRADFUN_H */ diff --git a/libavfilter/vf_gradfun.c b/libavfilter/vf_gradfun.c new file mode 100644 index 0000000000..1cbf8d8c2a --- /dev/null +++ b/libavfilter/vf_gradfun.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2010 Nolan Lum <nol888@gmail.com> + * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * gradfun debanding filter, ported from MPlayer + * libmpcodecs/vf_gradfun.c + * + * Apply a boxblur debanding algorithm (based on the gradfun2db + * Avisynth filter by prunedtree). + * Foreach pixel, if it's within threshold of the blurred value, make it closer. + * So now we have a smoothed and higher bitdepth version of all the shallow + * gradients, while leaving detailed areas untouched. + * Dither it back to 8bit. + */ + +#include "libavcore/imgutils.h" +#include "libavutil/cpu.h" +#include "libavutil/pixdesc.h" +#include "avfilter.h" +#include "gradfun.h" + +DECLARE_ALIGNED(16, static const uint16_t, dither)[8][8] = { + {0x00,0x60,0x18,0x78,0x06,0x66,0x1E,0x7E}, + {0x40,0x20,0x58,0x38,0x46,0x26,0x5E,0x3E}, + {0x10,0x70,0x08,0x68,0x16,0x76,0x0E,0x6E}, + {0x50,0x30,0x48,0x28,0x56,0x36,0x4E,0x2E}, + {0x04,0x64,0x1C,0x7C,0x02,0x62,0x1A,0x7A}, + {0x44,0x24,0x5C,0x3C,0x42,0x22,0x5A,0x3A}, + {0x14,0x74,0x0C,0x6C,0x12,0x72,0x0A,0x6A}, + {0x54,0x34,0x4C,0x2C,0x52,0x32,0x4A,0x2A}, +}; + +void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) +{ + int x; + for (x = 0; x < width; x++, dc += x & 1) { + int pix = src[x] << 7; + int delta = dc[0] - pix; + int m = abs(delta) * thresh >> 16; + m = FFMAX(0, 127 - m); + m = m * m * delta >> 14; + pix += m + dithers[x & 7]; + dst[x] = av_clip_uint8(pix >> 7); + } +} + +void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width) +{ + int x, v, old; + for (x = 0; x < width; x++) { + v = buf1[x] + src[2 * x] + src[2 * x + 1] + src[2 * x + src_linesize] + src[2 * x + 1 + src_linesize]; + old = buf[x]; + buf[x] = v; + dc[x] = v - old; + } +} + +static void filter(GradFunContext *ctx, uint8_t *dst, uint8_t *src, int width, int height, int dst_linesize, int src_linesize, int r) +{ + int bstride = FFALIGN(width, 16) / 2; + int y; + uint32_t dc_factor = (1 << 21) / (r * r); + uint16_t *dc = ctx->buf + 16; + uint16_t *buf = ctx->buf + bstride + 32; + int thresh = ctx->thresh; + + memset(dc, 0, (bstride + 16) * sizeof(*buf)); + for (y = 0; y < r; y++) + ctx->blur_line(dc, buf + y * bstride, buf + (y - 1) * bstride, src + 2 * y * src_linesize, src_linesize, width / 2); + for (;;) { + if (y < height - r) { + int mod = ((y + r) / 2) % r; + uint16_t *buf0 = buf + mod * bstride; + uint16_t *buf1 = buf + (mod ? mod - 1 : r - 1) * bstride; + int x, v; + ctx->blur_line(dc, buf0, buf1, src + (y + r) * src_linesize, src_linesize, width / 2); + for (x = v = 0; x < r; x++) + v += dc[x]; + for (; x < width / 2; x++) { + v += dc[x] - dc[x-r]; + dc[x-r] = v * dc_factor >> 16; + } + for (; x < (width + r + 1) / 2; x++) + dc[x-r] = v * dc_factor >> 16; + for (x = -r / 2; x < 0; x++) + dc[x] = dc[0]; + } + if (y == r) { + for (y = 0; y < r; y++) + ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]); + } + ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]); + if (++y >= height) break; + ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]); + if (++y >= height) break; + } +} + +static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque) +{ + GradFunContext *gf = ctx->priv; + float thresh = 1.2; + int radius = 16; + av_unused int cpu_flags = av_get_cpu_flags(); + + if (args) + sscanf(args, "%f:%d", &thresh, &radius); + + thresh = av_clipf(thresh, 0.51, 255); + gf->thresh = (1 << 15) / thresh; + gf->radius = av_clip((radius + 1) & ~1, 4, 32); + + gf->blur_line = ff_gradfun_blur_line_c; + gf->filter_line = ff_gradfun_filter_line_c; + + if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2) + gf->filter_line = ff_gradfun_filter_line_mmx2; + if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3) + gf->filter_line = ff_gradfun_filter_line_ssse3; + if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2) + gf->blur_line = ff_gradfun_blur_line_sse2; + + av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius); + + return 0; +} + +static av_cold void uninit(AVFilterContext *ctx) +{ + GradFunContext *gf = ctx->priv; + av_freep(&gf->buf); +} + +static int query_formats(AVFilterContext *ctx) +{ + static const enum PixelFormat pix_fmts[] = { + PIX_FMT_YUV410P, PIX_FMT_YUV420P, + PIX_FMT_GRAY8, PIX_FMT_NV12, + PIX_FMT_NV21, PIX_FMT_YUV444P, + PIX_FMT_YUV422P, PIX_FMT_YUV411P, + PIX_FMT_NONE + }; + + avfilter_set_common_formats(ctx, avfilter_make_format_list(pix_fmts)); + + return 0; +} + +static int config_input(AVFilterLink *inlink) +{ + GradFunContext *gf = inlink->dst->priv; + int hsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_w; + int vsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_h; + + gf->buf = av_mallocz((FFALIGN(inlink->w, 16) * (gf->radius + 1) / 2 + 32) * sizeof(uint16_t)); + if (!gf->buf) + return AVERROR(ENOMEM); + + gf->chroma_w = -((-inlink->w) >> hsub); + gf->chroma_h = -((-inlink->h) >> vsub); + gf->chroma_r = av_clip(((((gf->radius >> hsub) + (gf->radius >> vsub)) / 2 ) + 1) & ~1, 4, 32); + + return 0; +} + +static void start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref) +{ + AVFilterLink *outlink = inlink->dst->outputs[0]; + AVFilterBufferRef *outpicref; + + if (inpicref->perms & AV_PERM_PRESERVE) { + outpicref = avfilter_get_video_buffer(outlink, AV_PERM_WRITE, outlink->w, outlink->h); + avfilter_copy_buffer_ref_props(outpicref, inpicref); + outpicref->video->w = outlink->w; + outpicref->video->h = outlink->h; + } else + outpicref = inpicref; + + outlink->out_buf = outpicref; + avfilter_start_frame(outlink, avfilter_ref_buffer(outpicref, ~0)); +} + +static void null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir) { } + +static void end_frame(AVFilterLink *inlink) +{ + GradFunContext *gf = inlink->dst->priv; + AVFilterBufferRef *inpic = inlink->cur_buf; + AVFilterLink *outlink = inlink->dst->outputs[0]; + AVFilterBufferRef *outpic = outlink->out_buf; + int p; + + for (p = 0; p < 4 && inpic->data[p]; p++) { + int w = inlink->w; + int h = inlink->h; + int r = gf->radius; + if (p) { + w = gf->chroma_w; + h = gf->chroma_h; + r = gf->chroma_r; + } + + if (FFMIN(w, h) > 2 * r) + filter(gf, outpic->data[p], inpic->data[p], w, h, outpic->linesize[p], inpic->linesize[p], r); + else if (outpic->data[p] != inpic->data[p]) + av_image_copy_plane(outpic->data[p], outpic->linesize[p], inpic->data[p], inpic->linesize[p], w, h); + } + + avfilter_draw_slice(outlink, 0, inlink->h, 1); + avfilter_end_frame(outlink); + avfilter_unref_buffer(inpic); + avfilter_unref_buffer(outpic); +} + +AVFilter avfilter_vf_gradfun = { + .name = "gradfun", + .description = NULL_IF_CONFIG_SMALL("Debands video quickly using gradients."), + .priv_size = sizeof(GradFunContext), + .init = init, + .uninit = uninit, + .query_formats = query_formats, + + .inputs = (AVFilterPad[]) {{ .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = config_input, + .start_frame = start_frame, + .draw_slice = null_draw_slice, + .end_frame = end_frame, + .min_perms = AV_PERM_READ, }, + { .name = NULL}}, + .outputs = (AVFilterPad[]) {{ .name = "default", + .type = AVMEDIA_TYPE_VIDEO, }, + { .name = NULL}}, +}; diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 716048cca2..e98693d654 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1 +1,2 @@ MMX-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o +MMX-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o diff --git a/libavfilter/x86/gradfun.c b/libavfilter/x86/gradfun.c new file mode 100644 index 0000000000..894a44b9ff --- /dev/null +++ b/libavfilter/x86/gradfun.c @@ -0,0 +1,162 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86_cpu.h" +#include "libavfilter/gradfun.h" + +DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; +DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; + +void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) +{ +#if HAVE_MMX + intptr_t x; + if (width & 3) { + x = width & ~3; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); + width = x; + } + x = -width; + __asm__ volatile( + "movd %4, %%mm5 \n" + "pxor %%mm7, %%mm7 \n" + "pshufw $0, %%mm5, %%mm5 \n" + "movq %6, %%mm6 \n" + "movq %5, %%mm4 \n" + "1: \n" + "movd (%2,%0), %%mm0 \n" + "movd (%3,%0), %%mm1 \n" + "punpcklbw %%mm7, %%mm0 \n" + "punpcklwd %%mm1, %%mm1 \n" + "psllw $7, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "psubw %%mm0, %%mm1 \n" // delta = dc - pix + "psubw %%mm1, %%mm2 \n" + "pmaxsw %%mm1, %%mm2 \n" + "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%mm6, %%mm2 \n" + "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m) + "pmullw %%mm2, %%mm2 \n" + "paddw %%mm4, %%mm0 \n" // pix += dither + "pmulhw %%mm2, %%mm1 \n" + "psllw $2, %%mm1 \n" // m = m*m*delta >> 14 + "paddw %%mm1, %%mm0 \n" // pix += m + "psraw $7, %%mm0 \n" + "packuswb %%mm0, %%mm0 \n" + "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $4, %0 \n" + "jl 1b \n" + "emms \n" + :"+r"(x) + :"r"(dst+width), "r"(src+width), "r"(dc+width/2), + "rm"(thresh), "m"(*dithers), "m"(*pw_7f) + :"memory" + ); +#endif +} + +void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) +{ +#if HAVE_SSSE3 + intptr_t x; + if (width & 7) { + // could be 10% faster if I somehow eliminated this + x = width & ~7; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); + width = x; + } + x = -width; + __asm__ volatile( + "movd %4, %%xmm5 \n" + "pxor %%xmm7, %%xmm7 \n" + "pshuflw $0,%%xmm5, %%xmm5 \n" + "movdqa %6, %%xmm6 \n" + "punpcklqdq %%xmm5, %%xmm5 \n" + "movdqa %5, %%xmm4 \n" + "1: \n" + "movq (%2,%0), %%xmm0 \n" + "movq (%3,%0), %%xmm1 \n" + "punpcklbw %%xmm7, %%xmm0 \n" + "punpcklwd %%xmm1, %%xmm1 \n" + "psllw $7, %%xmm0 \n" + "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix + "pabsw %%xmm1, %%xmm2 \n" + "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%xmm6, %%xmm2 \n" + "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m) + "pmullw %%xmm2, %%xmm2 \n" + "psllw $1, %%xmm2 \n" + "paddw %%xmm4, %%xmm0 \n" // pix += dither + "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14 + "paddw %%xmm1, %%xmm0 \n" // pix += m + "psraw $7, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $8, %0 \n" + "jl 1b \n" + :"+&r"(x) + :"r"(dst+width), "r"(src+width), "r"(dc+width/2), + "rm"(thresh), "m"(*dithers), "m"(*pw_7f) + :"memory" + ); +#endif // HAVE_SSSE3 +} + +void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width) +{ +#if HAVE_SSE +#define BLURV(load)\ + intptr_t x = -2*width;\ + __asm__ volatile(\ + "movdqa %6, %%xmm7 \n"\ + "1: \n"\ + load" (%4,%0), %%xmm0 \n"\ + load" (%5,%0), %%xmm1 \n"\ + "movdqa %%xmm0, %%xmm2 \n"\ + "movdqa %%xmm1, %%xmm3 \n"\ + "psrlw $8, %%xmm0 \n"\ + "psrlw $8, %%xmm1 \n"\ + "pand %%xmm7, %%xmm2 \n"\ + "pand %%xmm7, %%xmm3 \n"\ + "paddw %%xmm1, %%xmm0 \n"\ + "paddw %%xmm3, %%xmm2 \n"\ + "paddw %%xmm2, %%xmm0 \n"\ + "paddw (%2,%0), %%xmm0 \n"\ + "movdqa (%1,%0), %%xmm1 \n"\ + "movdqa %%xmm0, (%1,%0) \n"\ + "psubw %%xmm1, %%xmm0 \n"\ + "movdqa %%xmm0, (%3,%0) \n"\ + "add $16, %0 \n"\ + "jl 1b \n"\ + :"+&r"(x)\ + :"r"(buf+width),\ + "r"(buf1+width),\ + "r"(dc+width),\ + "r"(src+width*2),\ + "r"(src+width*2+src_linesize),\ + "m"(*pw_ff)\ + :"memory"\ + ); + if (((intptr_t) src | src_linesize) & 15) { + BLURV("movdqu"); + } else { + BLURV("movdqa"); + } +#endif // HAVE_SSE +} |