aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNolan L <nol888@gmail.com>2010-12-12 17:59:10 +0000
committerStefano Sabatini <stefano.sabatini-lala@poste.it>2010-12-12 17:59:10 +0000
commitd5f187fd3355ec6d4922d8479930c10d1b6f9ebf (patch)
treec95815e013f51c29f9bb1d2e44a2896d3d3f03b1
parent9d845ca40cff56c8c0dc04cc76964b0573ef9796 (diff)
downloadffmpeg-d5f187fd3355ec6d4922d8479930c10d1b6f9ebf.tar.gz
Add gradfun filter, ported from MPlayer.
Patch by Nolan L nol888 <=> gmail >=< com. See thread: Subject: [FFmpeg-devel] [PATCH] Port gradfun to libavfilter (GCI) Date: Mon, 29 Nov 2010 07:18:14 -0500 Originally committed as revision 25942 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--Changelog1
-rw-r--r--doc/filters.texi29
-rw-r--r--libavfilter/Makefile1
-rw-r--r--libavfilter/allfilters.c1
-rw-r--r--libavfilter/avfilter.h4
-rw-r--r--libavfilter/gradfun.h48
-rw-r--r--libavfilter/vf_gradfun.c253
-rw-r--r--libavfilter/x86/Makefile1
-rw-r--r--libavfilter/x86/gradfun.c162
9 files changed, 498 insertions, 2 deletions
diff --git a/Changelog b/Changelog
index 83c2e48c32..dcefc996c6 100644
--- a/Changelog
+++ b/Changelog
@@ -64,6 +64,7 @@ version <next>:
- hqdn3d filter added
- RTP depacketization of QCELP
- FLAC parser added
+- gradfun filter added
version 0.6:
diff --git a/doc/filters.texi b/doc/filters.texi
index c460aa951f..be3f8e180e 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -425,6 +425,35 @@ frei0r=perspective:0.2/0.2:0.8/0.2
For more information see:
@url{http://piksel.org/frei0r}
+@section gradfun
+
+Fix the banding artifacts that are sometimes introduced into nearly flat
+regions by truncation to 8bit colordepth.
+Interpolate the gradients that should go where the bands are, and
+dither them.
+
+The filter takes two optional parameters, separated by ':':
+@var{strength}:@var{radius}
+
+@var{strength} is the maximum amount by which the filter will change
+any one pixel. Also the threshold for detecting nearly flat
+regions. Acceptable values range from .51 to 255, default value is
+1.2, out-of-range values will be clipped to the valid range.
+
+@var{radius} is the neighborhood to fit the gradient to. A larger
+radius makes for smoother gradients, but also prevents the filter from
+modifying the pixels near detailed regions. Acceptable values are
+8-32, default value is 16, out-of-range values will be clipped to the
+valid range.
+
+@example
+# default parameters
+gradfun=1.2:16
+
+# omitting radius
+gradfun=1.2
+@end example
+
@section hflip
Flip the input video horizontally.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index aece3abd07..14666458f8 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -26,6 +26,7 @@ OBJS-$(CONFIG_DRAWBOX_FILTER) += vf_drawbox.o
OBJS-$(CONFIG_FIFO_FILTER) += vf_fifo.o
OBJS-$(CONFIG_FORMAT_FILTER) += vf_format.o
OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o
+OBJS-$(CONFIG_GRADFUN_FILTER) += vf_gradfun.o
OBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
OBJS-$(CONFIG_HQDN3D_FILTER) += vf_hqdn3d.o
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 8ce4f1b168..7f7e46d448 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -47,6 +47,7 @@ void avfilter_register_all(void)
REGISTER_FILTER (FIFO, fifo, vf);
REGISTER_FILTER (FORMAT, format, vf);
REGISTER_FILTER (FREI0R, frei0r, vf);
+ REGISTER_FILTER (GRADFUN, gradfun, vf);
REGISTER_FILTER (HFLIP, hflip, vf);
REGISTER_FILTER (HQDN3D, hqdn3d, vf);
REGISTER_FILTER (NOFORMAT, noformat, vf);
diff --git a/libavfilter/avfilter.h b/libavfilter/avfilter.h
index 54cd9050f1..a4bc978eba 100644
--- a/libavfilter/avfilter.h
+++ b/libavfilter/avfilter.h
@@ -27,8 +27,8 @@
#include "libavcore/samplefmt.h"
#define LIBAVFILTER_VERSION_MAJOR 1
-#define LIBAVFILTER_VERSION_MINOR 68
-#define LIBAVFILTER_VERSION_MICRO 1
+#define LIBAVFILTER_VERSION_MINOR 69
+#define LIBAVFILTER_VERSION_MICRO 0
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
LIBAVFILTER_VERSION_MINOR, \
diff --git a/libavfilter/gradfun.h b/libavfilter/gradfun.h
new file mode 100644
index 0000000000..3dacbcb252
--- /dev/null
+++ b/libavfilter/gradfun.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_GRADFUN_H
+#define AVFILTER_GRADFUN_H
+
+#include "avfilter.h"
+
+/// Holds instance-specific information for gradfun.
+typedef struct {
+ int thresh; ///< threshold for gradient algorithm
+ int radius; ///< blur radius
+ int chroma_w; ///< width of the chroma planes
+ int chroma_h; ///< weight of the chroma planes
+ int chroma_r; ///< blur radius for the chroma planes
+ uint16_t *buf; ///< holds image data for blur algorithm passed into filter.
+ /// DSP functions.
+ void (*filter_line) (uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+ void (*blur_line) (uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+} GradFunContext;
+
+void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+
+void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+
+void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+
+#endif /* AVFILTER_GRADFUN_H */
diff --git a/libavfilter/vf_gradfun.c b/libavfilter/vf_gradfun.c
new file mode 100644
index 0000000000..1cbf8d8c2a
--- /dev/null
+++ b/libavfilter/vf_gradfun.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * gradfun debanding filter, ported from MPlayer
+ * libmpcodecs/vf_gradfun.c
+ *
+ * Apply a boxblur debanding algorithm (based on the gradfun2db
+ * Avisynth filter by prunedtree).
+ * Foreach pixel, if it's within threshold of the blurred value, make it closer.
+ * So now we have a smoothed and higher bitdepth version of all the shallow
+ * gradients, while leaving detailed areas untouched.
+ * Dither it back to 8bit.
+ */
+
+#include "libavcore/imgutils.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+#include "gradfun.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, dither)[8][8] = {
+ {0x00,0x60,0x18,0x78,0x06,0x66,0x1E,0x7E},
+ {0x40,0x20,0x58,0x38,0x46,0x26,0x5E,0x3E},
+ {0x10,0x70,0x08,0x68,0x16,0x76,0x0E,0x6E},
+ {0x50,0x30,0x48,0x28,0x56,0x36,0x4E,0x2E},
+ {0x04,0x64,0x1C,0x7C,0x02,0x62,0x1A,0x7A},
+ {0x44,0x24,0x5C,0x3C,0x42,0x22,0x5A,0x3A},
+ {0x14,0x74,0x0C,0x6C,0x12,0x72,0x0A,0x6A},
+ {0x54,0x34,0x4C,0x2C,0x52,0x32,0x4A,0x2A},
+};
+
+void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+ int x;
+ for (x = 0; x < width; x++, dc += x & 1) {
+ int pix = src[x] << 7;
+ int delta = dc[0] - pix;
+ int m = abs(delta) * thresh >> 16;
+ m = FFMAX(0, 127 - m);
+ m = m * m * delta >> 14;
+ pix += m + dithers[x & 7];
+ dst[x] = av_clip_uint8(pix >> 7);
+ }
+}
+
+void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
+{
+ int x, v, old;
+ for (x = 0; x < width; x++) {
+ v = buf1[x] + src[2 * x] + src[2 * x + 1] + src[2 * x + src_linesize] + src[2 * x + 1 + src_linesize];
+ old = buf[x];
+ buf[x] = v;
+ dc[x] = v - old;
+ }
+}
+
+static void filter(GradFunContext *ctx, uint8_t *dst, uint8_t *src, int width, int height, int dst_linesize, int src_linesize, int r)
+{
+ int bstride = FFALIGN(width, 16) / 2;
+ int y;
+ uint32_t dc_factor = (1 << 21) / (r * r);
+ uint16_t *dc = ctx->buf + 16;
+ uint16_t *buf = ctx->buf + bstride + 32;
+ int thresh = ctx->thresh;
+
+ memset(dc, 0, (bstride + 16) * sizeof(*buf));
+ for (y = 0; y < r; y++)
+ ctx->blur_line(dc, buf + y * bstride, buf + (y - 1) * bstride, src + 2 * y * src_linesize, src_linesize, width / 2);
+ for (;;) {
+ if (y < height - r) {
+ int mod = ((y + r) / 2) % r;
+ uint16_t *buf0 = buf + mod * bstride;
+ uint16_t *buf1 = buf + (mod ? mod - 1 : r - 1) * bstride;
+ int x, v;
+ ctx->blur_line(dc, buf0, buf1, src + (y + r) * src_linesize, src_linesize, width / 2);
+ for (x = v = 0; x < r; x++)
+ v += dc[x];
+ for (; x < width / 2; x++) {
+ v += dc[x] - dc[x-r];
+ dc[x-r] = v * dc_factor >> 16;
+ }
+ for (; x < (width + r + 1) / 2; x++)
+ dc[x-r] = v * dc_factor >> 16;
+ for (x = -r / 2; x < 0; x++)
+ dc[x] = dc[0];
+ }
+ if (y == r) {
+ for (y = 0; y < r; y++)
+ ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
+ }
+ ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
+ if (++y >= height) break;
+ ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
+ if (++y >= height) break;
+ }
+}
+
+static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
+{
+ GradFunContext *gf = ctx->priv;
+ float thresh = 1.2;
+ int radius = 16;
+ av_unused int cpu_flags = av_get_cpu_flags();
+
+ if (args)
+ sscanf(args, "%f:%d", &thresh, &radius);
+
+ thresh = av_clipf(thresh, 0.51, 255);
+ gf->thresh = (1 << 15) / thresh;
+ gf->radius = av_clip((radius + 1) & ~1, 4, 32);
+
+ gf->blur_line = ff_gradfun_blur_line_c;
+ gf->filter_line = ff_gradfun_filter_line_c;
+
+ if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2)
+ gf->filter_line = ff_gradfun_filter_line_mmx2;
+ if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
+ gf->filter_line = ff_gradfun_filter_line_ssse3;
+ if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
+ gf->blur_line = ff_gradfun_blur_line_sse2;
+
+ av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius);
+
+ return 0;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+ GradFunContext *gf = ctx->priv;
+ av_freep(&gf->buf);
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+ static const enum PixelFormat pix_fmts[] = {
+ PIX_FMT_YUV410P, PIX_FMT_YUV420P,
+ PIX_FMT_GRAY8, PIX_FMT_NV12,
+ PIX_FMT_NV21, PIX_FMT_YUV444P,
+ PIX_FMT_YUV422P, PIX_FMT_YUV411P,
+ PIX_FMT_NONE
+ };
+
+ avfilter_set_common_formats(ctx, avfilter_make_format_list(pix_fmts));
+
+ return 0;
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+ GradFunContext *gf = inlink->dst->priv;
+ int hsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_w;
+ int vsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_h;
+
+ gf->buf = av_mallocz((FFALIGN(inlink->w, 16) * (gf->radius + 1) / 2 + 32) * sizeof(uint16_t));
+ if (!gf->buf)
+ return AVERROR(ENOMEM);
+
+ gf->chroma_w = -((-inlink->w) >> hsub);
+ gf->chroma_h = -((-inlink->h) >> vsub);
+ gf->chroma_r = av_clip(((((gf->radius >> hsub) + (gf->radius >> vsub)) / 2 ) + 1) & ~1, 4, 32);
+
+ return 0;
+}
+
+static void start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
+{
+ AVFilterLink *outlink = inlink->dst->outputs[0];
+ AVFilterBufferRef *outpicref;
+
+ if (inpicref->perms & AV_PERM_PRESERVE) {
+ outpicref = avfilter_get_video_buffer(outlink, AV_PERM_WRITE, outlink->w, outlink->h);
+ avfilter_copy_buffer_ref_props(outpicref, inpicref);
+ outpicref->video->w = outlink->w;
+ outpicref->video->h = outlink->h;
+ } else
+ outpicref = inpicref;
+
+ outlink->out_buf = outpicref;
+ avfilter_start_frame(outlink, avfilter_ref_buffer(outpicref, ~0));
+}
+
+static void null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir) { }
+
+static void end_frame(AVFilterLink *inlink)
+{
+ GradFunContext *gf = inlink->dst->priv;
+ AVFilterBufferRef *inpic = inlink->cur_buf;
+ AVFilterLink *outlink = inlink->dst->outputs[0];
+ AVFilterBufferRef *outpic = outlink->out_buf;
+ int p;
+
+ for (p = 0; p < 4 && inpic->data[p]; p++) {
+ int w = inlink->w;
+ int h = inlink->h;
+ int r = gf->radius;
+ if (p) {
+ w = gf->chroma_w;
+ h = gf->chroma_h;
+ r = gf->chroma_r;
+ }
+
+ if (FFMIN(w, h) > 2 * r)
+ filter(gf, outpic->data[p], inpic->data[p], w, h, outpic->linesize[p], inpic->linesize[p], r);
+ else if (outpic->data[p] != inpic->data[p])
+ av_image_copy_plane(outpic->data[p], outpic->linesize[p], inpic->data[p], inpic->linesize[p], w, h);
+ }
+
+ avfilter_draw_slice(outlink, 0, inlink->h, 1);
+ avfilter_end_frame(outlink);
+ avfilter_unref_buffer(inpic);
+ avfilter_unref_buffer(outpic);
+}
+
+AVFilter avfilter_vf_gradfun = {
+ .name = "gradfun",
+ .description = NULL_IF_CONFIG_SMALL("Debands video quickly using gradients."),
+ .priv_size = sizeof(GradFunContext),
+ .init = init,
+ .uninit = uninit,
+ .query_formats = query_formats,
+
+ .inputs = (AVFilterPad[]) {{ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ .config_props = config_input,
+ .start_frame = start_frame,
+ .draw_slice = null_draw_slice,
+ .end_frame = end_frame,
+ .min_perms = AV_PERM_READ, },
+ { .name = NULL}},
+ .outputs = (AVFilterPad[]) {{ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO, },
+ { .name = NULL}},
+};
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 716048cca2..e98693d654 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1 +1,2 @@
MMX-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o
+MMX-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o
diff --git a/libavfilter/x86/gradfun.c b/libavfilter/x86/gradfun.c
new file mode 100644
index 0000000000..894a44b9ff
--- /dev/null
+++ b/libavfilter/x86/gradfun.c
@@ -0,0 +1,162 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86_cpu.h"
+#include "libavfilter/gradfun.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
+DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+
+void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+#if HAVE_MMX
+ intptr_t x;
+ if (width & 3) {
+ x = width & ~3;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ __asm__ volatile(
+ "movd %4, %%mm5 \n"
+ "pxor %%mm7, %%mm7 \n"
+ "pshufw $0, %%mm5, %%mm5 \n"
+ "movq %6, %%mm6 \n"
+ "movq %5, %%mm4 \n"
+ "1: \n"
+ "movd (%2,%0), %%mm0 \n"
+ "movd (%3,%0), %%mm1 \n"
+ "punpcklbw %%mm7, %%mm0 \n"
+ "punpcklwd %%mm1, %%mm1 \n"
+ "psllw $7, %%mm0 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "psubw %%mm0, %%mm1 \n" // delta = dc - pix
+ "psubw %%mm1, %%mm2 \n"
+ "pmaxsw %%mm1, %%mm2 \n"
+ "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
+ "psubw %%mm6, %%mm2 \n"
+ "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
+ "pmullw %%mm2, %%mm2 \n"
+ "paddw %%mm4, %%mm0 \n" // pix += dither
+ "pmulhw %%mm2, %%mm1 \n"
+ "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
+ "paddw %%mm1, %%mm0 \n" // pix += m
+ "psraw $7, %%mm0 \n"
+ "packuswb %%mm0, %%mm0 \n"
+ "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
+ "add $4, %0 \n"
+ "jl 1b \n"
+ "emms \n"
+ :"+r"(x)
+ :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+ "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
+ :"memory"
+ );
+#endif
+}
+
+void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+#if HAVE_SSSE3
+ intptr_t x;
+ if (width & 7) {
+ // could be 10% faster if I somehow eliminated this
+ x = width & ~7;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ __asm__ volatile(
+ "movd %4, %%xmm5 \n"
+ "pxor %%xmm7, %%xmm7 \n"
+ "pshuflw $0,%%xmm5, %%xmm5 \n"
+ "movdqa %6, %%xmm6 \n"
+ "punpcklqdq %%xmm5, %%xmm5 \n"
+ "movdqa %5, %%xmm4 \n"
+ "1: \n"
+ "movq (%2,%0), %%xmm0 \n"
+ "movq (%3,%0), %%xmm1 \n"
+ "punpcklbw %%xmm7, %%xmm0 \n"
+ "punpcklwd %%xmm1, %%xmm1 \n"
+ "psllw $7, %%xmm0 \n"
+ "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
+ "pabsw %%xmm1, %%xmm2 \n"
+ "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
+ "psubw %%xmm6, %%xmm2 \n"
+ "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
+ "pmullw %%xmm2, %%xmm2 \n"
+ "psllw $1, %%xmm2 \n"
+ "paddw %%xmm4, %%xmm0 \n" // pix += dither
+ "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
+ "paddw %%xmm1, %%xmm0 \n" // pix += m
+ "psraw $7, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
+ "add $8, %0 \n"
+ "jl 1b \n"
+ :"+&r"(x)
+ :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+ "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
+ :"memory"
+ );
+#endif // HAVE_SSSE3
+}
+
+void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
+{
+#if HAVE_SSE
+#define BLURV(load)\
+ intptr_t x = -2*width;\
+ __asm__ volatile(\
+ "movdqa %6, %%xmm7 \n"\
+ "1: \n"\
+ load" (%4,%0), %%xmm0 \n"\
+ load" (%5,%0), %%xmm1 \n"\
+ "movdqa %%xmm0, %%xmm2 \n"\
+ "movdqa %%xmm1, %%xmm3 \n"\
+ "psrlw $8, %%xmm0 \n"\
+ "psrlw $8, %%xmm1 \n"\
+ "pand %%xmm7, %%xmm2 \n"\
+ "pand %%xmm7, %%xmm3 \n"\
+ "paddw %%xmm1, %%xmm0 \n"\
+ "paddw %%xmm3, %%xmm2 \n"\
+ "paddw %%xmm2, %%xmm0 \n"\
+ "paddw (%2,%0), %%xmm0 \n"\
+ "movdqa (%1,%0), %%xmm1 \n"\
+ "movdqa %%xmm0, (%1,%0) \n"\
+ "psubw %%xmm1, %%xmm0 \n"\
+ "movdqa %%xmm0, (%3,%0) \n"\
+ "add $16, %0 \n"\
+ "jl 1b \n"\
+ :"+&r"(x)\
+ :"r"(buf+width),\
+ "r"(buf1+width),\
+ "r"(dc+width),\
+ "r"(src+width*2),\
+ "r"(src+width*2+src_linesize),\
+ "m"(*pw_ff)\
+ :"memory"\
+ );
+ if (((intptr_t) src | src_linesize) & 15) {
+ BLURV("movdqu");
+ } else {
+ BLURV("movdqa");
+ }
+#endif // HAVE_SSE
+}