GitList

Browse code

Add gradfun filter, ported from MPlayer.

Patch by Nolan L nol888 <=> gmail >=< com.

See thread:
Subject: [FFmpeg-devel] [PATCH] Port gradfun to libavfilter (GCI)
Date: Mon, 29 Nov 2010 07:18:14 -0500

Originally committed as revision 25942 to svn://svn.ffmpeg.org/ffmpeg/trunk

Nolan L authored on 2010/12/13 02:59:10
Showing 9 changed files

Changelog index 83c2e48..dcefc99 100644
doc/filters.texi index c460aa9..be3f8e1 100644
libavfilter/Makefile index aece3ab..1466645 100644
libavfilter/allfilters.c index 8ce4f1b..7f7e46d 100644
libavfilter/avfilter.h index 54cd905..a4bc978 100644
libavfilter/gradfun.h index 0000000..3dacbcb
libavfilter/vf_gradfun.c index 0000000..1cbf8d8
libavfilter/x86/Makefile index 716048c..e98693d 100644
libavfilter/x86/gradfun.c index 0000000..894a44b

Changelog

History View file @ d5f187f

@@ -64,6 +64,7 @@ version <next>:
                      - hqdn3d filter added
                      - RTP depacketization of QCELP
                      - FLAC parser added
                     +- gradfun filter added
                      version 0.6:

doc/filters.texi

History View file @ d5f187f

@@ -425,6 +425,35 @@ frei0r=perspective:0.2/0.2:0.8/0.2
                      For more information see:
                      @url{http://piksel.org/frei0r}
                     +@section gradfun
+                    +
                     +Fix the banding artifacts that are sometimes introduced into nearly flat
                     +regions by truncation to 8bit colordepth.
                     +Interpolate the gradients that should go where the bands are, and
                     +dither them.
+                    +
                     +The filter takes two optional parameters, separated by ':':
                     +@var{strength}:@var{radius}
+                    +
                     +@var{strength} is the maximum amount by which the filter will change
                     +any one pixel. Also the threshold for detecting nearly flat
                     +regions. Acceptable values range from .51 to 255, default value is
                     +1.2, out-of-range values will be clipped to the valid range.
+                    +
                     +@var{radius} is the neighborhood to fit the gradient to. A larger
                     +radius makes for smoother gradients, but also prevents the filter from
                     +modifying the pixels near detailed regions. Acceptable values are
                     +8-32, default value is 16, out-of-range values will be clipped to the
                     +valid range.
+                    +
                     +@example
                     +# default parameters
                     +gradfun=1.2:16
+                    +
                     +# omitting radius
                     +gradfun=1.2
                     +@end example
+                    +
                      @section hflip
                      Flip the input video horizontally.

libavfilter/Makefile

History View file @ d5f187f

@@ -26,6 +26,7 @@ OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
                      OBJS-$(CONFIG_FIFO_FILTER)                   += vf_fifo.o
                      OBJS-$(CONFIG_FORMAT_FILTER)                 += vf_format.o
                      OBJS-$(CONFIG_FREI0R_FILTER)                 += vf_frei0r.o
                     +OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
                      OBJS-$(CONFIG_HFLIP_FILTER)                  += vf_hflip.o
                      OBJS-$(CONFIG_HQDN3D_FILTER)                 += vf_hqdn3d.o
                      OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o

libavfilter/allfilters.c

History View file @ d5f187f

@@ -47,6 +47,7 @@ void avfilter_register_all(void)
                          REGISTER_FILTER (FIFO,        fifo,        vf);
                          REGISTER_FILTER (FORMAT,      format,      vf);
                          REGISTER_FILTER (FREI0R,      frei0r,      vf);
                     +    REGISTER_FILTER (GRADFUN,     gradfun,     vf);
                          REGISTER_FILTER (HFLIP,       hflip,       vf);
                          REGISTER_FILTER (HQDN3D,      hqdn3d,      vf);
                          REGISTER_FILTER (NOFORMAT,    noformat,    vf);

libavfilter/avfilter.h

History View file @ d5f187f

@@ -27,8 +27,8 @@
                      #include "libavcore/samplefmt.h"
                      #define LIBAVFILTER_VERSION_MAJOR  1
                     -#define LIBAVFILTER_VERSION_MINOR 68
                     -#define LIBAVFILTER_VERSION_MICRO  1
                     +#define LIBAVFILTER_VERSION_MINOR 69
                     +#define LIBAVFILTER_VERSION_MICRO  0
                      #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
                                                                     LIBAVFILTER_VERSION_MINOR, \

libavfilter/gradfun.h

History View file @ d5f187f

                     new file mode 100644
@@ -0,0 +1,48 @@
                     +/*
                     + * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
                     + * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#ifndef AVFILTER_GRADFUN_H
                     +#define AVFILTER_GRADFUN_H
+                    +
                     +#include "avfilter.h"
+                    +
                     +/// Holds instance-specific information for gradfun.
                     +typedef struct {
                     +    int thresh;    ///< threshold for gradient algorithm
                     +    int radius;    ///< blur radius
                     +    int chroma_w;  ///< width of the chroma planes
                     +    int chroma_h;  ///< weight of the chroma planes
                     +    int chroma_r;  ///< blur radius for the chroma planes
                     +    uint16_t *buf; ///< holds image data for blur algorithm passed into filter.
                     +    /// DSP functions.
                     +    void (*filter_line) (uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
                     +    void (*blur_line) (uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
                     +} GradFunContext;
+                    +
                     +void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
                     +void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+                    +
                     +void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
                     +void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+                    +
                     +void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+                    +
                     +#endif /* AVFILTER_GRADFUN_H */

libavfilter/vf_gradfun.c

History View file @ d5f187f

                     new file mode 100644
@@ -0,0 +1,253 @@
                     +/*
                     + * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
                     + * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +/**
                     + * @file
                     + * gradfun debanding filter, ported from MPlayer
                     + * libmpcodecs/vf_gradfun.c
                     + *
                     + * Apply a boxblur debanding algorithm (based on the gradfun2db
                     + * Avisynth filter by prunedtree).
                     + * Foreach pixel, if it's within threshold of the blurred value, make it closer.
                     + * So now we have a smoothed and higher bitdepth version of all the shallow
                     + * gradients, while leaving detailed areas untouched.
                     + * Dither it back to 8bit.
                     + */
+                    +
                     +#include "libavcore/imgutils.h"
                     +#include "libavutil/cpu.h"
                     +#include "libavutil/pixdesc.h"
                     +#include "avfilter.h"
                     +#include "gradfun.h"
+                    +
                     +DECLARE_ALIGNED(16, static const uint16_t, dither)[8][8] = {
                     +    {0x00,0x60,0x18,0x78,0x06,0x66,0x1E,0x7E},
                     +    {0x40,0x20,0x58,0x38,0x46,0x26,0x5E,0x3E},
                     +    {0x10,0x70,0x08,0x68,0x16,0x76,0x0E,0x6E},
                     +    {0x50,0x30,0x48,0x28,0x56,0x36,0x4E,0x2E},
                     +    {0x04,0x64,0x1C,0x7C,0x02,0x62,0x1A,0x7A},
                     +    {0x44,0x24,0x5C,0x3C,0x42,0x22,0x5A,0x3A},
                     +    {0x14,0x74,0x0C,0x6C,0x12,0x72,0x0A,0x6A},
                     +    {0x54,0x34,0x4C,0x2C,0x52,0x32,0x4A,0x2A},
                     +};
+                    +
                     +void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
                     +{
                     +    int x;
                     +    for (x = 0; x < width; x++, dc += x & 1) {
                     +        int pix = src[x] << 7;
                     +        int delta = dc[0] - pix;
                     +        int m = abs(delta) * thresh >> 16;
                     +        m = FFMAX(0, 127 - m);
                     +        m = m * m * delta >> 14;
                     +        pix += m + dithers[x & 7];
                     +        dst[x] = av_clip_uint8(pix >> 7);
                     +    }
                     +}
+                    +
                     +void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
                     +{
                     +    int x, v, old;
                     +    for (x = 0; x < width; x++) {
                     +        v = buf1[x] + src[2 * x] + src[2 * x + 1] + src[2 * x + src_linesize] + src[2 * x + 1 + src_linesize];
                     +        old = buf[x];
                     +        buf[x] = v;
                     +        dc[x] = v - old;
                     +    }
                     +}
+                    +
                     +static void filter(GradFunContext *ctx, uint8_t *dst, uint8_t *src, int width, int height, int dst_linesize, int src_linesize, int r)
                     +{
                     +    int bstride = FFALIGN(width, 16) / 2;
                     +    int y;
                     +    uint32_t dc_factor = (1 << 21) / (r * r);
                     +    uint16_t *dc = ctx->buf + 16;
                     +    uint16_t *buf = ctx->buf + bstride + 32;
                     +    int thresh = ctx->thresh;
+                    +
                     +    memset(dc, 0, (bstride + 16) * sizeof(*buf));
                     +    for (y = 0; y < r; y++)
                     +        ctx->blur_line(dc, buf + y * bstride, buf + (y - 1) * bstride, src + 2 * y * src_linesize, src_linesize, width / 2);
                     +    for (;;) {
                     +        if (y < height - r) {
                     +            int mod = ((y + r) / 2) % r;
                     +            uint16_t *buf0 = buf + mod * bstride;
                     +            uint16_t *buf1 = buf + (mod ? mod - 1 : r - 1) * bstride;
                     +            int x, v;
                     +            ctx->blur_line(dc, buf0, buf1, src + (y + r) * src_linesize, src_linesize, width / 2);
                     +            for (x = v = 0; x < r; x++)
                     +                v += dc[x];
                     +            for (; x < width / 2; x++) {
                     +                v += dc[x] - dc[x-r];
                     +                dc[x-r] = v * dc_factor >> 16;
                     +            }
                     +            for (; x < (width + r + 1) / 2; x++)
                     +                dc[x-r] = v * dc_factor >> 16;
                     +            for (x = -r / 2; x < 0; x++)
                     +                dc[x] = dc[0];
                     +        }
                     +        if (y == r) {
                     +            for (y = 0; y < r; y++)
                     +                ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
                     +        }
                     +        ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
                     +        if (++y >= height) break;
                     +        ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
                     +        if (++y >= height) break;
                     +    }
                     +}
+                    +
                     +static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
                     +{
                     +    GradFunContext *gf = ctx->priv;
                     +    float thresh = 1.2;
                     +    int radius = 16;
                     +    av_unused int cpu_flags = av_get_cpu_flags();
+                    +
                     +    if (args)
                     +        sscanf(args, "%f:%d", &thresh, &radius);
+                    +
                     +    thresh = av_clipf(thresh, 0.51, 255);
                     +    gf->thresh = (1 << 15) / thresh;
                     +    gf->radius = av_clip((radius + 1) & ~1, 4, 32);
+                    +
                     +    gf->blur_line = ff_gradfun_blur_line_c;
                     +    gf->filter_line = ff_gradfun_filter_line_c;
+                    +
                     +    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2)
                     +        gf->filter_line = ff_gradfun_filter_line_mmx2;
                     +    if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
                     +        gf->filter_line = ff_gradfun_filter_line_ssse3;
                     +    if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
                     +        gf->blur_line = ff_gradfun_blur_line_sse2;
+                    +
                     +    av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius);
+                    +
                     +    return 0;
                     +}
+                    +
                     +static av_cold void uninit(AVFilterContext *ctx)
                     +{
                     +    GradFunContext *gf = ctx->priv;
                     +    av_freep(&gf->buf);
                     +}
+                    +
                     +static int query_formats(AVFilterContext *ctx)
                     +{
                     +    static const enum PixelFormat pix_fmts[] = {
                     +        PIX_FMT_YUV410P,            PIX_FMT_YUV420P,
                     +        PIX_FMT_GRAY8,              PIX_FMT_NV12,
                     +        PIX_FMT_NV21,               PIX_FMT_YUV444P,
                     +        PIX_FMT_YUV422P,            PIX_FMT_YUV411P,
                     +        PIX_FMT_NONE
                     +    };
+                    +
                     +    avfilter_set_common_formats(ctx, avfilter_make_format_list(pix_fmts));
+                    +
                     +    return 0;
                     +}
+                    +
                     +static int config_input(AVFilterLink *inlink)
                     +{
                     +    GradFunContext *gf = inlink->dst->priv;
                     +    int hsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_w;
                     +    int vsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_h;
+                    +
                     +    gf->buf = av_mallocz((FFALIGN(inlink->w, 16) * (gf->radius + 1) / 2 + 32) * sizeof(uint16_t));
                     +    if (!gf->buf)
                     +        return AVERROR(ENOMEM);
+                    +
                     +    gf->chroma_w = -((-inlink->w) >> hsub);
                     +    gf->chroma_h = -((-inlink->h) >> vsub);
                     +    gf->chroma_r = av_clip(((((gf->radius >> hsub) + (gf->radius >> vsub)) / 2 ) + 1) & ~1, 4, 32);
+                    +
                     +    return 0;
                     +}
+                    +
                     +static void start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
                     +{
                     +    AVFilterLink *outlink = inlink->dst->outputs[0];
                     +    AVFilterBufferRef *outpicref;
+                    +
                     +    if (inpicref->perms & AV_PERM_PRESERVE) {
                     +        outpicref = avfilter_get_video_buffer(outlink, AV_PERM_WRITE, outlink->w, outlink->h);
                     +        avfilter_copy_buffer_ref_props(outpicref, inpicref);
                     +        outpicref->video->w = outlink->w;
                     +        outpicref->video->h = outlink->h;
                     +    } else
                     +        outpicref = inpicref;
+                    +
                     +    outlink->out_buf = outpicref;
                     +    avfilter_start_frame(outlink, avfilter_ref_buffer(outpicref, ~0));
                     +}
+                    +
                     +static void null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir) { }
+                    +
                     +static void end_frame(AVFilterLink *inlink)
                     +{
                     +    GradFunContext *gf = inlink->dst->priv;
                     +    AVFilterBufferRef *inpic = inlink->cur_buf;
                     +    AVFilterLink *outlink = inlink->dst->outputs[0];
                     +    AVFilterBufferRef *outpic = outlink->out_buf;
                     +    int p;
+                    +
                     +    for (p = 0; p < 4 && inpic->data[p]; p++) {
                     +        int w = inlink->w;
                     +        int h = inlink->h;
                     +        int r = gf->radius;
                     +        if (p) {
                     +            w = gf->chroma_w;
                     +            h = gf->chroma_h;
                     +            r = gf->chroma_r;
                     +        }
+                    +
                     +        if (FFMIN(w, h) > 2 * r)
                     +            filter(gf, outpic->data[p], inpic->data[p], w, h, outpic->linesize[p], inpic->linesize[p], r);
                     +        else if (outpic->data[p] != inpic->data[p])
                     +            av_image_copy_plane(outpic->data[p], outpic->linesize[p], inpic->data[p], inpic->linesize[p], w, h);
                     +    }
+                    +
                     +    avfilter_draw_slice(outlink, 0, inlink->h, 1);
                     +    avfilter_end_frame(outlink);
                     +    avfilter_unref_buffer(inpic);
                     +    avfilter_unref_buffer(outpic);
                     +}
+                    +
                     +AVFilter avfilter_vf_gradfun = {
                     +    .name          = "gradfun",
                     +    .description   = NULL_IF_CONFIG_SMALL("Debands video quickly using gradients."),
                     +    .priv_size     = sizeof(GradFunContext),
                     +    .init          = init,
                     +    .uninit        = uninit,
                     +    .query_formats = query_formats,
+                    +
                     +    .inputs    = (AVFilterPad[]) {{ .name             = "default",
                     +                                    .type             = AVMEDIA_TYPE_VIDEO,
                     +                                    .config_props     = config_input,
                     +                                    .start_frame      = start_frame,
                     +                                    .draw_slice       = null_draw_slice,
                     +                                    .end_frame        = end_frame,
                     +                                    .min_perms        = AV_PERM_READ, },
                     +                                  { .name = NULL}},
                     +    .outputs   = (AVFilterPad[]) {{ .name             = "default",
                     +                                    .type             = AVMEDIA_TYPE_VIDEO, },
                     +                                  { .name = NULL}},
                     +};

libavfilter/x86/Makefile

History View file @ d5f187f

@@ -1 +1,2 @@
                      MMX-OBJS-$(CONFIG_YADIF_FILTER)              += x86/yadif.o
                     +MMX-OBJS-$(CONFIG_GRADFUN_FILTER)            += x86/gradfun.o

libavfilter/x86/gradfun.c

History View file @ d5f187f

                     new file mode 100644
@@ -0,0 +1,162 @@
                     +/*
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or modify
                     + * it under the terms of the GNU General Public License as published by
                     + * the Free Software Foundation; either version 2 of the License, or
                     + * (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + * GNU General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU General Public License along
                     + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
                     + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
                     + */
+                    +
                     +#include "libavutil/cpu.h"
                     +#include "libavutil/x86_cpu.h"
                     +#include "libavfilter/gradfun.h"
+                    +
                     +DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
                     +DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+                    +
                     +void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
                     +{
                     +#if HAVE_MMX
                     +    intptr_t x;
                     +    if (width & 3) {
                     +        x = width & ~3;
                     +        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
                     +        width = x;
                     +    }
                     +    x = -width;
                     +    __asm__ volatile(
                     +        "movd          %4, %%mm5 \n"
                     +        "pxor       %%mm7, %%mm7 \n"
                     +        "pshufw $0, %%mm5, %%mm5 \n"
                     +        "movq          %6, %%mm6 \n"
                     +        "movq          %5, %%mm4 \n"
                     +        "1: \n"
                     +        "movd     (%2,%0), %%mm0 \n"
                     +        "movd     (%3,%0), %%mm1 \n"
                     +        "punpcklbw  %%mm7, %%mm0 \n"
                     +        "punpcklwd  %%mm1, %%mm1 \n"
                     +        "psllw         $7, %%mm0 \n"
                     +        "pxor       %%mm2, %%mm2 \n"
                     +        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
                     +        "psubw      %%mm1, %%mm2 \n"
                     +        "pmaxsw     %%mm1, %%mm2 \n"
                     +        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
                     +        "psubw      %%mm6, %%mm2 \n"
                     +        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
                     +        "pmullw     %%mm2, %%mm2 \n"
                     +        "paddw      %%mm4, %%mm0 \n" // pix += dither
                     +        "pmulhw     %%mm2, %%mm1 \n"
                     +        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
                     +        "paddw      %%mm1, %%mm0 \n" // pix += m
                     +        "psraw         $7, %%mm0 \n"
                     +        "packuswb   %%mm0, %%mm0 \n"
                     +        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
                     +        "add           $4, %0 \n"
                     +        "jl 1b \n"
                     +        "emms \n"
                     +        :"+r"(x)
                     +        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
                     +         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
                     +        :"memory"
                     +    );
                     +#endif
                     +}
+                    +
                     +void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
                     +{
                     +#if HAVE_SSSE3
                     +    intptr_t x;
                     +    if (width & 7) {
                     +        // could be 10% faster if I somehow eliminated this
                     +        x = width & ~7;
                     +        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
                     +        width = x;
                     +    }
                     +    x = -width;
                     +    __asm__ volatile(
                     +        "movd           %4, %%xmm5 \n"
                     +        "pxor       %%xmm7, %%xmm7 \n"
                     +        "pshuflw $0,%%xmm5, %%xmm5 \n"
                     +        "movdqa         %6, %%xmm6 \n"
                     +        "punpcklqdq %%xmm5, %%xmm5 \n"
                     +        "movdqa         %5, %%xmm4 \n"
                     +        "1: \n"
                     +        "movq      (%2,%0), %%xmm0 \n"
                     +        "movq      (%3,%0), %%xmm1 \n"
                     +        "punpcklbw  %%xmm7, %%xmm0 \n"
                     +        "punpcklwd  %%xmm1, %%xmm1 \n"
                     +        "psllw          $7, %%xmm0 \n"
                     +        "psubw      %%xmm0, %%xmm1 \n" // delta = dc - pix
                     +        "pabsw      %%xmm1, %%xmm2 \n"
                     +        "pmulhuw    %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
                     +        "psubw      %%xmm6, %%xmm2 \n"
                     +        "pminsw     %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
                     +        "pmullw     %%xmm2, %%xmm2 \n"
                     +        "psllw          $1, %%xmm2 \n"
                     +        "paddw      %%xmm4, %%xmm0 \n" // pix += dither
                     +        "pmulhrsw   %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
                     +        "paddw      %%xmm1, %%xmm0 \n" // pix += m
                     +        "psraw          $7, %%xmm0 \n"
                     +        "packuswb   %%xmm0, %%xmm0 \n"
                     +        "movq       %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
                     +        "add            $8, %0 \n"
                     +        "jl 1b \n"
                     +        :"+&r"(x)
                     +        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
                     +         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
                     +        :"memory"
                     +    );
                     +#endif // HAVE_SSSE3
                     +}
+                    +
                     +void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
                     +{
                     +#if HAVE_SSE
                     +#define BLURV(load)\
                     +    intptr_t x = -2*width;\
                     +    __asm__ volatile(\
                     +        "movdqa %6, %%xmm7 \n"\
                     +        "1: \n"\
                     +        load"   (%4,%0), %%xmm0 \n"\
                     +        load"   (%5,%0), %%xmm1 \n"\
                     +        "movdqa  %%xmm0, %%xmm2 \n"\
                     +        "movdqa  %%xmm1, %%xmm3 \n"\
                     +        "psrlw       $8, %%xmm0 \n"\
                     +        "psrlw       $8, %%xmm1 \n"\
                     +        "pand    %%xmm7, %%xmm2 \n"\
                     +        "pand    %%xmm7, %%xmm3 \n"\
                     +        "paddw   %%xmm1, %%xmm0 \n"\
                     +        "paddw   %%xmm3, %%xmm2 \n"\
                     +        "paddw   %%xmm2, %%xmm0 \n"\
                     +        "paddw  (%2,%0), %%xmm0 \n"\
                     +        "movdqa (%1,%0), %%xmm1 \n"\
                     +        "movdqa  %%xmm0, (%1,%0) \n"\
                     +        "psubw   %%xmm1, %%xmm0 \n"\
                     +        "movdqa  %%xmm0, (%3,%0) \n"\
                     +        "add        $16, %0 \n"\
                     +        "jl 1b \n"\
                     +        :"+&r"(x)\
                     +        :"r"(buf+width),\
                     +         "r"(buf1+width),\
                     +         "r"(dc+width),\
                     +         "r"(src+width*2),\
                     +         "r"(src+width*2+src_linesize),\
                     +         "m"(*pw_ff)\
                     +        :"memory"\
                     +    );
                     +    if (((intptr_t) src | src_linesize) & 15) {
                     +        BLURV("movdqu");
                     +    } else {
                     +        BLURV("movdqa");
                     +    }
                     +#endif // HAVE_SSE
                     +}