Browse code

Add gradfun filter, ported from MPlayer.

Patch by Nolan L nol888 <=> gmail >=< com.

See thread:
Subject: [FFmpeg-devel] [PATCH] Port gradfun to libavfilter (GCI)
Date: Mon, 29 Nov 2010 07:18:14 -0500

Originally committed as revision 25942 to svn://svn.ffmpeg.org/ffmpeg/trunk

Nolan L authored on 2010/12/13 02:59:10
Showing 9 changed files
... ...
@@ -64,6 +64,7 @@ version <next>:
64 64
 - hqdn3d filter added
65 65
 - RTP depacketization of QCELP
66 66
 - FLAC parser added
67
+- gradfun filter added
67 68
 
68 69
 
69 70
 version 0.6:
... ...
@@ -425,6 +425,35 @@ frei0r=perspective:0.2/0.2:0.8/0.2
425 425
 For more information see:
426 426
 @url{http://piksel.org/frei0r}
427 427
 
428
+@section gradfun
429
+
430
+Fix the banding artifacts that are sometimes introduced into nearly flat
431
+regions by truncation to 8bit colordepth.
432
+Interpolate the gradients that should go where the bands are, and
433
+dither them.
434
+
435
+The filter takes two optional parameters, separated by ':':
436
+@var{strength}:@var{radius}
437
+
438
+@var{strength} is the maximum amount by which the filter will change
439
+any one pixel. Also the threshold for detecting nearly flat
440
+regions. Acceptable values range from .51 to 255, default value is
441
+1.2, out-of-range values will be clipped to the valid range.
442
+
443
+@var{radius} is the neighborhood to fit the gradient to. A larger
444
+radius makes for smoother gradients, but also prevents the filter from
445
+modifying the pixels near detailed regions. Acceptable values are
446
+8-32, default value is 16, out-of-range values will be clipped to the
447
+valid range.
448
+
449
+@example
450
+# default parameters
451
+gradfun=1.2:16
452
+
453
+# omitting radius
454
+gradfun=1.2
455
+@end example
456
+
428 457
 @section hflip
429 458
 
430 459
 Flip the input video horizontally.
... ...
@@ -26,6 +26,7 @@ OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
26 26
 OBJS-$(CONFIG_FIFO_FILTER)                   += vf_fifo.o
27 27
 OBJS-$(CONFIG_FORMAT_FILTER)                 += vf_format.o
28 28
 OBJS-$(CONFIG_FREI0R_FILTER)                 += vf_frei0r.o
29
+OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
29 30
 OBJS-$(CONFIG_HFLIP_FILTER)                  += vf_hflip.o
30 31
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += vf_hqdn3d.o
31 32
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
... ...
@@ -47,6 +47,7 @@ void avfilter_register_all(void)
47 47
     REGISTER_FILTER (FIFO,        fifo,        vf);
48 48
     REGISTER_FILTER (FORMAT,      format,      vf);
49 49
     REGISTER_FILTER (FREI0R,      frei0r,      vf);
50
+    REGISTER_FILTER (GRADFUN,     gradfun,     vf);
50 51
     REGISTER_FILTER (HFLIP,       hflip,       vf);
51 52
     REGISTER_FILTER (HQDN3D,      hqdn3d,      vf);
52 53
     REGISTER_FILTER (NOFORMAT,    noformat,    vf);
... ...
@@ -27,8 +27,8 @@
27 27
 #include "libavcore/samplefmt.h"
28 28
 
29 29
 #define LIBAVFILTER_VERSION_MAJOR  1
30
-#define LIBAVFILTER_VERSION_MINOR 68
31
-#define LIBAVFILTER_VERSION_MICRO  1
30
+#define LIBAVFILTER_VERSION_MINOR 69
31
+#define LIBAVFILTER_VERSION_MICRO  0
32 32
 
33 33
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
34 34
                                                LIBAVFILTER_VERSION_MINOR, \
35 35
new file mode 100644
... ...
@@ -0,0 +1,48 @@
0
+/*
1
+ * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
2
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
3
+ *
4
+ * This file is part of FFmpeg.
5
+ *
6
+ * FFmpeg is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * FFmpeg is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with FFmpeg; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#ifndef AVFILTER_GRADFUN_H
22
+#define AVFILTER_GRADFUN_H
23
+
24
+#include "avfilter.h"
25
+
26
+/// Holds instance-specific information for gradfun.
27
+typedef struct {
28
+    int thresh;    ///< threshold for gradient algorithm
29
+    int radius;    ///< blur radius
30
+    int chroma_w;  ///< width of the chroma planes
31
+    int chroma_h;  ///< weight of the chroma planes
32
+    int chroma_r;  ///< blur radius for the chroma planes
33
+    uint16_t *buf; ///< holds image data for blur algorithm passed into filter.
34
+    /// DSP functions.
35
+    void (*filter_line) (uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
36
+    void (*blur_line) (uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
37
+} GradFunContext;
38
+
39
+void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
40
+void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
41
+
42
+void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
43
+void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
44
+
45
+void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
46
+
47
+#endif /* AVFILTER_GRADFUN_H */
0 48
new file mode 100644
... ...
@@ -0,0 +1,253 @@
0
+/*
1
+ * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
2
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
3
+ *
4
+ * This file is part of FFmpeg.
5
+ *
6
+ * FFmpeg is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * FFmpeg is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with FFmpeg; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+/**
22
+ * @file
23
+ * gradfun debanding filter, ported from MPlayer
24
+ * libmpcodecs/vf_gradfun.c
25
+ *
26
+ * Apply a boxblur debanding algorithm (based on the gradfun2db
27
+ * Avisynth filter by prunedtree).
28
+ * Foreach pixel, if it's within threshold of the blurred value, make it closer.
29
+ * So now we have a smoothed and higher bitdepth version of all the shallow
30
+ * gradients, while leaving detailed areas untouched.
31
+ * Dither it back to 8bit.
32
+ */
33
+
34
+#include "libavcore/imgutils.h"
35
+#include "libavutil/cpu.h"
36
+#include "libavutil/pixdesc.h"
37
+#include "avfilter.h"
38
+#include "gradfun.h"
39
+
40
+DECLARE_ALIGNED(16, static const uint16_t, dither)[8][8] = {
41
+    {0x00,0x60,0x18,0x78,0x06,0x66,0x1E,0x7E},
42
+    {0x40,0x20,0x58,0x38,0x46,0x26,0x5E,0x3E},
43
+    {0x10,0x70,0x08,0x68,0x16,0x76,0x0E,0x6E},
44
+    {0x50,0x30,0x48,0x28,0x56,0x36,0x4E,0x2E},
45
+    {0x04,0x64,0x1C,0x7C,0x02,0x62,0x1A,0x7A},
46
+    {0x44,0x24,0x5C,0x3C,0x42,0x22,0x5A,0x3A},
47
+    {0x14,0x74,0x0C,0x6C,0x12,0x72,0x0A,0x6A},
48
+    {0x54,0x34,0x4C,0x2C,0x52,0x32,0x4A,0x2A},
49
+};
50
+
51
+void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
52
+{
53
+    int x;
54
+    for (x = 0; x < width; x++, dc += x & 1) {
55
+        int pix = src[x] << 7;
56
+        int delta = dc[0] - pix;
57
+        int m = abs(delta) * thresh >> 16;
58
+        m = FFMAX(0, 127 - m);
59
+        m = m * m * delta >> 14;
60
+        pix += m + dithers[x & 7];
61
+        dst[x] = av_clip_uint8(pix >> 7);
62
+    }
63
+}
64
+
65
+void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
66
+{
67
+    int x, v, old;
68
+    for (x = 0; x < width; x++) {
69
+        v = buf1[x] + src[2 * x] + src[2 * x + 1] + src[2 * x + src_linesize] + src[2 * x + 1 + src_linesize];
70
+        old = buf[x];
71
+        buf[x] = v;
72
+        dc[x] = v - old;
73
+    }
74
+}
75
+
76
+static void filter(GradFunContext *ctx, uint8_t *dst, uint8_t *src, int width, int height, int dst_linesize, int src_linesize, int r)
77
+{
78
+    int bstride = FFALIGN(width, 16) / 2;
79
+    int y;
80
+    uint32_t dc_factor = (1 << 21) / (r * r);
81
+    uint16_t *dc = ctx->buf + 16;
82
+    uint16_t *buf = ctx->buf + bstride + 32;
83
+    int thresh = ctx->thresh;
84
+
85
+    memset(dc, 0, (bstride + 16) * sizeof(*buf));
86
+    for (y = 0; y < r; y++)
87
+        ctx->blur_line(dc, buf + y * bstride, buf + (y - 1) * bstride, src + 2 * y * src_linesize, src_linesize, width / 2);
88
+    for (;;) {
89
+        if (y < height - r) {
90
+            int mod = ((y + r) / 2) % r;
91
+            uint16_t *buf0 = buf + mod * bstride;
92
+            uint16_t *buf1 = buf + (mod ? mod - 1 : r - 1) * bstride;
93
+            int x, v;
94
+            ctx->blur_line(dc, buf0, buf1, src + (y + r) * src_linesize, src_linesize, width / 2);
95
+            for (x = v = 0; x < r; x++)
96
+                v += dc[x];
97
+            for (; x < width / 2; x++) {
98
+                v += dc[x] - dc[x-r];
99
+                dc[x-r] = v * dc_factor >> 16;
100
+            }
101
+            for (; x < (width + r + 1) / 2; x++)
102
+                dc[x-r] = v * dc_factor >> 16;
103
+            for (x = -r / 2; x < 0; x++)
104
+                dc[x] = dc[0];
105
+        }
106
+        if (y == r) {
107
+            for (y = 0; y < r; y++)
108
+                ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
109
+        }
110
+        ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
111
+        if (++y >= height) break;
112
+        ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
113
+        if (++y >= height) break;
114
+    }
115
+}
116
+
117
+static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
118
+{
119
+    GradFunContext *gf = ctx->priv;
120
+    float thresh = 1.2;
121
+    int radius = 16;
122
+    av_unused int cpu_flags = av_get_cpu_flags();
123
+
124
+    if (args)
125
+        sscanf(args, "%f:%d", &thresh, &radius);
126
+
127
+    thresh = av_clipf(thresh, 0.51, 255);
128
+    gf->thresh = (1 << 15) / thresh;
129
+    gf->radius = av_clip((radius + 1) & ~1, 4, 32);
130
+
131
+    gf->blur_line = ff_gradfun_blur_line_c;
132
+    gf->filter_line = ff_gradfun_filter_line_c;
133
+
134
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2)
135
+        gf->filter_line = ff_gradfun_filter_line_mmx2;
136
+    if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
137
+        gf->filter_line = ff_gradfun_filter_line_ssse3;
138
+    if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
139
+        gf->blur_line = ff_gradfun_blur_line_sse2;
140
+
141
+    av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius);
142
+
143
+    return 0;
144
+}
145
+
146
+static av_cold void uninit(AVFilterContext *ctx)
147
+{
148
+    GradFunContext *gf = ctx->priv;
149
+    av_freep(&gf->buf);
150
+}
151
+
152
+static int query_formats(AVFilterContext *ctx)
153
+{
154
+    static const enum PixelFormat pix_fmts[] = {
155
+        PIX_FMT_YUV410P,            PIX_FMT_YUV420P,
156
+        PIX_FMT_GRAY8,              PIX_FMT_NV12,
157
+        PIX_FMT_NV21,               PIX_FMT_YUV444P,
158
+        PIX_FMT_YUV422P,            PIX_FMT_YUV411P,
159
+        PIX_FMT_NONE
160
+    };
161
+
162
+    avfilter_set_common_formats(ctx, avfilter_make_format_list(pix_fmts));
163
+
164
+    return 0;
165
+}
166
+
167
+static int config_input(AVFilterLink *inlink)
168
+{
169
+    GradFunContext *gf = inlink->dst->priv;
170
+    int hsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_w;
171
+    int vsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_h;
172
+
173
+    gf->buf = av_mallocz((FFALIGN(inlink->w, 16) * (gf->radius + 1) / 2 + 32) * sizeof(uint16_t));
174
+    if (!gf->buf)
175
+        return AVERROR(ENOMEM);
176
+
177
+    gf->chroma_w = -((-inlink->w) >> hsub);
178
+    gf->chroma_h = -((-inlink->h) >> vsub);
179
+    gf->chroma_r = av_clip(((((gf->radius >> hsub) + (gf->radius >> vsub)) / 2 ) + 1) & ~1, 4, 32);
180
+
181
+    return 0;
182
+}
183
+
184
+static void start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
185
+{
186
+    AVFilterLink *outlink = inlink->dst->outputs[0];
187
+    AVFilterBufferRef *outpicref;
188
+
189
+    if (inpicref->perms & AV_PERM_PRESERVE) {
190
+        outpicref = avfilter_get_video_buffer(outlink, AV_PERM_WRITE, outlink->w, outlink->h);
191
+        avfilter_copy_buffer_ref_props(outpicref, inpicref);
192
+        outpicref->video->w = outlink->w;
193
+        outpicref->video->h = outlink->h;
194
+    } else
195
+        outpicref = inpicref;
196
+
197
+    outlink->out_buf = outpicref;
198
+    avfilter_start_frame(outlink, avfilter_ref_buffer(outpicref, ~0));
199
+}
200
+
201
+static void null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir) { }
202
+
203
+static void end_frame(AVFilterLink *inlink)
204
+{
205
+    GradFunContext *gf = inlink->dst->priv;
206
+    AVFilterBufferRef *inpic = inlink->cur_buf;
207
+    AVFilterLink *outlink = inlink->dst->outputs[0];
208
+    AVFilterBufferRef *outpic = outlink->out_buf;
209
+    int p;
210
+
211
+    for (p = 0; p < 4 && inpic->data[p]; p++) {
212
+        int w = inlink->w;
213
+        int h = inlink->h;
214
+        int r = gf->radius;
215
+        if (p) {
216
+            w = gf->chroma_w;
217
+            h = gf->chroma_h;
218
+            r = gf->chroma_r;
219
+        }
220
+
221
+        if (FFMIN(w, h) > 2 * r)
222
+            filter(gf, outpic->data[p], inpic->data[p], w, h, outpic->linesize[p], inpic->linesize[p], r);
223
+        else if (outpic->data[p] != inpic->data[p])
224
+            av_image_copy_plane(outpic->data[p], outpic->linesize[p], inpic->data[p], inpic->linesize[p], w, h);
225
+    }
226
+
227
+    avfilter_draw_slice(outlink, 0, inlink->h, 1);
228
+    avfilter_end_frame(outlink);
229
+    avfilter_unref_buffer(inpic);
230
+    avfilter_unref_buffer(outpic);
231
+}
232
+
233
+AVFilter avfilter_vf_gradfun = {
234
+    .name          = "gradfun",
235
+    .description   = NULL_IF_CONFIG_SMALL("Debands video quickly using gradients."),
236
+    .priv_size     = sizeof(GradFunContext),
237
+    .init          = init,
238
+    .uninit        = uninit,
239
+    .query_formats = query_formats,
240
+
241
+    .inputs    = (AVFilterPad[]) {{ .name             = "default",
242
+                                    .type             = AVMEDIA_TYPE_VIDEO,
243
+                                    .config_props     = config_input,
244
+                                    .start_frame      = start_frame,
245
+                                    .draw_slice       = null_draw_slice,
246
+                                    .end_frame        = end_frame,
247
+                                    .min_perms        = AV_PERM_READ, },
248
+                                  { .name = NULL}},
249
+    .outputs   = (AVFilterPad[]) {{ .name             = "default",
250
+                                    .type             = AVMEDIA_TYPE_VIDEO, },
251
+                                  { .name = NULL}},
252
+};
... ...
@@ -1 +1,2 @@
1 1
 MMX-OBJS-$(CONFIG_YADIF_FILTER)              += x86/yadif.o
2
+MMX-OBJS-$(CONFIG_GRADFUN_FILTER)            += x86/gradfun.o
2 3
new file mode 100644
... ...
@@ -0,0 +1,162 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or modify
4
+ * it under the terms of the GNU General Public License as published by
5
+ * the Free Software Foundation; either version 2 of the License, or
6
+ * (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+ * GNU General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU General Public License along
14
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
15
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16
+ */
17
+
18
+#include "libavutil/cpu.h"
19
+#include "libavutil/x86_cpu.h"
20
+#include "libavfilter/gradfun.h"
21
+
22
+DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
23
+DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
24
+
25
+void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
26
+{
27
+#if HAVE_MMX
28
+    intptr_t x;
29
+    if (width & 3) {
30
+        x = width & ~3;
31
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
32
+        width = x;
33
+    }
34
+    x = -width;
35
+    __asm__ volatile(
36
+        "movd          %4, %%mm5 \n"
37
+        "pxor       %%mm7, %%mm7 \n"
38
+        "pshufw $0, %%mm5, %%mm5 \n"
39
+        "movq          %6, %%mm6 \n"
40
+        "movq          %5, %%mm4 \n"
41
+        "1: \n"
42
+        "movd     (%2,%0), %%mm0 \n"
43
+        "movd     (%3,%0), %%mm1 \n"
44
+        "punpcklbw  %%mm7, %%mm0 \n"
45
+        "punpcklwd  %%mm1, %%mm1 \n"
46
+        "psllw         $7, %%mm0 \n"
47
+        "pxor       %%mm2, %%mm2 \n"
48
+        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
49
+        "psubw      %%mm1, %%mm2 \n"
50
+        "pmaxsw     %%mm1, %%mm2 \n"
51
+        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
52
+        "psubw      %%mm6, %%mm2 \n"
53
+        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
54
+        "pmullw     %%mm2, %%mm2 \n"
55
+        "paddw      %%mm4, %%mm0 \n" // pix += dither
56
+        "pmulhw     %%mm2, %%mm1 \n"
57
+        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
58
+        "paddw      %%mm1, %%mm0 \n" // pix += m
59
+        "psraw         $7, %%mm0 \n"
60
+        "packuswb   %%mm0, %%mm0 \n"
61
+        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
62
+        "add           $4, %0 \n"
63
+        "jl 1b \n"
64
+        "emms \n"
65
+        :"+r"(x)
66
+        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
67
+         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
68
+        :"memory"
69
+    );
70
+#endif
71
+}
72
+
73
+void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
74
+{
75
+#if HAVE_SSSE3
76
+    intptr_t x;
77
+    if (width & 7) {
78
+        // could be 10% faster if I somehow eliminated this
79
+        x = width & ~7;
80
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
81
+        width = x;
82
+    }
83
+    x = -width;
84
+    __asm__ volatile(
85
+        "movd           %4, %%xmm5 \n"
86
+        "pxor       %%xmm7, %%xmm7 \n"
87
+        "pshuflw $0,%%xmm5, %%xmm5 \n"
88
+        "movdqa         %6, %%xmm6 \n"
89
+        "punpcklqdq %%xmm5, %%xmm5 \n"
90
+        "movdqa         %5, %%xmm4 \n"
91
+        "1: \n"
92
+        "movq      (%2,%0), %%xmm0 \n"
93
+        "movq      (%3,%0), %%xmm1 \n"
94
+        "punpcklbw  %%xmm7, %%xmm0 \n"
95
+        "punpcklwd  %%xmm1, %%xmm1 \n"
96
+        "psllw          $7, %%xmm0 \n"
97
+        "psubw      %%xmm0, %%xmm1 \n" // delta = dc - pix
98
+        "pabsw      %%xmm1, %%xmm2 \n"
99
+        "pmulhuw    %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
100
+        "psubw      %%xmm6, %%xmm2 \n"
101
+        "pminsw     %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
102
+        "pmullw     %%xmm2, %%xmm2 \n"
103
+        "psllw          $1, %%xmm2 \n"
104
+        "paddw      %%xmm4, %%xmm0 \n" // pix += dither
105
+        "pmulhrsw   %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
106
+        "paddw      %%xmm1, %%xmm0 \n" // pix += m
107
+        "psraw          $7, %%xmm0 \n"
108
+        "packuswb   %%xmm0, %%xmm0 \n"
109
+        "movq       %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
110
+        "add            $8, %0 \n"
111
+        "jl 1b \n"
112
+        :"+&r"(x)
113
+        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
114
+         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
115
+        :"memory"
116
+    );
117
+#endif // HAVE_SSSE3
118
+}
119
+
120
+void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
121
+{
122
+#if HAVE_SSE
123
+#define BLURV(load)\
124
+    intptr_t x = -2*width;\
125
+    __asm__ volatile(\
126
+        "movdqa %6, %%xmm7 \n"\
127
+        "1: \n"\
128
+        load"   (%4,%0), %%xmm0 \n"\
129
+        load"   (%5,%0), %%xmm1 \n"\
130
+        "movdqa  %%xmm0, %%xmm2 \n"\
131
+        "movdqa  %%xmm1, %%xmm3 \n"\
132
+        "psrlw       $8, %%xmm0 \n"\
133
+        "psrlw       $8, %%xmm1 \n"\
134
+        "pand    %%xmm7, %%xmm2 \n"\
135
+        "pand    %%xmm7, %%xmm3 \n"\
136
+        "paddw   %%xmm1, %%xmm0 \n"\
137
+        "paddw   %%xmm3, %%xmm2 \n"\
138
+        "paddw   %%xmm2, %%xmm0 \n"\
139
+        "paddw  (%2,%0), %%xmm0 \n"\
140
+        "movdqa (%1,%0), %%xmm1 \n"\
141
+        "movdqa  %%xmm0, (%1,%0) \n"\
142
+        "psubw   %%xmm1, %%xmm0 \n"\
143
+        "movdqa  %%xmm0, (%3,%0) \n"\
144
+        "add        $16, %0 \n"\
145
+        "jl 1b \n"\
146
+        :"+&r"(x)\
147
+        :"r"(buf+width),\
148
+         "r"(buf1+width),\
149
+         "r"(dc+width),\
150
+         "r"(src+width*2),\
151
+         "r"(src+width*2+src_linesize),\
152
+         "m"(*pw_ff)\
153
+        :"memory"\
154
+    );
155
+    if (((intptr_t) src | src_linesize) & 15) {
156
+        BLURV("movdqu");
157
+    } else {
158
+        BLURV("movdqa");
159
+    }
160
+#endif // HAVE_SSE
161
+}