Browse code

yadif: restore speed of the C filtering code

Always use the special filter for the first and last 3 columns (only).

Changes made in 64ed397 slowed the filter to just under 3/4 of what it
was. This commit restores the speed while maintaining identical output.

For reference, on my Athlon64:
1733222 decicycles in old
2358563 decicycles in new
1727558 decicycles in this

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>

James Darnley authored on 2013/03/10 23:08:50
Showing 3 changed files
... ...
@@ -32,14 +32,17 @@
32 32
 #include <assert.h>
33 33
 
34 34
 #define CHECK(j)\
35
-    {   int score = FFABS(cur[mrefs + off_left + (j)] - cur[prefs + off_left - (j)])\
35
+    {   int score = FFABS(cur[mrefs - 1 + (j)] - cur[prefs - 1 - (j)])\
36 36
                   + FFABS(cur[mrefs  +(j)] - cur[prefs  -(j)])\
37
-                  + FFABS(cur[mrefs + off_right + (j)] - cur[prefs + off_right - (j)]);\
37
+                  + FFABS(cur[mrefs + 1 + (j)] - cur[prefs + 1 - (j)]);\
38 38
         if (score < spatial_score) {\
39 39
             spatial_score= score;\
40 40
             spatial_pred= (cur[mrefs  +(j)] + cur[prefs  -(j)])>>1;\
41 41
 
42
-#define FILTER(start, end) \
42
+/* The is_not_edge argument here controls when the code will enter a branch
43
+ * which reads up to and including x-3 and x+3. */
44
+
45
+#define FILTER(start, end, is_not_edge) \
43 46
     for (x = start;  x < end; x++) { \
44 47
         int c = cur[mrefs]; \
45 48
         int d = (prev2[0] + next2[0])>>1; \
... ...
@@ -49,12 +52,10 @@
49 49
         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; \
50 50
         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
51 51
         int spatial_pred = (c+e) >> 1; \
52
-        int off_right = (x < w - 1) ? 1 : -1;\
53
-        int off_left  = x ? -1 : 1;\
54
-        int spatial_score = FFABS(cur[mrefs + off_left]  - cur[prefs + off_left]) + FFABS(c-e) \
55
-                          + FFABS(cur[mrefs + off_right] - cur[prefs + off_right]) - 1; \
56 52
  \
57
-        if (x > 2 && x < w - 3) {\
53
+        if (is_not_edge) {\
54
+            int spatial_score = FFABS(cur[mrefs - 1] - cur[prefs - 1]) + FFABS(c-e) \
55
+                              + FFABS(cur[mrefs + 1] - cur[prefs + 1]) - 1; \
58 56
             CHECK(-1) CHECK(-2) }} }} \
59 57
             CHECK( 1) CHECK( 2) }} }} \
60 58
         }\
... ...
@@ -95,12 +96,15 @@ static void filter_line_c(void *dst1,
95 95
     uint8_t *prev2 = parity ? prev : cur ;
96 96
     uint8_t *next2 = parity ? cur  : next;
97 97
 
98
-    FILTER(0, w)
98
+    /* The function is called with the pointers already pointing to data[3] and
99
+     * with 6 subtracted from the width.  This allows the FILTER macro to be
100
+     * called so that it processes all the pixels normally.  A constant value of
101
+     * true for is_not_edge lets the compiler ignore the if statement. */
102
+    FILTER(0, w, 1)
99 103
 }
100 104
 
101 105
 static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
102
-                         int w, int prefs, int mrefs, int parity, int mode,
103
-                         int l_edge)
106
+                         int w, int prefs, int mrefs, int parity, int mode)
104 107
 {
105 108
     uint8_t *dst  = dst1;
106 109
     uint8_t *prev = prev1;
... ...
@@ -110,7 +114,9 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
110 110
     uint8_t *prev2 = parity ? prev : cur ;
111 111
     uint8_t *next2 = parity ? cur  : next;
112 112
 
113
-    FILTER(0, l_edge)
113
+    /* Only edge pixels need to be processed here.  A constant value of false
114
+     * for is_not_edge should let the compiler ignore the whole branch. */
115
+    FILTER(0, 3, 0)
114 116
 
115 117
     dst  = (uint8_t*)dst1  + w - 3;
116 118
     prev = (uint8_t*)prev1 + w - 3;
... ...
@@ -119,7 +125,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
119 119
     prev2 = (uint8_t*)(parity ? prev : cur);
120 120
     next2 = (uint8_t*)(parity ? cur  : next);
121 121
 
122
-    FILTER(w - 3, w)
122
+    FILTER(w - 3, w, 0)
123 123
 }
124 124
 
125 125
 
... ...
@@ -138,12 +144,11 @@ static void filter_line_c_16bit(void *dst1,
138 138
     mrefs /= 2;
139 139
     prefs /= 2;
140 140
 
141
-    FILTER(0, w)
141
+    FILTER(0, w, 1)
142 142
 }
143 143
 
144 144
 static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
145
-                               int w, int prefs, int mrefs, int parity, int mode,
146
-                               int l_edge)
145
+                               int w, int prefs, int mrefs, int parity, int mode)
147 146
 {
148 147
     uint16_t *dst  = dst1;
149 148
     uint16_t *prev = prev1;
... ...
@@ -153,7 +158,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
153 153
     uint16_t *prev2 = parity ? prev : cur ;
154 154
     uint16_t *next2 = parity ? cur  : next;
155 155
 
156
-    FILTER(0, l_edge)
156
+    FILTER(0, 3, 0)
157 157
 
158 158
     dst   = (uint16_t*)dst1  + w - 3;
159 159
     prev  = (uint16_t*)prev1 + w - 3;
... ...
@@ -162,7 +167,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
162 162
     prev2 = (uint16_t*)(parity ? prev : cur);
163 163
     next2 = (uint16_t*)(parity ? cur  : next);
164 164
 
165
-    FILTER(w - 3, w)
165
+    FILTER(w - 3, w, 0)
166 166
 }
167 167
 
168 168
 static void filter(AVFilterContext *ctx, AVFrame *dstpic,
... ...
@@ -176,7 +181,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
176 176
         int h = dstpic->height;
177 177
         int refs = yadif->cur->linesize[i];
178 178
         int df = (yadif->csp->comp[i].depth_minus1 + 8) / 8;
179
-        int l_edge, l_edge_pix;
179
+        int pix_3 = 3 * df;
180 180
 
181 181
         if (i == 1 || i == 2) {
182 182
         /* Why is this not part of the per-plane description thing? */
... ...
@@ -187,8 +192,6 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
187 187
         /* filtering reads 3 pixels to the left/right; to avoid invalid reads,
188 188
          * we need to call the c variant which avoids this for border pixels
189 189
          */
190
-        l_edge     = yadif->req_align;
191
-        l_edge_pix = l_edge / df;
192 190
 
193 191
         for (y = 0; y < h; y++) {
194 192
             if ((y ^ parity) & 1) {
... ...
@@ -197,22 +200,15 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
197 197
                 uint8_t *next = &yadif->next->data[i][y * refs];
198 198
                 uint8_t *dst  = &dstpic->data[i][y * dstpic->linesize[i]];
199 199
                 int     mode  = y == 1 || y + 2 == h ? 2 : yadif->mode;
200
-                if (yadif->req_align) {
201
-                    yadif->filter_line(dst + l_edge, prev + l_edge, cur + l_edge,
202
-                                       next + l_edge, w - l_edge_pix - 3,
203
-                                       y + 1 < h ? refs : -refs,
204
-                                       y ? -refs : refs,
205
-                                       parity ^ tff, mode);
206
-                    yadif->filter_edges(dst, prev, cur, next, w,
207
-                                         y + 1 < h ? refs : -refs,
208
-                                         y ? -refs : refs,
209
-                                         parity ^ tff, mode, l_edge_pix);
210
-                } else {
211
-                    yadif->filter_line(dst, prev, cur, next + l_edge, w,
212
-                                       y + 1 < h ? refs : -refs,
213
-                                       y ? -refs : refs,
214
-                                       parity ^ tff, mode);
215
-                }
200
+                yadif->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
201
+                                   next + pix_3, w - 6,
202
+                                   y + 1 < h ? refs : -refs,
203
+                                   y ? -refs : refs,
204
+                                   parity ^ tff, mode);
205
+                yadif->filter_edges(dst, prev, cur, next, w,
206
+                                    y + 1 < h ? refs : -refs,
207
+                                    y ? -refs : refs,
208
+                                    parity ^ tff, mode);
216 209
             } else {
217 210
                 memcpy(&dstpic->data[i][y * dstpic->linesize[i]],
218 211
                        &yadif->cur->data[i][y * refs], w * df);
... ...
@@ -42,18 +42,12 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
42 42
 
43 43
 #if HAVE_YASM
44 44
 #if ARCH_X86_32
45
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
45
+    if (EXTERNAL_MMXEXT(cpu_flags))
46 46
         yadif->filter_line = ff_yadif_filter_line_mmxext;
47
-        yadif->req_align   = 8;
48
-    }
49 47
 #endif /* ARCH_X86_32 */
50
-    if (EXTERNAL_SSE2(cpu_flags)) {
48
+    if (EXTERNAL_SSE2(cpu_flags))
51 49
         yadif->filter_line = ff_yadif_filter_line_sse2;
52
-        yadif->req_align   = 16;
53
-    }
54
-    if (EXTERNAL_SSSE3(cpu_flags)) {
50
+    if (EXTERNAL_SSSE3(cpu_flags))
55 51
         yadif->filter_line = ff_yadif_filter_line_ssse3;
56
-        yadif->req_align   = 16;
57
-    }
58 52
 #endif /* HAVE_YASM */
59 53
 }
... ...
@@ -57,13 +57,11 @@ typedef struct YADIFContext {
57 57
     /**
58 58
      * Required alignment for filter_line
59 59
      */
60
-    int req_align;
61 60
     void (*filter_line)(void *dst,
62 61
                         void *prev, void *cur, void *next,
63 62
                         int w, int prefs, int mrefs, int parity, int mode);
64 63
     void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
65
-                         int w, int prefs, int mrefs, int parity, int mode,
66
-                         int l_edge);
64
+                         int w, int prefs, int mrefs, int parity, int mode);
67 65
 
68 66
     const AVPixFmtDescriptor *csp;
69 67
     int eof;