Always use the special filter for the first and last 3 columns (only).
Changes made in 64ed397 slowed the filter to just under 3/4 of what it
was. This commit restores the speed while maintaining identical output.
For reference, on my Athlon64:
1733222 decicycles in old
2358563 decicycles in new
1727558 decicycles in this
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -32,14 +32,17 @@ |
32 | 32 |
#include <assert.h> |
33 | 33 |
|
34 | 34 |
#define CHECK(j)\ |
35 |
- { int score = FFABS(cur[mrefs + off_left + (j)] - cur[prefs + off_left - (j)])\ |
|
35 |
+ { int score = FFABS(cur[mrefs - 1 + (j)] - cur[prefs - 1 - (j)])\ |
|
36 | 36 |
+ FFABS(cur[mrefs +(j)] - cur[prefs -(j)])\ |
37 |
- + FFABS(cur[mrefs + off_right + (j)] - cur[prefs + off_right - (j)]);\ |
|
37 |
+ + FFABS(cur[mrefs + 1 + (j)] - cur[prefs + 1 - (j)]);\ |
|
38 | 38 |
if (score < spatial_score) {\ |
39 | 39 |
spatial_score= score;\ |
40 | 40 |
spatial_pred= (cur[mrefs +(j)] + cur[prefs -(j)])>>1;\ |
41 | 41 |
|
42 |
-#define FILTER(start, end) \ |
|
42 |
+/* The is_not_edge argument here controls when the code will enter a branch |
|
43 |
+ * which reads up to and including x-3 and x+3. */ |
|
44 |
+ |
|
45 |
+#define FILTER(start, end, is_not_edge) \ |
|
43 | 46 |
for (x = start; x < end; x++) { \ |
44 | 47 |
int c = cur[mrefs]; \ |
45 | 48 |
int d = (prev2[0] + next2[0])>>1; \ |
... | ... |
@@ -49,12 +52,10 @@ |
49 | 49 |
int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; \ |
50 | 50 |
int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \ |
51 | 51 |
int spatial_pred = (c+e) >> 1; \ |
52 |
- int off_right = (x < w - 1) ? 1 : -1;\ |
|
53 |
- int off_left = x ? -1 : 1;\ |
|
54 |
- int spatial_score = FFABS(cur[mrefs + off_left] - cur[prefs + off_left]) + FFABS(c-e) \ |
|
55 |
- + FFABS(cur[mrefs + off_right] - cur[prefs + off_right]) - 1; \ |
|
56 | 52 |
\ |
57 |
- if (x > 2 && x < w - 3) {\ |
|
53 |
+ if (is_not_edge) {\ |
|
54 |
+ int spatial_score = FFABS(cur[mrefs - 1] - cur[prefs - 1]) + FFABS(c-e) \ |
|
55 |
+ + FFABS(cur[mrefs + 1] - cur[prefs + 1]) - 1; \ |
|
58 | 56 |
CHECK(-1) CHECK(-2) }} }} \ |
59 | 57 |
CHECK( 1) CHECK( 2) }} }} \ |
60 | 58 |
}\ |
... | ... |
@@ -95,12 +96,15 @@ static void filter_line_c(void *dst1, |
95 | 95 |
uint8_t *prev2 = parity ? prev : cur ; |
96 | 96 |
uint8_t *next2 = parity ? cur : next; |
97 | 97 |
|
98 |
- FILTER(0, w) |
|
98 |
+ /* The function is called with the pointers already pointing to data[3] and |
|
99 |
+ * with 6 subtracted from the width. This allows the FILTER macro to be |
|
100 |
+ * called so that it processes all the pixels normally. A constant value of |
|
101 |
+ * true for is_not_edge lets the compiler ignore the if statement. */ |
|
102 |
+ FILTER(0, w, 1) |
|
99 | 103 |
} |
100 | 104 |
|
101 | 105 |
static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1, |
102 |
- int w, int prefs, int mrefs, int parity, int mode, |
|
103 |
- int l_edge) |
|
106 |
+ int w, int prefs, int mrefs, int parity, int mode) |
|
104 | 107 |
{ |
105 | 108 |
uint8_t *dst = dst1; |
106 | 109 |
uint8_t *prev = prev1; |
... | ... |
@@ -110,7 +114,9 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1, |
110 | 110 |
uint8_t *prev2 = parity ? prev : cur ; |
111 | 111 |
uint8_t *next2 = parity ? cur : next; |
112 | 112 |
|
113 |
- FILTER(0, l_edge) |
|
113 |
+ /* Only edge pixels need to be processed here. A constant value of false |
|
114 |
+ * for is_not_edge should let the compiler ignore the whole branch. */ |
|
115 |
+ FILTER(0, 3, 0) |
|
114 | 116 |
|
115 | 117 |
dst = (uint8_t*)dst1 + w - 3; |
116 | 118 |
prev = (uint8_t*)prev1 + w - 3; |
... | ... |
@@ -119,7 +125,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1, |
119 | 119 |
prev2 = (uint8_t*)(parity ? prev : cur); |
120 | 120 |
next2 = (uint8_t*)(parity ? cur : next); |
121 | 121 |
|
122 |
- FILTER(w - 3, w) |
|
122 |
+ FILTER(w - 3, w, 0) |
|
123 | 123 |
} |
124 | 124 |
|
125 | 125 |
|
... | ... |
@@ -138,12 +144,11 @@ static void filter_line_c_16bit(void *dst1, |
138 | 138 |
mrefs /= 2; |
139 | 139 |
prefs /= 2; |
140 | 140 |
|
141 |
- FILTER(0, w) |
|
141 |
+ FILTER(0, w, 1) |
|
142 | 142 |
} |
143 | 143 |
|
144 | 144 |
static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1, |
145 |
- int w, int prefs, int mrefs, int parity, int mode, |
|
146 |
- int l_edge) |
|
145 |
+ int w, int prefs, int mrefs, int parity, int mode) |
|
147 | 146 |
{ |
148 | 147 |
uint16_t *dst = dst1; |
149 | 148 |
uint16_t *prev = prev1; |
... | ... |
@@ -153,7 +158,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1, |
153 | 153 |
uint16_t *prev2 = parity ? prev : cur ; |
154 | 154 |
uint16_t *next2 = parity ? cur : next; |
155 | 155 |
|
156 |
- FILTER(0, l_edge) |
|
156 |
+ FILTER(0, 3, 0) |
|
157 | 157 |
|
158 | 158 |
dst = (uint16_t*)dst1 + w - 3; |
159 | 159 |
prev = (uint16_t*)prev1 + w - 3; |
... | ... |
@@ -162,7 +167,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1, |
162 | 162 |
prev2 = (uint16_t*)(parity ? prev : cur); |
163 | 163 |
next2 = (uint16_t*)(parity ? cur : next); |
164 | 164 |
|
165 |
- FILTER(w - 3, w) |
|
165 |
+ FILTER(w - 3, w, 0) |
|
166 | 166 |
} |
167 | 167 |
|
168 | 168 |
static void filter(AVFilterContext *ctx, AVFrame *dstpic, |
... | ... |
@@ -176,7 +181,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, |
176 | 176 |
int h = dstpic->height; |
177 | 177 |
int refs = yadif->cur->linesize[i]; |
178 | 178 |
int df = (yadif->csp->comp[i].depth_minus1 + 8) / 8; |
179 |
- int l_edge, l_edge_pix; |
|
179 |
+ int pix_3 = 3 * df; |
|
180 | 180 |
|
181 | 181 |
if (i == 1 || i == 2) { |
182 | 182 |
/* Why is this not part of the per-plane description thing? */ |
... | ... |
@@ -187,8 +192,6 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, |
187 | 187 |
/* filtering reads 3 pixels to the left/right; to avoid invalid reads, |
188 | 188 |
* we need to call the c variant which avoids this for border pixels |
189 | 189 |
*/ |
190 |
- l_edge = yadif->req_align; |
|
191 |
- l_edge_pix = l_edge / df; |
|
192 | 190 |
|
193 | 191 |
for (y = 0; y < h; y++) { |
194 | 192 |
if ((y ^ parity) & 1) { |
... | ... |
@@ -197,22 +200,15 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, |
197 | 197 |
uint8_t *next = &yadif->next->data[i][y * refs]; |
198 | 198 |
uint8_t *dst = &dstpic->data[i][y * dstpic->linesize[i]]; |
199 | 199 |
int mode = y == 1 || y + 2 == h ? 2 : yadif->mode; |
200 |
- if (yadif->req_align) { |
|
201 |
- yadif->filter_line(dst + l_edge, prev + l_edge, cur + l_edge, |
|
202 |
- next + l_edge, w - l_edge_pix - 3, |
|
203 |
- y + 1 < h ? refs : -refs, |
|
204 |
- y ? -refs : refs, |
|
205 |
- parity ^ tff, mode); |
|
206 |
- yadif->filter_edges(dst, prev, cur, next, w, |
|
207 |
- y + 1 < h ? refs : -refs, |
|
208 |
- y ? -refs : refs, |
|
209 |
- parity ^ tff, mode, l_edge_pix); |
|
210 |
- } else { |
|
211 |
- yadif->filter_line(dst, prev, cur, next + l_edge, w, |
|
212 |
- y + 1 < h ? refs : -refs, |
|
213 |
- y ? -refs : refs, |
|
214 |
- parity ^ tff, mode); |
|
215 |
- } |
|
200 |
+ yadif->filter_line(dst + pix_3, prev + pix_3, cur + pix_3, |
|
201 |
+ next + pix_3, w - 6, |
|
202 |
+ y + 1 < h ? refs : -refs, |
|
203 |
+ y ? -refs : refs, |
|
204 |
+ parity ^ tff, mode); |
|
205 |
+ yadif->filter_edges(dst, prev, cur, next, w, |
|
206 |
+ y + 1 < h ? refs : -refs, |
|
207 |
+ y ? -refs : refs, |
|
208 |
+ parity ^ tff, mode); |
|
216 | 209 |
} else { |
217 | 210 |
memcpy(&dstpic->data[i][y * dstpic->linesize[i]], |
218 | 211 |
&yadif->cur->data[i][y * refs], w * df); |
... | ... |
@@ -42,18 +42,12 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif) |
42 | 42 |
|
43 | 43 |
#if HAVE_YASM |
44 | 44 |
#if ARCH_X86_32 |
45 |
- if (EXTERNAL_MMXEXT(cpu_flags)) { |
|
45 |
+ if (EXTERNAL_MMXEXT(cpu_flags)) |
|
46 | 46 |
yadif->filter_line = ff_yadif_filter_line_mmxext; |
47 |
- yadif->req_align = 8; |
|
48 |
- } |
|
49 | 47 |
#endif /* ARCH_X86_32 */ |
50 |
- if (EXTERNAL_SSE2(cpu_flags)) { |
|
48 |
+ if (EXTERNAL_SSE2(cpu_flags)) |
|
51 | 49 |
yadif->filter_line = ff_yadif_filter_line_sse2; |
52 |
- yadif->req_align = 16; |
|
53 |
- } |
|
54 |
- if (EXTERNAL_SSSE3(cpu_flags)) { |
|
50 |
+ if (EXTERNAL_SSSE3(cpu_flags)) |
|
55 | 51 |
yadif->filter_line = ff_yadif_filter_line_ssse3; |
56 |
- yadif->req_align = 16; |
|
57 |
- } |
|
58 | 52 |
#endif /* HAVE_YASM */ |
59 | 53 |
} |
... | ... |
@@ -57,13 +57,11 @@ typedef struct YADIFContext { |
57 | 57 |
/** |
58 | 58 |
* Required alignment for filter_line |
59 | 59 |
*/ |
60 |
- int req_align; |
|
61 | 60 |
void (*filter_line)(void *dst, |
62 | 61 |
void *prev, void *cur, void *next, |
63 | 62 |
int w, int prefs, int mrefs, int parity, int mode); |
64 | 63 |
void (*filter_edges)(void *dst, void *prev, void *cur, void *next, |
65 |
- int w, int prefs, int mrefs, int parity, int mode, |
|
66 |
- int l_edge); |
|
64 |
+ int w, int prefs, int mrefs, int parity, int mode); |
|
67 | 65 |
|
68 | 66 |
const AVPixFmtDescriptor *csp; |
69 | 67 |
int eof; |