Browse code

cosmetics: Fix indentation to be 4 spaces and consistently place {}.

Originally committed as revision 12552 to svn://svn.ffmpeg.org/ffmpeg/trunk

Diego Biurrun authored on 2008/03/23 00:46:34
Showing 5 changed files
... ...
@@ -116,65 +116,65 @@ DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
116 116
 
117 117
 static struct PPFilter filters[]=
118 118
 {
119
-        {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
120
-        {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
121
-/*      {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
122
-        {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
123
-        {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
124
-        {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
125
-        {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
126
-        {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
127
-        {"dr", "dering",                1, 5, 6, DERING},
128
-        {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
129
-        {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
130
-        {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
131
-        {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
132
-        {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
133
-        {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
134
-        {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
135
-        {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
136
-        {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
137
-        {NULL, NULL,0,0,0,0} //End Marker
119
+    {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
120
+    {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
121
+/*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
122
+    {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
123
+    {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
124
+    {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
125
+    {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
126
+    {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
127
+    {"dr", "dering",                1, 5, 6, DERING},
128
+    {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
129
+    {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
130
+    {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
131
+    {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
132
+    {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
133
+    {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
134
+    {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
135
+    {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
136
+    {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
137
+    {NULL, NULL,0,0,0,0} //End Marker
138 138
 };
139 139
 
140 140
 static const char *replaceTable[]=
141 141
 {
142
-        "default",      "hb:a,vb:a,dr:a",
143
-        "de",           "hb:a,vb:a,dr:a",
144
-        "fast",         "h1:a,v1:a,dr:a",
145
-        "fa",           "h1:a,v1:a,dr:a",
146
-        "ac",           "ha:a:128:7,va:a,dr:a",
147
-        NULL //End Marker
142
+    "default",      "hb:a,vb:a,dr:a",
143
+    "de",           "hb:a,vb:a,dr:a",
144
+    "fast",         "h1:a,v1:a,dr:a",
145
+    "fa",           "h1:a,v1:a,dr:a",
146
+    "ac",           "ha:a:128:7,va:a,dr:a",
147
+    NULL //End Marker
148 148
 };
149 149
 
150 150
 
151 151
 #if defined(ARCH_X86)
152 152
 static inline void prefetchnta(void *p)
153 153
 {
154
-        asm volatile(   "prefetchnta (%0)\n\t"
155
-                : : "r" (p)
156
-        );
154
+    asm volatile(   "prefetchnta (%0)\n\t"
155
+        : : "r" (p)
156
+    );
157 157
 }
158 158
 
159 159
 static inline void prefetcht0(void *p)
160 160
 {
161
-        asm volatile(   "prefetcht0 (%0)\n\t"
162
-                : : "r" (p)
163
-        );
161
+    asm volatile(   "prefetcht0 (%0)\n\t"
162
+        : : "r" (p)
163
+    );
164 164
 }
165 165
 
166 166
 static inline void prefetcht1(void *p)
167 167
 {
168
-        asm volatile(   "prefetcht1 (%0)\n\t"
169
-                : : "r" (p)
170
-        );
168
+    asm volatile(   "prefetcht1 (%0)\n\t"
169
+        : : "r" (p)
170
+    );
171 171
 }
172 172
 
173 173
 static inline void prefetcht2(void *p)
174 174
 {
175
-        asm volatile(   "prefetcht2 (%0)\n\t"
176
-                : : "r" (p)
177
-        );
175
+    asm volatile(   "prefetcht2 (%0)\n\t"
176
+        : : "r" (p)
177
+    );
178 178
 }
179 179
 #endif
180 180
 
... ...
@@ -185,171 +185,167 @@ static inline void prefetcht2(void *p)
185 185
  */
186 186
 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
187 187
 {
188
-        int numEq= 0;
189
-        int y;
190
-        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
191
-        const int dcThreshold= dcOffset*2 + 1;
192
-
193
-        for(y=0; y<BLOCK_SIZE; y++)
194
-        {
195
-                if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
196
-                if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
197
-                if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
198
-                if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
199
-                if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
200
-                if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
201
-                if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
202
-                src+= stride;
203
-        }
204
-        return numEq > c->ppMode.flatnessThreshold;
188
+    int numEq= 0;
189
+    int y;
190
+    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
191
+    const int dcThreshold= dcOffset*2 + 1;
192
+
193
+    for(y=0; y<BLOCK_SIZE; y++){
194
+        if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
195
+        if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
196
+        if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
197
+        if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
198
+        if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
199
+        if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
200
+        if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
201
+        src+= stride;
202
+    }
203
+    return numEq > c->ppMode.flatnessThreshold;
205 204
 }
206 205
 
207 206
 /**
208 207
  * Check if the middle 8x8 Block in the given 8x16 block is flat
209 208
  */
210
-static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
211
-        int numEq= 0;
212
-        int y;
213
-        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
214
-        const int dcThreshold= dcOffset*2 + 1;
215
-
216
-        src+= stride*4; // src points to begin of the 8x8 Block
217
-        for(y=0; y<BLOCK_SIZE-1; y++)
218
-        {
219
-                if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
220
-                if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
221
-                if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
222
-                if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
223
-                if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
224
-                if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
225
-                if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
226
-                if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
227
-                src+= stride;
228
-        }
229
-        return numEq > c->ppMode.flatnessThreshold;
209
+static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
210
+{
211
+    int numEq= 0;
212
+    int y;
213
+    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
214
+    const int dcThreshold= dcOffset*2 + 1;
215
+
216
+    src+= stride*4; // src points to begin of the 8x8 Block
217
+    for(y=0; y<BLOCK_SIZE-1; y++){
218
+        if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
219
+        if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
220
+        if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
221
+        if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
222
+        if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
223
+        if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
224
+        if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
225
+        if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
226
+        src+= stride;
227
+    }
228
+    return numEq > c->ppMode.flatnessThreshold;
230 229
 }
231 230
 
232 231
 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
233 232
 {
234
-        int i;
233
+    int i;
235 234
 #if 1
236
-        for(i=0; i<2; i++){
237
-                if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
238
-                src += stride;
239
-                if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
240
-                src += stride;
241
-                if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
242
-                src += stride;
243
-                if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
244
-                src += stride;
245
-        }
235
+    for(i=0; i<2; i++){
236
+        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
237
+        src += stride;
238
+        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
239
+        src += stride;
240
+        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
241
+        src += stride;
242
+        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
243
+        src += stride;
244
+    }
246 245
 #else
247
-        for(i=0; i<8; i++){
248
-                if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
249
-                src += stride;
250
-        }
246
+    for(i=0; i<8; i++){
247
+        if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
248
+        src += stride;
249
+    }
251 250
 #endif
252
-        return 1;
251
+    return 1;
253 252
 }
254 253
 
255 254
 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
256 255
 {
257 256
 #if 1
258 257
 #if 1
259
-        int x;
260
-        src+= stride*4;
261
-        for(x=0; x<BLOCK_SIZE; x+=4)
262
-        {
263
-                if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
264
-                if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
265
-                if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
266
-                if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
267
-        }
258
+    int x;
259
+    src+= stride*4;
260
+    for(x=0; x<BLOCK_SIZE; x+=4){
261
+        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
262
+        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
263
+        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
264
+        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
265
+    }
268 266
 #else
269
-        int x;
270
-        src+= stride*3;
271
-        for(x=0; x<BLOCK_SIZE; x++)
272
-        {
273
-                if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
274
-        }
267
+    int x;
268
+    src+= stride*3;
269
+    for(x=0; x<BLOCK_SIZE; x++){
270
+        if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
271
+    }
275 272
 #endif
276
-        return 1;
273
+    return 1;
277 274
 #else
278
-        int x;
279
-        src+= stride*4;
280
-        for(x=0; x<BLOCK_SIZE; x++)
281
-        {
282
-                int min=255;
283
-                int max=0;
284
-                int y;
285
-                for(y=0; y<8; y++){
286
-                        int v= src[x + y*stride];
287
-                        if(v>max) max=v;
288
-                        if(v<min) min=v;
289
-                }
290
-                if(max-min > 2*QP) return 0;
275
+    int x;
276
+    src+= stride*4;
277
+    for(x=0; x<BLOCK_SIZE; x++){
278
+        int min=255;
279
+        int max=0;
280
+        int y;
281
+        for(y=0; y<8; y++){
282
+            int v= src[x + y*stride];
283
+            if(v>max) max=v;
284
+            if(v<min) min=v;
291 285
         }
292
-        return 1;
286
+        if(max-min > 2*QP) return 0;
287
+    }
288
+    return 1;
293 289
 #endif
294 290
 }
295 291
 
296
-static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
297
-        if( isHorizDC_C(src, stride, c) ){
298
-                if( isHorizMinMaxOk_C(src, stride, c->QP) )
299
-                        return 1;
300
-                else
301
-                        return 0;
302
-        }else{
303
-                return 2;
304
-        }
292
+static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
293
+{
294
+    if( isHorizDC_C(src, stride, c) ){
295
+        if( isHorizMinMaxOk_C(src, stride, c->QP) )
296
+            return 1;
297
+        else
298
+            return 0;
299
+    }else{
300
+        return 2;
301
+    }
305 302
 }
306 303
 
307
-static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
308
-        if( isVertDC_C(src, stride, c) ){
309
-                if( isVertMinMaxOk_C(src, stride, c->QP) )
310
-                        return 1;
311
-                else
312
-                        return 0;
313
-        }else{
314
-                return 2;
315
-        }
304
+static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
305
+{
306
+    if( isVertDC_C(src, stride, c) ){
307
+        if( isVertMinMaxOk_C(src, stride, c->QP) )
308
+            return 1;
309
+        else
310
+            return 0;
311
+    }else{
312
+        return 2;
313
+    }
316 314
 }
317 315
 
318 316
 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
319 317
 {
320
-        int y;
321
-        for(y=0; y<BLOCK_SIZE; y++)
322
-        {
323
-                const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
324
-
325
-                if(FFABS(middleEnergy) < 8*c->QP)
326
-                {
327
-                        const int q=(dst[3] - dst[4])/2;
328
-                        const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
329
-                        const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
330
-
331
-                        int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
332
-                        d= FFMAX(d, 0);
333
-
334
-                        d= (5*d + 32) >> 6;
335
-                        d*= FFSIGN(-middleEnergy);
336
-
337
-                        if(q>0)
338
-                        {
339
-                                d= d<0 ? 0 : d;
340
-                                d= d>q ? q : d;
341
-                        }
342
-                        else
343
-                        {
344
-                                d= d>0 ? 0 : d;
345
-                                d= d<q ? q : d;
346
-                        }
347
-
348
-                        dst[3]-= d;
349
-                        dst[4]+= d;
350
-                }
351
-                dst+= stride;
318
+    int y;
319
+    for(y=0; y<BLOCK_SIZE; y++){
320
+        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
321
+
322
+        if(FFABS(middleEnergy) < 8*c->QP){
323
+            const int q=(dst[3] - dst[4])/2;
324
+            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
325
+            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
326
+
327
+            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
328
+            d= FFMAX(d, 0);
329
+
330
+            d= (5*d + 32) >> 6;
331
+            d*= FFSIGN(-middleEnergy);
332
+
333
+            if(q>0)
334
+            {
335
+                d= d<0 ? 0 : d;
336
+                d= d>q ? q : d;
337
+            }
338
+            else
339
+            {
340
+                d= d>0 ? 0 : d;
341
+                d= d<q ? q : d;
342
+            }
343
+
344
+            dst[3]-= d;
345
+            dst[4]+= d;
352 346
         }
347
+        dst+= stride;
348
+    }
353 349
 }
354 350
 
355 351
 /**
... ...
@@ -358,35 +354,34 @@ static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
358 358
  */
359 359
 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
360 360
 {
361
-        int y;
362
-        for(y=0; y<BLOCK_SIZE; y++)
363
-        {
364
-                const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
365
-                const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
366
-
367
-                int sums[10];
368
-                sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
369
-                sums[1] = sums[0] - first  + dst[3];
370
-                sums[2] = sums[1] - first  + dst[4];
371
-                sums[3] = sums[2] - first  + dst[5];
372
-                sums[4] = sums[3] - first  + dst[6];
373
-                sums[5] = sums[4] - dst[0] + dst[7];
374
-                sums[6] = sums[5] - dst[1] + last;
375
-                sums[7] = sums[6] - dst[2] + last;
376
-                sums[8] = sums[7] - dst[3] + last;
377
-                sums[9] = sums[8] - dst[4] + last;
378
-
379
-                dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
380
-                dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
381
-                dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
382
-                dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
383
-                dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
384
-                dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
385
-                dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
386
-                dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
387
-
388
-                dst+= stride;
389
-        }
361
+    int y;
362
+    for(y=0; y<BLOCK_SIZE; y++){
363
+        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
364
+        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
365
+
366
+        int sums[10];
367
+        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
368
+        sums[1] = sums[0] - first  + dst[3];
369
+        sums[2] = sums[1] - first  + dst[4];
370
+        sums[3] = sums[2] - first  + dst[5];
371
+        sums[4] = sums[3] - first  + dst[6];
372
+        sums[5] = sums[4] - dst[0] + dst[7];
373
+        sums[6] = sums[5] - dst[1] + last;
374
+        sums[7] = sums[6] - dst[2] + last;
375
+        sums[8] = sums[7] - dst[3] + last;
376
+        sums[9] = sums[8] - dst[4] + last;
377
+
378
+        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
379
+        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
380
+        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
381
+        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
382
+        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
383
+        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
384
+        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
385
+        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
386
+
387
+        dst+= stride;
388
+    }
390 389
 }
391 390
 
392 391
 /**
... ...
@@ -399,161 +394,154 @@ static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
399 399
  */
400 400
 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
401 401
 {
402
-        int y;
403
-        static uint64_t *lut= NULL;
404
-        if(lut==NULL)
402
+    int y;
403
+    static uint64_t *lut= NULL;
404
+    if(lut==NULL)
405
+    {
406
+        int i;
407
+        lut = av_malloc(256*8);
408
+        for(i=0; i<256; i++)
405 409
         {
406
-                int i;
407
-                lut = av_malloc(256*8);
408
-                for(i=0; i<256; i++)
409
-                {
410
-                        int v= i < 128 ? 2*i : 2*(i-256);
410
+            int v= i < 128 ? 2*i : 2*(i-256);
411 411
 /*
412 412
 //Simulate 112242211 9-Tap filter
413
-                        uint64_t a= (v/16) & 0xFF;
414
-                        uint64_t b= (v/8) & 0xFF;
415
-                        uint64_t c= (v/4) & 0xFF;
416
-                        uint64_t d= (3*v/8) & 0xFF;
413
+            uint64_t a= (v/16)  & 0xFF;
414
+            uint64_t b= (v/8)   & 0xFF;
415
+            uint64_t c= (v/4)   & 0xFF;
416
+            uint64_t d= (3*v/8) & 0xFF;
417 417
 */
418 418
 //Simulate piecewise linear interpolation
419
-                        uint64_t a= (v/16) & 0xFF;
420
-                        uint64_t b= (v*3/16) & 0xFF;
421
-                        uint64_t c= (v*5/16) & 0xFF;
422
-                        uint64_t d= (7*v/16) & 0xFF;
423
-                        uint64_t A= (0x100 - a)&0xFF;
424
-                        uint64_t B= (0x100 - b)&0xFF;
425
-                        uint64_t C= (0x100 - c)&0xFF;
426
-                        uint64_t D= (0x100 - c)&0xFF;
427
-
428
-                        lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
429
-                                (D<<24) | (C<<16) | (B<<8) | (A);
430
-                        //lut[i] = (v<<32) | (v<<24);
431
-                }
419
+            uint64_t a= (v/16)   & 0xFF;
420
+            uint64_t b= (v*3/16) & 0xFF;
421
+            uint64_t c= (v*5/16) & 0xFF;
422
+            uint64_t d= (7*v/16) & 0xFF;
423
+            uint64_t A= (0x100 - a)&0xFF;
424
+            uint64_t B= (0x100 - b)&0xFF;
425
+            uint64_t C= (0x100 - c)&0xFF;
426
+            uint64_t D= (0x100 - c)&0xFF;
427
+
428
+            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
429
+                       (D<<24) | (C<<16) | (B<<8)  | (A);
430
+            //lut[i] = (v<<32) | (v<<24);
432 431
         }
432
+    }
433 433
 
434
-        for(y=0; y<BLOCK_SIZE; y++)
435
-        {
436
-                int a= src[1] - src[2];
437
-                int b= src[3] - src[4];
438
-                int c= src[5] - src[6];
439
-
440
-                int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
434
+    for(y=0; y<BLOCK_SIZE; y++){
435
+        int a= src[1] - src[2];
436
+        int b= src[3] - src[4];
437
+        int c= src[5] - src[6];
441 438
 
442
-                if(d < QP)
443
-                {
444
-                        int v = d * FFSIGN(-b);
439
+        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
445 440
 
446
-                        src[1] +=v/8;
447
-                        src[2] +=v/4;
448
-                        src[3] +=3*v/8;
449
-                        src[4] -=3*v/8;
450
-                        src[5] -=v/4;
451
-                        src[6] -=v/8;
441
+        if(d < QP){
442
+            int v = d * FFSIGN(-b);
452 443
 
453
-                }
454
-                src+=stride;
444
+            src[1] +=v/8;
445
+            src[2] +=v/4;
446
+            src[3] +=3*v/8;
447
+            src[4] -=3*v/8;
448
+            src[5] -=v/4;
449
+            src[6] -=v/8;
455 450
         }
451
+        src+=stride;
452
+    }
456 453
 }
457 454
 
458 455
 /**
459 456
  * accurate deblock filter
460 457
  */
461 458
 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
462
-        int y;
463
-        const int QP= c->QP;
464
-        const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
465
-        const int dcThreshold= dcOffset*2 + 1;
459
+    int y;
460
+    const int QP= c->QP;
461
+    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
462
+    const int dcThreshold= dcOffset*2 + 1;
466 463
 //START_TIMER
467
-        src+= step*4; // src points to begin of the 8x8 Block
468
-        for(y=0; y<8; y++){
469
-                int numEq= 0;
470
-
471
-                if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
472
-                if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
473
-                if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
474
-                if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
475
-                if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
476
-                if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
477
-                if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
478
-                if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
479
-                if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
480
-                if(numEq > c->ppMode.flatnessThreshold){
481
-                        int min, max, x;
482
-
483
-                        if(src[0] > src[step]){
484
-                            max= src[0];
485
-                            min= src[step];
486
-                        }else{
487
-                            max= src[step];
488
-                            min= src[0];
489
-                        }
490
-                        for(x=2; x<8; x+=2){
491
-                                if(src[x*step] > src[(x+1)*step]){
492
-                                        if(src[x    *step] > max) max= src[ x   *step];
493
-                                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
494
-                                }else{
495
-                                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
496
-                                        if(src[ x   *step] < min) min= src[ x   *step];
497
-                                }
498
-                        }
499
-                        if(max-min < 2*QP){
500
-                                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
501
-                                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
502
-
503
-                                int sums[10];
504
-                                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
505
-                                sums[1] = sums[0] - first       + src[3*step];
506
-                                sums[2] = sums[1] - first       + src[4*step];
507
-                                sums[3] = sums[2] - first       + src[5*step];
508
-                                sums[4] = sums[3] - first       + src[6*step];
509
-                                sums[5] = sums[4] - src[0*step] + src[7*step];
510
-                                sums[6] = sums[5] - src[1*step] + last;
511
-                                sums[7] = sums[6] - src[2*step] + last;
512
-                                sums[8] = sums[7] - src[3*step] + last;
513
-                                sums[9] = sums[8] - src[4*step] + last;
514
-
515
-                                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
516
-                                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
517
-                                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
518
-                                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
519
-                                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
520
-                                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
521
-                                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
522
-                                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
523
-                        }
464
+    src+= step*4; // src points to begin of the 8x8 Block
465
+    for(y=0; y<8; y++){
466
+        int numEq= 0;
467
+
468
+        if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
469
+        if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
470
+        if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
471
+        if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
472
+        if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
473
+        if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
474
+        if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
475
+        if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
476
+        if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
477
+        if(numEq > c->ppMode.flatnessThreshold){
478
+            int min, max, x;
479
+
480
+            if(src[0] > src[step]){
481
+                max= src[0];
482
+                min= src[step];
483
+            }else{
484
+                max= src[step];
485
+                min= src[0];
486
+            }
487
+            for(x=2; x<8; x+=2){
488
+                if(src[x*step] > src[(x+1)*step]){
489
+                        if(src[x    *step] > max) max= src[ x   *step];
490
+                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
524 491
                 }else{
525
-                        const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
526
-
527
-                        if(FFABS(middleEnergy) < 8*QP)
528
-                        {
529
-                                const int q=(src[3*step] - src[4*step])/2;
530
-                                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
531
-                                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
532
-
533
-                                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
534
-                                d= FFMAX(d, 0);
535
-
536
-                                d= (5*d + 32) >> 6;
537
-                                d*= FFSIGN(-middleEnergy);
538
-
539
-                                if(q>0)
540
-                                {
541
-                                        d= d<0 ? 0 : d;
542
-                                        d= d>q ? q : d;
543
-                                }
544
-                                else
545
-                                {
546
-                                        d= d>0 ? 0 : d;
547
-                                        d= d<q ? q : d;
548
-                                }
549
-
550
-                                src[3*step]-= d;
551
-                                src[4*step]+= d;
552
-                        }
492
+                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
493
+                        if(src[ x   *step] < min) min= src[ x   *step];
494
+                }
495
+            }
496
+            if(max-min < 2*QP){
497
+                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
498
+                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
499
+
500
+                int sums[10];
501
+                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
502
+                sums[1] = sums[0] - first       + src[3*step];
503
+                sums[2] = sums[1] - first       + src[4*step];
504
+                sums[3] = sums[2] - first       + src[5*step];
505
+                sums[4] = sums[3] - first       + src[6*step];
506
+                sums[5] = sums[4] - src[0*step] + src[7*step];
507
+                sums[6] = sums[5] - src[1*step] + last;
508
+                sums[7] = sums[6] - src[2*step] + last;
509
+                sums[8] = sums[7] - src[3*step] + last;
510
+                sums[9] = sums[8] - src[4*step] + last;
511
+
512
+                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
513
+                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
514
+                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
515
+                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
516
+                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
517
+                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
518
+                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
519
+                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
520
+            }
521
+        }else{
522
+            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
523
+
524
+            if(FFABS(middleEnergy) < 8*QP){
525
+                const int q=(src[3*step] - src[4*step])/2;
526
+                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
527
+                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
528
+
529
+                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
530
+                d= FFMAX(d, 0);
531
+
532
+                d= (5*d + 32) >> 6;
533
+                d*= FFSIGN(-middleEnergy);
534
+
535
+                if(q>0){
536
+                    d= d<0 ? 0 : d;
537
+                    d= d>q ? q : d;
538
+                }else{
539
+                    d= d>0 ? 0 : d;
540
+                    d= d<q ? q : d;
553 541
                 }
554 542
 
555
-                src += stride;
543
+                src[3*step]-= d;
544
+                src[4*step]+= d;
545
+            }
556 546
         }
547
+
548
+        src += stride;
549
+    }
557 550
 /*if(step==16){
558 551
     STOP_TIMER("step16")
559 552
 }else{
... ...
@@ -642,43 +630,43 @@ static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride,
642 642
 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
643 643
         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
644 644
 {
645
-        PPContext *c= (PPContext *)vc;
646
-        PPMode *ppMode= (PPMode *)vm;
647
-        c->ppMode= *ppMode; //FIXME
645
+    PPContext *c= (PPContext *)vc;
646
+    PPMode *ppMode= (PPMode *)vm;
647
+    c->ppMode= *ppMode; //FIXME
648 648
 
649
-        // Using ifs here as they are faster than function pointers although the
650
-        // difference would not be measurable here but it is much better because
651
-        // someone might exchange the CPU whithout restarting MPlayer ;)
649
+    // Using ifs here as they are faster than function pointers although the
650
+    // difference would not be measurable here but it is much better because
651
+    // someone might exchange the CPU whithout restarting MPlayer ;)
652 652
 #ifdef RUNTIME_CPUDETECT
653 653
 #if defined(ARCH_X86)
654
-        // ordered per speed fastest first
655
-        if(c->cpuCaps & PP_CPU_CAPS_MMX2)
656
-                postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
657
-        else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
658
-                postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
659
-        else if(c->cpuCaps & PP_CPU_CAPS_MMX)
660
-                postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
661
-        else
662
-                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
654
+    // ordered per speed fastest first
655
+    if(c->cpuCaps & PP_CPU_CAPS_MMX2)
656
+        postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
657
+    else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
658
+        postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
659
+    else if(c->cpuCaps & PP_CPU_CAPS_MMX)
660
+        postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
661
+    else
662
+        postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
663 663
 #else
664 664
 #ifdef HAVE_ALTIVEC
665
-        if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
666
-                postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
667
-        else
665
+    if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
666
+            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
667
+    else
668 668
 #endif
669
-                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
669
+            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
670 670
 #endif
671 671
 #else //RUNTIME_CPUDETECT
672 672
 #ifdef HAVE_MMX2
673
-                postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
673
+            postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
674 674
 #elif defined (HAVE_3DNOW)
675
-                postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
675
+            postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
676 676
 #elif defined (HAVE_MMX)
677
-                postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
677
+            postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
678 678
 #elif defined (HAVE_ALTIVEC)
679
-                postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
679
+            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
680 680
 #else
681
-                postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
681
+            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
682 682
 #endif
683 683
 #endif //!RUNTIME_CPUDETECT
684 684
 }
... ...
@@ -738,194 +726,177 @@ const char pp_help[] =
738 738
 
739 739
 pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
740 740
 {
741
-        char temp[GET_MODE_BUFFER_SIZE];
742
-        char *p= temp;
743
-        static const char filterDelimiters[] = ",/";
744
-        static const char optionDelimiters[] = ":";
745
-        struct PPMode *ppMode;
746
-        char *filterToken;
747
-
748
-        ppMode= av_malloc(sizeof(PPMode));
749
-
750
-        ppMode->lumMode= 0;
751
-        ppMode->chromMode= 0;
752
-        ppMode->maxTmpNoise[0]= 700;
753
-        ppMode->maxTmpNoise[1]= 1500;
754
-        ppMode->maxTmpNoise[2]= 3000;
755
-        ppMode->maxAllowedY= 234;
756
-        ppMode->minAllowedY= 16;
757
-        ppMode->baseDcDiff= 256/8;
758
-        ppMode->flatnessThreshold= 56-16-1;
759
-        ppMode->maxClippedThreshold= 0.01;
760
-        ppMode->error=0;
761
-
762
-        strncpy(temp, name, GET_MODE_BUFFER_SIZE);
763
-
764
-        av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
765
-
766
-        for(;;){
767
-                char *filterName;
768
-                int q= 1000000; //PP_QUALITY_MAX;
769
-                int chrom=-1;
770
-                int luma=-1;
771
-                char *option;
772
-                char *options[OPTIONS_ARRAY_SIZE];
773
-                int i;
774
-                int filterNameOk=0;
775
-                int numOfUnknownOptions=0;
776
-                int enable=1; //does the user want us to enabled or disabled the filter
777
-
778
-                filterToken= strtok(p, filterDelimiters);
779
-                if(filterToken == NULL) break;
780
-                p+= strlen(filterToken) + 1; // p points to next filterToken
781
-                filterName= strtok(filterToken, optionDelimiters);
782
-                av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
783
-
784
-                if(*filterName == '-')
785
-                {
786
-                        enable=0;
787
-                        filterName++;
788
-                }
741
+    char temp[GET_MODE_BUFFER_SIZE];
742
+    char *p= temp;
743
+    static const char filterDelimiters[] = ",/";
744
+    static const char optionDelimiters[] = ":";
745
+    struct PPMode *ppMode;
746
+    char *filterToken;
747
+
748
+    ppMode= av_malloc(sizeof(PPMode));
749
+
750
+    ppMode->lumMode= 0;
751
+    ppMode->chromMode= 0;
752
+    ppMode->maxTmpNoise[0]= 700;
753
+    ppMode->maxTmpNoise[1]= 1500;
754
+    ppMode->maxTmpNoise[2]= 3000;
755
+    ppMode->maxAllowedY= 234;
756
+    ppMode->minAllowedY= 16;
757
+    ppMode->baseDcDiff= 256/8;
758
+    ppMode->flatnessThreshold= 56-16-1;
759
+    ppMode->maxClippedThreshold= 0.01;
760
+    ppMode->error=0;
761
+
762
+    strncpy(temp, name, GET_MODE_BUFFER_SIZE);
763
+
764
+    av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
765
+
766
+    for(;;){
767
+        char *filterName;
768
+        int q= 1000000; //PP_QUALITY_MAX;
769
+        int chrom=-1;
770
+        int luma=-1;
771
+        char *option;
772
+        char *options[OPTIONS_ARRAY_SIZE];
773
+        int i;
774
+        int filterNameOk=0;
775
+        int numOfUnknownOptions=0;
776
+        int enable=1; //does the user want us to enabled or disabled the filter
777
+
778
+        filterToken= strtok(p, filterDelimiters);
779
+        if(filterToken == NULL) break;
780
+        p+= strlen(filterToken) + 1; // p points to next filterToken
781
+        filterName= strtok(filterToken, optionDelimiters);
782
+        av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
783
+
784
+        if(*filterName == '-'){
785
+            enable=0;
786
+            filterName++;
787
+        }
789 788
 
790
-                for(;;){ //for all options
791
-                        option= strtok(NULL, optionDelimiters);
792
-                        if(option == NULL) break;
793
-
794
-                        av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
795
-                        if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
796
-                        else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
797
-                        else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
798
-                        else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
799
-                        else
800
-                        {
801
-                                options[numOfUnknownOptions] = option;
802
-                                numOfUnknownOptions++;
803
-                        }
804
-                        if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
789
+        for(;;){ //for all options
790
+            option= strtok(NULL, optionDelimiters);
791
+            if(option == NULL) break;
792
+
793
+            av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
794
+            if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
795
+            else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
796
+            else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
797
+            else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
798
+            else{
799
+                options[numOfUnknownOptions] = option;
800
+                numOfUnknownOptions++;
801
+            }
802
+            if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
803
+        }
804
+        options[numOfUnknownOptions] = NULL;
805
+
806
+        /* replace stuff from the replace Table */
807
+        for(i=0; replaceTable[2*i]!=NULL; i++){
808
+            if(!strcmp(replaceTable[2*i], filterName)){
809
+                int newlen= strlen(replaceTable[2*i + 1]);
810
+                int plen;
811
+                int spaceLeft;
812
+
813
+                if(p==NULL) p= temp, *p=0;      //last filter
814
+                else p--, *p=',';               //not last filter
815
+
816
+                plen= strlen(p);
817
+                spaceLeft= p - temp + plen;
818
+                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE){
819
+                    ppMode->error++;
820
+                    break;
805 821
                 }
806
-                options[numOfUnknownOptions] = NULL;
822
+                memmove(p + newlen, p, plen+1);
823
+                memcpy(p, replaceTable[2*i + 1], newlen);
824
+                filterNameOk=1;
825
+            }
826
+        }
807 827
 
808
-                /* replace stuff from the replace Table */
809
-                for(i=0; replaceTable[2*i]!=NULL; i++)
810
-                {
811
-                        if(!strcmp(replaceTable[2*i], filterName))
812
-                        {
813
-                                int newlen= strlen(replaceTable[2*i + 1]);
814
-                                int plen;
815
-                                int spaceLeft;
816
-
817
-                                if(p==NULL) p= temp, *p=0;      //last filter
818
-                                else p--, *p=',';               //not last filter
819
-
820
-                                plen= strlen(p);
821
-                                spaceLeft= p - temp + plen;
822
-                                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
823
-                                {
824
-                                        ppMode->error++;
825
-                                        break;
826
-                                }
827
-                                memmove(p + newlen, p, plen+1);
828
-                                memcpy(p, replaceTable[2*i + 1], newlen);
829
-                                filterNameOk=1;
828
+        for(i=0; filters[i].shortName!=NULL; i++){
829
+            if(   !strcmp(filters[i].longName, filterName)
830
+               || !strcmp(filters[i].shortName, filterName)){
831
+                ppMode->lumMode &= ~filters[i].mask;
832
+                ppMode->chromMode &= ~filters[i].mask;
833
+
834
+                filterNameOk=1;
835
+                if(!enable) break; // user wants to disable it
836
+
837
+                if(q >= filters[i].minLumQuality && luma)
838
+                    ppMode->lumMode|= filters[i].mask;
839
+                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
840
+                    if(q >= filters[i].minChromQuality)
841
+                            ppMode->chromMode|= filters[i].mask;
842
+
843
+                if(filters[i].mask == LEVEL_FIX){
844
+                    int o;
845
+                    ppMode->minAllowedY= 16;
846
+                    ppMode->maxAllowedY= 234;
847
+                    for(o=0; options[o]!=NULL; o++){
848
+                        if(  !strcmp(options[o],"fullyrange")
849
+                           ||!strcmp(options[o],"f")){
850
+                            ppMode->minAllowedY= 0;
851
+                            ppMode->maxAllowedY= 255;
852
+                            numOfUnknownOptions--;
830 853
                         }
854
+                    }
831 855
                 }
832
-
833
-                for(i=0; filters[i].shortName!=NULL; i++)
856
+                else if(filters[i].mask == TEMP_NOISE_FILTER)
834 857
                 {
835
-                        if(   !strcmp(filters[i].longName, filterName)
836
-                           || !strcmp(filters[i].shortName, filterName))
837
-                        {
838
-                                ppMode->lumMode &= ~filters[i].mask;
839
-                                ppMode->chromMode &= ~filters[i].mask;
840
-
841
-                                filterNameOk=1;
842
-                                if(!enable) break; // user wants to disable it
843
-
844
-                                if(q >= filters[i].minLumQuality && luma)
845
-                                        ppMode->lumMode|= filters[i].mask;
846
-                                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
847
-                                        if(q >= filters[i].minChromQuality)
848
-                                                ppMode->chromMode|= filters[i].mask;
849
-
850
-                                if(filters[i].mask == LEVEL_FIX)
851
-                                {
852
-                                        int o;
853
-                                        ppMode->minAllowedY= 16;
854
-                                        ppMode->maxAllowedY= 234;
855
-                                        for(o=0; options[o]!=NULL; o++)
856
-                                        {
857
-                                                if(  !strcmp(options[o],"fullyrange")
858
-                                                   ||!strcmp(options[o],"f"))
859
-                                                {
860
-                                                        ppMode->minAllowedY= 0;
861
-                                                        ppMode->maxAllowedY= 255;
862
-                                                        numOfUnknownOptions--;
863
-                                                }
864
-                                        }
865
-                                }
866
-                                else if(filters[i].mask == TEMP_NOISE_FILTER)
867
-                                {
868
-                                        int o;
869
-                                        int numOfNoises=0;
870
-
871
-                                        for(o=0; options[o]!=NULL; o++)
872
-                                        {
873
-                                                char *tail;
874
-                                                ppMode->maxTmpNoise[numOfNoises]=
875
-                                                        strtol(options[o], &tail, 0);
876
-                                                if(tail!=options[o])
877
-                                                {
878
-                                                        numOfNoises++;
879
-                                                        numOfUnknownOptions--;
880
-                                                        if(numOfNoises >= 3) break;
881
-                                                }
882
-                                        }
883
-                                }
884
-                                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
885
-                                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
886
-                                {
887
-                                        int o;
888
-
889
-                                        for(o=0; options[o]!=NULL && o<2; o++)
890
-                                        {
891
-                                                char *tail;
892
-                                                int val= strtol(options[o], &tail, 0);
893
-                                                if(tail==options[o]) break;
894
-
895
-                                                numOfUnknownOptions--;
896
-                                                if(o==0) ppMode->baseDcDiff= val;
897
-                                                else ppMode->flatnessThreshold= val;
898
-                                        }
899
-                                }
900
-                                else if(filters[i].mask == FORCE_QUANT)
901
-                                {
902
-                                        int o;
903
-                                        ppMode->forcedQuant= 15;
904
-
905
-                                        for(o=0; options[o]!=NULL && o<1; o++)
906
-                                        {
907
-                                                char *tail;
908
-                                                int val= strtol(options[o], &tail, 0);
909
-                                                if(tail==options[o]) break;
910
-
911
-                                                numOfUnknownOptions--;
912
-                                                ppMode->forcedQuant= val;
913
-                                        }
914
-                                }
858
+                    int o;
859
+                    int numOfNoises=0;
860
+
861
+                    for(o=0; options[o]!=NULL; o++){
862
+                        char *tail;
863
+                        ppMode->maxTmpNoise[numOfNoises]=
864
+                            strtol(options[o], &tail, 0);
865
+                        if(tail!=options[o]){
866
+                            numOfNoises++;
867
+                            numOfUnknownOptions--;
868
+                            if(numOfNoises >= 3) break;
915 869
                         }
870
+                    }
916 871
                 }
917
-                if(!filterNameOk) ppMode->error++;
918
-                ppMode->error += numOfUnknownOptions;
919
-        }
920
-
921
-        av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
922
-        if(ppMode->error)
923
-        {
924
-                av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
925
-                av_free(ppMode);
926
-                return NULL;
872
+                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
873
+                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
874
+                    int o;
875
+
876
+                    for(o=0; options[o]!=NULL && o<2; o++){
877
+                        char *tail;
878
+                        int val= strtol(options[o], &tail, 0);
879
+                        if(tail==options[o]) break;
880
+
881
+                        numOfUnknownOptions--;
882
+                        if(o==0) ppMode->baseDcDiff= val;
883
+                        else ppMode->flatnessThreshold= val;
884
+                    }
885
+                }
886
+                else if(filters[i].mask == FORCE_QUANT){
887
+                    int o;
888
+                    ppMode->forcedQuant= 15;
889
+
890
+                    for(o=0; options[o]!=NULL && o<1; o++){
891
+                        char *tail;
892
+                        int val= strtol(options[o], &tail, 0);
893
+                        if(tail==options[o]) break;
894
+
895
+                        numOfUnknownOptions--;
896
+                        ppMode->forcedQuant= val;
897
+                    }
898
+                }
899
+            }
927 900
         }
928
-        return ppMode;
901
+        if(!filterNameOk) ppMode->error++;
902
+        ppMode->error += numOfUnknownOptions;
903
+    }
904
+
905
+    av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
906
+    if(ppMode->error){
907
+        av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
908
+        av_free(ppMode);
909
+        return NULL;
910
+    }
911
+    return ppMode;
929 912
 }
930 913
 
931 914
 void pp_free_mode(pp_mode_t *mode){
... ...
@@ -933,36 +904,35 @@ void pp_free_mode(pp_mode_t *mode){
933 933
 }
934 934
 
935 935
 static void reallocAlign(void **p, int alignment, int size){
936
-        av_free(*p);
937
-        *p= av_mallocz(size);
936
+    av_free(*p);
937
+    *p= av_mallocz(size);
938 938
 }
939 939
 
940 940
 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
941
-        int mbWidth = (width+15)>>4;
942
-        int mbHeight= (height+15)>>4;
943
-        int i;
944
-
945
-        c->stride= stride;
946
-        c->qpStride= qpStride;
947
-
948
-        reallocAlign((void **)&c->tempDst, 8, stride*24);
949
-        reallocAlign((void **)&c->tempSrc, 8, stride*24);
950
-        reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
951
-        reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
952
-        for(i=0; i<256; i++)
953
-                c->yHistogram[i]= width*height/64*15/256;
954
-
955
-        for(i=0; i<3; i++)
956
-        {
957
-                //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
958
-                reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
959
-                reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
960
-        }
961
-
962
-        reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
963
-        reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
964
-        reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
965
-        reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
941
+    int mbWidth = (width+15)>>4;
942
+    int mbHeight= (height+15)>>4;
943
+    int i;
944
+
945
+    c->stride= stride;
946
+    c->qpStride= qpStride;
947
+
948
+    reallocAlign((void **)&c->tempDst, 8, stride*24);
949
+    reallocAlign((void **)&c->tempSrc, 8, stride*24);
950
+    reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
951
+    reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
952
+    for(i=0; i<256; i++)
953
+            c->yHistogram[i]= width*height/64*15/256;
954
+
955
+    for(i=0; i<3; i++){
956
+        //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
957
+        reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
958
+        reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
959
+    }
960
+
961
+    reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
962
+    reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
963
+    reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
964
+    reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
966 965
 }
967 966
 
968 967
 static const char * context_to_name(void * ptr) {
... ...
@@ -972,153 +942,146 @@ static const char * context_to_name(void * ptr) {
972 972
 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
973 973
 
974 974
 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
975
-        PPContext *c= av_malloc(sizeof(PPContext));
976
-        int stride= (width+15)&(~15);    //assumed / will realloc if needed
977
-        int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
978
-
979
-        memset(c, 0, sizeof(PPContext));
980
-        c->av_class = &av_codec_context_class;
981
-        c->cpuCaps= cpuCaps;
982
-        if(cpuCaps&PP_FORMAT){
983
-                c->hChromaSubSample= cpuCaps&0x3;
984
-                c->vChromaSubSample= (cpuCaps>>4)&0x3;
985
-        }else{
986
-                c->hChromaSubSample= 1;
987
-                c->vChromaSubSample= 1;
988
-        }
989
-
990
-        reallocBuffers(c, width, height, stride, qpStride);
991
-
992
-        c->frameNum=-1;
993
-
994
-        return c;
975
+    PPContext *c= av_malloc(sizeof(PPContext));
976
+    int stride= (width+15)&(~15);    //assumed / will realloc if needed
977
+    int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
978
+
979
+    memset(c, 0, sizeof(PPContext));
980
+    c->av_class = &av_codec_context_class;
981
+    c->cpuCaps= cpuCaps;
982
+    if(cpuCaps&PP_FORMAT){
983
+        c->hChromaSubSample= cpuCaps&0x3;
984
+        c->vChromaSubSample= (cpuCaps>>4)&0x3;
985
+    }else{
986
+        c->hChromaSubSample= 1;
987
+        c->vChromaSubSample= 1;
988
+    }
989
+
990
+    reallocBuffers(c, width, height, stride, qpStride);
991
+
992
+    c->frameNum=-1;
993
+
994
+    return c;
995 995
 }
996 996
 
997 997
 void pp_free_context(void *vc){
998
-        PPContext *c = (PPContext*)vc;
999
-        int i;
998
+    PPContext *c = (PPContext*)vc;
999
+    int i;
1000 1000
 
1001
-        for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1002
-        for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1001
+    for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1002
+    for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1003 1003
 
1004
-        av_free(c->tempBlocks);
1005
-        av_free(c->yHistogram);
1006
-        av_free(c->tempDst);
1007
-        av_free(c->tempSrc);
1008
-        av_free(c->deintTemp);
1009
-        av_free(c->stdQPTable);
1010
-        av_free(c->nonBQPTable);
1011
-        av_free(c->forcedQPTable);
1004
+    av_free(c->tempBlocks);
1005
+    av_free(c->yHistogram);
1006
+    av_free(c->tempDst);
1007
+    av_free(c->tempSrc);
1008
+    av_free(c->deintTemp);
1009
+    av_free(c->stdQPTable);
1010
+    av_free(c->nonBQPTable);
1011
+    av_free(c->forcedQPTable);
1012 1012
 
1013
-        memset(c, 0, sizeof(PPContext));
1013
+    memset(c, 0, sizeof(PPContext));
1014 1014
 
1015
-        av_free(c);
1015
+    av_free(c);
1016 1016
 }
1017 1017
 
1018 1018
 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
1019
-                 uint8_t * dst[3], const int dstStride[3],
1020
-                 int width, int height,
1021
-                 const QP_STORE_T *QP_store,  int QPStride,
1022
-                 pp_mode_t *vm,  void *vc, int pict_type)
1019
+                     uint8_t * dst[3], const int dstStride[3],
1020
+                     int width, int height,
1021
+                     const QP_STORE_T *QP_store,  int QPStride,
1022
+                     pp_mode_t *vm,  void *vc, int pict_type)
1023 1023
 {
1024
-        int mbWidth = (width+15)>>4;
1025
-        int mbHeight= (height+15)>>4;
1026
-        PPMode *mode = (PPMode*)vm;
1027
-        PPContext *c = (PPContext*)vc;
1028
-        int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1029
-        int absQPStride = FFABS(QPStride);
1030
-
1031
-        // c->stride and c->QPStride are always positive
1032
-        if(c->stride < minStride || c->qpStride < absQPStride)
1033
-                reallocBuffers(c, width, height,
1034
-                                FFMAX(minStride, c->stride),
1035
-                                FFMAX(c->qpStride, absQPStride));
1036
-
1037
-        if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1038
-        {
1039
-                int i;
1040
-                QP_store= c->forcedQPTable;
1041
-                absQPStride = QPStride = 0;
1042
-                if(mode->lumMode & FORCE_QUANT)
1043
-                        for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
1044
-                else
1045
-                        for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1046
-        }
1024
+    int mbWidth = (width+15)>>4;
1025
+    int mbHeight= (height+15)>>4;
1026
+    PPMode *mode = (PPMode*)vm;
1027
+    PPContext *c = (PPContext*)vc;
1028
+    int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1029
+    int absQPStride = FFABS(QPStride);
1030
+
1031
+    // c->stride and c->QPStride are always positive
1032
+    if(c->stride < minStride || c->qpStride < absQPStride)
1033
+        reallocBuffers(c, width, height,
1034
+                       FFMAX(minStride, c->stride),
1035
+                       FFMAX(c->qpStride, absQPStride));
1036
+
1037
+    if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
1038
+        int i;
1039
+        QP_store= c->forcedQPTable;
1040
+        absQPStride = QPStride = 0;
1041
+        if(mode->lumMode & FORCE_QUANT)
1042
+            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
1043
+        else
1044
+            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1045
+    }
1047 1046
 
1048
-        if(pict_type & PP_PICT_TYPE_QP2){
1049
-                int i;
1050
-                const int count= mbHeight * absQPStride;
1051
-                for(i=0; i<(count>>2); i++){
1052
-                        ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1053
-                }
1054
-                for(i<<=2; i<count; i++){
1055
-                        c->stdQPTable[i] = QP_store[i]>>1;
1056
-                }
1057
-                QP_store= c->stdQPTable;
1058
-                QPStride= absQPStride;
1047
+    if(pict_type & PP_PICT_TYPE_QP2){
1048
+        int i;
1049
+        const int count= mbHeight * absQPStride;
1050
+        for(i=0; i<(count>>2); i++){
1051
+            ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1059 1052
         }
1060
-
1061
-if(0){
1062
-int x,y;
1063
-for(y=0; y<mbHeight; y++){
1064
-        for(x=0; x<mbWidth; x++){
1053
+        for(i<<=2; i<count; i++){
1054
+            c->stdQPTable[i] = QP_store[i]>>1;
1055
+        }
1056
+        QP_store= c->stdQPTable;
1057
+        QPStride= absQPStride;
1058
+    }
1059
+
1060
+    if(0){
1061
+        int x,y;
1062
+        for(y=0; y<mbHeight; y++){
1063
+            for(x=0; x<mbWidth; x++){
1065 1064
                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1065
+            }
1066
+            av_log(c, AV_LOG_INFO, "\n");
1066 1067
         }
1067 1068
         av_log(c, AV_LOG_INFO, "\n");
1068
-}
1069
-        av_log(c, AV_LOG_INFO, "\n");
1070
-}
1071
-
1072
-        if((pict_type&7)!=3)
1073
-        {
1074
-                if (QPStride >= 0) {
1075
-                        int i;
1076
-                        const int count= mbHeight * QPStride;
1077
-                        for(i=0; i<(count>>2); i++){
1078
-                                ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1079
-                        }
1080
-                        for(i<<=2; i<count; i++){
1081
-                                c->nonBQPTable[i] = QP_store[i] & 0x3F;
1082
-                        }
1083
-                } else {
1084
-                        int i,j;
1085
-                        for(i=0; i<mbHeight; i++) {
1086
-                                    for(j=0; j<absQPStride; j++) {
1087
-                                        c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1088
-                                }
1089
-                        }
1069
+    }
1070
+
1071
+    if((pict_type&7)!=3){
1072
+        if (QPStride >= 0){
1073
+            int i;
1074
+            const int count= mbHeight * QPStride;
1075
+            for(i=0; i<(count>>2); i++){
1076
+                ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1077
+            }
1078
+            for(i<<=2; i<count; i++){
1079
+                c->nonBQPTable[i] = QP_store[i] & 0x3F;
1080
+            }
1081
+        } else {
1082
+            int i,j;
1083
+            for(i=0; i<mbHeight; i++) {
1084
+                for(j=0; j<absQPStride; j++) {
1085
+                    c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1090 1086
                 }
1087
+            }
1091 1088
         }
1089
+    }
1092 1090
 
1093
-        av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1094
-               mode->lumMode, mode->chromMode);
1091
+    av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1092
+           mode->lumMode, mode->chromMode);
1095 1093
 
1096
-        postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1094
+    postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1097 1095
                 width, height, QP_store, QPStride, 0, mode, c);
1098 1096
 
1099
-        width  = (width )>>c->hChromaSubSample;
1100
-        height = (height)>>c->vChromaSubSample;
1101
-
1102
-        if(mode->chromMode)
1103
-        {
1104
-                postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1105
-                        width, height, QP_store, QPStride, 1, mode, c);
1106
-                postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1107
-                        width, height, QP_store, QPStride, 2, mode, c);
1108
-        }
1109
-        else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1110
-        {
1111
-                linecpy(dst[1], src[1], height, srcStride[1]);
1112
-                linecpy(dst[2], src[2], height, srcStride[2]);
1113
-        }
1114
-        else
1115
-        {
1116
-                int y;
1117
-                for(y=0; y<height; y++)
1118
-                {
1119
-                        memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1120
-                        memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1121
-                }
1097
+    width  = (width )>>c->hChromaSubSample;
1098
+    height = (height)>>c->vChromaSubSample;
1099
+
1100
+    if(mode->chromMode){
1101
+        postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1102
+                    width, height, QP_store, QPStride, 1, mode, c);
1103
+        postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1104
+                    width, height, QP_store, QPStride, 2, mode, c);
1105
+    }
1106
+    else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1107
+        linecpy(dst[1], src[1], height, srcStride[1]);
1108
+        linecpy(dst[2], src[2], height, srcStride[2]);
1109
+    }else{
1110
+        int y;
1111
+        for(y=0; y<height; y++){
1112
+            memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1113
+            memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1122 1114
         }
1115
+    }
1123 1116
 }
1124 1117
 
... ...
@@ -59,10 +59,10 @@ extern const char pp_help[]; ///< a simple help text
59 59
 #endif
60 60
 
61 61
 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
62
-                 uint8_t * dst[3], const int dstStride[3],
63
-                 int horizontalSize, int verticalSize,
64
-                 const QP_STORE_T *QP_store,  int QP_stride,
65
-                 pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
62
+                     uint8_t * dst[3], const int dstStride[3],
63
+                     int horizontalSize, int verticalSize,
64
+                     const QP_STORE_T *QP_store,  int QP_stride,
65
+                     pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
66 66
 
67 67
 
68 68
 /**
... ...
@@ -23,353 +23,352 @@
23 23
 #include "avutil.h"
24 24
 
25 25
 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
26
-  do {                                                                  \
27
-    __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;                   \
28
-    __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;                   \
29
-    __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;                   \
30
-    __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;                   \
31
-    tempA1 = vec_mergeh (src_a, src_e);                                 \
32
-    tempB1 = vec_mergel (src_a, src_e);                                 \
33
-    tempC1 = vec_mergeh (src_b, src_f);                                 \
34
-    tempD1 = vec_mergel (src_b, src_f);                                 \
35
-    tempE1 = vec_mergeh (src_c, src_g);                                 \
36
-    tempF1 = vec_mergel (src_c, src_g);                                 \
37
-    tempG1 = vec_mergeh (src_d, src_h);                                 \
38
-    tempH1 = vec_mergel (src_d, src_h);                                 \
39
-    tempA2 = vec_mergeh (tempA1, tempE1);                               \
40
-    tempB2 = vec_mergel (tempA1, tempE1);                               \
41
-    tempC2 = vec_mergeh (tempB1, tempF1);                               \
42
-    tempD2 = vec_mergel (tempB1, tempF1);                               \
43
-    tempE2 = vec_mergeh (tempC1, tempG1);                               \
44
-    tempF2 = vec_mergel (tempC1, tempG1);                               \
45
-    tempG2 = vec_mergeh (tempD1, tempH1);                               \
46
-    tempH2 = vec_mergel (tempD1, tempH1);                               \
47
-    src_a = vec_mergeh (tempA2, tempE2);                                \
48
-    src_b = vec_mergel (tempA2, tempE2);                                \
49
-    src_c = vec_mergeh (tempB2, tempF2);                                \
50
-    src_d = vec_mergel (tempB2, tempF2);                                \
51
-    src_e = vec_mergeh (tempC2, tempG2);                                \
52
-    src_f = vec_mergel (tempC2, tempG2);                                \
53
-    src_g = vec_mergeh (tempD2, tempH2);                                \
54
-    src_h = vec_mergel (tempD2, tempH2);                                \
55
-  } while (0)
26
+    do {                                                          \
27
+        __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;         \
28
+        __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;         \
29
+        __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;         \
30
+        __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;         \
31
+        tempA1 = vec_mergeh (src_a, src_e);                       \
32
+        tempB1 = vec_mergel (src_a, src_e);                       \
33
+        tempC1 = vec_mergeh (src_b, src_f);                       \
34
+        tempD1 = vec_mergel (src_b, src_f);                       \
35
+        tempE1 = vec_mergeh (src_c, src_g);                       \
36
+        tempF1 = vec_mergel (src_c, src_g);                       \
37
+        tempG1 = vec_mergeh (src_d, src_h);                       \
38
+        tempH1 = vec_mergel (src_d, src_h);                       \
39
+        tempA2 = vec_mergeh (tempA1, tempE1);                     \
40
+        tempB2 = vec_mergel (tempA1, tempE1);                     \
41
+        tempC2 = vec_mergeh (tempB1, tempF1);                     \
42
+        tempD2 = vec_mergel (tempB1, tempF1);                     \
43
+        tempE2 = vec_mergeh (tempC1, tempG1);                     \
44
+        tempF2 = vec_mergel (tempC1, tempG1);                     \
45
+        tempG2 = vec_mergeh (tempD1, tempH1);                     \
46
+        tempH2 = vec_mergel (tempD1, tempH1);                     \
47
+        src_a = vec_mergeh (tempA2, tempE2);                      \
48
+        src_b = vec_mergel (tempA2, tempE2);                      \
49
+        src_c = vec_mergeh (tempB2, tempF2);                      \
50
+        src_d = vec_mergel (tempB2, tempF2);                      \
51
+        src_e = vec_mergeh (tempC2, tempG2);                      \
52
+        src_f = vec_mergel (tempC2, tempG2);                      \
53
+        src_g = vec_mergeh (tempD2, tempH2);                      \
54
+        src_h = vec_mergel (tempD2, tempH2);                      \
55
+    } while (0)
56 56
 
57 57
 
58 58
 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
59
-  /*
59
+    /*
60 60
     this code makes no assumption on src or stride.
61 61
     One could remove the recomputation of the perm
62 62
     vector by assuming (stride % 16) == 0, unfortunately
63 63
     this is not always true.
64
-  */
65
-  DECLARE_ALIGNED(16, short, data[8]);
66
-  int numEq;
67
-  uint8_t *src2 = src;
68
-  vector signed short v_dcOffset;
69
-  vector signed short v2QP;
70
-  vector unsigned short v4QP;
71
-  vector unsigned short v_dcThreshold;
72
-  const int properStride = (stride % 16);
73
-  const int srcAlign = ((unsigned long)src2 % 16);
74
-  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
75
-  const vector signed int zero = vec_splat_s32(0);
76
-  const vector signed short mask = vec_splat_s16(1);
77
-  vector signed int v_numEq = vec_splat_s32(0);
78
-
79
-  data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
80
-  data[1] = data[0] * 2 + 1;
81
-  data[2] = c->QP * 2;
82
-  data[3] = c->QP * 4;
83
-  vector signed short v_data = vec_ld(0, data);
84
-  v_dcOffset = vec_splat(v_data, 0);
85
-  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
86
-  v2QP = vec_splat(v_data, 2);
87
-  v4QP = (vector unsigned short)vec_splat(v_data, 3);
88
-
89
-  src2 += stride * 4;
90
-
91
-  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
64
+    */
65
+    DECLARE_ALIGNED(16, short, data[8]);
66
+    int numEq;
67
+    uint8_t *src2 = src;
68
+    vector signed short v_dcOffset;
69
+    vector signed short v2QP;
70
+    vector unsigned short v4QP;
71
+    vector unsigned short v_dcThreshold;
72
+    const int properStride = (stride % 16);
73
+    const int srcAlign = ((unsigned long)src2 % 16);
74
+    const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
75
+    const vector signed int zero = vec_splat_s32(0);
76
+    const vector signed short mask = vec_splat_s16(1);
77
+    vector signed int v_numEq = vec_splat_s32(0);
78
+
79
+    data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
80
+    data[1] = data[0] * 2 + 1;
81
+    data[2] = c->QP * 2;
82
+    data[3] = c->QP * 4;
83
+    vector signed short v_data = vec_ld(0, data);
84
+    v_dcOffset = vec_splat(v_data, 0);
85
+    v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
86
+    v2QP = vec_splat(v_data, 2);
87
+    v4QP = (vector unsigned short)vec_splat(v_data, 3);
88
+
89
+    src2 += stride * 4;
90
+
91
+    vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
92 92
 
93 93
 #define LOAD_LINE(i)                                                    \
94
-  register int j##i = i * stride;                                       \
95
-  vector unsigned char perm##i = vec_lvsl(j##i, src2);                  \
96
-  const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);           \
97
-  vector unsigned char v_srcA2##i;                                      \
98
-  if (two_vectors)                                                      \
99
-    v_srcA2##i = vec_ld(j##i + 16, src2);                               \
100
-  const vector unsigned char v_srcA##i =                                \
101
-    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);                          \
102
-  v_srcAss##i =                                                         \
103
-    (vector signed short)vec_mergeh((vector signed char)zero,           \
104
-                                    (vector signed char)v_srcA##i)
94
+    register int j##i = i * stride;                                     \
95
+    vector unsigned char perm##i = vec_lvsl(j##i, src2);                \
96
+    const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);         \
97
+    vector unsigned char v_srcA2##i;                                    \
98
+    if (two_vectors)                                                    \
99
+        v_srcA2##i = vec_ld(j##i + 16, src2);                           \
100
+    const vector unsigned char v_srcA##i =                              \
101
+        vec_perm(v_srcA1##i, v_srcA2##i, perm##i);                      \
102
+    v_srcAss##i =                                                       \
103
+        (vector signed short)vec_mergeh((vector signed char)zero,       \
104
+                                        (vector signed char)v_srcA##i)
105 105
 
106 106
 #define LOAD_LINE_ALIGNED(i)                                            \
107
-  register int j##i = i * stride;                                       \
108
-  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \
109
-  v_srcAss##i =                                                         \
110
-    (vector signed short)vec_mergeh((vector signed char)zero,           \
111
-                                    (vector signed char)v_srcA##i)
107
+    register int j##i = i * stride;                                     \
108
+    const vector unsigned char v_srcA##i = vec_ld(j##i, src2);          \
109
+    v_srcAss##i =                                                       \
110
+        (vector signed short)vec_mergeh((vector signed char)zero,       \
111
+                                        (vector signed char)v_srcA##i)
112 112
 
113 113
     /* Special-casing the aligned case is worthwhile, as all calls from
114 114
      * the (transposed) horizontable deblocks will be aligned, in addition
115 115
      * to the naturally aligned vertical deblocks. */
116 116
     if (properStride && srcAlign) {
117
-      LOAD_LINE_ALIGNED(0);
118
-      LOAD_LINE_ALIGNED(1);
119
-      LOAD_LINE_ALIGNED(2);
120
-      LOAD_LINE_ALIGNED(3);
121
-      LOAD_LINE_ALIGNED(4);
122
-      LOAD_LINE_ALIGNED(5);
123
-      LOAD_LINE_ALIGNED(6);
124
-      LOAD_LINE_ALIGNED(7);
117
+        LOAD_LINE_ALIGNED(0);
118
+        LOAD_LINE_ALIGNED(1);
119
+        LOAD_LINE_ALIGNED(2);
120
+        LOAD_LINE_ALIGNED(3);
121
+        LOAD_LINE_ALIGNED(4);
122
+        LOAD_LINE_ALIGNED(5);
123
+        LOAD_LINE_ALIGNED(6);
124
+        LOAD_LINE_ALIGNED(7);
125 125
     } else {
126
-      LOAD_LINE(0);
127
-      LOAD_LINE(1);
128
-      LOAD_LINE(2);
129
-      LOAD_LINE(3);
130
-      LOAD_LINE(4);
131
-      LOAD_LINE(5);
132
-      LOAD_LINE(6);
133
-      LOAD_LINE(7);
126
+        LOAD_LINE(0);
127
+        LOAD_LINE(1);
128
+        LOAD_LINE(2);
129
+        LOAD_LINE(3);
130
+        LOAD_LINE(4);
131
+        LOAD_LINE(5);
132
+        LOAD_LINE(6);
133
+        LOAD_LINE(7);
134 134
     }
135 135
 #undef LOAD_LINE
136 136
 #undef LOAD_LINE_ALIGNED
137 137
 
138 138
 #define ITER(i, j)                                                      \
139
-  const vector signed short v_diff##i =                                 \
140
-    vec_sub(v_srcAss##i, v_srcAss##j);                                  \
141
-  const vector signed short v_sum##i =                                  \
142
-    vec_add(v_diff##i, v_dcOffset);                                     \
143
-  const vector signed short v_comp##i =                                 \
144
-    (vector signed short)vec_cmplt((vector unsigned short)v_sum##i,     \
145
-                                   v_dcThreshold);                      \
146
-  const vector signed short v_part##i = vec_and(mask, v_comp##i);       \
147
-  v_numEq = vec_sum4s(v_part##i, v_numEq);
148
-
149
-  ITER(0, 1);
150
-  ITER(1, 2);
151
-  ITER(2, 3);
152
-  ITER(3, 4);
153
-  ITER(4, 5);
154
-  ITER(5, 6);
155
-  ITER(6, 7);
139
+    const vector signed short v_diff##i =                               \
140
+        vec_sub(v_srcAss##i, v_srcAss##j);                              \
141
+    const vector signed short v_sum##i =                                \
142
+        vec_add(v_diff##i, v_dcOffset);                                 \
143
+    const vector signed short v_comp##i =                               \
144
+        (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
145
+                                       v_dcThreshold);                  \
146
+    const vector signed short v_part##i = vec_and(mask, v_comp##i);     \
147
+    v_numEq = vec_sum4s(v_part##i, v_numEq);
148
+
149
+    ITER(0, 1);
150
+    ITER(1, 2);
151
+    ITER(2, 3);
152
+    ITER(3, 4);
153
+    ITER(4, 5);
154
+    ITER(5, 6);
155
+    ITER(6, 7);
156 156
 #undef ITER
157 157
 
158
-  v_numEq = vec_sums(v_numEq, zero);
159
-
160
-  v_numEq = vec_splat(v_numEq, 3);
161
-  vec_ste(v_numEq, 0, &numEq);
162
-
163
-  if (numEq > c->ppMode.flatnessThreshold)
164
-    {
165
-      const vector unsigned char mmoP1 = (const vector unsigned char)
166
-        AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
167
-            0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
168
-      const vector unsigned char mmoP2 = (const vector unsigned char)
169
-        AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
170
-            0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
171
-      const vector unsigned char mmoP = (const vector unsigned char)
172
-        vec_lvsl(8, (unsigned char*)0);
173
-
174
-      vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
175
-      vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
176
-      vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
177
-      vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
178
-      vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
179
-      vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
180
-      vector signed short mmoDiff = vec_sub(mmoL, mmoR);
181
-      vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
182
-
183
-      if (vec_any_gt(mmoSum, v4QP))
184
-        return 0;
185
-      else
186
-        return 1;
158
+    v_numEq = vec_sums(v_numEq, zero);
159
+
160
+    v_numEq = vec_splat(v_numEq, 3);
161
+    vec_ste(v_numEq, 0, &numEq);
162
+
163
+    if (numEq > c->ppMode.flatnessThreshold){
164
+        const vector unsigned char mmoP1 = (const vector unsigned char)
165
+            AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
166
+                0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
167
+        const vector unsigned char mmoP2 = (const vector unsigned char)
168
+            AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
169
+                0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
170
+        const vector unsigned char mmoP = (const vector unsigned char)
171
+            vec_lvsl(8, (unsigned char*)0);
172
+
173
+        vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
174
+        vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
175
+        vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
176
+        vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
177
+        vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
178
+        vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
179
+        vector signed short mmoDiff = vec_sub(mmoL, mmoR);
180
+        vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
181
+
182
+        if (vec_any_gt(mmoSum, v4QP))
183
+            return 0;
184
+        else
185
+            return 1;
187 186
     }
188
-  else return 2;
187
+    else return 2;
189 188
 }
190 189
 
191 190
 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
192
-  /*
191
+    /*
193 192
     this code makes no assumption on src or stride.
194 193
     One could remove the recomputation of the perm
195 194
     vector by assuming (stride % 16) == 0, unfortunately
196 195
     this is not always true. Quite a lot of load/stores
197 196
     can be removed by assuming proper alignment of
198 197
     src & stride :-(
199
-  */
200
-  uint8_t *src2 = src;
201
-  const vector signed int zero = vec_splat_s32(0);
202
-  const int properStride = (stride % 16);
203
-  const int srcAlign = ((unsigned long)src2 % 16);
204
-  DECLARE_ALIGNED(16, short, qp[8]);
205
-  qp[0] = c->QP;
206
-  vector signed short vqp = vec_ld(0, qp);
207
-  vqp = vec_splat(vqp, 0);
208
-
209
-  src2 += stride*3;
210
-
211
-  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
212
-  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
213
-  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
214
-  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
198
+    */
199
+    uint8_t *src2 = src;
200
+    const vector signed int zero = vec_splat_s32(0);
201
+    const int properStride = (stride % 16);
202
+    const int srcAlign = ((unsigned long)src2 % 16);
203
+    DECLARE_ALIGNED(16, short, qp[8]);
204
+    qp[0] = c->QP;
205
+    vector signed short vqp = vec_ld(0, qp);
206
+    vqp = vec_splat(vqp, 0);
207
+
208
+    src2 += stride*3;
209
+
210
+    vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
211
+    vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
212
+    vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
213
+    vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
215 214
 
216 215
 #define LOAD_LINE(i)                                                    \
217
-  const vector unsigned char perml##i =                                 \
218
-    vec_lvsl(i * stride, src2);                                         \
219
-  vbA##i = vec_ld(i * stride, src2);                                    \
220
-  vbB##i = vec_ld(i * stride + 16, src2);                               \
221
-  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \
222
-  vb##i =                                                               \
223
-    (vector signed short)vec_mergeh((vector unsigned char)zero,         \
224
-                                    (vector unsigned char)vbT##i)
216
+    const vector unsigned char perml##i =                               \
217
+        vec_lvsl(i * stride, src2);                                     \
218
+    vbA##i = vec_ld(i * stride, src2);                                  \
219
+    vbB##i = vec_ld(i * stride + 16, src2);                             \
220
+    vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                        \
221
+    vb##i =                                                             \
222
+        (vector signed short)vec_mergeh((vector unsigned char)zero,     \
223
+                                        (vector unsigned char)vbT##i)
225 224
 
226 225
 #define LOAD_LINE_ALIGNED(i)                                            \
227
-  register int j##i = i * stride;                                       \
228
-  vbT##i = vec_ld(j##i, src2);                                          \
229
-  vb##i =                                                               \
230
-    (vector signed short)vec_mergeh((vector signed char)zero,           \
231
-                                    (vector signed char)vbT##i)
232
-
233
-    /* Special-casing the aligned case is worthwhile, as all calls from
234
-     * the (transposed) horizontable deblocks will be aligned, in addition
235
-     * to the naturally aligned vertical deblocks. */
236
-    if (properStride && srcAlign) {
237
-      LOAD_LINE_ALIGNED(0);
238
-      LOAD_LINE_ALIGNED(1);
239
-      LOAD_LINE_ALIGNED(2);
240
-      LOAD_LINE_ALIGNED(3);
241
-      LOAD_LINE_ALIGNED(4);
242
-      LOAD_LINE_ALIGNED(5);
243
-      LOAD_LINE_ALIGNED(6);
244
-      LOAD_LINE_ALIGNED(7);
245
-      LOAD_LINE_ALIGNED(8);
246
-      LOAD_LINE_ALIGNED(9);
247
-    } else {
248
-      LOAD_LINE(0);
249
-      LOAD_LINE(1);
250
-      LOAD_LINE(2);
251
-      LOAD_LINE(3);
252
-      LOAD_LINE(4);
253
-      LOAD_LINE(5);
254
-      LOAD_LINE(6);
255
-      LOAD_LINE(7);
256
-      LOAD_LINE(8);
257
-      LOAD_LINE(9);
258
-    }
226
+    register int j##i = i * stride;                                     \
227
+    vbT##i = vec_ld(j##i, src2);                                        \
228
+    vb##i =                                                             \
229
+        (vector signed short)vec_mergeh((vector signed char)zero,       \
230
+                                        (vector signed char)vbT##i)
231
+
232
+      /* Special-casing the aligned case is worthwhile, as all calls from
233
+       * the (transposed) horizontable deblocks will be aligned, in addition
234
+       * to the naturally aligned vertical deblocks. */
235
+      if (properStride && srcAlign) {
236
+          LOAD_LINE_ALIGNED(0);
237
+          LOAD_LINE_ALIGNED(1);
238
+          LOAD_LINE_ALIGNED(2);
239
+          LOAD_LINE_ALIGNED(3);
240
+          LOAD_LINE_ALIGNED(4);
241
+          LOAD_LINE_ALIGNED(5);
242
+          LOAD_LINE_ALIGNED(6);
243
+          LOAD_LINE_ALIGNED(7);
244
+          LOAD_LINE_ALIGNED(8);
245
+          LOAD_LINE_ALIGNED(9);
246
+      } else {
247
+          LOAD_LINE(0);
248
+          LOAD_LINE(1);
249
+          LOAD_LINE(2);
250
+          LOAD_LINE(3);
251
+          LOAD_LINE(4);
252
+          LOAD_LINE(5);
253
+          LOAD_LINE(6);
254
+          LOAD_LINE(7);
255
+          LOAD_LINE(8);
256
+          LOAD_LINE(9);
257
+      }
259 258
 #undef LOAD_LINE
260 259
 #undef LOAD_LINE_ALIGNED
261 260
 
262
-  const vector unsigned short v_2 = vec_splat_u16(2);
263
-  const vector unsigned short v_4 = vec_splat_u16(4);
261
+    const vector unsigned short v_2 = vec_splat_u16(2);
262
+    const vector unsigned short v_4 = vec_splat_u16(4);
264 263
 
265
-  const vector signed short v_diff01 = vec_sub(vb0, vb1);
266
-  const vector unsigned short v_cmp01 =
267
-    (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
268
-  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
269
-  const vector signed short v_diff89 = vec_sub(vb8, vb9);
270
-  const vector unsigned short v_cmp89 =
271
-    (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
272
-  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
264
+    const vector signed short v_diff01 = vec_sub(vb0, vb1);
265
+    const vector unsigned short v_cmp01 =
266
+        (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
267
+    const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
268
+    const vector signed short v_diff89 = vec_sub(vb8, vb9);
269
+    const vector unsigned short v_cmp89 =
270
+        (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
271
+    const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
273 272
 
274
-  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
275
-  const vector signed short temp02 = vec_add(vb2, vb3);
276
-  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
277
-  const vector signed short v_sumsB0 = vec_add(temp02, temp03);
273
+    const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
274
+    const vector signed short temp02 = vec_add(vb2, vb3);
275
+    const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
276
+    const vector signed short v_sumsB0 = vec_add(temp02, temp03);
278 277
 
279
-  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
280
-  const vector signed short v_sumsB1 = vec_add(temp11, vb4);
278
+    const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
279
+    const vector signed short v_sumsB1 = vec_add(temp11, vb4);
281 280
 
282
-  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
283
-  const vector signed short v_sumsB2 = vec_add(temp21, vb5);
281
+    const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
282
+    const vector signed short v_sumsB2 = vec_add(temp21, vb5);
284 283
 
285
-  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
286
-  const vector signed short v_sumsB3 = vec_add(temp31, vb6);
284
+    const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
285
+    const vector signed short v_sumsB3 = vec_add(temp31, vb6);
287 286
 
288
-  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
289
-  const vector signed short v_sumsB4 = vec_add(temp41, vb7);
287
+    const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
288
+    const vector signed short v_sumsB4 = vec_add(temp41, vb7);
290 289
 
291
-  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
292
-  const vector signed short v_sumsB5 = vec_add(temp51, vb8);
290
+    const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
291
+    const vector signed short v_sumsB5 = vec_add(temp51, vb8);
293 292
 
294
-  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
295
-  const vector signed short v_sumsB6 = vec_add(temp61, v_last);
293
+    const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
294
+    const vector signed short v_sumsB6 = vec_add(temp61, v_last);
296 295
 
297
-  const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
298
-  const vector signed short v_sumsB7 = vec_add(temp71, v_last);
296
+    const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
297
+    const vector signed short v_sumsB7 = vec_add(temp71, v_last);
299 298
 
300
-  const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
301
-  const vector signed short v_sumsB8 = vec_add(temp81, v_last);
299
+    const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
300
+    const vector signed short v_sumsB8 = vec_add(temp81, v_last);
302 301
 
303
-  const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
304
-  const vector signed short v_sumsB9 = vec_add(temp91, v_last);
302
+    const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
303
+    const vector signed short v_sumsB9 = vec_add(temp91, v_last);
305 304
 
306 305
 #define COMPUTE_VR(i, j, k)                                             \
307
-  const vector signed short temps1##i =                                 \
308
-    vec_add(v_sumsB##i, v_sumsB##k);                                    \
309
-  const vector signed short temps2##i =                                 \
310
-    vec_mladd(vb##j, (vector signed short)v_2, temps1##i);              \
311
-  const vector signed short  vr##j = vec_sra(temps2##i, v_4)
312
-
313
-  COMPUTE_VR(0, 1, 2);
314
-  COMPUTE_VR(1, 2, 3);
315
-  COMPUTE_VR(2, 3, 4);
316
-  COMPUTE_VR(3, 4, 5);
317
-  COMPUTE_VR(4, 5, 6);
318
-  COMPUTE_VR(5, 6, 7);
319
-  COMPUTE_VR(6, 7, 8);
320
-  COMPUTE_VR(7, 8, 9);
321
-
322
-  const vector signed char neg1 = vec_splat_s8(-1);
323
-  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
324
-                                                                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
306
+    const vector signed short temps1##i =                               \
307
+        vec_add(v_sumsB##i, v_sumsB##k);                                \
308
+    const vector signed short temps2##i =                               \
309
+        vec_mladd(vb##j, (vector signed short)v_2, temps1##i);          \
310
+    const vector signed short  vr##j = vec_sra(temps2##i, v_4)
311
+
312
+    COMPUTE_VR(0, 1, 2);
313
+    COMPUTE_VR(1, 2, 3);
314
+    COMPUTE_VR(2, 3, 4);
315
+    COMPUTE_VR(3, 4, 5);
316
+    COMPUTE_VR(4, 5, 6);
317
+    COMPUTE_VR(5, 6, 7);
318
+    COMPUTE_VR(6, 7, 8);
319
+    COMPUTE_VR(7, 8, 9);
320
+
321
+    const vector signed char neg1 = vec_splat_s8(-1);
322
+    const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
323
+                                                                        0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
325 324
 
326 325
 #define PACK_AND_STORE(i)                                       \
327
-  const vector unsigned char perms##i =                         \
328
-    vec_lvsr(i * stride, src2);                                 \
329
-  const vector unsigned char vf##i =                            \
330
-    vec_packsu(vr##i, (vector signed short)zero);               \
331
-  const vector unsigned char vg##i =                            \
332
-    vec_perm(vf##i, vbT##i, permHH);                            \
333
-  const vector unsigned char mask##i =                          \
334
-    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
335
-  const vector unsigned char vg2##i =                           \
336
-    vec_perm(vg##i, vg##i, perms##i);                           \
337
-  const vector unsigned char svA##i =                           \
338
-    vec_sel(vbA##i, vg2##i, mask##i);                           \
339
-  const vector unsigned char svB##i =                           \
340
-    vec_sel(vg2##i, vbB##i, mask##i);                           \
341
-  vec_st(svA##i, i * stride, src2);                             \
342
-  vec_st(svB##i, i * stride + 16, src2)
326
+    const vector unsigned char perms##i =                       \
327
+        vec_lvsr(i * stride, src2);                             \
328
+    const vector unsigned char vf##i =                          \
329
+        vec_packsu(vr##i, (vector signed short)zero);           \
330
+    const vector unsigned char vg##i =                          \
331
+        vec_perm(vf##i, vbT##i, permHH);                        \
332
+    const vector unsigned char mask##i =                        \
333
+        vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
334
+    const vector unsigned char vg2##i =                         \
335
+        vec_perm(vg##i, vg##i, perms##i);                       \
336
+    const vector unsigned char svA##i =                         \
337
+        vec_sel(vbA##i, vg2##i, mask##i);                       \
338
+    const vector unsigned char svB##i =                         \
339
+        vec_sel(vg2##i, vbB##i, mask##i);                       \
340
+    vec_st(svA##i, i * stride, src2);                           \
341
+    vec_st(svB##i, i * stride + 16, src2)
343 342
 
344 343
 #define PACK_AND_STORE_ALIGNED(i)                               \
345
-  const vector unsigned char vf##i =                            \
346
-    vec_packsu(vr##i, (vector signed short)zero);               \
347
-  const vector unsigned char vg##i =                            \
348
-    vec_perm(vf##i, vbT##i, permHH);                            \
349
-  vec_st(vg##i, i * stride, src2)
350
-
351
-  /* Special-casing the aligned case is worthwhile, as all calls from
352
-   * the (transposed) horizontable deblocks will be aligned, in addition
353
-   * to the naturally aligned vertical deblocks. */
354
-  if (properStride && srcAlign) {
355
-    PACK_AND_STORE_ALIGNED(1);
356
-    PACK_AND_STORE_ALIGNED(2);
357
-    PACK_AND_STORE_ALIGNED(3);
358
-    PACK_AND_STORE_ALIGNED(4);
359
-    PACK_AND_STORE_ALIGNED(5);
360
-    PACK_AND_STORE_ALIGNED(6);
361
-    PACK_AND_STORE_ALIGNED(7);
362
-    PACK_AND_STORE_ALIGNED(8);
363
-  } else {
364
-    PACK_AND_STORE(1);
365
-    PACK_AND_STORE(2);
366
-    PACK_AND_STORE(3);
367
-    PACK_AND_STORE(4);
368
-    PACK_AND_STORE(5);
369
-    PACK_AND_STORE(6);
370
-    PACK_AND_STORE(7);
371
-    PACK_AND_STORE(8);
372
-  }
344
+    const vector unsigned char vf##i =                          \
345
+        vec_packsu(vr##i, (vector signed short)zero);           \
346
+    const vector unsigned char vg##i =                          \
347
+        vec_perm(vf##i, vbT##i, permHH);                        \
348
+    vec_st(vg##i, i * stride, src2)
349
+
350
+    /* Special-casing the aligned case is worthwhile, as all calls from
351
+     * the (transposed) horizontable deblocks will be aligned, in addition
352
+     * to the naturally aligned vertical deblocks. */
353
+    if (properStride && srcAlign) {
354
+        PACK_AND_STORE_ALIGNED(1);
355
+        PACK_AND_STORE_ALIGNED(2);
356
+        PACK_AND_STORE_ALIGNED(3);
357
+        PACK_AND_STORE_ALIGNED(4);
358
+        PACK_AND_STORE_ALIGNED(5);
359
+        PACK_AND_STORE_ALIGNED(6);
360
+        PACK_AND_STORE_ALIGNED(7);
361
+        PACK_AND_STORE_ALIGNED(8);
362
+    } else {
363
+        PACK_AND_STORE(1);
364
+        PACK_AND_STORE(2);
365
+        PACK_AND_STORE(3);
366
+        PACK_AND_STORE(4);
367
+        PACK_AND_STORE(5);
368
+        PACK_AND_STORE(6);
369
+        PACK_AND_STORE(7);
370
+        PACK_AND_STORE(8);
371
+    }
373 372
 #undef PACK_AND_STORE
374 373
 #undef PACK_AND_STORE_ALIGNED
375 374
 }
... ...
@@ -377,190 +376,190 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
377 377
 
378 378
 
379 379
 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
380
-  /*
380
+    /*
381 381
     this code makes no assumption on src or stride.
382 382
     One could remove the recomputation of the perm
383 383
     vector by assuming (stride % 16) == 0, unfortunately
384 384
     this is not always true. Quite a lot of load/stores
385 385
     can be removed by assuming proper alignment of
386 386
     src & stride :-(
387
-  */
388
-  uint8_t *src2 = src;
389
-  const vector signed int zero = vec_splat_s32(0);
390
-  DECLARE_ALIGNED(16, short, qp[8]);
391
-  qp[0] = 8*c->QP;
392
-  vector signed short vqp = vec_ld(0, qp);
393
-  vqp = vec_splat(vqp, 0);
387
+    */
388
+    uint8_t *src2 = src;
389
+    const vector signed int zero = vec_splat_s32(0);
390
+    DECLARE_ALIGNED(16, short, qp[8]);
391
+    qp[0] = 8*c->QP;
392
+    vector signed short vqp = vec_ld(0, qp);
393
+    vqp = vec_splat(vqp, 0);
394 394
 
395 395
 #define LOAD_LINE(i)                                                    \
396
-  const vector unsigned char perm##i =                                  \
397
-    vec_lvsl(i * stride, src2);                                         \
398
-  const vector unsigned char vbA##i =                                   \
399
-    vec_ld(i * stride, src2);                                           \
400
-  const vector unsigned char vbB##i =                                   \
401
-    vec_ld(i * stride + 16, src2);                                      \
402
-  const vector unsigned char vbT##i =                                   \
403
-    vec_perm(vbA##i, vbB##i, perm##i);                                  \
404
-  const vector signed short vb##i =                                     \
405
-    (vector signed short)vec_mergeh((vector unsigned char)zero,         \
406
-                                    (vector unsigned char)vbT##i)
407
-
408
-  src2 += stride*3;
409
-
410
-  LOAD_LINE(1);
411
-  LOAD_LINE(2);
412
-  LOAD_LINE(3);
413
-  LOAD_LINE(4);
414
-  LOAD_LINE(5);
415
-  LOAD_LINE(6);
416
-  LOAD_LINE(7);
417
-  LOAD_LINE(8);
396
+    const vector unsigned char perm##i =                                \
397
+        vec_lvsl(i * stride, src2);                                     \
398
+    const vector unsigned char vbA##i =                                 \
399
+        vec_ld(i * stride, src2);                                       \
400
+    const vector unsigned char vbB##i =                                 \
401
+        vec_ld(i * stride + 16, src2);                                  \
402
+    const vector unsigned char vbT##i =                                 \
403
+        vec_perm(vbA##i, vbB##i, perm##i);                              \
404
+    const vector signed short vb##i =                                   \
405
+        (vector signed short)vec_mergeh((vector unsigned char)zero,     \
406
+                                        (vector unsigned char)vbT##i)
407
+
408
+    src2 += stride*3;
409
+
410
+     LOAD_LINE(1);
411
+     LOAD_LINE(2);
412
+     LOAD_LINE(3);
413
+     LOAD_LINE(4);
414
+     LOAD_LINE(5);
415
+     LOAD_LINE(6);
416
+     LOAD_LINE(7);
417
+     LOAD_LINE(8);
418 418
 #undef LOAD_LINE
419 419
 
420
-  const vector signed short v_1 = vec_splat_s16(1);
421
-  const vector signed short v_2 = vec_splat_s16(2);
422
-  const vector signed short v_5 = vec_splat_s16(5);
423
-  const vector signed short v_32 = vec_sl(v_1,
424
-                                          (vector unsigned short)v_5);
425
-  /* middle energy */
426
-  const vector signed short l3minusl6 = vec_sub(vb3, vb6);
427
-  const vector signed short l5minusl4 = vec_sub(vb5, vb4);
428
-  const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
429
-  const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
430
-  const vector signed short absmE = vec_abs(mE);
431
-  /* left & right energy */
432
-  const vector signed short l1minusl4 = vec_sub(vb1, vb4);
433
-  const vector signed short l3minusl2 = vec_sub(vb3, vb2);
434
-  const vector signed short l5minusl8 = vec_sub(vb5, vb8);
435
-  const vector signed short l7minusl6 = vec_sub(vb7, vb6);
436
-  const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
437
-  const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
438
-  const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
439
-  const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
440
-  /* d */
441
-  const vector signed short ddiff = vec_sub(absmE,
442
-                                            vec_min(vec_abs(lE),
443
-                                                    vec_abs(rE)));
444
-  const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
445
-  const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
446
-  const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
447
-  const vector signed short minusd = vec_sub((vector signed short)zero, d);
448
-  const vector signed short finald = vec_sel(minusd,
449
-                                             d,
450
-                                             vec_cmpgt(vec_sub((vector signed short)zero, mE),
451
-                                                       (vector signed short)zero));
452
-  /* q */
453
-  const vector signed short qtimes2 = vec_sub(vb4, vb5);
454
-  /* for a shift right to behave like /2, we need to add one
455
-     to all negative integer */
456
-  const vector signed short rounddown = vec_sel((vector signed short)zero,
457
-                                                v_1,
458
-                                                vec_cmplt(qtimes2, (vector signed short)zero));
459
-  const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
460
-  /* clamp */
461
-  const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
462
-  const vector signed short dclamp_P = vec_min(dclamp_P1, q);
463
-  const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
464
-  const vector signed short dclamp_N = vec_max(dclamp_N1, q);
465
-
466
-  const vector signed short dclampedfinal = vec_sel(dclamp_N,
467
-                                                    dclamp_P,
468
-                                                    vec_cmpgt(q, (vector signed short)zero));
469
-  const vector signed short dornotd = vec_sel((vector signed short)zero,
470
-                                              dclampedfinal,
471
-                                              vec_cmplt(absmE, vqp));
472
-  /* add/subtract to l4 and l5 */
473
-  const vector signed short vb4minusd = vec_sub(vb4, dornotd);
474
-  const vector signed short vb5plusd = vec_add(vb5, dornotd);
475
-  /* finally, stores */
476
-  const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
477
-  const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
478
-
479
-  const vector signed char neg1 = vec_splat_s8(-1);
480
-  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
481
-                                                                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
420
+     const vector signed short v_1 = vec_splat_s16(1);
421
+     const vector signed short v_2 = vec_splat_s16(2);
422
+     const vector signed short v_5 = vec_splat_s16(5);
423
+     const vector signed short v_32 = vec_sl(v_1,
424
+                                             (vector unsigned short)v_5);
425
+     /* middle energy */
426
+     const vector signed short l3minusl6 = vec_sub(vb3, vb6);
427
+     const vector signed short l5minusl4 = vec_sub(vb5, vb4);
428
+     const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
429
+     const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
430
+     const vector signed short absmE = vec_abs(mE);
431
+     /* left & right energy */
432
+     const vector signed short l1minusl4 = vec_sub(vb1, vb4);
433
+     const vector signed short l3minusl2 = vec_sub(vb3, vb2);
434
+     const vector signed short l5minusl8 = vec_sub(vb5, vb8);
435
+     const vector signed short l7minusl6 = vec_sub(vb7, vb6);
436
+     const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
437
+     const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
438
+     const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
439
+     const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
440
+     /* d */
441
+     const vector signed short ddiff = vec_sub(absmE,
442
+                                               vec_min(vec_abs(lE),
443
+                                                       vec_abs(rE)));
444
+     const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
445
+     const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
446
+     const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
447
+     const vector signed short minusd = vec_sub((vector signed short)zero, d);
448
+     const vector signed short finald = vec_sel(minusd,
449
+                                                d,
450
+                                                vec_cmpgt(vec_sub((vector signed short)zero, mE),
451
+                                                          (vector signed short)zero));
452
+     /* q */
453
+     const vector signed short qtimes2 = vec_sub(vb4, vb5);
454
+     /* for a shift right to behave like /2, we need to add one
455
+        to all negative integer */
456
+     const vector signed short rounddown = vec_sel((vector signed short)zero,
457
+                                                   v_1,
458
+                                                   vec_cmplt(qtimes2, (vector signed short)zero));
459
+     const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
460
+     /* clamp */
461
+     const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
462
+     const vector signed short dclamp_P = vec_min(dclamp_P1, q);
463
+     const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
464
+     const vector signed short dclamp_N = vec_max(dclamp_N1, q);
465
+
466
+     const vector signed short dclampedfinal = vec_sel(dclamp_N,
467
+                                                       dclamp_P,
468
+                                                       vec_cmpgt(q, (vector signed short)zero));
469
+     const vector signed short dornotd = vec_sel((vector signed short)zero,
470
+                                                 dclampedfinal,
471
+                                                 vec_cmplt(absmE, vqp));
472
+     /* add/subtract to l4 and l5 */
473
+     const vector signed short vb4minusd = vec_sub(vb4, dornotd);
474
+     const vector signed short vb5plusd  = vec_add(vb5, dornotd);
475
+     /* finally, stores */
476
+     const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
477
+     const vector unsigned char st5 = vec_packsu(vb5plusd,  (vector signed short)zero);
478
+
479
+     const vector signed char neg1 = vec_splat_s8(-1);
480
+     const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
481
+                                                                         0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
482 482
 
483 483
 #define STORE(i)                                                \
484
-  const vector unsigned char perms##i =                         \
485
-    vec_lvsr(i * stride, src2);                                 \
486
-  const vector unsigned char vg##i =                            \
487
-    vec_perm(st##i, vbT##i, permHH);                            \
488
-  const vector unsigned char mask##i =                          \
489
-    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
490
-  const vector unsigned char vg2##i =                           \
491
-    vec_perm(vg##i, vg##i, perms##i);                           \
492
-  const vector unsigned char svA##i =                           \
493
-    vec_sel(vbA##i, vg2##i, mask##i);                           \
494
-  const vector unsigned char svB##i =                           \
495
-    vec_sel(vg2##i, vbB##i, mask##i);                           \
496
-  vec_st(svA##i, i * stride, src2);                             \
497
-  vec_st(svB##i, i * stride + 16, src2)
498
-
499
-  STORE(4);
500
-  STORE(5);
484
+     const vector unsigned char perms##i =                      \
485
+         vec_lvsr(i * stride, src2);                            \
486
+     const vector unsigned char vg##i =                         \
487
+         vec_perm(st##i, vbT##i, permHH);                       \
488
+     const vector unsigned char mask##i =                       \
489
+         vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
490
+     const vector unsigned char vg2##i =                        \
491
+         vec_perm(vg##i, vg##i, perms##i);                      \
492
+     const vector unsigned char svA##i =                        \
493
+         vec_sel(vbA##i, vg2##i, mask##i);                      \
494
+     const vector unsigned char svB##i =                        \
495
+         vec_sel(vg2##i, vbB##i, mask##i);                      \
496
+     vec_st(svA##i, i * stride, src2);                          \
497
+     vec_st(svB##i, i * stride + 16, src2)
498
+
499
+     STORE(4);
500
+     STORE(5);
501 501
 }
502 502
 
503 503
 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
504
-  /*
504
+    /*
505 505
     this code makes no assumption on src or stride.
506 506
     One could remove the recomputation of the perm
507 507
     vector by assuming (stride % 16) == 0, unfortunately
508 508
     this is not always true. Quite a lot of load/stores
509 509
     can be removed by assuming proper alignment of
510 510
     src & stride :-(
511
-  */
512
-  uint8_t *srcCopy = src;
513
-  DECLARE_ALIGNED(16, uint8_t, dt[16]);
514
-  const vector signed int zero = vec_splat_s32(0);
515
-  vector unsigned char v_dt;
516
-  dt[0] = deringThreshold;
517
-  v_dt = vec_splat(vec_ld(0, dt), 0);
511
+    */
512
+    uint8_t *srcCopy = src;
513
+    DECLARE_ALIGNED(16, uint8_t, dt[16]);
514
+    const vector signed int zero = vec_splat_s32(0);
515
+    vector unsigned char v_dt;
516
+    dt[0] = deringThreshold;
517
+    v_dt = vec_splat(vec_ld(0, dt), 0);
518 518
 
519 519
 #define LOAD_LINE(i)                                                    \
520
-  const vector unsigned char perm##i =                                  \
521
-    vec_lvsl(i * stride, srcCopy);                                      \
522
-  vector unsigned char sA##i = vec_ld(i * stride, srcCopy);             \
523
-  vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy);        \
524
-  vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
525
-
526
-  LOAD_LINE(0);
527
-  LOAD_LINE(1);
528
-  LOAD_LINE(2);
529
-  LOAD_LINE(3);
530
-  LOAD_LINE(4);
531
-  LOAD_LINE(5);
532
-  LOAD_LINE(6);
533
-  LOAD_LINE(7);
534
-  LOAD_LINE(8);
535
-  LOAD_LINE(9);
520
+    const vector unsigned char perm##i =                               \
521
+        vec_lvsl(i * stride, srcCopy);                                 \
522
+    vector unsigned char sA##i = vec_ld(i * stride, srcCopy);          \
523
+    vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy);     \
524
+    vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
525
+
526
+    LOAD_LINE(0);
527
+    LOAD_LINE(1);
528
+    LOAD_LINE(2);
529
+    LOAD_LINE(3);
530
+    LOAD_LINE(4);
531
+    LOAD_LINE(5);
532
+    LOAD_LINE(6);
533
+    LOAD_LINE(7);
534
+    LOAD_LINE(8);
535
+    LOAD_LINE(9);
536 536
 #undef LOAD_LINE
537 537
 
538
-  vector unsigned char v_avg;
539
-  {
538
+    vector unsigned char v_avg;
539
+    {
540 540
     const vector unsigned char trunc_perm = (vector unsigned char)
541
-      AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
542
-          0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
541
+        AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
542
+            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
543 543
     const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
544 544
     const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
545 545
     const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
546 546
     const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
547 547
 
548 548
 #define EXTRACT(op) do {                                                \
549
-      const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
550
-      const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
551
-      const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
552
-      const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
553
-      const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \
554
-      const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \
555
-      const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \
556
-      const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \
557
-      const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \
558
-      const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \
559
-      const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
560
-      const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \
561
-      const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
562
-      const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
563
-      v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
549
+    const vector unsigned char s##op##_1   = vec_##op(trunc_src12, trunc_src34); \
550
+    const vector unsigned char s##op##_2   = vec_##op(trunc_src56, trunc_src78); \
551
+    const vector unsigned char s##op##_6   = vec_##op(s##op##_1, s##op##_2);     \
552
+    const vector unsigned char s##op##_8h  = vec_mergeh(s##op##_6, s##op##_6);   \
553
+    const vector unsigned char s##op##_8l  = vec_mergel(s##op##_6, s##op##_6);   \
554
+    const vector unsigned char s##op##_9   = vec_##op(s##op##_8h, s##op##_8l);   \
555
+    const vector unsigned char s##op##_9h  = vec_mergeh(s##op##_9, s##op##_9);   \
556
+    const vector unsigned char s##op##_9l  = vec_mergel(s##op##_9, s##op##_9);   \
557
+    const vector unsigned char s##op##_10  = vec_##op(s##op##_9h, s##op##_9l);   \
558
+    const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \
559
+    const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
560
+    const vector unsigned char s##op##_11  = vec_##op(s##op##_10h, s##op##_10l); \
561
+    const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
562
+    const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
563
+    v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
564 564
 
565 565
     vector unsigned char v_min;
566 566
     vector unsigned char v_max;
... ...
@@ -569,19 +568,19 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
569 569
 #undef EXTRACT
570 570
 
571 571
     if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
572
-      return;
572
+        return;
573 573
 
574 574
     v_avg = vec_avg(v_min, v_max);
575
-  }
575
+    }
576 576
 
577
-  DECLARE_ALIGNED(16, signed int, S[8]);
578
-  {
577
+    DECLARE_ALIGNED(16, signed int, S[8]);
578
+    {
579 579
     const vector unsigned short mask1 = (vector unsigned short)
580
-      AVV(0x0001, 0x0002, 0x0004, 0x0008,
581
-          0x0010, 0x0020, 0x0040, 0x0080);
580
+        AVV(0x0001, 0x0002, 0x0004, 0x0008,
581
+            0x0010, 0x0020, 0x0040, 0x0080);
582 582
     const vector unsigned short mask2 = (vector unsigned short)
583
-      AVV(0x0100, 0x0200, 0x0000, 0x0000,
584
-          0x0000, 0x0000, 0x0000, 0x0000);
583
+        AVV(0x0100, 0x0200, 0x0000, 0x0000,
584
+            0x0000, 0x0000, 0x0000, 0x0000);
585 585
 
586 586
     const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
587 587
     const vector unsigned int vuint32_1 = vec_splat_u32(1);
... ...
@@ -589,19 +588,19 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
589 589
 #define COMPARE(i)                                                      \
590 590
     vector signed int sum##i;                                           \
591 591
     do {                                                                \
592
-      const vector unsigned char cmp##i =                               \
593
-        (vector unsigned char)vec_cmpgt(src##i, v_avg);                 \
594
-      const vector unsigned short cmpHi##i =                            \
595
-        (vector unsigned short)vec_mergeh(cmp##i, cmp##i);              \
596
-      const vector unsigned short cmpLi##i =                            \
597
-        (vector unsigned short)vec_mergel(cmp##i, cmp##i);              \
598
-      const vector signed short cmpHf##i =                              \
599
-        (vector signed short)vec_and(cmpHi##i, mask1);                  \
600
-      const vector signed short cmpLf##i =                              \
601
-        (vector signed short)vec_and(cmpLi##i, mask2);                  \
602
-      const vector signed int sump##i = vec_sum4s(cmpHf##i, zero);      \
603
-      const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i);   \
604
-      sum##i  = vec_sums(sumq##i, zero); } while (0)
592
+        const vector unsigned char cmp##i =                             \
593
+            (vector unsigned char)vec_cmpgt(src##i, v_avg);             \
594
+        const vector unsigned short cmpHi##i =                          \
595
+            (vector unsigned short)vec_mergeh(cmp##i, cmp##i);          \
596
+        const vector unsigned short cmpLi##i =                          \
597
+            (vector unsigned short)vec_mergel(cmp##i, cmp##i);          \
598
+        const vector signed short cmpHf##i =                            \
599
+            (vector signed short)vec_and(cmpHi##i, mask1);              \
600
+        const vector signed short cmpLf##i =                            \
601
+            (vector signed short)vec_and(cmpLi##i, mask2);              \
602
+        const vector signed int sump##i = vec_sum4s(cmpHf##i, zero);    \
603
+        const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \
604
+        sum##i  = vec_sums(sumq##i, zero); } while (0)
605 605
 
606 606
     COMPARE(0);
607 607
     COMPARE(1);
... ...
@@ -618,178 +617,178 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
618 618
     vector signed int sumA2;
619 619
     vector signed int sumB2;
620 620
     {
621
-      const vector signed int sump02 = vec_mergel(sum0, sum2);
622
-      const vector signed int sump13 = vec_mergel(sum1, sum3);
623
-      const vector signed int sumA = vec_mergel(sump02, sump13);
624
-
625
-      const vector signed int sump46 = vec_mergel(sum4, sum6);
626
-      const vector signed int sump57 = vec_mergel(sum5, sum7);
627
-      const vector signed int sumB = vec_mergel(sump46, sump57);
628
-
629
-      const vector signed int sump8A = vec_mergel(sum8, zero);
630
-      const vector signed int sump9B = vec_mergel(sum9, zero);
631
-      const vector signed int sumC = vec_mergel(sump8A, sump9B);
632
-
633
-      const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
634
-      const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
635
-      const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
636
-      const vector signed int t2A = vec_or(sumA, tA);
637
-      const vector signed int t2B = vec_or(sumB, tB);
638
-      const vector signed int t2C = vec_or(sumC, tC);
639
-      const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
640
-                                            vec_sl(t2A, vuint32_1));
641
-      const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
642
-                                            vec_sl(t2B, vuint32_1));
643
-      const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
644
-                                            vec_sl(t2C, vuint32_1));
645
-      const vector signed int yA = vec_and(t2A, t3A);
646
-      const vector signed int yB = vec_and(t2B, t3B);
647
-      const vector signed int yC = vec_and(t2C, t3C);
648
-
649
-      const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
650
-      const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
651
-      const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
652
-      const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
653
-      const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
654
-      const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
655
-      const vector signed int sumAp = vec_and(yA,
656
-                                              vec_and(sumAd4,sumAd8));
657
-      const vector signed int sumBp = vec_and(yB,
658
-                                              vec_and(sumBd4,sumBd8));
659
-      sumA2 = vec_or(sumAp,
660
-                     vec_sra(sumAp,
661
-                             vuint32_16));
662
-      sumB2  = vec_or(sumBp,
663
-                      vec_sra(sumBp,
664
-                              vuint32_16));
621
+    const vector signed int sump02 = vec_mergel(sum0, sum2);
622
+    const vector signed int sump13 = vec_mergel(sum1, sum3);
623
+    const vector signed int sumA = vec_mergel(sump02, sump13);
624
+
625
+    const vector signed int sump46 = vec_mergel(sum4, sum6);
626
+    const vector signed int sump57 = vec_mergel(sum5, sum7);
627
+    const vector signed int sumB = vec_mergel(sump46, sump57);
628
+
629
+    const vector signed int sump8A = vec_mergel(sum8, zero);
630
+    const vector signed int sump9B = vec_mergel(sum9, zero);
631
+    const vector signed int sumC = vec_mergel(sump8A, sump9B);
632
+
633
+    const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
634
+    const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
635
+    const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
636
+    const vector signed int t2A = vec_or(sumA, tA);
637
+    const vector signed int t2B = vec_or(sumB, tB);
638
+    const vector signed int t2C = vec_or(sumC, tC);
639
+    const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
640
+                                          vec_sl(t2A, vuint32_1));
641
+    const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
642
+                                          vec_sl(t2B, vuint32_1));
643
+    const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
644
+                                          vec_sl(t2C, vuint32_1));
645
+    const vector signed int yA = vec_and(t2A, t3A);
646
+    const vector signed int yB = vec_and(t2B, t3B);
647
+    const vector signed int yC = vec_and(t2C, t3C);
648
+
649
+    const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
650
+    const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
651
+    const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
652
+    const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
653
+    const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
654
+    const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
655
+    const vector signed int sumAp = vec_and(yA,
656
+                                            vec_and(sumAd4,sumAd8));
657
+    const vector signed int sumBp = vec_and(yB,
658
+                                            vec_and(sumBd4,sumBd8));
659
+    sumA2 = vec_or(sumAp,
660
+                   vec_sra(sumAp,
661
+                           vuint32_16));
662
+    sumB2  = vec_or(sumBp,
663
+                    vec_sra(sumBp,
664
+                            vuint32_16));
665 665
     }
666 666
     vec_st(sumA2, 0, S);
667 667
     vec_st(sumB2, 16, S);
668
-  }
669
-
670
-  /* I'm not sure the following is actually faster
671
-     than straight, unvectorized C code :-( */
672
-
673
-  DECLARE_ALIGNED(16, int, tQP2[4]);
674
-  tQP2[0]= c->QP/2 + 1;
675
-  vector signed int vQP2 = vec_ld(0, tQP2);
676
-  vQP2 = vec_splat(vQP2, 0);
677
-  const vector signed int vsint32_8 = vec_splat_s32(8);
678
-  const vector unsigned int vuint32_4 = vec_splat_u32(4);
679
-
680
-  const vector unsigned char permA1 = (vector unsigned char)
681
-    AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
682
-        0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
683
-  const vector unsigned char permA2 = (vector unsigned char)
684
-    AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
685
-        0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
686
-  const vector unsigned char permA1inc = (vector unsigned char)
687
-    AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
688
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
689
-  const vector unsigned char permA2inc = (vector unsigned char)
690
-    AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
691
-        0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
692
-  const vector unsigned char magic = (vector unsigned char)
693
-    AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
694
-        0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
695
-  const vector unsigned char extractPerm = (vector unsigned char)
696
-    AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
697
-        0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01);
698
-  const vector unsigned char extractPermInc = (vector unsigned char)
699
-    AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
700
-        0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01);
701
-  const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
702
-  const vector unsigned char tenRight = (vector unsigned char)
703
-    AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
704
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
705
-  const vector unsigned char eightLeft = (vector unsigned char)
706
-    AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
707
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08);
668
+    }
669
+
670
+    /* I'm not sure the following is actually faster
671
+       than straight, unvectorized C code :-( */
672
+
673
+    DECLARE_ALIGNED(16, int, tQP2[4]);
674
+    tQP2[0]= c->QP/2 + 1;
675
+    vector signed int vQP2 = vec_ld(0, tQP2);
676
+    vQP2 = vec_splat(vQP2, 0);
677
+    const vector signed int vsint32_8 = vec_splat_s32(8);
678
+    const vector unsigned int vuint32_4 = vec_splat_u32(4);
679
+
680
+    const vector unsigned char permA1 = (vector unsigned char)
681
+        AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
682
+            0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
683
+    const vector unsigned char permA2 = (vector unsigned char)
684
+        AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
685
+            0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
686
+    const vector unsigned char permA1inc = (vector unsigned char)
687
+        AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
688
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
689
+    const vector unsigned char permA2inc = (vector unsigned char)
690
+        AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
691
+            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
692
+    const vector unsigned char magic = (vector unsigned char)
693
+        AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
694
+            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
695
+    const vector unsigned char extractPerm = (vector unsigned char)
696
+        AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
697
+            0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01);
698
+    const vector unsigned char extractPermInc = (vector unsigned char)
699
+        AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
700
+            0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01);
701
+    const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
702
+    const vector unsigned char tenRight = (vector unsigned char)
703
+        AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
704
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
705
+    const vector unsigned char eightLeft = (vector unsigned char)
706
+        AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
707
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08);
708 708
 
709 709
 
710 710
 #define F_INIT(i)                                       \
711
-  vector unsigned char tenRightM##i = tenRight;         \
712
-  vector unsigned char permA1M##i = permA1;             \
713
-  vector unsigned char permA2M##i = permA2;             \
714
-  vector unsigned char extractPermM##i = extractPerm
711
+    vector unsigned char tenRightM##i = tenRight;       \
712
+    vector unsigned char permA1M##i = permA1;           \
713
+    vector unsigned char permA2M##i = permA2;           \
714
+    vector unsigned char extractPermM##i = extractPerm
715 715
 
716 716
 #define F2(i, j, k, l)                                                  \
717
-  if (S[i] & (1 << (l+1))) {                                            \
718
-    const vector unsigned char a_##j##_A##l =                           \
719
-      vec_perm(src##i, src##j, permA1M##i);                             \
720
-    const vector unsigned char a_##j##_B##l =                           \
721
-      vec_perm(a_##j##_A##l, src##k, permA2M##i);                       \
722
-    const vector signed int a_##j##_sump##l =                           \
723
-      (vector signed int)vec_msum(a_##j##_B##l, magic,                  \
724
-                                  (vector unsigned int)zero);           \
725
-    vector signed int F_##j##_##l =                                     \
726
-      vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4);          \
727
-    F_##j##_##l = vec_splat(F_##j##_##l, 3);                            \
728
-    const vector signed int p_##j##_##l =                               \
729
-      (vector signed int)vec_perm(src##j,                               \
730
-                                  (vector unsigned char)zero,           \
731
-                                  extractPermM##i);                     \
732
-    const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\
733
-    const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\
734
-    vector signed int newpm_##j##_##l;                                  \
735
-    if (vec_all_lt(sum_##j##_##l, F_##j##_##l))                         \
736
-      newpm_##j##_##l = sum_##j##_##l;                                  \
737
-    else if (vec_all_gt(diff_##j##_##l, F_##j##_##l))                   \
738
-      newpm_##j##_##l = diff_##j##_##l;                                 \
739
-    else newpm_##j##_##l = F_##j##_##l;                                 \
740
-    const vector unsigned char newpm2_##j##_##l =                       \
741
-      vec_splat((vector unsigned char)newpm_##j##_##l, 15);             \
742
-    const vector unsigned char mask##j##l = vec_add(identity,           \
743
-                                                    tenRightM##i);      \
744
-    src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l);            \
745
-  }                                                                     \
746
-  permA1M##i = vec_add(permA1M##i, permA1inc);                          \
747
-  permA2M##i = vec_add(permA2M##i, permA2inc);                          \
748
-  tenRightM##i = vec_sro(tenRightM##i, eightLeft);                      \
749
-  extractPermM##i = vec_add(extractPermM##i, extractPermInc)
717
+    if (S[i] & (1 << (l+1))) {                                          \
718
+        const vector unsigned char a_##j##_A##l =                       \
719
+            vec_perm(src##i, src##j, permA1M##i);                       \
720
+        const vector unsigned char a_##j##_B##l =                       \
721
+            vec_perm(a_##j##_A##l, src##k, permA2M##i);                 \
722
+        const vector signed int a_##j##_sump##l =                       \
723
+            (vector signed int)vec_msum(a_##j##_B##l, magic,            \
724
+                                        (vector unsigned int)zero);     \
725
+        vector signed int F_##j##_##l =                                 \
726
+            vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4);    \
727
+        F_##j##_##l = vec_splat(F_##j##_##l, 3);                        \
728
+        const vector signed int p_##j##_##l =                           \
729
+            (vector signed int)vec_perm(src##j,                         \
730
+                                        (vector unsigned char)zero,     \
731
+                                        extractPermM##i);               \
732
+        const vector signed int sum_##j##_##l  = vec_add( p_##j##_##l, vQP2);\
733
+        const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\
734
+        vector signed int newpm_##j##_##l;                              \
735
+        if (vec_all_lt(sum_##j##_##l, F_##j##_##l))                     \
736
+            newpm_##j##_##l = sum_##j##_##l;                            \
737
+        else if (vec_all_gt(diff_##j##_##l, F_##j##_##l))               \
738
+            newpm_##j##_##l = diff_##j##_##l;                           \
739
+        else newpm_##j##_##l = F_##j##_##l;                             \
740
+        const vector unsigned char newpm2_##j##_##l =                   \
741
+            vec_splat((vector unsigned char)newpm_##j##_##l, 15);       \
742
+        const vector unsigned char mask##j##l = vec_add(identity,       \
743
+                                                        tenRightM##i);  \
744
+        src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l);        \
745
+    }                                                                   \
746
+    permA1M##i = vec_add(permA1M##i, permA1inc);                        \
747
+    permA2M##i = vec_add(permA2M##i, permA2inc);                        \
748
+    tenRightM##i = vec_sro(tenRightM##i, eightLeft);                    \
749
+    extractPermM##i = vec_add(extractPermM##i, extractPermInc)
750 750
 
751 751
 #define ITER(i, j, k)                           \
752
-  F_INIT(i);                                    \
753
-  F2(i, j, k, 0);                               \
754
-  F2(i, j, k, 1);                               \
755
-  F2(i, j, k, 2);                               \
756
-  F2(i, j, k, 3);                               \
757
-  F2(i, j, k, 4);                               \
758
-  F2(i, j, k, 5);                               \
759
-  F2(i, j, k, 6);                               \
760
-  F2(i, j, k, 7)
761
-
762
-  ITER(0, 1, 2);
763
-  ITER(1, 2, 3);
764
-  ITER(2, 3, 4);
765
-  ITER(3, 4, 5);
766
-  ITER(4, 5, 6);
767
-  ITER(5, 6, 7);
768
-  ITER(6, 7, 8);
769
-  ITER(7, 8, 9);
770
-
771
-  const vector signed char neg1 = vec_splat_s8(-1);
752
+    F_INIT(i);                                  \
753
+    F2(i, j, k, 0);                             \
754
+    F2(i, j, k, 1);                             \
755
+    F2(i, j, k, 2);                             \
756
+    F2(i, j, k, 3);                             \
757
+    F2(i, j, k, 4);                             \
758
+    F2(i, j, k, 5);                             \
759
+    F2(i, j, k, 6);                             \
760
+    F2(i, j, k, 7)
761
+
762
+    ITER(0, 1, 2);
763
+    ITER(1, 2, 3);
764
+    ITER(2, 3, 4);
765
+    ITER(3, 4, 5);
766
+    ITER(4, 5, 6);
767
+    ITER(5, 6, 7);
768
+    ITER(6, 7, 8);
769
+    ITER(7, 8, 9);
770
+
771
+    const vector signed char neg1 = vec_splat_s8(-1);
772 772
 
773 773
 #define STORE_LINE(i)                                   \
774
-  const vector unsigned char permST##i =                \
775
-    vec_lvsr(i * stride, srcCopy);                      \
776
-  const vector unsigned char maskST##i =                \
777
-    vec_perm((vector unsigned char)zero,                \
778
-             (vector unsigned char)neg1, permST##i);    \
779
-  src##i = vec_perm(src##i ,src##i, permST##i);         \
780
-  sA##i= vec_sel(sA##i, src##i, maskST##i);             \
781
-  sB##i= vec_sel(src##i, sB##i, maskST##i);             \
782
-  vec_st(sA##i, i * stride, srcCopy);                   \
783
-  vec_st(sB##i, i * stride + 16, srcCopy)
784
-
785
-  STORE_LINE(1);
786
-  STORE_LINE(2);
787
-  STORE_LINE(3);
788
-  STORE_LINE(4);
789
-  STORE_LINE(5);
790
-  STORE_LINE(6);
791
-  STORE_LINE(7);
792
-  STORE_LINE(8);
774
+    const vector unsigned char permST##i =              \
775
+        vec_lvsr(i * stride, srcCopy);                  \
776
+    const vector unsigned char maskST##i =              \
777
+        vec_perm((vector unsigned char)zero,            \
778
+                 (vector unsigned char)neg1, permST##i);\
779
+    src##i = vec_perm(src##i ,src##i, permST##i);       \
780
+    sA##i= vec_sel(sA##i, src##i, maskST##i);           \
781
+    sB##i= vec_sel(src##i, sB##i, maskST##i);           \
782
+    vec_st(sA##i, i * stride, srcCopy);                 \
783
+    vec_st(sB##i, i * stride + 16, srcCopy)
784
+
785
+    STORE_LINE(1);
786
+    STORE_LINE(2);
787
+    STORE_LINE(3);
788
+    STORE_LINE(4);
789
+    STORE_LINE(5);
790
+    STORE_LINE(6);
791
+    STORE_LINE(7);
792
+    STORE_LINE(8);
793 793
 
794 794
 #undef STORE_LINE
795 795
 #undef ITER
... ...
@@ -801,386 +800,386 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
801 801
 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
802 802
 
803 803
 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
804
-                                    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
804
+                                            uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
805 805
 {
806
-  const vector signed int zero = vec_splat_s32(0);
807
-  const vector signed short vsint16_1 = vec_splat_s16(1);
808
-  vector signed int v_dp = zero;
809
-  vector signed int v_sysdp = zero;
810
-  int d, sysd, i;
806
+    const vector signed int zero = vec_splat_s32(0);
807
+    const vector signed short vsint16_1 = vec_splat_s16(1);
808
+    vector signed int v_dp = zero;
809
+    vector signed int v_sysdp = zero;
810
+    int d, sysd, i;
811 811
 
812
-  tempBluredPast[127]= maxNoise[0];
813
-  tempBluredPast[128]= maxNoise[1];
814
-  tempBluredPast[129]= maxNoise[2];
812
+    tempBluredPast[127]= maxNoise[0];
813
+    tempBluredPast[128]= maxNoise[1];
814
+    tempBluredPast[129]= maxNoise[2];
815 815
 
816 816
 #define LOAD_LINE(src, i)                                               \
817
-  register int j##src##i = i * stride;                                  \
818
-  vector unsigned char perm##src##i = vec_lvsl(j##src##i, src);         \
819
-  const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src);   \
820
-  const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
821
-  const vector unsigned char v_##src##A##i =                            \
822
-    vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i);             \
823
-  vector signed short v_##src##Ass##i =                                 \
824
-    (vector signed short)vec_mergeh((vector signed char)zero,           \
825
-                                    (vector signed char)v_##src##A##i)
826
-
827
-  LOAD_LINE(src, 0);
828
-  LOAD_LINE(src, 1);
829
-  LOAD_LINE(src, 2);
830
-  LOAD_LINE(src, 3);
831
-  LOAD_LINE(src, 4);
832
-  LOAD_LINE(src, 5);
833
-  LOAD_LINE(src, 6);
834
-  LOAD_LINE(src, 7);
835
-
836
-  LOAD_LINE(tempBlured, 0);
837
-  LOAD_LINE(tempBlured, 1);
838
-  LOAD_LINE(tempBlured, 2);
839
-  LOAD_LINE(tempBlured, 3);
840
-  LOAD_LINE(tempBlured, 4);
841
-  LOAD_LINE(tempBlured, 5);
842
-  LOAD_LINE(tempBlured, 6);
843
-  LOAD_LINE(tempBlured, 7);
817
+    register int j##src##i = i * stride;                                \
818
+    vector unsigned char perm##src##i = vec_lvsl(j##src##i, src);       \
819
+    const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
820
+    const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
821
+    const vector unsigned char v_##src##A##i =                          \
822
+        vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i);         \
823
+    vector signed short v_##src##Ass##i =                               \
824
+        (vector signed short)vec_mergeh((vector signed char)zero,       \
825
+                                        (vector signed char)v_##src##A##i)
826
+
827
+    LOAD_LINE(src, 0);
828
+    LOAD_LINE(src, 1);
829
+    LOAD_LINE(src, 2);
830
+    LOAD_LINE(src, 3);
831
+    LOAD_LINE(src, 4);
832
+    LOAD_LINE(src, 5);
833
+    LOAD_LINE(src, 6);
834
+    LOAD_LINE(src, 7);
835
+
836
+    LOAD_LINE(tempBlured, 0);
837
+    LOAD_LINE(tempBlured, 1);
838
+    LOAD_LINE(tempBlured, 2);
839
+    LOAD_LINE(tempBlured, 3);
840
+    LOAD_LINE(tempBlured, 4);
841
+    LOAD_LINE(tempBlured, 5);
842
+    LOAD_LINE(tempBlured, 6);
843
+    LOAD_LINE(tempBlured, 7);
844 844
 #undef LOAD_LINE
845 845
 
846 846
 #define ACCUMULATE_DIFFS(i)                                     \
847
-  vector signed short v_d##i = vec_sub(v_tempBluredAss##i,      \
848
-                                       v_srcAss##i);            \
849
-  v_dp = vec_msums(v_d##i, v_d##i, v_dp);                       \
850
-  v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
851
-
852
-  ACCUMULATE_DIFFS(0);
853
-  ACCUMULATE_DIFFS(1);
854
-  ACCUMULATE_DIFFS(2);
855
-  ACCUMULATE_DIFFS(3);
856
-  ACCUMULATE_DIFFS(4);
857
-  ACCUMULATE_DIFFS(5);
858
-  ACCUMULATE_DIFFS(6);
859
-  ACCUMULATE_DIFFS(7);
847
+    vector signed short v_d##i = vec_sub(v_tempBluredAss##i,    \
848
+                                         v_srcAss##i);          \
849
+    v_dp = vec_msums(v_d##i, v_d##i, v_dp);                     \
850
+    v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
851
+
852
+    ACCUMULATE_DIFFS(0);
853
+    ACCUMULATE_DIFFS(1);
854
+    ACCUMULATE_DIFFS(2);
855
+    ACCUMULATE_DIFFS(3);
856
+    ACCUMULATE_DIFFS(4);
857
+    ACCUMULATE_DIFFS(5);
858
+    ACCUMULATE_DIFFS(6);
859
+    ACCUMULATE_DIFFS(7);
860 860
 #undef ACCUMULATE_DIFFS
861 861
 
862
-  v_dp = vec_sums(v_dp, zero);
863
-  v_sysdp = vec_sums(v_sysdp, zero);
862
+    v_dp = vec_sums(v_dp, zero);
863
+    v_sysdp = vec_sums(v_sysdp, zero);
864 864
 
865
-  v_dp = vec_splat(v_dp, 3);
866
-  v_sysdp = vec_splat(v_sysdp, 3);
865
+    v_dp = vec_splat(v_dp, 3);
866
+    v_sysdp = vec_splat(v_sysdp, 3);
867 867
 
868
-  vec_ste(v_dp, 0, &d);
869
-  vec_ste(v_sysdp, 0, &sysd);
868
+    vec_ste(v_dp, 0, &d);
869
+    vec_ste(v_sysdp, 0, &sysd);
870 870
 
871
-  i = d;
872
-  d = (4*d
873
-       +(*(tempBluredPast-256))
874
-       +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
875
-       +(*(tempBluredPast+256))
876
-       +4)>>3;
871
+    i = d;
872
+    d = (4*d
873
+         +(*(tempBluredPast-256))
874
+         +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
875
+         +(*(tempBluredPast+256))
876
+         +4)>>3;
877 877
 
878
-  *tempBluredPast=i;
878
+    *tempBluredPast=i;
879 879
 
880
-  if (d > maxNoise[1]) {
881
-    if (d < maxNoise[2]) {
880
+    if (d > maxNoise[1]) {
881
+        if (d < maxNoise[2]) {
882 882
 #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i);
883 883
 
884
-      OP(0);
885
-      OP(1);
886
-      OP(2);
887
-      OP(3);
888
-      OP(4);
889
-      OP(5);
890
-      OP(6);
891
-      OP(7);
884
+            OP(0);
885
+            OP(1);
886
+            OP(2);
887
+            OP(3);
888
+            OP(4);
889
+            OP(5);
890
+            OP(6);
891
+            OP(7);
892 892
 #undef OP
893
-    } else {
893
+        } else {
894 894
 #define OP(i) v_tempBluredAss##i = v_srcAss##i;
895 895
 
896
-      OP(0);
897
-      OP(1);
898
-      OP(2);
899
-      OP(3);
900
-      OP(4);
901
-      OP(5);
902
-      OP(6);
903
-      OP(7);
896
+            OP(0);
897
+            OP(1);
898
+            OP(2);
899
+            OP(3);
900
+            OP(4);
901
+            OP(5);
902
+            OP(6);
903
+            OP(7);
904 904
 #undef OP
905
-    }
906
-  } else {
907
-    if (d < maxNoise[0]) {
908
-      const vector signed short vsint16_7 = vec_splat_s16(7);
909
-      const vector signed short vsint16_4 = vec_splat_s16(4);
910
-      const vector unsigned short vuint16_3 = vec_splat_u16(3);
905
+        }
906
+    } else {
907
+        if (d < maxNoise[0]) {
908
+            const vector signed short vsint16_7 = vec_splat_s16(7);
909
+            const vector signed short vsint16_4 = vec_splat_s16(4);
910
+            const vector unsigned short vuint16_3 = vec_splat_u16(3);
911 911
 
912 912
 #define OP(i)                                                   \
913
-      const vector signed short v_temp##i =                     \
914
-        vec_mladd(v_tempBluredAss##i,                           \
915
-                  vsint16_7, v_srcAss##i);                      \
916
-      const vector signed short v_temp2##i =                    \
917
-        vec_add(v_temp##i, vsint16_4);                          \
918
-      v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
919
-
920
-      OP(0);
921
-      OP(1);
922
-      OP(2);
923
-      OP(3);
924
-      OP(4);
925
-      OP(5);
926
-      OP(6);
927
-      OP(7);
913
+            const vector signed short v_temp##i =               \
914
+                vec_mladd(v_tempBluredAss##i,                   \
915
+                          vsint16_7, v_srcAss##i);              \
916
+            const vector signed short v_temp2##i =              \
917
+                vec_add(v_temp##i, vsint16_4);                  \
918
+            v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
919
+
920
+            OP(0);
921
+            OP(1);
922
+            OP(2);
923
+            OP(3);
924
+            OP(4);
925
+            OP(5);
926
+            OP(6);
927
+            OP(7);
928 928
 #undef OP
929
-    } else {
930
-      const vector signed short vsint16_3 = vec_splat_s16(3);
931
-      const vector signed short vsint16_2 = vec_splat_s16(2);
929
+        } else {
930
+            const vector signed short vsint16_3 = vec_splat_s16(3);
931
+            const vector signed short vsint16_2 = vec_splat_s16(2);
932 932
 
933 933
 #define OP(i)                                                   \
934
-      const vector signed short v_temp##i =                     \
935
-        vec_mladd(v_tempBluredAss##i,                           \
936
-                  vsint16_3, v_srcAss##i);                      \
937
-      const vector signed short v_temp2##i =                    \
938
-        vec_add(v_temp##i, vsint16_2);                          \
939
-      v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
940
-
941
-      OP(0);
942
-      OP(1);
943
-      OP(2);
944
-      OP(3);
945
-      OP(4);
946
-      OP(5);
947
-      OP(6);
948
-      OP(7);
934
+            const vector signed short v_temp##i =               \
935
+                vec_mladd(v_tempBluredAss##i,                   \
936
+                          vsint16_3, v_srcAss##i);              \
937
+            const vector signed short v_temp2##i =              \
938
+                vec_add(v_temp##i, vsint16_2);                  \
939
+            v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
940
+
941
+            OP(0);
942
+            OP(1);
943
+            OP(2);
944
+            OP(3);
945
+            OP(4);
946
+            OP(5);
947
+            OP(6);
948
+            OP(7);
949 949
 #undef OP
950
+        }
950 951
     }
951
-  }
952 952
 
953
-  const vector signed char neg1 = vec_splat_s8(-1);
954
-  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
955
-                                                                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
953
+    const vector signed char neg1 = vec_splat_s8(-1);
954
+    const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
955
+                                                                        0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
956 956
 
957 957
 #define PACK_AND_STORE(src, i)                                  \
958
-  const vector unsigned char perms##src##i =                    \
959
-    vec_lvsr(i * stride, src);                                  \
960
-  const vector unsigned char vf##src##i =                       \
961
-    vec_packsu(v_tempBluredAss##i, (vector signed short)zero);  \
962
-  const vector unsigned char vg##src##i =                       \
963
-    vec_perm(vf##src##i, v_##src##A##i, permHH);                \
964
-  const vector unsigned char mask##src##i =                     \
965
-    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
966
-  const vector unsigned char vg2##src##i =                      \
967
-    vec_perm(vg##src##i, vg##src##i, perms##src##i);            \
968
-  const vector unsigned char svA##src##i =                      \
969
-    vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i);         \
970
-  const vector unsigned char svB##src##i =                      \
971
-    vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i);         \
972
-  vec_st(svA##src##i, i * stride, src);                         \
973
-  vec_st(svB##src##i, i * stride + 16, src)
974
-
975
-  PACK_AND_STORE(src, 0);
976
-  PACK_AND_STORE(src, 1);
977
-  PACK_AND_STORE(src, 2);
978
-  PACK_AND_STORE(src, 3);
979
-  PACK_AND_STORE(src, 4);
980
-  PACK_AND_STORE(src, 5);
981
-  PACK_AND_STORE(src, 6);
982
-  PACK_AND_STORE(src, 7);
983
-  PACK_AND_STORE(tempBlured, 0);
984
-  PACK_AND_STORE(tempBlured, 1);
985
-  PACK_AND_STORE(tempBlured, 2);
986
-  PACK_AND_STORE(tempBlured, 3);
987
-  PACK_AND_STORE(tempBlured, 4);
988
-  PACK_AND_STORE(tempBlured, 5);
989
-  PACK_AND_STORE(tempBlured, 6);
990
-  PACK_AND_STORE(tempBlured, 7);
958
+    const vector unsigned char perms##src##i =                  \
959
+        vec_lvsr(i * stride, src);                              \
960
+    const vector unsigned char vf##src##i =                     \
961
+        vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \
962
+    const vector unsigned char vg##src##i =                     \
963
+        vec_perm(vf##src##i, v_##src##A##i, permHH);            \
964
+    const vector unsigned char mask##src##i =                   \
965
+        vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
966
+    const vector unsigned char vg2##src##i =                    \
967
+        vec_perm(vg##src##i, vg##src##i, perms##src##i);        \
968
+    const vector unsigned char svA##src##i =                    \
969
+        vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i);     \
970
+    const vector unsigned char svB##src##i =                    \
971
+        vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i);     \
972
+    vec_st(svA##src##i, i * stride, src);                       \
973
+    vec_st(svB##src##i, i * stride + 16, src)
974
+
975
+    PACK_AND_STORE(src, 0);
976
+    PACK_AND_STORE(src, 1);
977
+    PACK_AND_STORE(src, 2);
978
+    PACK_AND_STORE(src, 3);
979
+    PACK_AND_STORE(src, 4);
980
+    PACK_AND_STORE(src, 5);
981
+    PACK_AND_STORE(src, 6);
982
+    PACK_AND_STORE(src, 7);
983
+    PACK_AND_STORE(tempBlured, 0);
984
+    PACK_AND_STORE(tempBlured, 1);
985
+    PACK_AND_STORE(tempBlured, 2);
986
+    PACK_AND_STORE(tempBlured, 3);
987
+    PACK_AND_STORE(tempBlured, 4);
988
+    PACK_AND_STORE(tempBlured, 5);
989
+    PACK_AND_STORE(tempBlured, 6);
990
+    PACK_AND_STORE(tempBlured, 7);
991 991
 #undef PACK_AND_STORE
992 992
 }
993 993
 
994 994
 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
995
-  const vector unsigned char zero = vec_splat_u8(0);
995
+    const vector unsigned char zero = vec_splat_u8(0);
996 996
 
997 997
 #define LOAD_DOUBLE_LINE(i, j)                                          \
998
-  vector unsigned char perm1##i = vec_lvsl(i * stride, src);            \
999
-  vector unsigned char perm2##i = vec_lvsl(j * stride, src);            \
1000
-  vector unsigned char srcA##i = vec_ld(i * stride, src);               \
1001
-  vector unsigned char srcB##i = vec_ld(i * stride + 16, src);          \
1002
-  vector unsigned char srcC##i = vec_ld(j * stride, src);               \
1003
-  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);           \
1004
-  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i);   \
1005
-  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1006
-
1007
-  LOAD_DOUBLE_LINE(0, 1);
1008
-  LOAD_DOUBLE_LINE(2, 3);
1009
-  LOAD_DOUBLE_LINE(4, 5);
1010
-  LOAD_DOUBLE_LINE(6, 7);
998
+    vector unsigned char perm1##i = vec_lvsl(i * stride, src);          \
999
+    vector unsigned char perm2##i = vec_lvsl(j * stride, src);          \
1000
+    vector unsigned char srcA##i = vec_ld(i * stride, src);             \
1001
+    vector unsigned char srcB##i = vec_ld(i * stride + 16, src);        \
1002
+    vector unsigned char srcC##i = vec_ld(j * stride, src);             \
1003
+    vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);         \
1004
+    vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1005
+    vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1006
+
1007
+    LOAD_DOUBLE_LINE(0, 1);
1008
+    LOAD_DOUBLE_LINE(2, 3);
1009
+    LOAD_DOUBLE_LINE(4, 5);
1010
+    LOAD_DOUBLE_LINE(6, 7);
1011 1011
 #undef LOAD_DOUBLE_LINE
1012 1012
 
1013
-  vector unsigned char tempA = vec_mergeh(src0, zero);
1014
-  vector unsigned char tempB = vec_mergel(src0, zero);
1015
-  vector unsigned char tempC = vec_mergeh(src1, zero);
1016
-  vector unsigned char tempD = vec_mergel(src1, zero);
1017
-  vector unsigned char tempE = vec_mergeh(src2, zero);
1018
-  vector unsigned char tempF = vec_mergel(src2, zero);
1019
-  vector unsigned char tempG = vec_mergeh(src3, zero);
1020
-  vector unsigned char tempH = vec_mergel(src3, zero);
1021
-  vector unsigned char tempI = vec_mergeh(src4, zero);
1022
-  vector unsigned char tempJ = vec_mergel(src4, zero);
1023
-  vector unsigned char tempK = vec_mergeh(src5, zero);
1024
-  vector unsigned char tempL = vec_mergel(src5, zero);
1025
-  vector unsigned char tempM = vec_mergeh(src6, zero);
1026
-  vector unsigned char tempN = vec_mergel(src6, zero);
1027
-  vector unsigned char tempO = vec_mergeh(src7, zero);
1028
-  vector unsigned char tempP = vec_mergel(src7, zero);
1029
-
1030
-  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1031
-  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1032
-  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1033
-  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1034
-  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1035
-  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1036
-  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1037
-  vector unsigned char temp7 = vec_mergel(tempD, tempL);
1038
-  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1039
-  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1040
-  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1041
-  vector unsigned char temp11 = vec_mergel(tempF, tempN);
1042
-  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1043
-  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1044
-  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1045
-  vector unsigned char temp15 = vec_mergel(tempH, tempP);
1046
-
1047
-  tempA = vec_mergeh(temp0, temp8);
1048
-  tempB = vec_mergel(temp0, temp8);
1049
-  tempC = vec_mergeh(temp1, temp9);
1050
-  tempD = vec_mergel(temp1, temp9);
1051
-  tempE = vec_mergeh(temp2, temp10);
1052
-  tempF = vec_mergel(temp2, temp10);
1053
-  tempG = vec_mergeh(temp3, temp11);
1054
-  tempH = vec_mergel(temp3, temp11);
1055
-  tempI = vec_mergeh(temp4, temp12);
1056
-  tempJ = vec_mergel(temp4, temp12);
1057
-  tempK = vec_mergeh(temp5, temp13);
1058
-  tempL = vec_mergel(temp5, temp13);
1059
-  tempM = vec_mergeh(temp6, temp14);
1060
-  tempN = vec_mergel(temp6, temp14);
1061
-  tempO = vec_mergeh(temp7, temp15);
1062
-  tempP = vec_mergel(temp7, temp15);
1063
-
1064
-  temp0 = vec_mergeh(tempA, tempI);
1065
-  temp1 = vec_mergel(tempA, tempI);
1066
-  temp2 = vec_mergeh(tempB, tempJ);
1067
-  temp3 = vec_mergel(tempB, tempJ);
1068
-  temp4 = vec_mergeh(tempC, tempK);
1069
-  temp5 = vec_mergel(tempC, tempK);
1070
-  temp6 = vec_mergeh(tempD, tempL);
1071
-  temp7 = vec_mergel(tempD, tempL);
1072
-  temp8 = vec_mergeh(tempE, tempM);
1073
-  temp9 = vec_mergel(tempE, tempM);
1074
-  temp10 = vec_mergeh(tempF, tempN);
1075
-  temp11 = vec_mergel(tempF, tempN);
1076
-  temp12 = vec_mergeh(tempG, tempO);
1077
-  temp13 = vec_mergel(tempG, tempO);
1078
-  temp14 = vec_mergeh(tempH, tempP);
1079
-  temp15 = vec_mergel(tempH, tempP);
1080
-
1081
-  vec_st(temp0, 0, dst);
1082
-  vec_st(temp1, 16, dst);
1083
-  vec_st(temp2, 32, dst);
1084
-  vec_st(temp3, 48, dst);
1085
-  vec_st(temp4, 64, dst);
1086
-  vec_st(temp5, 80, dst);
1087
-  vec_st(temp6, 96, dst);
1088
-  vec_st(temp7, 112, dst);
1089
-  vec_st(temp8, 128, dst);
1090
-  vec_st(temp9, 144, dst);
1091
-  vec_st(temp10, 160, dst);
1092
-  vec_st(temp11, 176, dst);
1093
-  vec_st(temp12, 192, dst);
1094
-  vec_st(temp13, 208, dst);
1095
-  vec_st(temp14, 224, dst);
1096
-  vec_st(temp15, 240, dst);
1013
+    vector unsigned char tempA = vec_mergeh(src0, zero);
1014
+    vector unsigned char tempB = vec_mergel(src0, zero);
1015
+    vector unsigned char tempC = vec_mergeh(src1, zero);
1016
+    vector unsigned char tempD = vec_mergel(src1, zero);
1017
+    vector unsigned char tempE = vec_mergeh(src2, zero);
1018
+    vector unsigned char tempF = vec_mergel(src2, zero);
1019
+    vector unsigned char tempG = vec_mergeh(src3, zero);
1020
+    vector unsigned char tempH = vec_mergel(src3, zero);
1021
+    vector unsigned char tempI = vec_mergeh(src4, zero);
1022
+    vector unsigned char tempJ = vec_mergel(src4, zero);
1023
+    vector unsigned char tempK = vec_mergeh(src5, zero);
1024
+    vector unsigned char tempL = vec_mergel(src5, zero);
1025
+    vector unsigned char tempM = vec_mergeh(src6, zero);
1026
+    vector unsigned char tempN = vec_mergel(src6, zero);
1027
+    vector unsigned char tempO = vec_mergeh(src7, zero);
1028
+    vector unsigned char tempP = vec_mergel(src7, zero);
1029
+
1030
+    vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1031
+    vector unsigned char temp1 = vec_mergel(tempA, tempI);
1032
+    vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1033
+    vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1034
+    vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1035
+    vector unsigned char temp5 = vec_mergel(tempC, tempK);
1036
+    vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1037
+    vector unsigned char temp7 = vec_mergel(tempD, tempL);
1038
+    vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1039
+    vector unsigned char temp9 = vec_mergel(tempE, tempM);
1040
+    vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1041
+    vector unsigned char temp11 = vec_mergel(tempF, tempN);
1042
+    vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1043
+    vector unsigned char temp13 = vec_mergel(tempG, tempO);
1044
+    vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1045
+    vector unsigned char temp15 = vec_mergel(tempH, tempP);
1046
+
1047
+    tempA = vec_mergeh(temp0, temp8);
1048
+    tempB = vec_mergel(temp0, temp8);
1049
+    tempC = vec_mergeh(temp1, temp9);
1050
+    tempD = vec_mergel(temp1, temp9);
1051
+    tempE = vec_mergeh(temp2, temp10);
1052
+    tempF = vec_mergel(temp2, temp10);
1053
+    tempG = vec_mergeh(temp3, temp11);
1054
+    tempH = vec_mergel(temp3, temp11);
1055
+    tempI = vec_mergeh(temp4, temp12);
1056
+    tempJ = vec_mergel(temp4, temp12);
1057
+    tempK = vec_mergeh(temp5, temp13);
1058
+    tempL = vec_mergel(temp5, temp13);
1059
+    tempM = vec_mergeh(temp6, temp14);
1060
+    tempN = vec_mergel(temp6, temp14);
1061
+    tempO = vec_mergeh(temp7, temp15);
1062
+    tempP = vec_mergel(temp7, temp15);
1063
+
1064
+    temp0 = vec_mergeh(tempA, tempI);
1065
+    temp1 = vec_mergel(tempA, tempI);
1066
+    temp2 = vec_mergeh(tempB, tempJ);
1067
+    temp3 = vec_mergel(tempB, tempJ);
1068
+    temp4 = vec_mergeh(tempC, tempK);
1069
+    temp5 = vec_mergel(tempC, tempK);
1070
+    temp6 = vec_mergeh(tempD, tempL);
1071
+    temp7 = vec_mergel(tempD, tempL);
1072
+    temp8 = vec_mergeh(tempE, tempM);
1073
+    temp9 = vec_mergel(tempE, tempM);
1074
+    temp10 = vec_mergeh(tempF, tempN);
1075
+    temp11 = vec_mergel(tempF, tempN);
1076
+    temp12 = vec_mergeh(tempG, tempO);
1077
+    temp13 = vec_mergel(tempG, tempO);
1078
+    temp14 = vec_mergeh(tempH, tempP);
1079
+    temp15 = vec_mergel(tempH, tempP);
1080
+
1081
+    vec_st(temp0, 0, dst);
1082
+    vec_st(temp1, 16, dst);
1083
+    vec_st(temp2, 32, dst);
1084
+    vec_st(temp3, 48, dst);
1085
+    vec_st(temp4, 64, dst);
1086
+    vec_st(temp5, 80, dst);
1087
+    vec_st(temp6, 96, dst);
1088
+    vec_st(temp7, 112, dst);
1089
+    vec_st(temp8, 128, dst);
1090
+    vec_st(temp9, 144, dst);
1091
+    vec_st(temp10, 160, dst);
1092
+    vec_st(temp11, 176, dst);
1093
+    vec_st(temp12, 192, dst);
1094
+    vec_st(temp13, 208, dst);
1095
+    vec_st(temp14, 224, dst);
1096
+    vec_st(temp15, 240, dst);
1097 1097
 }
1098 1098
 
1099 1099
 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1100
-  const vector unsigned char zero = vec_splat_u8(0);
1100
+    const vector unsigned char zero = vec_splat_u8(0);
1101 1101
 
1102 1102
 #define LOAD_DOUBLE_LINE(i, j)                                  \
1103
-  vector unsigned char src##i = vec_ld(i * 16, src);            \
1104
-  vector unsigned char src##j = vec_ld(j * 16, src)
1105
-
1106
-  LOAD_DOUBLE_LINE(0, 1);
1107
-  LOAD_DOUBLE_LINE(2, 3);
1108
-  LOAD_DOUBLE_LINE(4, 5);
1109
-  LOAD_DOUBLE_LINE(6, 7);
1110
-  LOAD_DOUBLE_LINE(8, 9);
1111
-  LOAD_DOUBLE_LINE(10, 11);
1112
-  LOAD_DOUBLE_LINE(12, 13);
1113
-  LOAD_DOUBLE_LINE(14, 15);
1103
+    vector unsigned char src##i = vec_ld(i * 16, src);            \
1104
+    vector unsigned char src##j = vec_ld(j * 16, src)
1105
+
1106
+    LOAD_DOUBLE_LINE(0, 1);
1107
+    LOAD_DOUBLE_LINE(2, 3);
1108
+    LOAD_DOUBLE_LINE(4, 5);
1109
+    LOAD_DOUBLE_LINE(6, 7);
1110
+    LOAD_DOUBLE_LINE(8, 9);
1111
+    LOAD_DOUBLE_LINE(10, 11);
1112
+    LOAD_DOUBLE_LINE(12, 13);
1113
+    LOAD_DOUBLE_LINE(14, 15);
1114 1114
 #undef LOAD_DOUBLE_LINE
1115 1115
 
1116
-  vector unsigned char tempA = vec_mergeh(src0, src8);
1117
-  vector unsigned char tempB;
1118
-  vector unsigned char tempC = vec_mergeh(src1, src9);
1119
-  vector unsigned char tempD;
1120
-  vector unsigned char tempE = vec_mergeh(src2, src10);
1121
-  vector unsigned char tempG = vec_mergeh(src3, src11);
1122
-  vector unsigned char tempI = vec_mergeh(src4, src12);
1123
-  vector unsigned char tempJ;
1124
-  vector unsigned char tempK = vec_mergeh(src5, src13);
1125
-  vector unsigned char tempL;
1126
-  vector unsigned char tempM = vec_mergeh(src6, src14);
1127
-  vector unsigned char tempO = vec_mergeh(src7, src15);
1128
-
1129
-  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1130
-  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1131
-  vector unsigned char temp2;
1132
-  vector unsigned char temp3;
1133
-  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1134
-  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1135
-  vector unsigned char temp6;
1136
-  vector unsigned char temp7;
1137
-  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1138
-  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1139
-  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1140
-  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1141
-
1142
-  tempA = vec_mergeh(temp0, temp8);
1143
-  tempB = vec_mergel(temp0, temp8);
1144
-  tempC = vec_mergeh(temp1, temp9);
1145
-  tempD = vec_mergel(temp1, temp9);
1146
-  tempI = vec_mergeh(temp4, temp12);
1147
-  tempJ = vec_mergel(temp4, temp12);
1148
-  tempK = vec_mergeh(temp5, temp13);
1149
-  tempL = vec_mergel(temp5, temp13);
1150
-
1151
-  temp0 = vec_mergeh(tempA, tempI);
1152
-  temp1 = vec_mergel(tempA, tempI);
1153
-  temp2 = vec_mergeh(tempB, tempJ);
1154
-  temp3 = vec_mergel(tempB, tempJ);
1155
-  temp4 = vec_mergeh(tempC, tempK);
1156
-  temp5 = vec_mergel(tempC, tempK);
1157
-  temp6 = vec_mergeh(tempD, tempL);
1158
-  temp7 = vec_mergel(tempD, tempL);
1159
-
1160
-
1161
-  const vector signed char neg1 = vec_splat_s8(-1);
1116
+    vector unsigned char tempA = vec_mergeh(src0, src8);
1117
+    vector unsigned char tempB;
1118
+    vector unsigned char tempC = vec_mergeh(src1, src9);
1119
+    vector unsigned char tempD;
1120
+    vector unsigned char tempE = vec_mergeh(src2, src10);
1121
+    vector unsigned char tempG = vec_mergeh(src3, src11);
1122
+    vector unsigned char tempI = vec_mergeh(src4, src12);
1123
+    vector unsigned char tempJ;
1124
+    vector unsigned char tempK = vec_mergeh(src5, src13);
1125
+    vector unsigned char tempL;
1126
+    vector unsigned char tempM = vec_mergeh(src6, src14);
1127
+    vector unsigned char tempO = vec_mergeh(src7, src15);
1128
+
1129
+    vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1130
+    vector unsigned char temp1 = vec_mergel(tempA, tempI);
1131
+    vector unsigned char temp2;
1132
+    vector unsigned char temp3;
1133
+    vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1134
+    vector unsigned char temp5 = vec_mergel(tempC, tempK);
1135
+    vector unsigned char temp6;
1136
+    vector unsigned char temp7;
1137
+    vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1138
+    vector unsigned char temp9 = vec_mergel(tempE, tempM);
1139
+    vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1140
+    vector unsigned char temp13 = vec_mergel(tempG, tempO);
1141
+
1142
+    tempA = vec_mergeh(temp0, temp8);
1143
+    tempB = vec_mergel(temp0, temp8);
1144
+    tempC = vec_mergeh(temp1, temp9);
1145
+    tempD = vec_mergel(temp1, temp9);
1146
+    tempI = vec_mergeh(temp4, temp12);
1147
+    tempJ = vec_mergel(temp4, temp12);
1148
+    tempK = vec_mergeh(temp5, temp13);
1149
+    tempL = vec_mergel(temp5, temp13);
1150
+
1151
+    temp0 = vec_mergeh(tempA, tempI);
1152
+    temp1 = vec_mergel(tempA, tempI);
1153
+    temp2 = vec_mergeh(tempB, tempJ);
1154
+    temp3 = vec_mergel(tempB, tempJ);
1155
+    temp4 = vec_mergeh(tempC, tempK);
1156
+    temp5 = vec_mergel(tempC, tempK);
1157
+    temp6 = vec_mergeh(tempD, tempL);
1158
+    temp7 = vec_mergel(tempD, tempL);
1159
+
1160
+
1161
+    const vector signed char neg1 = vec_splat_s8(-1);
1162 1162
 #define STORE_DOUBLE_LINE(i, j)                                         \
1163
-  vector unsigned char dstA##i = vec_ld(i * stride, dst);               \
1164
-  vector unsigned char dstB##i = vec_ld(i * stride + 16, dst);          \
1165
-  vector unsigned char dstA##j = vec_ld(j * stride, dst);               \
1166
-  vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst);           \
1167
-  vector unsigned char align##i = vec_lvsr(i * stride, dst);            \
1168
-  vector unsigned char align##j = vec_lvsr(j * stride, dst);            \
1169
-  vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \
1170
-  vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \
1171
-  vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);  \
1172
-  vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);  \
1173
-  vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i);   \
1174
-  vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i);   \
1175
-  vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j);   \
1176
-  vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j);   \
1177
-  vec_st(dstAF##i, i * stride, dst);                                    \
1178
-  vec_st(dstBF##i, i * stride + 16, dst);                               \
1179
-  vec_st(dstAF##j, j * stride, dst);                                    \
1180
-  vec_st(dstBF##j, j * stride + 16, dst)
1181
-
1182
-  STORE_DOUBLE_LINE(0,1);
1183
-  STORE_DOUBLE_LINE(2,3);
1184
-  STORE_DOUBLE_LINE(4,5);
1185
-  STORE_DOUBLE_LINE(6,7);
1163
+    vector unsigned char dstA##i = vec_ld(i * stride, dst);             \
1164
+    vector unsigned char dstB##i = vec_ld(i * stride + 16, dst);        \
1165
+    vector unsigned char dstA##j = vec_ld(j * stride, dst);             \
1166
+    vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst);         \
1167
+    vector unsigned char align##i = vec_lvsr(i * stride, dst);          \
1168
+    vector unsigned char align##j = vec_lvsr(j * stride, dst);          \
1169
+    vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \
1170
+    vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \
1171
+    vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);\
1172
+    vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);\
1173
+    vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \
1174
+    vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \
1175
+    vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \
1176
+    vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \
1177
+    vec_st(dstAF##i, i * stride, dst);                                  \
1178
+    vec_st(dstBF##i, i * stride + 16, dst);                             \
1179
+    vec_st(dstAF##j, j * stride, dst);                                  \
1180
+    vec_st(dstBF##j, j * stride + 16, dst)
1181
+
1182
+    STORE_DOUBLE_LINE(0,1);
1183
+    STORE_DOUBLE_LINE(2,3);
1184
+    STORE_DOUBLE_LINE(4,5);
1185
+    STORE_DOUBLE_LINE(6,7);
1186 1186
 }
... ...
@@ -55,13 +55,13 @@
55 55
 #define FULL_Y_RANGE    0x8000                  // 32768
56 56
 
57 57
 //Deinterlacing Filters
58
-#define        LINEAR_IPOL_DEINT_FILTER         0x10000 // 65536
59
-#define        LINEAR_BLEND_DEINT_FILTER        0x20000 // 131072
60
-#define        CUBIC_BLEND_DEINT_FILTER         0x8000  // (not implemented yet)
61
-#define        CUBIC_IPOL_DEINT_FILTER          0x40000 // 262144
62
-#define        MEDIAN_DEINT_FILTER              0x80000 // 524288
63
-#define        FFMPEG_DEINT_FILTER              0x400000
64
-#define        LOWPASS5_DEINT_FILTER            0x800000
58
+#define LINEAR_IPOL_DEINT_FILTER        0x10000 // 65536
59
+#define LINEAR_BLEND_DEINT_FILTER       0x20000 // 131072
60
+#define CUBIC_BLEND_DEINT_FILTER        0x8000  // (not implemented yet)
61
+#define CUBIC_IPOL_DEINT_FILTER         0x40000 // 262144
62
+#define MEDIAN_DEINT_FILTER             0x80000 // 524288
63
+#define FFMPEG_DEINT_FILTER             0x400000
64
+#define LOWPASS5_DEINT_FILTER           0x800000
65 65
 
66 66
 #define TEMP_NOISE_FILTER               0x100000
67 67
 #define FORCE_QUANT                     0x200000
... ...
@@ -73,106 +73,105 @@
73 73
 //#define COMPILE_TIME_MODE 0x77
74 74
 
75 75
 static inline int CLIP(int a){
76
-        if(a&256) return ((a)>>31)^(-1);
77
-        else      return a;
76
+    if(a&256) return ((a)>>31)^(-1);
77
+    else      return a;
78 78
 }
79 79
 /**
80 80
  * Postprocessng filter.
81 81
  */
82 82
 struct PPFilter{
83
-        const char *shortName;
84
-        const char *longName;
85
-        int chromDefault;       ///< is chrominance filtering on by default if this filter is manually activated
86
-        int minLumQuality;      ///< minimum quality to turn luminance filtering on
87
-        int minChromQuality;    ///< minimum quality to turn chrominance filtering on
88
-        int mask;               ///< Bitmask to turn this filter on
83
+    const char *shortName;
84
+    const char *longName;
85
+    int chromDefault;       ///< is chrominance filtering on by default if this filter is manually activated
86
+    int minLumQuality;      ///< minimum quality to turn luminance filtering on
87
+    int minChromQuality;    ///< minimum quality to turn chrominance filtering on
88
+    int mask;               ///< Bitmask to turn this filter on
89 89
 };
90 90
 
91 91
 /**
92 92
  * Postprocessng mode.
93 93
  */
94 94
 typedef struct PPMode{
95
-        int lumMode;                    ///< acivates filters for luminance
96
-        int chromMode;                  ///< acivates filters for chrominance
97
-        int error;                      ///< non zero on error
95
+    int lumMode;                    ///< acivates filters for luminance
96
+    int chromMode;                  ///< acivates filters for chrominance
97
+    int error;                      ///< non zero on error
98 98
 
99
-        int minAllowedY;                ///< for brigtness correction
100
-        int maxAllowedY;                ///< for brihtness correction
101
-        float maxClippedThreshold;      ///< amount of "black" u r willing to loose to get a brightness corrected picture
99
+    int minAllowedY;                ///< for brigtness correction
100
+    int maxAllowedY;                ///< for brihtness correction
101
+    float maxClippedThreshold;      ///< amount of "black" u r willing to loose to get a brightness corrected picture
102 102
 
103
-        int maxTmpNoise[3];             ///< for Temporal Noise Reducing filter (Maximal sum of abs differences)
103
+    int maxTmpNoise[3];             ///< for Temporal Noise Reducing filter (Maximal sum of abs differences)
104 104
 
105
-        int baseDcDiff;
106
-        int flatnessThreshold;
105
+    int baseDcDiff;
106
+    int flatnessThreshold;
107 107
 
108
-        int forcedQuant;                ///< quantizer if FORCE_QUANT is used
108
+    int forcedQuant;                ///< quantizer if FORCE_QUANT is used
109 109
 } PPMode;
110 110
 
111 111
 /**
112 112
  * postprocess context.
113 113
  */
114 114
 typedef struct PPContext{
115
-        /**
116
-         * info on struct for av_log
117
-         */
118
-        const AVClass *av_class;
115
+    /**
116
+     * info on struct for av_log
117
+     */
118
+    const AVClass *av_class;
119 119
 
120
-        uint8_t *tempBlocks; ///<used for the horizontal code
120
+    uint8_t *tempBlocks; ///<used for the horizontal code
121 121
 
122
-        /**
123
-         * luma histogram.
124
-         * we need 64bit here otherwise we'll going to have a problem
125
-         * after watching a black picture for 5 hours
126
-         */
127
-        uint64_t *yHistogram;
122
+    /**
123
+     * luma histogram.
124
+     * we need 64bit here otherwise we'll going to have a problem
125
+     * after watching a black picture for 5 hours
126
+     */
127
+    uint64_t *yHistogram;
128 128
 
129
-        DECLARE_ALIGNED(8, uint64_t, packedYOffset);
130
-        DECLARE_ALIGNED(8, uint64_t, packedYScale);
129
+    DECLARE_ALIGNED(8, uint64_t, packedYOffset);
130
+    DECLARE_ALIGNED(8, uint64_t, packedYScale);
131 131
 
132
-        /** Temporal noise reducing buffers */
133
-        uint8_t *tempBlured[3];
134
-        int32_t *tempBluredPast[3];
132
+    /** Temporal noise reducing buffers */
133
+    uint8_t *tempBlured[3];
134
+    int32_t *tempBluredPast[3];
135 135
 
136
-        /** Temporary buffers for handling the last row(s) */
137
-        uint8_t *tempDst;
138
-        uint8_t *tempSrc;
136
+    /** Temporary buffers for handling the last row(s) */
137
+    uint8_t *tempDst;
138
+    uint8_t *tempSrc;
139 139
 
140
-        uint8_t *deintTemp;
140
+    uint8_t *deintTemp;
141 141
 
142
-        DECLARE_ALIGNED(8, uint64_t, pQPb);
143
-        DECLARE_ALIGNED(8, uint64_t, pQPb2);
142
+    DECLARE_ALIGNED(8, uint64_t, pQPb);
143
+    DECLARE_ALIGNED(8, uint64_t, pQPb2);
144 144
 
145
-        DECLARE_ALIGNED(8, uint64_t, mmxDcOffset[64]);
146
-        DECLARE_ALIGNED(8, uint64_t, mmxDcThreshold[64]);
145
+    DECLARE_ALIGNED(8, uint64_t, mmxDcOffset[64]);
146
+    DECLARE_ALIGNED(8, uint64_t, mmxDcThreshold[64]);
147 147
 
148
-        QP_STORE_T *stdQPTable;       ///< used to fix MPEG2 style qscale
149
-        QP_STORE_T *nonBQPTable;
150
-        QP_STORE_T *forcedQPTable;
148
+    QP_STORE_T *stdQPTable;       ///< used to fix MPEG2 style qscale
149
+    QP_STORE_T *nonBQPTable;
150
+    QP_STORE_T *forcedQPTable;
151 151
 
152
-        int QP;
153
-        int nonBQP;
152
+    int QP;
153
+    int nonBQP;
154 154
 
155
-        int frameNum;
155
+    int frameNum;
156 156
 
157
-        int cpuCaps;
157
+    int cpuCaps;
158 158
 
159
-        int qpStride; ///<size of qp buffers (needed to realloc them if needed)
160
-        int stride;   ///<size of some buffers (needed to realloc them if needed)
159
+    int qpStride; ///<size of qp buffers (needed to realloc them if needed)
160
+    int stride;   ///<size of some buffers (needed to realloc them if needed)
161 161
 
162
-        int hChromaSubSample;
163
-        int vChromaSubSample;
162
+    int hChromaSubSample;
163
+    int vChromaSubSample;
164 164
 
165
-        PPMode ppMode;
165
+    PPMode ppMode;
166 166
 } PPContext;
167 167
 
168 168
 
169
-static inline void linecpy(void *dest, const void *src, int lines, int stride)
170
-{
171
-        if (stride > 0) {
172
-                memcpy(dest, src, lines*stride);
173
-        } else {
174
-                memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
175
-        }
169
+static inline void linecpy(void *dest, const void *src, int lines, int stride) {
170
+    if (stride > 0) {
171
+        memcpy(dest, src, lines*stride);
172
+    } else {
173
+        memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
174
+    }
176 175
 }
177 176
 
178 177
 #endif /* FFMPEG_POSTPROCESS_INTERNAL_H */
... ...
@@ -42,17 +42,17 @@
42 42
 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
43 43
 #elif defined (HAVE_MMX)
44 44
 #define PMINUB(b,a,t) \
45
-        "movq " #a ", " #t " \n\t"\
46
-        "psubusb " #b ", " #t " \n\t"\
47
-        "psubb " #t ", " #a " \n\t"
45
+    "movq " #a ", " #t " \n\t"\
46
+    "psubusb " #b ", " #t " \n\t"\
47
+    "psubb " #t ", " #a " \n\t"
48 48
 #endif
49 49
 
50 50
 #ifdef HAVE_MMX2
51 51
 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
52 52
 #elif defined (HAVE_MMX)
53 53
 #define PMAXUB(a,b) \
54
-        "psubusb " #a ", " #b " \n\t"\
55
-        "paddb " #a ", " #b " \n\t"
54
+    "psubusb " #a ", " #b " \n\t"\
55
+    "paddb " #a ", " #b " \n\t"
56 56
 #endif
57 57
 
58 58
 //FIXME? |255-0| = 1 (should not be a problem ...)
... ...
@@ -61,114 +61,114 @@
61 61
  * Check if the middle 8x8 Block in the given 8x16 block is flat
62 62
  */
63 63
 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
64
-        int numEq= 0, dcOk;
65
-        src+= stride*4; // src points to begin of the 8x8 Block
66
-asm volatile(
67
-                "movq %0, %%mm7                         \n\t"
68
-                "movq %1, %%mm6                         \n\t"
69
-                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
70
-                );
71
-
72
-asm volatile(
73
-                "lea (%2, %3), %%"REG_a"                \n\t"
64
+    int numEq= 0, dcOk;
65
+    src+= stride*4; // src points to begin of the 8x8 Block
66
+    asm volatile(
67
+        "movq %0, %%mm7                         \n\t"
68
+        "movq %1, %%mm6                         \n\t"
69
+        : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
70
+        );
71
+
72
+    asm volatile(
73
+        "lea (%2, %3), %%"REG_a"                \n\t"
74 74
 //      0       1       2       3       4       5       6       7       8       9
75 75
 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
76 76
 
77
-                "movq (%2), %%mm0                       \n\t"
78
-                "movq (%%"REG_a"), %%mm1                \n\t"
79
-                "movq %%mm0, %%mm3                      \n\t"
80
-                "movq %%mm0, %%mm4                      \n\t"
81
-                PMAXUB(%%mm1, %%mm4)
82
-                PMINUB(%%mm1, %%mm3, %%mm5)
83
-                "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
84
-                "paddb %%mm7, %%mm0                     \n\t"
85
-                "pcmpgtb %%mm6, %%mm0                   \n\t"
86
-
87
-                "movq (%%"REG_a",%3), %%mm2             \n\t"
88
-                PMAXUB(%%mm2, %%mm4)
89
-                PMINUB(%%mm2, %%mm3, %%mm5)
90
-                "psubb %%mm2, %%mm1                     \n\t"
91
-                "paddb %%mm7, %%mm1                     \n\t"
92
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
93
-                "paddb %%mm1, %%mm0                     \n\t"
94
-
95
-                "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
96
-                PMAXUB(%%mm1, %%mm4)
97
-                PMINUB(%%mm1, %%mm3, %%mm5)
98
-                "psubb %%mm1, %%mm2                     \n\t"
99
-                "paddb %%mm7, %%mm2                     \n\t"
100
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
101
-                "paddb %%mm2, %%mm0                     \n\t"
102
-
103
-                "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
104
-
105
-                "movq (%2, %3, 4), %%mm2                \n\t"
106
-                PMAXUB(%%mm2, %%mm4)
107
-                PMINUB(%%mm2, %%mm3, %%mm5)
108
-                "psubb %%mm2, %%mm1                     \n\t"
109
-                "paddb %%mm7, %%mm1                     \n\t"
110
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
111
-                "paddb %%mm1, %%mm0                     \n\t"
112
-
113
-                "movq (%%"REG_a"), %%mm1                \n\t"
114
-                PMAXUB(%%mm1, %%mm4)
115
-                PMINUB(%%mm1, %%mm3, %%mm5)
116
-                "psubb %%mm1, %%mm2                     \n\t"
117
-                "paddb %%mm7, %%mm2                     \n\t"
118
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
119
-                "paddb %%mm2, %%mm0                     \n\t"
120
-
121
-                "movq (%%"REG_a", %3), %%mm2            \n\t"
122
-                PMAXUB(%%mm2, %%mm4)
123
-                PMINUB(%%mm2, %%mm3, %%mm5)
124
-                "psubb %%mm2, %%mm1                     \n\t"
125
-                "paddb %%mm7, %%mm1                     \n\t"
126
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
127
-                "paddb %%mm1, %%mm0                     \n\t"
128
-
129
-                "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
130
-                PMAXUB(%%mm1, %%mm4)
131
-                PMINUB(%%mm1, %%mm3, %%mm5)
132
-                "psubb %%mm1, %%mm2                     \n\t"
133
-                "paddb %%mm7, %%mm2                     \n\t"
134
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
135
-                "paddb %%mm2, %%mm0                     \n\t"
136
-                "psubusb %%mm3, %%mm4                   \n\t"
137
-
138
-                "                                       \n\t"
77
+        "movq (%2), %%mm0                       \n\t"
78
+        "movq (%%"REG_a"), %%mm1                \n\t"
79
+        "movq %%mm0, %%mm3                      \n\t"
80
+        "movq %%mm0, %%mm4                      \n\t"
81
+        PMAXUB(%%mm1, %%mm4)
82
+        PMINUB(%%mm1, %%mm3, %%mm5)
83
+        "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
84
+        "paddb %%mm7, %%mm0                     \n\t"
85
+        "pcmpgtb %%mm6, %%mm0                   \n\t"
86
+
87
+        "movq (%%"REG_a",%3), %%mm2             \n\t"
88
+        PMAXUB(%%mm2, %%mm4)
89
+        PMINUB(%%mm2, %%mm3, %%mm5)
90
+        "psubb %%mm2, %%mm1                     \n\t"
91
+        "paddb %%mm7, %%mm1                     \n\t"
92
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
93
+        "paddb %%mm1, %%mm0                     \n\t"
94
+
95
+        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
96
+        PMAXUB(%%mm1, %%mm4)
97
+        PMINUB(%%mm1, %%mm3, %%mm5)
98
+        "psubb %%mm1, %%mm2                     \n\t"
99
+        "paddb %%mm7, %%mm2                     \n\t"
100
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
101
+        "paddb %%mm2, %%mm0                     \n\t"
102
+
103
+        "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
104
+
105
+        "movq (%2, %3, 4), %%mm2                \n\t"
106
+        PMAXUB(%%mm2, %%mm4)
107
+        PMINUB(%%mm2, %%mm3, %%mm5)
108
+        "psubb %%mm2, %%mm1                     \n\t"
109
+        "paddb %%mm7, %%mm1                     \n\t"
110
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
111
+        "paddb %%mm1, %%mm0                     \n\t"
112
+
113
+        "movq (%%"REG_a"), %%mm1                \n\t"
114
+        PMAXUB(%%mm1, %%mm4)
115
+        PMINUB(%%mm1, %%mm3, %%mm5)
116
+        "psubb %%mm1, %%mm2                     \n\t"
117
+        "paddb %%mm7, %%mm2                     \n\t"
118
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
119
+        "paddb %%mm2, %%mm0                     \n\t"
120
+
121
+        "movq (%%"REG_a", %3), %%mm2            \n\t"
122
+        PMAXUB(%%mm2, %%mm4)
123
+        PMINUB(%%mm2, %%mm3, %%mm5)
124
+        "psubb %%mm2, %%mm1                     \n\t"
125
+        "paddb %%mm7, %%mm1                     \n\t"
126
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
127
+        "paddb %%mm1, %%mm0                     \n\t"
128
+
129
+        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
130
+        PMAXUB(%%mm1, %%mm4)
131
+        PMINUB(%%mm1, %%mm3, %%mm5)
132
+        "psubb %%mm1, %%mm2                     \n\t"
133
+        "paddb %%mm7, %%mm2                     \n\t"
134
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
135
+        "paddb %%mm2, %%mm0                     \n\t"
136
+        "psubusb %%mm3, %%mm4                   \n\t"
137
+
138
+        "                                       \n\t"
139 139
 #ifdef HAVE_MMX2
140
-                "pxor %%mm7, %%mm7                      \n\t"
141
-                "psadbw %%mm7, %%mm0                    \n\t"
140
+        "pxor %%mm7, %%mm7                      \n\t"
141
+        "psadbw %%mm7, %%mm0                    \n\t"
142 142
 #else
143
-                "movq %%mm0, %%mm1                      \n\t"
144
-                "psrlw $8, %%mm0                        \n\t"
145
-                "paddb %%mm1, %%mm0                     \n\t"
146
-                "movq %%mm0, %%mm1                      \n\t"
147
-                "psrlq $16, %%mm0                       \n\t"
148
-                "paddb %%mm1, %%mm0                     \n\t"
149
-                "movq %%mm0, %%mm1                      \n\t"
150
-                "psrlq $32, %%mm0                       \n\t"
151
-                "paddb %%mm1, %%mm0                     \n\t"
143
+        "movq %%mm0, %%mm1                      \n\t"
144
+        "psrlw $8, %%mm0                        \n\t"
145
+        "paddb %%mm1, %%mm0                     \n\t"
146
+        "movq %%mm0, %%mm1                      \n\t"
147
+        "psrlq $16, %%mm0                       \n\t"
148
+        "paddb %%mm1, %%mm0                     \n\t"
149
+        "movq %%mm0, %%mm1                      \n\t"
150
+        "psrlq $32, %%mm0                       \n\t"
151
+        "paddb %%mm1, %%mm0                     \n\t"
152 152
 #endif
153
-                "movq %4, %%mm7                         \n\t" // QP,..., QP
154
-                "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
155
-                "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
156
-                "packssdw %%mm4, %%mm4                  \n\t"
157
-                "movd %%mm0, %0                         \n\t"
158
-                "movd %%mm4, %1                         \n\t"
159
-
160
-                : "=r" (numEq), "=r" (dcOk)
161
-                : "r" (src), "r" ((long)stride), "m" (c->pQPb)
162
-                : "%"REG_a
163
-                );
164
-
165
-        numEq= (-numEq) &0xFF;
166
-        if(numEq > c->ppMode.flatnessThreshold){
167
-            if(dcOk) return 0;
168
-            else     return 1;
169
-        }else{
170
-            return 2;
171
-        }
153
+        "movq %4, %%mm7                         \n\t" // QP,..., QP
154
+        "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
155
+        "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
156
+        "packssdw %%mm4, %%mm4                  \n\t"
157
+        "movd %%mm0, %0                         \n\t"
158
+        "movd %%mm4, %1                         \n\t"
159
+
160
+        : "=r" (numEq), "=r" (dcOk)
161
+        : "r" (src), "r" ((long)stride), "m" (c->pQPb)
162
+        : "%"REG_a
163
+        );
164
+
165
+    numEq= (-numEq) &0xFF;
166
+    if(numEq > c->ppMode.flatnessThreshold){
167
+        if(dcOk) return 0;
168
+        else     return 1;
169
+    }else{
170
+        return 2;
171
+    }
172 172
 }
173 173
 #endif //HAVE_MMX
174 174
 
... ...
@@ -180,172 +180,171 @@ asm volatile(
180 180
 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
181 181
 {
182 182
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
183
-        src+= stride*3;
184
-        asm volatile(        //"movv %0 %1 %2\n\t"
185
-                "movq %2, %%mm0                         \n\t"  // QP,..., QP
186
-                "pxor %%mm4, %%mm4                      \n\t"
187
-
188
-                "movq (%0), %%mm6                       \n\t"
189
-                "movq (%0, %1), %%mm5                   \n\t"
190
-                "movq %%mm5, %%mm1                      \n\t"
191
-                "movq %%mm6, %%mm2                      \n\t"
192
-                "psubusb %%mm6, %%mm5                   \n\t"
193
-                "psubusb %%mm1, %%mm2                   \n\t"
194
-                "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
195
-                "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
196
-                "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
197
-
198
-                "pand %%mm2, %%mm6                      \n\t"
199
-                "pandn %%mm1, %%mm2                     \n\t"
200
-                "por %%mm2, %%mm6                       \n\t"// First Line to Filter
201
-
202
-                "movq (%0, %1, 8), %%mm5                \n\t"
203
-                "lea (%0, %1, 4), %%"REG_a"             \n\t"
204
-                "lea (%0, %1, 8), %%"REG_c"             \n\t"
205
-                "sub %1, %%"REG_c"                      \n\t"
206
-                "add %1, %0                             \n\t" // %0 points to line 1 not 0
207
-                "movq (%0, %1, 8), %%mm7                \n\t"
208
-                "movq %%mm5, %%mm1                      \n\t"
209
-                "movq %%mm7, %%mm2                      \n\t"
210
-                "psubusb %%mm7, %%mm5                   \n\t"
211
-                "psubusb %%mm1, %%mm2                   \n\t"
212
-                "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
213
-                "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
214
-                "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
215
-
216
-                "pand %%mm2, %%mm7                      \n\t"
217
-                "pandn %%mm1, %%mm2                     \n\t"
218
-                "por %%mm2, %%mm7                       \n\t" // First Line to Filter
219
-
220
-
221
-                //      1       2       3       4       5       6       7       8
222
-                //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
223
-                // 6 4 2 2 1 1
224
-                // 6 4 4 2
225
-                // 6 8 2
226
-
227
-                "movq (%0, %1), %%mm0                   \n\t" //  1
228
-                "movq %%mm0, %%mm1                      \n\t" //  1
229
-                PAVGB(%%mm6, %%mm0)                           //1 1        /2
230
-                PAVGB(%%mm6, %%mm0)                           //3 1        /4
231
-
232
-                "movq (%0, %1, 4), %%mm2                \n\t" //     1
233
-                "movq %%mm2, %%mm5                      \n\t" //     1
234
-                PAVGB((%%REGa), %%mm2)                        //    11        /2
235
-                PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
236
-                "movq %%mm2, %%mm3                      \n\t" //   211        /4
237
-                "movq (%0), %%mm4                       \n\t" // 1
238
-                PAVGB(%%mm4, %%mm3)                           // 4 211        /8
239
-                PAVGB(%%mm0, %%mm3)                           //642211        /16
240
-                "movq %%mm3, (%0)                       \n\t" // X
241
-                // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
242
-                "movq %%mm1, %%mm0                      \n\t" //  1
243
-                PAVGB(%%mm6, %%mm0)                           //1 1        /2
244
-                "movq %%mm4, %%mm3                      \n\t" // 1
245
-                PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
246
-                PAVGB((%%REGa,%1,2), %%mm5)                   //     11        /2
247
-                PAVGB((%%REGa), %%mm5)                        //    211 /4
248
-                PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
249
-                PAVGB(%%mm0, %%mm3)                           //4242211 /16
250
-                "movq %%mm3, (%0,%1)                    \n\t" //  X
251
-                // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
252
-                PAVGB(%%mm4, %%mm6)                                   //11        /2
253
-                "movq (%%"REG_c"), %%mm0                \n\t" //       1
254
-                PAVGB((%%REGa, %1, 2), %%mm0)                 //      11/2
255
-                "movq %%mm0, %%mm3                      \n\t" //      11/2
256
-                PAVGB(%%mm1, %%mm0)                           //  2   11/4
257
-                PAVGB(%%mm6, %%mm0)                           //222   11/8
258
-                PAVGB(%%mm2, %%mm0)                           //22242211/16
259
-                "movq (%0, %1, 2), %%mm2                \n\t" //   1
260
-                "movq %%mm0, (%0, %1, 2)                \n\t" //   X
261
-                // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
262
-                "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
263
-                PAVGB((%%REGc), %%mm0)                        //       11        /2
264
-                PAVGB(%%mm0, %%mm6)                           //11     11        /4
265
-                PAVGB(%%mm1, %%mm4)                           // 11                /2
266
-                PAVGB(%%mm2, %%mm1)                           //  11                /2
267
-                PAVGB(%%mm1, %%mm6)                           //1122   11        /8
268
-                PAVGB(%%mm5, %%mm6)                           //112242211        /16
269
-                "movq (%%"REG_a"), %%mm5                \n\t" //    1
270
-                "movq %%mm6, (%%"REG_a")                \n\t" //    X
271
-                // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
272
-                "movq (%%"REG_a", %1, 4), %%mm6         \n\t" //        1
273
-                PAVGB(%%mm7, %%mm6)                           //        11        /2
274
-                PAVGB(%%mm4, %%mm6)                           // 11     11        /4
275
-                PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
276
-                PAVGB(%%mm5, %%mm2)                           //   11                /2
277
-                "movq (%0, %1, 4), %%mm4                \n\t" //     1
278
-                PAVGB(%%mm4, %%mm2)                           //   112                /4
279
-                PAVGB(%%mm2, %%mm6)                           // 112242211        /16
280
-                "movq %%mm6, (%0, %1, 4)                \n\t" //     X
281
-                // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
282
-                PAVGB(%%mm7, %%mm1)                           //  11     2        /4
283
-                PAVGB(%%mm4, %%mm5)                           //    11                /2
284
-                PAVGB(%%mm5, %%mm0)                           //    11 11        /4
285
-                "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //      1
286
-                PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
287
-                PAVGB(%%mm0, %%mm1)                           //  11224222        /16
288
-                "movq %%mm1, (%%"REG_a", %1, 2)         \n\t" //      X
289
-                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
290
-                PAVGB((%%REGc), %%mm2)                        //   112 4        /8
291
-                "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
292
-                PAVGB(%%mm0, %%mm6)                           //      1 1        /2
293
-                PAVGB(%%mm7, %%mm6)                           //      1 12        /4
294
-                PAVGB(%%mm2, %%mm6)                           //   1122424        /4
295
-                "movq %%mm6, (%%"REG_c")                \n\t" //       X
296
-                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
297
-                PAVGB(%%mm7, %%mm5)                           //    11   2        /4
298
-                PAVGB(%%mm7, %%mm5)                           //    11   6        /8
299
-
300
-                PAVGB(%%mm3, %%mm0)                           //      112        /4
301
-                PAVGB(%%mm0, %%mm5)                           //    112246        /16
302
-                "movq %%mm5, (%%"REG_a", %1, 4)         \n\t" //        X
303
-                "sub %1, %0                             \n\t"
304
-
305
-                :
306
-                : "r" (src), "r" ((long)stride), "m" (c->pQPb)
307
-                : "%"REG_a, "%"REG_c
308
-        );
183
+    src+= stride*3;
184
+    asm volatile(        //"movv %0 %1 %2\n\t"
185
+        "movq %2, %%mm0                         \n\t"  // QP,..., QP
186
+        "pxor %%mm4, %%mm4                      \n\t"
187
+
188
+        "movq (%0), %%mm6                       \n\t"
189
+        "movq (%0, %1), %%mm5                   \n\t"
190
+        "movq %%mm5, %%mm1                      \n\t"
191
+        "movq %%mm6, %%mm2                      \n\t"
192
+        "psubusb %%mm6, %%mm5                   \n\t"
193
+        "psubusb %%mm1, %%mm2                   \n\t"
194
+        "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
195
+        "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
196
+        "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
197
+
198
+        "pand %%mm2, %%mm6                      \n\t"
199
+        "pandn %%mm1, %%mm2                     \n\t"
200
+        "por %%mm2, %%mm6                       \n\t"// First Line to Filter
201
+
202
+        "movq (%0, %1, 8), %%mm5                \n\t"
203
+        "lea (%0, %1, 4), %%"REG_a"             \n\t"
204
+        "lea (%0, %1, 8), %%"REG_c"             \n\t"
205
+        "sub %1, %%"REG_c"                      \n\t"
206
+        "add %1, %0                             \n\t" // %0 points to line 1 not 0
207
+        "movq (%0, %1, 8), %%mm7                \n\t"
208
+        "movq %%mm5, %%mm1                      \n\t"
209
+        "movq %%mm7, %%mm2                      \n\t"
210
+        "psubusb %%mm7, %%mm5                   \n\t"
211
+        "psubusb %%mm1, %%mm2                   \n\t"
212
+        "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
213
+        "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
214
+        "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
215
+
216
+        "pand %%mm2, %%mm7                      \n\t"
217
+        "pandn %%mm1, %%mm2                     \n\t"
218
+        "por %%mm2, %%mm7                       \n\t" // First Line to Filter
219
+
220
+
221
+        //      1       2       3       4       5       6       7       8
222
+        //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
223
+        // 6 4 2 2 1 1
224
+        // 6 4 4 2
225
+        // 6 8 2
226
+
227
+        "movq (%0, %1), %%mm0                   \n\t" //  1
228
+        "movq %%mm0, %%mm1                      \n\t" //  1
229
+        PAVGB(%%mm6, %%mm0)                           //1 1        /2
230
+        PAVGB(%%mm6, %%mm0)                           //3 1        /4
231
+
232
+        "movq (%0, %1, 4), %%mm2                \n\t" //     1
233
+        "movq %%mm2, %%mm5                      \n\t" //     1
234
+        PAVGB((%%REGa), %%mm2)                        //    11        /2
235
+        PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
236
+        "movq %%mm2, %%mm3                      \n\t" //   211        /4
237
+        "movq (%0), %%mm4                       \n\t" // 1
238
+        PAVGB(%%mm4, %%mm3)                           // 4 211        /8
239
+        PAVGB(%%mm0, %%mm3)                           //642211        /16
240
+        "movq %%mm3, (%0)                       \n\t" // X
241
+        // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
242
+        "movq %%mm1, %%mm0                      \n\t" //  1
243
+        PAVGB(%%mm6, %%mm0)                           //1 1        /2
244
+        "movq %%mm4, %%mm3                      \n\t" // 1
245
+        PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
246
+        PAVGB((%%REGa,%1,2), %%mm5)                   //     11        /2
247
+        PAVGB((%%REGa), %%mm5)                        //    211 /4
248
+        PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
249
+        PAVGB(%%mm0, %%mm3)                           //4242211 /16
250
+        "movq %%mm3, (%0,%1)                    \n\t" //  X
251
+        // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
252
+        PAVGB(%%mm4, %%mm6)                                   //11        /2
253
+        "movq (%%"REG_c"), %%mm0                \n\t" //       1
254
+        PAVGB((%%REGa, %1, 2), %%mm0)                 //      11/2
255
+        "movq %%mm0, %%mm3                      \n\t" //      11/2
256
+        PAVGB(%%mm1, %%mm0)                           //  2   11/4
257
+        PAVGB(%%mm6, %%mm0)                           //222   11/8
258
+        PAVGB(%%mm2, %%mm0)                           //22242211/16
259
+        "movq (%0, %1, 2), %%mm2                \n\t" //   1
260
+        "movq %%mm0, (%0, %1, 2)                \n\t" //   X
261
+        // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
262
+        "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
263
+        PAVGB((%%REGc), %%mm0)                        //       11        /2
264
+        PAVGB(%%mm0, %%mm6)                           //11     11        /4
265
+        PAVGB(%%mm1, %%mm4)                           // 11                /2
266
+        PAVGB(%%mm2, %%mm1)                           //  11                /2
267
+        PAVGB(%%mm1, %%mm6)                           //1122   11        /8
268
+        PAVGB(%%mm5, %%mm6)                           //112242211        /16
269
+        "movq (%%"REG_a"), %%mm5                \n\t" //    1
270
+        "movq %%mm6, (%%"REG_a")                \n\t" //    X
271
+        // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
272
+        "movq (%%"REG_a", %1, 4), %%mm6         \n\t" //        1
273
+        PAVGB(%%mm7, %%mm6)                           //        11        /2
274
+        PAVGB(%%mm4, %%mm6)                           // 11     11        /4
275
+        PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
276
+        PAVGB(%%mm5, %%mm2)                           //   11                /2
277
+        "movq (%0, %1, 4), %%mm4                \n\t" //     1
278
+        PAVGB(%%mm4, %%mm2)                           //   112                /4
279
+        PAVGB(%%mm2, %%mm6)                           // 112242211        /16
280
+        "movq %%mm6, (%0, %1, 4)                \n\t" //     X
281
+        // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
282
+        PAVGB(%%mm7, %%mm1)                           //  11     2        /4
283
+        PAVGB(%%mm4, %%mm5)                           //    11                /2
284
+        PAVGB(%%mm5, %%mm0)                           //    11 11        /4
285
+        "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //      1
286
+        PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
287
+        PAVGB(%%mm0, %%mm1)                           //  11224222        /16
288
+        "movq %%mm1, (%%"REG_a", %1, 2)         \n\t" //      X
289
+        // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
290
+        PAVGB((%%REGc), %%mm2)                        //   112 4        /8
291
+        "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
292
+        PAVGB(%%mm0, %%mm6)                           //      1 1        /2
293
+        PAVGB(%%mm7, %%mm6)                           //      1 12        /4
294
+        PAVGB(%%mm2, %%mm6)                           //   1122424        /4
295
+        "movq %%mm6, (%%"REG_c")                \n\t" //       X
296
+        // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
297
+        PAVGB(%%mm7, %%mm5)                           //    11   2        /4
298
+        PAVGB(%%mm7, %%mm5)                           //    11   6        /8
299
+
300
+        PAVGB(%%mm3, %%mm0)                           //      112        /4
301
+        PAVGB(%%mm0, %%mm5)                           //    112246        /16
302
+        "movq %%mm5, (%%"REG_a", %1, 4)         \n\t" //        X
303
+        "sub %1, %0                             \n\t"
304
+
305
+        :
306
+        : "r" (src), "r" ((long)stride), "m" (c->pQPb)
307
+        : "%"REG_a, "%"REG_c
308
+    );
309 309
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
310
-        const int l1= stride;
311
-        const int l2= stride + l1;
312
-        const int l3= stride + l2;
313
-        const int l4= stride + l3;
314
-        const int l5= stride + l4;
315
-        const int l6= stride + l5;
316
-        const int l7= stride + l6;
317
-        const int l8= stride + l7;
318
-        const int l9= stride + l8;
319
-        int x;
320
-        src+= stride*3;
321
-        for(x=0; x<BLOCK_SIZE; x++)
322
-        {
323
-                const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
324
-                const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
325
-
326
-                int sums[10];
327
-                sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
328
-                sums[1] = sums[0] - first  + src[l4];
329
-                sums[2] = sums[1] - first  + src[l5];
330
-                sums[3] = sums[2] - first  + src[l6];
331
-                sums[4] = sums[3] - first  + src[l7];
332
-                sums[5] = sums[4] - src[l1] + src[l8];
333
-                sums[6] = sums[5] - src[l2] + last;
334
-                sums[7] = sums[6] - src[l3] + last;
335
-                sums[8] = sums[7] - src[l4] + last;
336
-                sums[9] = sums[8] - src[l5] + last;
337
-
338
-                src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
339
-                src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
340
-                src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
341
-                src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
342
-                src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
343
-                src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
344
-                src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
345
-                src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
346
-
347
-                src++;
348
-        }
310
+    const int l1= stride;
311
+    const int l2= stride + l1;
312
+    const int l3= stride + l2;
313
+    const int l4= stride + l3;
314
+    const int l5= stride + l4;
315
+    const int l6= stride + l5;
316
+    const int l7= stride + l6;
317
+    const int l8= stride + l7;
318
+    const int l9= stride + l8;
319
+    int x;
320
+    src+= stride*3;
321
+    for(x=0; x<BLOCK_SIZE; x++){
322
+        const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
323
+        const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
324
+
325
+        int sums[10];
326
+        sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
327
+        sums[1] = sums[0] - first  + src[l4];
328
+        sums[2] = sums[1] - first  + src[l5];
329
+        sums[3] = sums[2] - first  + src[l6];
330
+        sums[4] = sums[3] - first  + src[l7];
331
+        sums[5] = sums[4] - src[l1] + src[l8];
332
+        sums[6] = sums[5] - src[l2] + last;
333
+        sums[7] = sums[6] - src[l3] + last;
334
+        sums[8] = sums[7] - src[l4] + last;
335
+        sums[9] = sums[8] - src[l5] + last;
336
+
337
+        src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
338
+        src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
339
+        src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
340
+        src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
341
+        src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
342
+        src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
343
+        src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
344
+        src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
345
+
346
+        src++;
347
+    }
349 348
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
350 349
 }
351 350
 #endif //HAVE_ALTIVEC
... ...
@@ -366,92 +365,89 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
366 366
 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
367 367
 {
368 368
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
369
-        src+= stride*3;
369
+    src+= stride*3;
370 370
 // FIXME rounding
371
-        asm volatile(
372
-                "pxor %%mm7, %%mm7                      \n\t" // 0
373
-                "movq "MANGLE(b80)", %%mm6              \n\t" // MIN_SIGNED_BYTE
374
-                "leal (%0, %1), %%"REG_a"               \n\t"
375
-                "leal (%%"REG_a", %1, 4), %%"REG_c"     \n\t"
371
+    asm volatile(
372
+        "pxor %%mm7, %%mm7                      \n\t" // 0
373
+        "movq "MANGLE(b80)", %%mm6              \n\t" // MIN_SIGNED_BYTE
374
+        "leal (%0, %1), %%"REG_a"               \n\t"
375
+        "leal (%%"REG_a", %1, 4), %%"REG_c"     \n\t"
376 376
 //      0       1       2       3       4       5       6       7       8       9
377 377
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
378
-                "movq "MANGLE(pQPb)", %%mm0             \n\t" // QP,..., QP
379
-                "movq %%mm0, %%mm1                      \n\t" // QP,..., QP
380
-                "paddusb "MANGLE(b02)", %%mm0           \n\t"
381
-                "psrlw $2, %%mm0                        \n\t"
382
-                "pand "MANGLE(b3F)", %%mm0              \n\t" // QP/4,..., QP/4
383
-                "paddusb %%mm1, %%mm0                   \n\t" // QP*1.25 ...
384
-                "movq (%0, %1, 4), %%mm2                \n\t" // line 4
385
-                "movq (%%"REG_c"), %%mm3                \n\t" // line 5
386
-                "movq %%mm2, %%mm4                      \n\t" // line 4
387
-                "pcmpeqb %%mm5, %%mm5                   \n\t" // -1
388
-                "pxor %%mm2, %%mm5                      \n\t" // -line 4 - 1
389
-                PAVGB(%%mm3, %%mm5)
390
-                "paddb %%mm6, %%mm5                     \n\t" // (l5-l4)/2
391
-                "psubusb %%mm3, %%mm4                   \n\t"
392
-                "psubusb %%mm2, %%mm3                   \n\t"
393
-                "por %%mm3, %%mm4                       \n\t" // |l4 - l5|
394
-                "psubusb %%mm0, %%mm4                   \n\t"
395
-                "pcmpeqb %%mm7, %%mm4                   \n\t"
396
-                "pand %%mm4, %%mm5                      \n\t" // d/2
397
-
398
-//                "paddb %%mm6, %%mm2                     \n\t" // line 4 + 0x80
399
-                "paddb %%mm5, %%mm2                     \n\t"
400
-//                "psubb %%mm6, %%mm2                     \n\t"
401
-                "movq %%mm2, (%0,%1, 4)                 \n\t"
402
-
403
-                "movq (%%"REG_c"), %%mm2                \n\t"
404
-//                "paddb %%mm6, %%mm2                     \n\t" // line 5 + 0x80
405
-                "psubb %%mm5, %%mm2                     \n\t"
406
-//                "psubb %%mm6, %%mm2                     \n\t"
407
-                "movq %%mm2, (%%"REG_c")                \n\t"
408
-
409
-                "paddb %%mm6, %%mm5                     \n\t"
410
-                "psrlw $2, %%mm5                        \n\t"
411
-                "pand "MANGLE(b3F)", %%mm5              \n\t"
412
-                "psubb "MANGLE(b20)", %%mm5             \n\t" // (l5-l4)/8
413
-
414
-                "movq (%%"REG_a", %1, 2), %%mm2         \n\t"
415
-                "paddb %%mm6, %%mm2                     \n\t" // line 3 + 0x80
416
-                "paddsb %%mm5, %%mm2                    \n\t"
417
-                "psubb %%mm6, %%mm2                     \n\t"
418
-                "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
419
-
420
-                "movq (%%"REG_c", %1), %%mm2            \n\t"
421
-                "paddb %%mm6, %%mm2                     \n\t" // line 6 + 0x80
422
-                "psubsb %%mm5, %%mm2                    \n\t"
423
-                "psubb %%mm6, %%mm2                     \n\t"
424
-                "movq %%mm2, (%%"REG_c", %1)            \n\t"
425
-
426
-                :
427
-                : "r" (src), "r" ((long)stride)
428
-                : "%"REG_a, "%"REG_c
429
-        );
378
+        "movq "MANGLE(pQPb)", %%mm0             \n\t" // QP,..., QP
379
+        "movq %%mm0, %%mm1                      \n\t" // QP,..., QP
380
+        "paddusb "MANGLE(b02)", %%mm0           \n\t"
381
+        "psrlw $2, %%mm0                        \n\t"
382
+        "pand "MANGLE(b3F)", %%mm0              \n\t" // QP/4,..., QP/4
383
+        "paddusb %%mm1, %%mm0                   \n\t" // QP*1.25 ...
384
+        "movq (%0, %1, 4), %%mm2                \n\t" // line 4
385
+        "movq (%%"REG_c"), %%mm3                \n\t" // line 5
386
+        "movq %%mm2, %%mm4                      \n\t" // line 4
387
+        "pcmpeqb %%mm5, %%mm5                   \n\t" // -1
388
+        "pxor %%mm2, %%mm5                      \n\t" // -line 4 - 1
389
+        PAVGB(%%mm3, %%mm5)
390
+        "paddb %%mm6, %%mm5                     \n\t" // (l5-l4)/2
391
+        "psubusb %%mm3, %%mm4                   \n\t"
392
+        "psubusb %%mm2, %%mm3                   \n\t"
393
+        "por %%mm3, %%mm4                       \n\t" // |l4 - l5|
394
+        "psubusb %%mm0, %%mm4                   \n\t"
395
+        "pcmpeqb %%mm7, %%mm4                   \n\t"
396
+        "pand %%mm4, %%mm5                      \n\t" // d/2
397
+
398
+//        "paddb %%mm6, %%mm2                     \n\t" // line 4 + 0x80
399
+        "paddb %%mm5, %%mm2                     \n\t"
400
+//        "psubb %%mm6, %%mm2                     \n\t"
401
+        "movq %%mm2, (%0,%1, 4)                 \n\t"
402
+
403
+        "movq (%%"REG_c"), %%mm2                \n\t"
404
+//        "paddb %%mm6, %%mm2                     \n\t" // line 5 + 0x80
405
+        "psubb %%mm5, %%mm2                     \n\t"
406
+//        "psubb %%mm6, %%mm2                     \n\t"
407
+        "movq %%mm2, (%%"REG_c")                \n\t"
408
+
409
+        "paddb %%mm6, %%mm5                     \n\t"
410
+        "psrlw $2, %%mm5                        \n\t"
411
+        "pand "MANGLE(b3F)", %%mm5              \n\t"
412
+        "psubb "MANGLE(b20)", %%mm5             \n\t" // (l5-l4)/8
413
+
414
+        "movq (%%"REG_a", %1, 2), %%mm2         \n\t"
415
+        "paddb %%mm6, %%mm2                     \n\t" // line 3 + 0x80
416
+        "paddsb %%mm5, %%mm2                    \n\t"
417
+        "psubb %%mm6, %%mm2                     \n\t"
418
+        "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
419
+
420
+        "movq (%%"REG_c", %1), %%mm2            \n\t"
421
+        "paddb %%mm6, %%mm2                     \n\t" // line 6 + 0x80
422
+        "psubsb %%mm5, %%mm2                    \n\t"
423
+        "psubb %%mm6, %%mm2                     \n\t"
424
+        "movq %%mm2, (%%"REG_c", %1)            \n\t"
425
+
426
+        :
427
+        : "r" (src), "r" ((long)stride)
428
+        : "%"REG_a, "%"REG_c
429
+    );
430 430
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
431
-         const int l1= stride;
432
-        const int l2= stride + l1;
433
-        const int l3= stride + l2;
434
-        const int l4= stride + l3;
435
-        const int l5= stride + l4;
436
-        const int l6= stride + l5;
437
-//        const int l7= stride + l6;
438
-//        const int l8= stride + l7;
439
-//        const int l9= stride + l8;
440
-        int x;
441
-        const int QP15= QP + (QP>>2);
442
-        src+= stride*3;
443
-        for(x=0; x<BLOCK_SIZE; x++)
444
-        {
445
-                const int v = (src[x+l5] - src[x+l4]);
446
-                if(FFABS(v) < QP15)
447
-                {
448
-                        src[x+l3] +=v>>3;
449
-                        src[x+l4] +=v>>1;
450
-                        src[x+l5] -=v>>1;
451
-                        src[x+l6] -=v>>3;
452
-
453
-                }
431
+    const int l1= stride;
432
+    const int l2= stride + l1;
433
+    const int l3= stride + l2;
434
+    const int l4= stride + l3;
435
+    const int l5= stride + l4;
436
+    const int l6= stride + l5;
437
+//    const int l7= stride + l6;
438
+//    const int l8= stride + l7;
439
+//    const int l9= stride + l8;
440
+    int x;
441
+    const int QP15= QP + (QP>>2);
442
+    src+= stride*3;
443
+    for(x=0; x<BLOCK_SIZE; x++){
444
+        const int v = (src[x+l5] - src[x+l4]);
445
+        if(FFABS(v) < QP15){
446
+            src[x+l3] +=v>>3;
447
+            src[x+l4] +=v>>1;
448
+            src[x+l5] -=v>>1;
449
+            src[x+l6] -=v>>3;
454 450
         }
451
+    }
455 452
 
456 453
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
457 454
 }
... ...
@@ -467,128 +463,125 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
467 467
 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
468 468
 {
469 469
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
470
-        src+= stride*3;
470
+    src+= stride*3;
471 471
 
472
-        asm volatile(
473
-                "pxor %%mm7, %%mm7                      \n\t" // 0
474
-                "lea (%0, %1), %%"REG_a"                \n\t"
475
-                "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
472
+    asm volatile(
473
+        "pxor %%mm7, %%mm7                      \n\t" // 0
474
+        "lea (%0, %1), %%"REG_a"                \n\t"
475
+        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
476 476
 //      0       1       2       3       4       5       6       7       8       9
477 477
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
478
-                "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
479
-                "movq (%0, %1, 4), %%mm1                \n\t" // line 4
480
-                "movq %%mm1, %%mm2                      \n\t" // line 4
481
-                "psubusb %%mm0, %%mm1                   \n\t"
482
-                "psubusb %%mm2, %%mm0                   \n\t"
483
-                "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
484
-                "movq (%%"REG_c"), %%mm3                \n\t" // line 5
485
-                "movq (%%"REG_c", %1), %%mm4            \n\t" // line 6
486
-                "movq %%mm3, %%mm5                      \n\t" // line 5
487
-                "psubusb %%mm4, %%mm3                   \n\t"
488
-                "psubusb %%mm5, %%mm4                   \n\t"
489
-                "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
490
-                PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
491
-                "movq %%mm2, %%mm1                      \n\t" // line 4
492
-                "psubusb %%mm5, %%mm2                   \n\t"
493
-                "movq %%mm2, %%mm4                      \n\t"
494
-                "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
495
-                "psubusb %%mm1, %%mm5                   \n\t"
496
-                "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
497
-                "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
498
-                "movq %%mm4, %%mm3                      \n\t" // d
499
-                "movq %2, %%mm0                         \n\t"
500
-                "paddusb %%mm0, %%mm0                   \n\t"
501
-                "psubusb %%mm0, %%mm4                   \n\t"
502
-                "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
503
-                "psubusb "MANGLE(b01)", %%mm3           \n\t"
504
-                "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
505
-
506
-                PAVGB(%%mm7, %%mm3)                           // d/2
507
-                "movq %%mm3, %%mm1                      \n\t" // d/2
508
-                PAVGB(%%mm7, %%mm3)                           // d/4
509
-                PAVGB(%%mm1, %%mm3)                           // 3*d/8
510
-
511
-                "movq (%0, %1, 4), %%mm0                \n\t" // line 4
512
-                "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
513
-                "psubusb %%mm3, %%mm0                   \n\t"
514
-                "pxor %%mm2, %%mm0                      \n\t"
515
-                "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
516
-
517
-                "movq (%%"REG_c"), %%mm0                \n\t" // line 5
518
-                "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
519
-                "paddusb %%mm3, %%mm0                   \n\t"
520
-                "pxor %%mm2, %%mm0                      \n\t"
521
-                "movq %%mm0, (%%"REG_c")                \n\t" // line 5
522
-
523
-                PAVGB(%%mm7, %%mm1)                           // d/4
524
-
525
-                "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
526
-                "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
527
-                "psubusb %%mm1, %%mm0                   \n\t"
528
-                "pxor %%mm2, %%mm0                      \n\t"
529
-                "movq %%mm0, (%%"REG_a", %1, 2)         \n\t" // line 3
530
-
531
-                "movq (%%"REG_c", %1), %%mm0            \n\t" // line 6
532
-                "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
533
-                "paddusb %%mm1, %%mm0                   \n\t"
534
-                "pxor %%mm2, %%mm0                      \n\t"
535
-                "movq %%mm0, (%%"REG_c", %1)            \n\t" // line 6
536
-
537
-                PAVGB(%%mm7, %%mm1)                           // d/8
538
-
539
-                "movq (%%"REG_a", %1), %%mm0            \n\t" // line 2
540
-                "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
541
-                "psubusb %%mm1, %%mm0                   \n\t"
542
-                "pxor %%mm2, %%mm0                      \n\t"
543
-                "movq %%mm0, (%%"REG_a", %1)            \n\t" // line 2
544
-
545
-                "movq (%%"REG_c", %1, 2), %%mm0         \n\t" // line 7
546
-                "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
547
-                "paddusb %%mm1, %%mm0                   \n\t"
548
-                "pxor %%mm2, %%mm0                      \n\t"
549
-                "movq %%mm0, (%%"REG_c", %1, 2)         \n\t" // line 7
550
-
551
-                :
552
-                : "r" (src), "r" ((long)stride), "m" (co->pQPb)
553
-                : "%"REG_a, "%"REG_c
554
-        );
478
+        "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
479
+        "movq (%0, %1, 4), %%mm1                \n\t" // line 4
480
+        "movq %%mm1, %%mm2                      \n\t" // line 4
481
+        "psubusb %%mm0, %%mm1                   \n\t"
482
+        "psubusb %%mm2, %%mm0                   \n\t"
483
+        "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
484
+        "movq (%%"REG_c"), %%mm3                \n\t" // line 5
485
+        "movq (%%"REG_c", %1), %%mm4            \n\t" // line 6
486
+        "movq %%mm3, %%mm5                      \n\t" // line 5
487
+        "psubusb %%mm4, %%mm3                   \n\t"
488
+        "psubusb %%mm5, %%mm4                   \n\t"
489
+        "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
490
+        PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
491
+        "movq %%mm2, %%mm1                      \n\t" // line 4
492
+        "psubusb %%mm5, %%mm2                   \n\t"
493
+        "movq %%mm2, %%mm4                      \n\t"
494
+        "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
495
+        "psubusb %%mm1, %%mm5                   \n\t"
496
+        "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
497
+        "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
498
+        "movq %%mm4, %%mm3                      \n\t" // d
499
+        "movq %2, %%mm0                         \n\t"
500
+        "paddusb %%mm0, %%mm0                   \n\t"
501
+        "psubusb %%mm0, %%mm4                   \n\t"
502
+        "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
503
+        "psubusb "MANGLE(b01)", %%mm3           \n\t"
504
+        "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
505
+
506
+        PAVGB(%%mm7, %%mm3)                           // d/2
507
+        "movq %%mm3, %%mm1                      \n\t" // d/2
508
+        PAVGB(%%mm7, %%mm3)                           // d/4
509
+        PAVGB(%%mm1, %%mm3)                           // 3*d/8
510
+
511
+        "movq (%0, %1, 4), %%mm0                \n\t" // line 4
512
+        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
513
+        "psubusb %%mm3, %%mm0                   \n\t"
514
+        "pxor %%mm2, %%mm0                      \n\t"
515
+        "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
516
+
517
+        "movq (%%"REG_c"), %%mm0                \n\t" // line 5
518
+        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
519
+        "paddusb %%mm3, %%mm0                   \n\t"
520
+        "pxor %%mm2, %%mm0                      \n\t"
521
+        "movq %%mm0, (%%"REG_c")                \n\t" // line 5
522
+
523
+        PAVGB(%%mm7, %%mm1)                           // d/4
524
+
525
+        "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
526
+        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
527
+        "psubusb %%mm1, %%mm0                   \n\t"
528
+        "pxor %%mm2, %%mm0                      \n\t"
529
+        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t" // line 3
530
+
531
+        "movq (%%"REG_c", %1), %%mm0            \n\t" // line 6
532
+        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
533
+        "paddusb %%mm1, %%mm0                   \n\t"
534
+        "pxor %%mm2, %%mm0                      \n\t"
535
+        "movq %%mm0, (%%"REG_c", %1)            \n\t" // line 6
536
+
537
+        PAVGB(%%mm7, %%mm1)                           // d/8
538
+
539
+        "movq (%%"REG_a", %1), %%mm0            \n\t" // line 2
540
+        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
541
+        "psubusb %%mm1, %%mm0                   \n\t"
542
+        "pxor %%mm2, %%mm0                      \n\t"
543
+        "movq %%mm0, (%%"REG_a", %1)            \n\t" // line 2
544
+
545
+        "movq (%%"REG_c", %1, 2), %%mm0         \n\t" // line 7
546
+        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
547
+        "paddusb %%mm1, %%mm0                   \n\t"
548
+        "pxor %%mm2, %%mm0                      \n\t"
549
+        "movq %%mm0, (%%"REG_c", %1, 2)         \n\t" // line 7
550
+
551
+        :
552
+        : "r" (src), "r" ((long)stride), "m" (co->pQPb)
553
+        : "%"REG_a, "%"REG_c
554
+    );
555 555
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
556 556
 
557
-        const int l1= stride;
558
-        const int l2= stride + l1;
559
-        const int l3= stride + l2;
560
-        const int l4= stride + l3;
561
-        const int l5= stride + l4;
562
-        const int l6= stride + l5;
563
-        const int l7= stride + l6;
564
-//        const int l8= stride + l7;
565
-//        const int l9= stride + l8;
566
-        int x;
567
-
568
-        src+= stride*3;
569
-        for(x=0; x<BLOCK_SIZE; x++)
570
-        {
571
-                int a= src[l3] - src[l4];
572
-                int b= src[l4] - src[l5];
573
-                int c= src[l5] - src[l6];
574
-
575
-                int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
576
-                d= FFMAX(d, 0);
577
-
578
-                if(d < co->QP*2)
579
-                {
580
-                        int v = d * FFSIGN(-b);
581
-
582
-                        src[l2] +=v>>3;
583
-                        src[l3] +=v>>2;
584
-                        src[l4] +=(3*v)>>3;
585
-                        src[l5] -=(3*v)>>3;
586
-                        src[l6] -=v>>2;
587
-                        src[l7] -=v>>3;
588
-
589
-                }
590
-                src++;
557
+    const int l1= stride;
558
+    const int l2= stride + l1;
559
+    const int l3= stride + l2;
560
+    const int l4= stride + l3;
561
+    const int l5= stride + l4;
562
+    const int l6= stride + l5;
563
+    const int l7= stride + l6;
564
+//    const int l8= stride + l7;
565
+//    const int l9= stride + l8;
566
+    int x;
567
+
568
+    src+= stride*3;
569
+    for(x=0; x<BLOCK_SIZE; x++){
570
+        int a= src[l3] - src[l4];
571
+        int b= src[l4] - src[l5];
572
+        int c= src[l5] - src[l6];
573
+
574
+        int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
575
+        d= FFMAX(d, 0);
576
+
577
+        if(d < co->QP*2){
578
+            int v = d * FFSIGN(-b);
579
+
580
+            src[l2] +=v>>3;
581
+            src[l3] +=v>>2;
582
+            src[l4] +=(3*v)>>3;
583
+            src[l5] -=(3*v)>>3;
584
+            src[l6] -=v>>2;
585
+            src[l7] -=v>>3;
591 586
         }
587
+        src++;
588
+    }
592 589
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
593 590
 }
594 591
 
... ...
@@ -597,569 +590,555 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
597 597
 {
598 598
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
599 599
 /*
600
-        uint8_t tmp[16];
601
-        const int l1= stride;
602
-        const int l2= stride + l1;
603
-        const int l3= stride + l2;
604
-        const int l4= (int)tmp - (int)src - stride*3;
605
-        const int l5= (int)tmp - (int)src - stride*3 + 8;
606
-        const int l6= stride*3 + l3;
607
-        const int l7= stride + l6;
608
-        const int l8= stride + l7;
609
-
610
-        memcpy(tmp, src+stride*7, 8);
611
-        memcpy(tmp+8, src+stride*8, 8);
600
+    uint8_t tmp[16];
601
+    const int l1= stride;
602
+    const int l2= stride + l1;
603
+    const int l3= stride + l2;
604
+    const int l4= (int)tmp - (int)src - stride*3;
605
+    const int l5= (int)tmp - (int)src - stride*3 + 8;
606
+    const int l6= stride*3 + l3;
607
+    const int l7= stride + l6;
608
+    const int l8= stride + l7;
609
+
610
+    memcpy(tmp, src+stride*7, 8);
611
+    memcpy(tmp+8, src+stride*8, 8);
612 612
 */
613
-        src+= stride*4;
614
-        asm volatile(
613
+    src+= stride*4;
614
+    asm volatile(
615 615
 
616 616
 #if 0 //sligtly more accurate and slightly slower
617
-                "pxor %%mm7, %%mm7                      \n\t" // 0
618
-                "lea (%0, %1), %%"REG_a"                \n\t"
619
-                "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
617
+        "pxor %%mm7, %%mm7                      \n\t" // 0
618
+        "lea (%0, %1), %%"REG_a"                \n\t"
619
+        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
620 620
 //      0       1       2       3       4       5       6       7
621 621
 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
622 622
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
623 623
 
624 624
 
625
-                "movq (%0, %1, 2), %%mm0                \n\t" // l2
626
-                "movq (%0), %%mm1                       \n\t" // l0
627
-                "movq %%mm0, %%mm2                      \n\t" // l2
628
-                PAVGB(%%mm7, %%mm0)                           // ~l2/2
629
-                PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
630
-                PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
631
-
632
-                "movq (%%"REG_a"), %%mm1                \n\t" // l1
633
-                "movq (%%"REG_a", %1, 2), %%mm3         \n\t" // l3
634
-                "movq %%mm1, %%mm4                      \n\t" // l1
635
-                PAVGB(%%mm7, %%mm1)                           // ~l1/2
636
-                PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
637
-                PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
638
-
639
-                "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
640
-                "psubusb %%mm1, %%mm0                   \n\t"
641
-                "psubusb %%mm4, %%mm1                   \n\t"
642
-                "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
625
+        "movq (%0, %1, 2), %%mm0                \n\t" // l2
626
+        "movq (%0), %%mm1                       \n\t" // l0
627
+        "movq %%mm0, %%mm2                      \n\t" // l2
628
+        PAVGB(%%mm7, %%mm0)                           // ~l2/2
629
+        PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
630
+        PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
631
+
632
+        "movq (%%"REG_a"), %%mm1                \n\t" // l1
633
+        "movq (%%"REG_a", %1, 2), %%mm3         \n\t" // l3
634
+        "movq %%mm1, %%mm4                      \n\t" // l1
635
+        PAVGB(%%mm7, %%mm1)                           // ~l1/2
636
+        PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
637
+        PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
638
+
639
+        "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
640
+        "psubusb %%mm1, %%mm0                   \n\t"
641
+        "psubusb %%mm4, %%mm1                   \n\t"
642
+        "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
643 643
 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
644 644
 
645
-                "movq (%0, %1, 4), %%mm0                \n\t" // l4
646
-                "movq %%mm0, %%mm4                      \n\t" // l4
647
-                PAVGB(%%mm7, %%mm0)                           // ~l4/2
648
-                PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
649
-                PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
650
-
651
-                "movq (%%"REG_c"), %%mm2                \n\t" // l5
652
-                "movq %%mm3, %%mm5                      \n\t" // l3
653
-                PAVGB(%%mm7, %%mm3)                           // ~l3/2
654
-                PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
655
-                PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
656
-
657
-                "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
658
-                "psubusb %%mm3, %%mm0                   \n\t"
659
-                "psubusb %%mm6, %%mm3                   \n\t"
660
-                "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
661
-                "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
645
+        "movq (%0, %1, 4), %%mm0                \n\t" // l4
646
+        "movq %%mm0, %%mm4                      \n\t" // l4
647
+        PAVGB(%%mm7, %%mm0)                           // ~l4/2
648
+        PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
649
+        PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
650
+
651
+        "movq (%%"REG_c"), %%mm2                \n\t" // l5
652
+        "movq %%mm3, %%mm5                      \n\t" // l3
653
+        PAVGB(%%mm7, %%mm3)                           // ~l3/2
654
+        PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
655
+        PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
656
+
657
+        "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
658
+        "psubusb %%mm3, %%mm0                   \n\t"
659
+        "psubusb %%mm6, %%mm3                   \n\t"
660
+        "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
661
+        "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
662 662
 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
663 663
 
664
-                "movq (%%"REG_c", %1), %%mm6            \n\t" // l6
665
-                "movq %%mm6, %%mm5                      \n\t" // l6
666
-                PAVGB(%%mm7, %%mm6)                           // ~l6/2
667
-                PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
668
-                PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
669
-
670
-                "movq (%%"REG_c", %1, 2), %%mm5         \n\t" // l7
671
-                "movq %%mm2, %%mm4                      \n\t" // l5
672
-                PAVGB(%%mm7, %%mm2)                           // ~l5/2
673
-                PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
674
-                PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
675
-
676
-                "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
677
-                "psubusb %%mm2, %%mm6                   \n\t"
678
-                "psubusb %%mm4, %%mm2                   \n\t"
679
-                "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
664
+        "movq (%%"REG_c", %1), %%mm6            \n\t" // l6
665
+        "movq %%mm6, %%mm5                      \n\t" // l6
666
+        PAVGB(%%mm7, %%mm6)                           // ~l6/2
667
+        PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
668
+        PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
669
+
670
+        "movq (%%"REG_c", %1, 2), %%mm5         \n\t" // l7
671
+        "movq %%mm2, %%mm4                      \n\t" // l5
672
+        PAVGB(%%mm7, %%mm2)                           // ~l5/2
673
+        PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
674
+        PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
675
+
676
+        "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
677
+        "psubusb %%mm2, %%mm6                   \n\t"
678
+        "psubusb %%mm4, %%mm2                   \n\t"
679
+        "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
680 680
 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
681 681
 
682 682
 
683
-                PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
684
-                "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
685
-                "paddusb "MANGLE(b01)", %%mm4           \n\t"
686
-                "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
687
-                "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
688
-                "pand %%mm4, %%mm3                      \n\t"
689
-
690
-                "movq %%mm3, %%mm1                      \n\t"
691
-//                "psubusb "MANGLE(b01)", %%mm3           \n\t"
692
-                PAVGB(%%mm7, %%mm3)
693
-                PAVGB(%%mm7, %%mm3)
694
-                "paddusb %%mm1, %%mm3                   \n\t"
695
-//                "paddusb "MANGLE(b01)", %%mm3           \n\t"
696
-
697
-                "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //l3
698
-                "movq (%0, %1, 4), %%mm5                \n\t" //l4
699
-                "movq (%0, %1, 4), %%mm4                \n\t" //l4
700
-                "psubusb %%mm6, %%mm5                   \n\t"
701
-                "psubusb %%mm4, %%mm6                   \n\t"
702
-                "por %%mm6, %%mm5                       \n\t" // |l3-l4|
703
-                "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
704
-                "pxor %%mm6, %%mm0                      \n\t"
705
-                "pand %%mm0, %%mm3                      \n\t"
706
-                PMINUB(%%mm5, %%mm3, %%mm0)
707
-
708
-                "psubusb "MANGLE(b01)", %%mm3           \n\t"
709
-                PAVGB(%%mm7, %%mm3)
710
-
711
-                "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
712
-                "movq (%0, %1, 4), %%mm2                \n\t"
713
-                "pxor %%mm6, %%mm0                      \n\t"
714
-                "pxor %%mm6, %%mm2                      \n\t"
715
-                "psubb %%mm3, %%mm0                     \n\t"
716
-                "paddb %%mm3, %%mm2                     \n\t"
717
-                "pxor %%mm6, %%mm0                      \n\t"
718
-                "pxor %%mm6, %%mm2                      \n\t"
719
-                "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
720
-                "movq %%mm2, (%0, %1, 4)                \n\t"
683
+        PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
684
+        "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
685
+        "paddusb "MANGLE(b01)", %%mm4           \n\t"
686
+        "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
687
+        "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
688
+        "pand %%mm4, %%mm3                      \n\t"
689
+
690
+        "movq %%mm3, %%mm1                      \n\t"
691
+//        "psubusb "MANGLE(b01)", %%mm3           \n\t"
692
+        PAVGB(%%mm7, %%mm3)
693
+        PAVGB(%%mm7, %%mm3)
694
+        "paddusb %%mm1, %%mm3                   \n\t"
695
+//        "paddusb "MANGLE(b01)", %%mm3           \n\t"
696
+
697
+        "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //l3
698
+        "movq (%0, %1, 4), %%mm5                \n\t" //l4
699
+        "movq (%0, %1, 4), %%mm4                \n\t" //l4
700
+        "psubusb %%mm6, %%mm5                   \n\t"
701
+        "psubusb %%mm4, %%mm6                   \n\t"
702
+        "por %%mm6, %%mm5                       \n\t" // |l3-l4|
703
+        "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
704
+        "pxor %%mm6, %%mm0                      \n\t"
705
+        "pand %%mm0, %%mm3                      \n\t"
706
+        PMINUB(%%mm5, %%mm3, %%mm0)
707
+
708
+        "psubusb "MANGLE(b01)", %%mm3           \n\t"
709
+        PAVGB(%%mm7, %%mm3)
710
+
711
+        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
712
+        "movq (%0, %1, 4), %%mm2                \n\t"
713
+        "pxor %%mm6, %%mm0                      \n\t"
714
+        "pxor %%mm6, %%mm2                      \n\t"
715
+        "psubb %%mm3, %%mm0                     \n\t"
716
+        "paddb %%mm3, %%mm2                     \n\t"
717
+        "pxor %%mm6, %%mm0                      \n\t"
718
+        "pxor %%mm6, %%mm2                      \n\t"
719
+        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
720
+        "movq %%mm2, (%0, %1, 4)                \n\t"
721 721
 #endif //0
722 722
 
723
-                "lea (%0, %1), %%"REG_a"                \n\t"
724
-                "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
723
+        "lea (%0, %1), %%"REG_a"                \n\t"
724
+        "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
725 725
 //      0       1       2       3       4       5       6       7
726 726
 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
727 727
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
728 728
 
729 729
 
730
-                "movq (%%"REG_a", %1, 2), %%mm1         \n\t" // l3
731
-                "movq (%0, %1, 4), %%mm0                \n\t" // l4
732
-                "pxor %%mm6, %%mm1                      \n\t" // -l3-1
733
-                PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
730
+        "movq (%%"REG_a", %1, 2), %%mm1         \n\t" // l3
731
+        "movq (%0, %1, 4), %%mm0                \n\t" // l4
732
+        "pxor %%mm6, %%mm1                      \n\t" // -l3-1
733
+        PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
734 734
 // mm1=-l3-1, mm0=128-q
735 735
 
736
-                "movq (%%"REG_a", %1, 4), %%mm2         \n\t" // l5
737
-                "movq (%%"REG_a", %1), %%mm3            \n\t" // l2
738
-                "pxor %%mm6, %%mm2                      \n\t" // -l5-1
739
-                "movq %%mm2, %%mm5                      \n\t" // -l5-1
740
-                "movq "MANGLE(b80)", %%mm4              \n\t" // 128
741
-                "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
742
-                PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
743
-                PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
744
-                PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
745
-                PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
736
+        "movq (%%"REG_a", %1, 4), %%mm2         \n\t" // l5
737
+        "movq (%%"REG_a", %1), %%mm3            \n\t" // l2
738
+        "pxor %%mm6, %%mm2                      \n\t" // -l5-1
739
+        "movq %%mm2, %%mm5                      \n\t" // -l5-1
740
+        "movq "MANGLE(b80)", %%mm4              \n\t" // 128
741
+        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
742
+        PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
743
+        PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
744
+        PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
745
+        PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
746 746
 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
747 747
 
748
-                "movq (%%"REG_a"), %%mm2                \n\t" // l1
749
-                "pxor %%mm6, %%mm2                      \n\t" // -l1-1
750
-                PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
751
-                PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
752
-                "movq "MANGLE(b80)", %%mm3              \n\t" // 128
753
-                PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
754
-                PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
755
-                PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
748
+        "movq (%%"REG_a"), %%mm2                \n\t" // l1
749
+        "pxor %%mm6, %%mm2                      \n\t" // -l1-1
750
+        PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
751
+        PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
752
+        "movq "MANGLE(b80)", %%mm3              \n\t" // 128
753
+        PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
754
+        PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
755
+        PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
756 756
 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
757 757
 
758
-                PAVGB((%%REGc, %1), %%mm5)                    // (l6-l5+256)/2
759
-                "movq (%%"REG_c", %1, 2), %%mm1         \n\t" // l7
760
-                "pxor %%mm6, %%mm1                      \n\t" // -l7-1
761
-                PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
762
-                "movq "MANGLE(b80)", %%mm2              \n\t" // 128
763
-                PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
764
-                PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
765
-                PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
758
+        PAVGB((%%REGc, %1), %%mm5)                    // (l6-l5+256)/2
759
+        "movq (%%"REG_c", %1, 2), %%mm1         \n\t" // l7
760
+        "pxor %%mm6, %%mm1                      \n\t" // -l7-1
761
+        PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
762
+        "movq "MANGLE(b80)", %%mm2              \n\t" // 128
763
+        PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
764
+        PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
765
+        PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
766 766
 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
767 767
 
768
-                "movq "MANGLE(b00)", %%mm1              \n\t" // 0
769
-                "movq "MANGLE(b00)", %%mm5              \n\t" // 0
770
-                "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
771
-                "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
772
-                PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
773
-                 PMAXUB(%%mm5, %%mm3)                         // 128 + |lenergy/16|
774
-                PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
768
+        "movq "MANGLE(b00)", %%mm1              \n\t" // 0
769
+        "movq "MANGLE(b00)", %%mm5              \n\t" // 0
770
+        "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
771
+        "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
772
+        PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
773
+        PMAXUB(%%mm5, %%mm3)                          // 128 + |lenergy/16|
774
+        PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
775 775
 
776 776
 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
777 777
 
778
-                "movq "MANGLE(b00)", %%mm7              \n\t" // 0
779
-                "movq %2, %%mm2                         \n\t" // QP
780
-                PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
781
-                "psubb %%mm6, %%mm2                     \n\t"
782
-
783
-                "movq %%mm4, %%mm1                      \n\t"
784
-                "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
785
-                "pxor %%mm1, %%mm4                      \n\t"
786
-                "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
787
-                "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
788
-                "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
778
+        "movq "MANGLE(b00)", %%mm7              \n\t" // 0
779
+        "movq %2, %%mm2                         \n\t" // QP
780
+        PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
781
+        "psubb %%mm6, %%mm2                     \n\t"
782
+
783
+        "movq %%mm4, %%mm1                      \n\t"
784
+        "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
785
+        "pxor %%mm1, %%mm4                      \n\t"
786
+        "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
787
+        "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
788
+        "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
789 789
 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
790 790
 
791
-                "movq %%mm4, %%mm3                      \n\t" // d
792
-                "psubusb "MANGLE(b01)", %%mm4           \n\t"
793
-                PAVGB(%%mm7, %%mm4)                           // d/32
794
-                PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
795
-                "paddb %%mm3, %%mm4                     \n\t" // 5d/64
796
-                "pand %%mm2, %%mm4                      \n\t"
797
-
798
-                "movq "MANGLE(b80)", %%mm5              \n\t" // 128
799
-                "psubb %%mm0, %%mm5                     \n\t" // q
800
-                "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
801
-                "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
802
-                "pxor %%mm7, %%mm5                      \n\t"
803
-
804
-                PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
805
-                "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
806
-
807
-                "pand %%mm7, %%mm4                      \n\t"
808
-                "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
809
-                "movq (%0, %1, 4), %%mm2                \n\t"
810
-                "pxor %%mm1, %%mm0                      \n\t"
811
-                "pxor %%mm1, %%mm2                      \n\t"
812
-                "paddb %%mm4, %%mm0                     \n\t"
813
-                "psubb %%mm4, %%mm2                     \n\t"
814
-                "pxor %%mm1, %%mm0                      \n\t"
815
-                "pxor %%mm1, %%mm2                      \n\t"
816
-                "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
817
-                "movq %%mm2, (%0, %1, 4)                \n\t"
818
-
819
-                :
820
-                : "r" (src), "r" ((long)stride), "m" (c->pQPb)
821
-                : "%"REG_a, "%"REG_c
822
-        );
791
+        "movq %%mm4, %%mm3                      \n\t" // d
792
+        "psubusb "MANGLE(b01)", %%mm4           \n\t"
793
+        PAVGB(%%mm7, %%mm4)                           // d/32
794
+        PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
795
+        "paddb %%mm3, %%mm4                     \n\t" // 5d/64
796
+        "pand %%mm2, %%mm4                      \n\t"
797
+
798
+        "movq "MANGLE(b80)", %%mm5              \n\t" // 128
799
+        "psubb %%mm0, %%mm5                     \n\t" // q
800
+        "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
801
+        "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
802
+        "pxor %%mm7, %%mm5                      \n\t"
803
+
804
+        PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
805
+        "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
806
+
807
+        "pand %%mm7, %%mm4                      \n\t"
808
+        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
809
+        "movq (%0, %1, 4), %%mm2                \n\t"
810
+        "pxor %%mm1, %%mm0                      \n\t"
811
+        "pxor %%mm1, %%mm2                      \n\t"
812
+        "paddb %%mm4, %%mm0                     \n\t"
813
+        "psubb %%mm4, %%mm2                     \n\t"
814
+        "pxor %%mm1, %%mm0                      \n\t"
815
+        "pxor %%mm1, %%mm2                      \n\t"
816
+        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
817
+        "movq %%mm2, (%0, %1, 4)                \n\t"
818
+
819
+        :
820
+        : "r" (src), "r" ((long)stride), "m" (c->pQPb)
821
+        : "%"REG_a, "%"REG_c
822
+    );
823 823
 
824 824
 /*
825
-        {
826
-        int x;
827
-        src-= stride;
828
-        for(x=0; x<BLOCK_SIZE; x++)
829
-        {
830
-                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
831
-                if(FFABS(middleEnergy)< 8*QP)
832
-                {
833
-                        const int q=(src[l4] - src[l5])/2;
834
-                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
835
-                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
836
-
837
-                        int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
838
-                        d= FFMAX(d, 0);
839
-
840
-                        d= (5*d + 32) >> 6;
841
-                        d*= FFSIGN(-middleEnergy);
842
-
843
-                        if(q>0)
844
-                        {
845
-                                d= d<0 ? 0 : d;
846
-                                d= d>q ? q : d;
847
-                        }
848
-                        else
849
-                        {
850
-                                d= d>0 ? 0 : d;
851
-                                d= d<q ? q : d;
852
-                        }
853
-
854
-                        src[l4]-= d;
855
-                        src[l5]+= d;
856
-                }
857
-                src++;
825
+    {
826
+    int x;
827
+    src-= stride;
828
+    for(x=0; x<BLOCK_SIZE; x++){
829
+        const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
830
+        if(FFABS(middleEnergy)< 8*QP){
831
+            const int q=(src[l4] - src[l5])/2;
832
+            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
833
+            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
834
+
835
+            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
836
+            d= FFMAX(d, 0);
837
+
838
+            d= (5*d + 32) >> 6;
839
+            d*= FFSIGN(-middleEnergy);
840
+
841
+            if(q>0){
842
+                d= d<0 ? 0 : d;
843
+                d= d>q ? q : d;
844
+            }else{
845
+                d= d>0 ? 0 : d;
846
+                d= d<q ? q : d;
847
+            }
848
+
849
+            src[l4]-= d;
850
+            src[l5]+= d;
858 851
         }
859
-src-=8;
860
-        for(x=0; x<8; x++)
861
-        {
862
-                int y;
863
-                for(y=4; y<6; y++)
864
-                {
865
-                        int d= src[x+y*stride] - tmp[x+(y-4)*8];
866
-                        int ad= FFABS(d);
867
-                        static int max=0;
868
-                        static int sum=0;
869
-                        static int num=0;
870
-                        static int bias=0;
871
-
872
-                        if(max<ad) max=ad;
873
-                        sum+= ad>3 ? 1 : 0;
874
-                        if(ad>3)
875
-                        {
876
-                                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
877
-                        }
878
-                        if(y==4) bias+=d;
879
-                        num++;
880
-                        if(num%1000000 == 0)
881
-                        {
882
-                                av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
883
-                        }
884
-                }
852
+        src++;
853
+    }
854
+    src-=8;
855
+    for(x=0; x<8; x++){
856
+        int y;
857
+        for(y=4; y<6; y++){
858
+            int d= src[x+y*stride] - tmp[x+(y-4)*8];
859
+            int ad= FFABS(d);
860
+            static int max=0;
861
+            static int sum=0;
862
+            static int num=0;
863
+            static int bias=0;
864
+
865
+            if(max<ad) max=ad;
866
+            sum+= ad>3 ? 1 : 0;
867
+            if(ad>3){
868
+                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
869
+            }
870
+            if(y==4) bias+=d;
871
+            num++;
872
+            if(num%1000000 == 0){
873
+                av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
874
+            }
885 875
         }
876
+    }
886 877
 }
887 878
 */
888 879
 #elif defined (HAVE_MMX)
889
-        src+= stride*4;
890
-        asm volatile(
891
-                "pxor %%mm7, %%mm7                      \n\t"
892
-                "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
893
-                "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
880
+    src+= stride*4;
881
+    asm volatile(
882
+        "pxor %%mm7, %%mm7                      \n\t"
883
+        "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
884
+        "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
894 885
 //      0       1       2       3       4       5       6       7
895 886
 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 edx+%1  edx+2%1
896 887
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1
897 888
 
898
-                "movq (%0), %%mm0                       \n\t"
899
-                "movq %%mm0, %%mm1                      \n\t"
900
-                "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
901
-                "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
902
-
903
-                "movq (%0, %1), %%mm2                   \n\t"
904
-                "lea (%0, %1, 2), %%"REG_a"             \n\t"
905
-                "movq %%mm2, %%mm3                      \n\t"
906
-                "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
907
-                "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
908
-
909
-                "movq (%%"REG_a"), %%mm4                \n\t"
910
-                "movq %%mm4, %%mm5                      \n\t"
911
-                "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
912
-                "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
913
-
914
-                "paddw %%mm0, %%mm0                     \n\t" // 2L0
915
-                "paddw %%mm1, %%mm1                     \n\t" // 2H0
916
-                "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
917
-                "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
918
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
919
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
920
-
921
-                "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
922
-                "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
923
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
924
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
925
-
926
-                "movq (%%"REG_a", %1), %%mm2            \n\t"
927
-                "movq %%mm2, %%mm3                      \n\t"
928
-                "punpcklbw %%mm7, %%mm2                 \n\t" // L3
929
-                "punpckhbw %%mm7, %%mm3                 \n\t" // H3
930
-
931
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
932
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
933
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
934
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
935
-                "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
936
-                "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
937
-
938
-                "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
939
-                "movq %%mm0, %%mm1                      \n\t"
940
-                "punpcklbw %%mm7, %%mm0                 \n\t" // L4
941
-                "punpckhbw %%mm7, %%mm1                 \n\t" // H4
942
-
943
-                "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
944
-                "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
945
-                "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
946
-                "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
947
-                "paddw %%mm4, %%mm4                     \n\t" // 2L2
948
-                "paddw %%mm5, %%mm5                     \n\t" // 2H2
949
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
950
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
951
-
952
-                "lea (%%"REG_a", %1), %0                \n\t"
953
-                "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
954
-                "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
955
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
956
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
889
+        "movq (%0), %%mm0                       \n\t"
890
+        "movq %%mm0, %%mm1                      \n\t"
891
+        "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
892
+        "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
893
+
894
+        "movq (%0, %1), %%mm2                   \n\t"
895
+        "lea (%0, %1, 2), %%"REG_a"             \n\t"
896
+        "movq %%mm2, %%mm3                      \n\t"
897
+        "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
898
+        "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
899
+
900
+        "movq (%%"REG_a"), %%mm4                \n\t"
901
+        "movq %%mm4, %%mm5                      \n\t"
902
+        "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
903
+        "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
904
+
905
+        "paddw %%mm0, %%mm0                     \n\t" // 2L0
906
+        "paddw %%mm1, %%mm1                     \n\t" // 2H0
907
+        "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
908
+        "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
909
+        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
910
+        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
911
+
912
+        "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
913
+        "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
914
+        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
915
+        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
916
+
917
+        "movq (%%"REG_a", %1), %%mm2            \n\t"
918
+        "movq %%mm2, %%mm3                      \n\t"
919
+        "punpcklbw %%mm7, %%mm2                 \n\t" // L3
920
+        "punpckhbw %%mm7, %%mm3                 \n\t" // H3
921
+
922
+        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
923
+        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
924
+        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
925
+        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
926
+        "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
927
+        "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
928
+
929
+        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
930
+        "movq %%mm0, %%mm1                      \n\t"
931
+        "punpcklbw %%mm7, %%mm0                 \n\t" // L4
932
+        "punpckhbw %%mm7, %%mm1                 \n\t" // H4
933
+
934
+        "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
935
+        "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
936
+        "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
937
+        "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
938
+        "paddw %%mm4, %%mm4                     \n\t" // 2L2
939
+        "paddw %%mm5, %%mm5                     \n\t" // 2H2
940
+        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
941
+        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
942
+
943
+        "lea (%%"REG_a", %1), %0                \n\t"
944
+        "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
945
+        "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
946
+        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
947
+        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
957 948
 //50 opcodes so far
958
-                "movq (%0, %1, 2), %%mm2                \n\t"
959
-                "movq %%mm2, %%mm3                      \n\t"
960
-                "punpcklbw %%mm7, %%mm2                 \n\t" // L5
961
-                "punpckhbw %%mm7, %%mm3                 \n\t" // H5
962
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
963
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
964
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
965
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
966
-
967
-                "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
968
-                "punpcklbw %%mm7, %%mm6                 \n\t" // L6
969
-                "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
970
-                "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
971
-                "punpckhbw %%mm7, %%mm6                 \n\t" // H6
972
-                "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
973
-
974
-                "paddw %%mm0, %%mm0                     \n\t" // 2L4
975
-                "paddw %%mm1, %%mm1                     \n\t" // 2H4
976
-                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
977
-                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
978
-
979
-                "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
980
-                "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
981
-                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
982
-                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
983
-
984
-                "movq (%0, %1, 4), %%mm2                \n\t"
985
-                "movq %%mm2, %%mm3                      \n\t"
986
-                "punpcklbw %%mm7, %%mm2                 \n\t" // L7
987
-                "punpckhbw %%mm7, %%mm3                 \n\t" // H7
988
-
989
-                "paddw %%mm2, %%mm2                     \n\t" // 2L7
990
-                "paddw %%mm3, %%mm3                     \n\t" // 2H7
991
-                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
992
-                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
993
-
994
-                "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
995
-                "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
949
+        "movq (%0, %1, 2), %%mm2                \n\t"
950
+        "movq %%mm2, %%mm3                      \n\t"
951
+        "punpcklbw %%mm7, %%mm2                 \n\t" // L5
952
+        "punpckhbw %%mm7, %%mm3                 \n\t" // H5
953
+        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
954
+        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
955
+        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
956
+        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
957
+
958
+        "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
959
+        "punpcklbw %%mm7, %%mm6                 \n\t" // L6
960
+        "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
961
+        "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
962
+        "punpckhbw %%mm7, %%mm6                 \n\t" // H6
963
+        "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
964
+
965
+        "paddw %%mm0, %%mm0                     \n\t" // 2L4
966
+        "paddw %%mm1, %%mm1                     \n\t" // 2H4
967
+        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
968
+        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
969
+
970
+        "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
971
+        "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
972
+        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
973
+        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
974
+
975
+        "movq (%0, %1, 4), %%mm2                \n\t"
976
+        "movq %%mm2, %%mm3                      \n\t"
977
+        "punpcklbw %%mm7, %%mm2                 \n\t" // L7
978
+        "punpckhbw %%mm7, %%mm3                 \n\t" // H7
979
+
980
+        "paddw %%mm2, %%mm2                     \n\t" // 2L7
981
+        "paddw %%mm3, %%mm3                     \n\t" // 2H7
982
+        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
983
+        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
984
+
985
+        "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
986
+        "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
996 987
 
997 988
 #ifdef HAVE_MMX2
998
-                "movq %%mm7, %%mm6                      \n\t" // 0
999
-                "psubw %%mm0, %%mm6                     \n\t"
1000
-                "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1001
-                "movq %%mm7, %%mm6                      \n\t" // 0
1002
-                "psubw %%mm1, %%mm6                     \n\t"
1003
-                "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1004
-                "movq %%mm7, %%mm6                      \n\t" // 0
1005
-                "psubw %%mm2, %%mm6                     \n\t"
1006
-                "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1007
-                "movq %%mm7, %%mm6                      \n\t" // 0
1008
-                "psubw %%mm3, %%mm6                     \n\t"
1009
-                "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
989
+        "movq %%mm7, %%mm6                      \n\t" // 0
990
+        "psubw %%mm0, %%mm6                     \n\t"
991
+        "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
992
+        "movq %%mm7, %%mm6                      \n\t" // 0
993
+        "psubw %%mm1, %%mm6                     \n\t"
994
+        "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
995
+        "movq %%mm7, %%mm6                      \n\t" // 0
996
+        "psubw %%mm2, %%mm6                     \n\t"
997
+        "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
998
+        "movq %%mm7, %%mm6                      \n\t" // 0
999
+        "psubw %%mm3, %%mm6                     \n\t"
1000
+        "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1010 1001
 #else
1011
-                "movq %%mm7, %%mm6                      \n\t" // 0
1012
-                "pcmpgtw %%mm0, %%mm6                   \n\t"
1013
-                "pxor %%mm6, %%mm0                      \n\t"
1014
-                "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1015
-                "movq %%mm7, %%mm6                      \n\t" // 0
1016
-                "pcmpgtw %%mm1, %%mm6                   \n\t"
1017
-                "pxor %%mm6, %%mm1                      \n\t"
1018
-                "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1019
-                "movq %%mm7, %%mm6                      \n\t" // 0
1020
-                "pcmpgtw %%mm2, %%mm6                   \n\t"
1021
-                "pxor %%mm6, %%mm2                      \n\t"
1022
-                "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1023
-                "movq %%mm7, %%mm6                      \n\t" // 0
1024
-                "pcmpgtw %%mm3, %%mm6                   \n\t"
1025
-                "pxor %%mm6, %%mm3                      \n\t"
1026
-                "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1002
+        "movq %%mm7, %%mm6                      \n\t" // 0
1003
+        "pcmpgtw %%mm0, %%mm6                   \n\t"
1004
+        "pxor %%mm6, %%mm0                      \n\t"
1005
+        "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1006
+        "movq %%mm7, %%mm6                      \n\t" // 0
1007
+        "pcmpgtw %%mm1, %%mm6                   \n\t"
1008
+        "pxor %%mm6, %%mm1                      \n\t"
1009
+        "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1010
+        "movq %%mm7, %%mm6                      \n\t" // 0
1011
+        "pcmpgtw %%mm2, %%mm6                   \n\t"
1012
+        "pxor %%mm6, %%mm2                      \n\t"
1013
+        "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1014
+        "movq %%mm7, %%mm6                      \n\t" // 0
1015
+        "pcmpgtw %%mm3, %%mm6                   \n\t"
1016
+        "pxor %%mm6, %%mm3                      \n\t"
1017
+        "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1027 1018
 #endif
1028 1019
 
1029 1020
 #ifdef HAVE_MMX2
1030
-                "pminsw %%mm2, %%mm0                    \n\t"
1031
-                "pminsw %%mm3, %%mm1                    \n\t"
1021
+        "pminsw %%mm2, %%mm0                    \n\t"
1022
+        "pminsw %%mm3, %%mm1                    \n\t"
1032 1023
 #else
1033
-                "movq %%mm0, %%mm6                      \n\t"
1034
-                "psubusw %%mm2, %%mm6                   \n\t"
1035
-                "psubw %%mm6, %%mm0                     \n\t"
1036
-                "movq %%mm1, %%mm6                      \n\t"
1037
-                "psubusw %%mm3, %%mm6                   \n\t"
1038
-                "psubw %%mm6, %%mm1                     \n\t"
1024
+        "movq %%mm0, %%mm6                      \n\t"
1025
+        "psubusw %%mm2, %%mm6                   \n\t"
1026
+        "psubw %%mm6, %%mm0                     \n\t"
1027
+        "movq %%mm1, %%mm6                      \n\t"
1028
+        "psubusw %%mm3, %%mm6                   \n\t"
1029
+        "psubw %%mm6, %%mm1                     \n\t"
1039 1030
 #endif
1040 1031
 
1041
-                "movd %2, %%mm2                         \n\t" // QP
1042
-                "punpcklbw %%mm7, %%mm2                 \n\t"
1032
+        "movd %2, %%mm2                         \n\t" // QP
1033
+        "punpcklbw %%mm7, %%mm2                 \n\t"
1043 1034
 
1044
-                "movq %%mm7, %%mm6                      \n\t" // 0
1045
-                "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1046
-                "pxor %%mm6, %%mm4                      \n\t"
1047
-                "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1048
-                "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1049
-                "pxor %%mm7, %%mm5                      \n\t"
1050
-                "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1035
+        "movq %%mm7, %%mm6                      \n\t" // 0
1036
+        "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1037
+        "pxor %%mm6, %%mm4                      \n\t"
1038
+        "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1039
+        "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1040
+        "pxor %%mm7, %%mm5                      \n\t"
1041
+        "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1051 1042
 // 100 opcodes
1052
-                "psllw $3, %%mm2                        \n\t" // 8QP
1053
-                "movq %%mm2, %%mm3                      \n\t" // 8QP
1054
-                "pcmpgtw %%mm4, %%mm2                   \n\t"
1055
-                "pcmpgtw %%mm5, %%mm3                   \n\t"
1056
-                "pand %%mm2, %%mm4                      \n\t"
1057
-                "pand %%mm3, %%mm5                      \n\t"
1058
-
1059
-
1060
-                "psubusw %%mm0, %%mm4                   \n\t" // hd
1061
-                "psubusw %%mm1, %%mm5                   \n\t" // ld
1062
-
1063
-
1064
-                "movq "MANGLE(w05)", %%mm2              \n\t" // 5
1065
-                "pmullw %%mm2, %%mm4                    \n\t"
1066
-                "pmullw %%mm2, %%mm5                    \n\t"
1067
-                "movq "MANGLE(w20)", %%mm2              \n\t" // 32
1068
-                "paddw %%mm2, %%mm4                     \n\t"
1069
-                "paddw %%mm2, %%mm5                     \n\t"
1070
-                "psrlw $6, %%mm4                        \n\t"
1071
-                "psrlw $6, %%mm5                        \n\t"
1072
-
1073
-                "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
1074
-                "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
1075
-
1076
-                "pxor %%mm2, %%mm2                      \n\t"
1077
-                "pxor %%mm3, %%mm3                      \n\t"
1078
-
1079
-                "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
1080
-                "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
1081
-                "pxor %%mm2, %%mm0                      \n\t"
1082
-                "pxor %%mm3, %%mm1                      \n\t"
1083
-                "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
1084
-                "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
1085
-                "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
1086
-                "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
1087
-
1088
-                "pxor %%mm6, %%mm2                      \n\t"
1089
-                "pxor %%mm7, %%mm3                      \n\t"
1090
-                "pand %%mm2, %%mm4                      \n\t"
1091
-                "pand %%mm3, %%mm5                      \n\t"
1043
+        "psllw $3, %%mm2                        \n\t" // 8QP
1044
+        "movq %%mm2, %%mm3                      \n\t" // 8QP
1045
+        "pcmpgtw %%mm4, %%mm2                   \n\t"
1046
+        "pcmpgtw %%mm5, %%mm3                   \n\t"
1047
+        "pand %%mm2, %%mm4                      \n\t"
1048
+        "pand %%mm3, %%mm5                      \n\t"
1049
+
1050
+
1051
+        "psubusw %%mm0, %%mm4                   \n\t" // hd
1052
+        "psubusw %%mm1, %%mm5                   \n\t" // ld
1053
+
1054
+
1055
+        "movq "MANGLE(w05)", %%mm2              \n\t" // 5
1056
+        "pmullw %%mm2, %%mm4                    \n\t"
1057
+        "pmullw %%mm2, %%mm5                    \n\t"
1058
+        "movq "MANGLE(w20)", %%mm2              \n\t" // 32
1059
+        "paddw %%mm2, %%mm4                     \n\t"
1060
+        "paddw %%mm2, %%mm5                     \n\t"
1061
+        "psrlw $6, %%mm4                        \n\t"
1062
+        "psrlw $6, %%mm5                        \n\t"
1063
+
1064
+        "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
1065
+        "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
1066
+
1067
+        "pxor %%mm2, %%mm2                      \n\t"
1068
+        "pxor %%mm3, %%mm3                      \n\t"
1069
+
1070
+        "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
1071
+        "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
1072
+        "pxor %%mm2, %%mm0                      \n\t"
1073
+        "pxor %%mm3, %%mm1                      \n\t"
1074
+        "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
1075
+        "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
1076
+        "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
1077
+        "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
1078
+
1079
+        "pxor %%mm6, %%mm2                      \n\t"
1080
+        "pxor %%mm7, %%mm3                      \n\t"
1081
+        "pand %%mm2, %%mm4                      \n\t"
1082
+        "pand %%mm3, %%mm5                      \n\t"
1092 1083
 
1093 1084
 #ifdef HAVE_MMX2
1094
-                "pminsw %%mm0, %%mm4                    \n\t"
1095
-                "pminsw %%mm1, %%mm5                    \n\t"
1085
+        "pminsw %%mm0, %%mm4                    \n\t"
1086
+        "pminsw %%mm1, %%mm5                    \n\t"
1096 1087
 #else
1097
-                "movq %%mm4, %%mm2                      \n\t"
1098
-                "psubusw %%mm0, %%mm2                   \n\t"
1099
-                "psubw %%mm2, %%mm4                     \n\t"
1100
-                "movq %%mm5, %%mm2                      \n\t"
1101
-                "psubusw %%mm1, %%mm2                   \n\t"
1102
-                "psubw %%mm2, %%mm5                     \n\t"
1088
+        "movq %%mm4, %%mm2                      \n\t"
1089
+        "psubusw %%mm0, %%mm2                   \n\t"
1090
+        "psubw %%mm2, %%mm4                     \n\t"
1091
+        "movq %%mm5, %%mm2                      \n\t"
1092
+        "psubusw %%mm1, %%mm2                   \n\t"
1093
+        "psubw %%mm2, %%mm5                     \n\t"
1103 1094
 #endif
1104
-                "pxor %%mm6, %%mm4                      \n\t"
1105
-                "pxor %%mm7, %%mm5                      \n\t"
1106
-                "psubw %%mm6, %%mm4                     \n\t"
1107
-                "psubw %%mm7, %%mm5                     \n\t"
1108
-                "packsswb %%mm5, %%mm4                  \n\t"
1109
-                "movq (%0), %%mm0                       \n\t"
1110
-                "paddb   %%mm4, %%mm0                   \n\t"
1111
-                "movq %%mm0, (%0)                       \n\t"
1112
-                "movq (%0, %1), %%mm0                   \n\t"
1113
-                "psubb %%mm4, %%mm0                     \n\t"
1114
-                "movq %%mm0, (%0, %1)                   \n\t"
1115
-
1116
-                : "+r" (src)
1117
-                : "r" ((long)stride), "m" (c->pQPb)
1118
-                : "%"REG_a, "%"REG_c
1119
-        );
1095
+        "pxor %%mm6, %%mm4                      \n\t"
1096
+        "pxor %%mm7, %%mm5                      \n\t"
1097
+        "psubw %%mm6, %%mm4                     \n\t"
1098
+        "psubw %%mm7, %%mm5                     \n\t"
1099
+        "packsswb %%mm5, %%mm4                  \n\t"
1100
+        "movq (%0), %%mm0                       \n\t"
1101
+        "paddb   %%mm4, %%mm0                   \n\t"
1102
+        "movq %%mm0, (%0)                       \n\t"
1103
+        "movq (%0, %1), %%mm0                   \n\t"
1104
+        "psubb %%mm4, %%mm0                     \n\t"
1105
+        "movq %%mm0, (%0, %1)                   \n\t"
1106
+
1107
+        : "+r" (src)
1108
+        : "r" ((long)stride), "m" (c->pQPb)
1109
+        : "%"REG_a, "%"REG_c
1110
+    );
1120 1111
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1121
-        const int l1= stride;
1122
-        const int l2= stride + l1;
1123
-        const int l3= stride + l2;
1124
-        const int l4= stride + l3;
1125
-        const int l5= stride + l4;
1126
-        const int l6= stride + l5;
1127
-        const int l7= stride + l6;
1128
-        const int l8= stride + l7;
1129
-//        const int l9= stride + l8;
1130
-        int x;
1131
-        src+= stride*3;
1132
-        for(x=0; x<BLOCK_SIZE; x++)
1133
-        {
1134
-                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1135
-                if(FFABS(middleEnergy) < 8*c->QP)
1136
-                {
1137
-                        const int q=(src[l4] - src[l5])/2;
1138
-                        const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1139
-                        const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1140
-
1141
-                        int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1142
-                        d= FFMAX(d, 0);
1143
-
1144
-                        d= (5*d + 32) >> 6;
1145
-                        d*= FFSIGN(-middleEnergy);
1146
-
1147
-                        if(q>0)
1148
-                        {
1149
-                                d= d<0 ? 0 : d;
1150
-                                d= d>q ? q : d;
1151
-                        }
1152
-                        else
1153
-                        {
1154
-                                d= d>0 ? 0 : d;
1155
-                                d= d<q ? q : d;
1156
-                        }
1157
-
1158
-                        src[l4]-= d;
1159
-                        src[l5]+= d;
1160
-                }
1161
-                src++;
1112
+    const int l1= stride;
1113
+    const int l2= stride + l1;
1114
+    const int l3= stride + l2;
1115
+    const int l4= stride + l3;
1116
+    const int l5= stride + l4;
1117
+    const int l6= stride + l5;
1118
+    const int l7= stride + l6;
1119
+    const int l8= stride + l7;
1120
+//    const int l9= stride + l8;
1121
+    int x;
1122
+    src+= stride*3;
1123
+    for(x=0; x<BLOCK_SIZE; x++){
1124
+        const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1125
+        if(FFABS(middleEnergy) < 8*c->QP){
1126
+            const int q=(src[l4] - src[l5])/2;
1127
+            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1128
+            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1129
+
1130
+            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1131
+            d= FFMAX(d, 0);
1132
+
1133
+            d= (5*d + 32) >> 6;
1134
+            d*= FFSIGN(-middleEnergy);
1135
+
1136
+            if(q>0){
1137
+                d= d<0 ? 0 : d;
1138
+                d= d>q ? q : d;
1139
+            }else{
1140
+                d= d>0 ? 0 : d;
1141
+                d= d<q ? q : d;
1142
+            }
1143
+
1144
+            src[l4]-= d;
1145
+            src[l5]+= d;
1162 1146
         }
1147
+        src++;
1148
+    }
1163 1149
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1164 1150
 }
1165 1151
 #endif //HAVE_ALTIVEC
... ...
@@ -1168,18 +1147,18 @@ src-=8;
1168 1168
 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1169 1169
 {
1170 1170
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1171
-        asm volatile(
1172
-                "pxor %%mm6, %%mm6                      \n\t"
1173
-                "pcmpeqb %%mm7, %%mm7                   \n\t"
1174
-                "movq %2, %%mm0                         \n\t"
1175
-                "punpcklbw %%mm6, %%mm0                 \n\t"
1176
-                "psrlw $1, %%mm0                        \n\t"
1177
-                "psubw %%mm7, %%mm0                     \n\t"
1178
-                "packuswb %%mm0, %%mm0                  \n\t"
1179
-                "movq %%mm0, %3                         \n\t"
1180
-
1181
-                "lea (%0, %1), %%"REG_a"                \n\t"
1182
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1171
+    asm volatile(
1172
+        "pxor %%mm6, %%mm6                      \n\t"
1173
+        "pcmpeqb %%mm7, %%mm7                   \n\t"
1174
+        "movq %2, %%mm0                         \n\t"
1175
+        "punpcklbw %%mm6, %%mm0                 \n\t"
1176
+        "psrlw $1, %%mm0                        \n\t"
1177
+        "psubw %%mm7, %%mm0                     \n\t"
1178
+        "packuswb %%mm0, %%mm0                  \n\t"
1179
+        "movq %%mm0, %3                         \n\t"
1180
+
1181
+        "lea (%0, %1), %%"REG_a"                \n\t"
1182
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1183 1183
 
1184 1184
 //        0        1        2        3        4        5        6        7        8        9
1185 1185
 //        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
... ...
@@ -1187,17 +1166,17 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1187 1187
 #undef FIND_MIN_MAX
1188 1188
 #ifdef HAVE_MMX2
1189 1189
 #define REAL_FIND_MIN_MAX(addr)\
1190
-                "movq " #addr ", %%mm0                  \n\t"\
1191
-                "pminub %%mm0, %%mm7                    \n\t"\
1192
-                "pmaxub %%mm0, %%mm6                    \n\t"
1190
+        "movq " #addr ", %%mm0                  \n\t"\
1191
+        "pminub %%mm0, %%mm7                    \n\t"\
1192
+        "pmaxub %%mm0, %%mm6                    \n\t"
1193 1193
 #else
1194 1194
 #define REAL_FIND_MIN_MAX(addr)\
1195
-                "movq " #addr ", %%mm0                  \n\t"\
1196
-                "movq %%mm7, %%mm1                      \n\t"\
1197
-                "psubusb %%mm0, %%mm6                   \n\t"\
1198
-                "paddb %%mm0, %%mm6                     \n\t"\
1199
-                "psubusb %%mm0, %%mm1                   \n\t"\
1200
-                "psubb %%mm1, %%mm7                     \n\t"
1195
+        "movq " #addr ", %%mm0                  \n\t"\
1196
+        "movq %%mm7, %%mm1                      \n\t"\
1197
+        "psubusb %%mm0, %%mm6                   \n\t"\
1198
+        "paddb %%mm0, %%mm6                     \n\t"\
1199
+        "psubusb %%mm0, %%mm1                   \n\t"\
1200
+        "psubb %%mm1, %%mm7                     \n\t"
1201 1201
 #endif
1202 1202
 #define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
1203 1203
 
... ...
@@ -1210,155 +1189,155 @@ FIND_MIN_MAX((%%REGd, %1))
1210 1210
 FIND_MIN_MAX((%%REGd, %1, 2))
1211 1211
 FIND_MIN_MAX((%0, %1, 8))
1212 1212
 
1213
-                "movq %%mm7, %%mm4                      \n\t"
1214
-                "psrlq $8, %%mm7                        \n\t"
1213
+        "movq %%mm7, %%mm4                      \n\t"
1214
+        "psrlq $8, %%mm7                        \n\t"
1215 1215
 #ifdef HAVE_MMX2
1216
-                "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1217
-                "pshufw $0xF9, %%mm7, %%mm4             \n\t"
1218
-                "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1219
-                "pshufw $0xFE, %%mm7, %%mm4             \n\t"
1220
-                "pminub %%mm4, %%mm7                    \n\t"
1216
+        "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1217
+        "pshufw $0xF9, %%mm7, %%mm4             \n\t"
1218
+        "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1219
+        "pshufw $0xFE, %%mm7, %%mm4             \n\t"
1220
+        "pminub %%mm4, %%mm7                    \n\t"
1221 1221
 #else
1222
-                "movq %%mm7, %%mm1                      \n\t"
1223
-                "psubusb %%mm4, %%mm1                   \n\t"
1224
-                "psubb %%mm1, %%mm7                     \n\t"
1225
-                "movq %%mm7, %%mm4                      \n\t"
1226
-                "psrlq $16, %%mm7                       \n\t"
1227
-                "movq %%mm7, %%mm1                      \n\t"
1228
-                "psubusb %%mm4, %%mm1                   \n\t"
1229
-                "psubb %%mm1, %%mm7                     \n\t"
1230
-                "movq %%mm7, %%mm4                      \n\t"
1231
-                "psrlq $32, %%mm7                       \n\t"
1232
-                "movq %%mm7, %%mm1                      \n\t"
1233
-                "psubusb %%mm4, %%mm1                   \n\t"
1234
-                "psubb %%mm1, %%mm7                     \n\t"
1222
+        "movq %%mm7, %%mm1                      \n\t"
1223
+        "psubusb %%mm4, %%mm1                   \n\t"
1224
+        "psubb %%mm1, %%mm7                     \n\t"
1225
+        "movq %%mm7, %%mm4                      \n\t"
1226
+        "psrlq $16, %%mm7                       \n\t"
1227
+        "movq %%mm7, %%mm1                      \n\t"
1228
+        "psubusb %%mm4, %%mm1                   \n\t"
1229
+        "psubb %%mm1, %%mm7                     \n\t"
1230
+        "movq %%mm7, %%mm4                      \n\t"
1231
+        "psrlq $32, %%mm7                       \n\t"
1232
+        "movq %%mm7, %%mm1                      \n\t"
1233
+        "psubusb %%mm4, %%mm1                   \n\t"
1234
+        "psubb %%mm1, %%mm7                     \n\t"
1235 1235
 #endif
1236 1236
 
1237 1237
 
1238
-                "movq %%mm6, %%mm4                      \n\t"
1239
-                "psrlq $8, %%mm6                        \n\t"
1238
+        "movq %%mm6, %%mm4                      \n\t"
1239
+        "psrlq $8, %%mm6                        \n\t"
1240 1240
 #ifdef HAVE_MMX2
1241
-                "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
1242
-                "pshufw $0xF9, %%mm6, %%mm4             \n\t"
1243
-                "pmaxub %%mm4, %%mm6                    \n\t"
1244
-                "pshufw $0xFE, %%mm6, %%mm4             \n\t"
1245
-                "pmaxub %%mm4, %%mm6                    \n\t"
1241
+        "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
1242
+        "pshufw $0xF9, %%mm6, %%mm4             \n\t"
1243
+        "pmaxub %%mm4, %%mm6                    \n\t"
1244
+        "pshufw $0xFE, %%mm6, %%mm4             \n\t"
1245
+        "pmaxub %%mm4, %%mm6                    \n\t"
1246 1246
 #else
1247
-                "psubusb %%mm4, %%mm6                   \n\t"
1248
-                "paddb %%mm4, %%mm6                     \n\t"
1249
-                "movq %%mm6, %%mm4                      \n\t"
1250
-                "psrlq $16, %%mm6                       \n\t"
1251
-                "psubusb %%mm4, %%mm6                   \n\t"
1252
-                "paddb %%mm4, %%mm6                     \n\t"
1253
-                "movq %%mm6, %%mm4                      \n\t"
1254
-                "psrlq $32, %%mm6                       \n\t"
1255
-                "psubusb %%mm4, %%mm6                   \n\t"
1256
-                "paddb %%mm4, %%mm6                     \n\t"
1247
+        "psubusb %%mm4, %%mm6                   \n\t"
1248
+        "paddb %%mm4, %%mm6                     \n\t"
1249
+        "movq %%mm6, %%mm4                      \n\t"
1250
+        "psrlq $16, %%mm6                       \n\t"
1251
+        "psubusb %%mm4, %%mm6                   \n\t"
1252
+        "paddb %%mm4, %%mm6                     \n\t"
1253
+        "movq %%mm6, %%mm4                      \n\t"
1254
+        "psrlq $32, %%mm6                       \n\t"
1255
+        "psubusb %%mm4, %%mm6                   \n\t"
1256
+        "paddb %%mm4, %%mm6                     \n\t"
1257 1257
 #endif
1258
-                "movq %%mm6, %%mm0                      \n\t" // max
1259
-                "psubb %%mm7, %%mm6                     \n\t" // max - min
1260
-                "movd %%mm6, %%ecx                      \n\t"
1261
-                "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
1262
-                " jb 1f                                 \n\t"
1263
-                "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
1264
-                "and "ALIGN_MASK", %%"REG_c"            \n\t"
1265
-                PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
1266
-                "punpcklbw %%mm7, %%mm7                 \n\t"
1267
-                "punpcklbw %%mm7, %%mm7                 \n\t"
1268
-                "punpcklbw %%mm7, %%mm7                 \n\t"
1269
-                "movq %%mm7, (%%"REG_c")                \n\t"
1270
-
1271
-                "movq (%0), %%mm0                       \n\t" // L10
1272
-                "movq %%mm0, %%mm1                      \n\t" // L10
1273
-                "movq %%mm0, %%mm2                      \n\t" // L10
1274
-                "psllq $8, %%mm1                        \n\t"
1275
-                "psrlq $8, %%mm2                        \n\t"
1276
-                "movd -4(%0), %%mm3                     \n\t"
1277
-                "movd 8(%0), %%mm4                      \n\t"
1278
-                "psrlq $24, %%mm3                       \n\t"
1279
-                "psllq $56, %%mm4                       \n\t"
1280
-                "por %%mm3, %%mm1                       \n\t" // L00
1281
-                "por %%mm4, %%mm2                       \n\t" // L20
1282
-                "movq %%mm1, %%mm3                      \n\t" // L00
1283
-                PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
1284
-                PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
1285
-                "psubusb %%mm7, %%mm0                   \n\t"
1286
-                "psubusb %%mm7, %%mm2                   \n\t"
1287
-                "psubusb %%mm7, %%mm3                   \n\t"
1288
-                "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
1289
-                "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
1290
-                "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
1291
-                "paddb %%mm2, %%mm0                     \n\t"
1292
-                "paddb %%mm3, %%mm0                     \n\t"
1293
-
1294
-                "movq (%%"REG_a"), %%mm2                \n\t" // L11
1295
-                "movq %%mm2, %%mm3                      \n\t" // L11
1296
-                "movq %%mm2, %%mm4                      \n\t" // L11
1297
-                "psllq $8, %%mm3                        \n\t"
1298
-                "psrlq $8, %%mm4                        \n\t"
1299
-                "movd -4(%%"REG_a"), %%mm5              \n\t"
1300
-                "movd 8(%%"REG_a"), %%mm6               \n\t"
1301
-                "psrlq $24, %%mm5                       \n\t"
1302
-                "psllq $56, %%mm6                       \n\t"
1303
-                "por %%mm5, %%mm3                       \n\t" // L01
1304
-                "por %%mm6, %%mm4                       \n\t" // L21
1305
-                "movq %%mm3, %%mm5                      \n\t" // L01
1306
-                PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
1307
-                PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
1308
-                "psubusb %%mm7, %%mm2                   \n\t"
1309
-                "psubusb %%mm7, %%mm4                   \n\t"
1310
-                "psubusb %%mm7, %%mm5                   \n\t"
1311
-                "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
1312
-                "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
1313
-                "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
1314
-                "paddb %%mm4, %%mm2                     \n\t"
1315
-                "paddb %%mm5, %%mm2                     \n\t"
1258
+        "movq %%mm6, %%mm0                      \n\t" // max
1259
+        "psubb %%mm7, %%mm6                     \n\t" // max - min
1260
+        "movd %%mm6, %%ecx                      \n\t"
1261
+        "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
1262
+        " jb 1f                                 \n\t"
1263
+        "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
1264
+        "and "ALIGN_MASK", %%"REG_c"            \n\t"
1265
+        PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
1266
+        "punpcklbw %%mm7, %%mm7                 \n\t"
1267
+        "punpcklbw %%mm7, %%mm7                 \n\t"
1268
+        "punpcklbw %%mm7, %%mm7                 \n\t"
1269
+        "movq %%mm7, (%%"REG_c")                \n\t"
1270
+
1271
+        "movq (%0), %%mm0                       \n\t" // L10
1272
+        "movq %%mm0, %%mm1                      \n\t" // L10
1273
+        "movq %%mm0, %%mm2                      \n\t" // L10
1274
+        "psllq $8, %%mm1                        \n\t"
1275
+        "psrlq $8, %%mm2                        \n\t"
1276
+        "movd -4(%0), %%mm3                     \n\t"
1277
+        "movd 8(%0), %%mm4                      \n\t"
1278
+        "psrlq $24, %%mm3                       \n\t"
1279
+        "psllq $56, %%mm4                       \n\t"
1280
+        "por %%mm3, %%mm1                       \n\t" // L00
1281
+        "por %%mm4, %%mm2                       \n\t" // L20
1282
+        "movq %%mm1, %%mm3                      \n\t" // L00
1283
+        PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
1284
+        PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
1285
+        "psubusb %%mm7, %%mm0                   \n\t"
1286
+        "psubusb %%mm7, %%mm2                   \n\t"
1287
+        "psubusb %%mm7, %%mm3                   \n\t"
1288
+        "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
1289
+        "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
1290
+        "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
1291
+        "paddb %%mm2, %%mm0                     \n\t"
1292
+        "paddb %%mm3, %%mm0                     \n\t"
1293
+
1294
+        "movq (%%"REG_a"), %%mm2                \n\t" // L11
1295
+        "movq %%mm2, %%mm3                      \n\t" // L11
1296
+        "movq %%mm2, %%mm4                      \n\t" // L11
1297
+        "psllq $8, %%mm3                        \n\t"
1298
+        "psrlq $8, %%mm4                        \n\t"
1299
+        "movd -4(%%"REG_a"), %%mm5              \n\t"
1300
+        "movd 8(%%"REG_a"), %%mm6               \n\t"
1301
+        "psrlq $24, %%mm5                       \n\t"
1302
+        "psllq $56, %%mm6                       \n\t"
1303
+        "por %%mm5, %%mm3                       \n\t" // L01
1304
+        "por %%mm6, %%mm4                       \n\t" // L21
1305
+        "movq %%mm3, %%mm5                      \n\t" // L01
1306
+        PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
1307
+        PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
1308
+        "psubusb %%mm7, %%mm2                   \n\t"
1309
+        "psubusb %%mm7, %%mm4                   \n\t"
1310
+        "psubusb %%mm7, %%mm5                   \n\t"
1311
+        "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
1312
+        "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
1313
+        "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
1314
+        "paddb %%mm4, %%mm2                     \n\t"
1315
+        "paddb %%mm5, %%mm2                     \n\t"
1316 1316
 // 0, 2, 3, 1
1317 1317
 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1318
-                "movq " #src ", " #sx "                 \n\t" /* src[0] */\
1319
-                "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
1320
-                "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
1321
-                "psllq $8, " #lx "                      \n\t"\
1322
-                "psrlq $8, " #t0 "                      \n\t"\
1323
-                "movd -4" #src ", " #t1 "               \n\t"\
1324
-                "psrlq $24, " #t1 "                     \n\t"\
1325
-                "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
1326
-                "movd 8" #src ", " #t1 "                \n\t"\
1327
-                "psllq $56, " #t1 "                     \n\t"\
1328
-                "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
1329
-                "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
1330
-                PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
1331
-                PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1332
-                PAVGB(lx, pplx)                                     \
1333
-                "movq " #lx ", 8(%%"REG_c")             \n\t"\
1334
-                "movq (%%"REG_c"), " #lx "              \n\t"\
1335
-                "psubusb " #lx ", " #t1 "               \n\t"\
1336
-                "psubusb " #lx ", " #t0 "               \n\t"\
1337
-                "psubusb " #lx ", " #sx "               \n\t"\
1338
-                "movq "MANGLE(b00)", " #lx "            \n\t"\
1339
-                "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
1340
-                "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
1341
-                "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
1342
-                "paddb " #t1 ", " #t0 "                 \n\t"\
1343
-                "paddb " #t0 ", " #sx "                 \n\t"\
1318
+        "movq " #src ", " #sx "                 \n\t" /* src[0] */\
1319
+        "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
1320
+        "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
1321
+        "psllq $8, " #lx "                      \n\t"\
1322
+        "psrlq $8, " #t0 "                      \n\t"\
1323
+        "movd -4" #src ", " #t1 "               \n\t"\
1324
+        "psrlq $24, " #t1 "                     \n\t"\
1325
+        "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
1326
+        "movd 8" #src ", " #t1 "                \n\t"\
1327
+        "psllq $56, " #t1 "                     \n\t"\
1328
+        "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
1329
+        "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
1330
+        PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
1331
+        PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1332
+        PAVGB(lx, pplx)                                     \
1333
+        "movq " #lx ", 8(%%"REG_c")             \n\t"\
1334
+        "movq (%%"REG_c"), " #lx "              \n\t"\
1335
+        "psubusb " #lx ", " #t1 "               \n\t"\
1336
+        "psubusb " #lx ", " #t0 "               \n\t"\
1337
+        "psubusb " #lx ", " #sx "               \n\t"\
1338
+        "movq "MANGLE(b00)", " #lx "            \n\t"\
1339
+        "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
1340
+        "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
1341
+        "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
1342
+        "paddb " #t1 ", " #t0 "                 \n\t"\
1343
+        "paddb " #t0 ", " #sx "                 \n\t"\
1344 1344
 \
1345
-                PAVGB(plx, pplx)                              /* filtered */\
1346
-                "movq " #dst ", " #t0 "                 \n\t" /* dst */\
1347
-                "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
1348
-                "psubusb %3, " #t0 "                    \n\t"\
1349
-                "paddusb %3, " #t1 "                    \n\t"\
1350
-                PMAXUB(t0, pplx)\
1351
-                PMINUB(t1, pplx, t0)\
1352
-                "paddb " #sx ", " #ppsx "               \n\t"\
1353
-                "paddb " #psx ", " #ppsx "              \n\t"\
1354
-                "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
1355
-                "pand "MANGLE(b08)", " #ppsx "          \n\t"\
1356
-                "pcmpeqb " #lx ", " #ppsx "             \n\t"\
1357
-                "pand " #ppsx ", " #pplx "              \n\t"\
1358
-                "pandn " #dst ", " #ppsx "              \n\t"\
1359
-                "por " #pplx ", " #ppsx "               \n\t"\
1360
-                "movq " #ppsx ", " #dst "               \n\t"\
1361
-                "movq 8(%%"REG_c"), " #lx "             \n\t"
1345
+        PAVGB(plx, pplx)                              /* filtered */\
1346
+        "movq " #dst ", " #t0 "                 \n\t" /* dst */\
1347
+        "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
1348
+        "psubusb %3, " #t0 "                    \n\t"\
1349
+        "paddusb %3, " #t1 "                    \n\t"\
1350
+        PMAXUB(t0, pplx)\
1351
+        PMINUB(t1, pplx, t0)\
1352
+        "paddb " #sx ", " #ppsx "               \n\t"\
1353
+        "paddb " #psx ", " #ppsx "              \n\t"\
1354
+        "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
1355
+        "pand "MANGLE(b08)", " #ppsx "          \n\t"\
1356
+        "pcmpeqb " #lx ", " #ppsx "             \n\t"\
1357
+        "pand " #ppsx ", " #pplx "              \n\t"\
1358
+        "pandn " #dst ", " #ppsx "              \n\t"\
1359
+        "por " #pplx ", " #ppsx "               \n\t"\
1360
+        "movq " #ppsx ", " #dst "               \n\t"\
1361
+        "movq 8(%%"REG_c"), " #lx "             \n\t"
1362 1362
 
1363 1363
 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1364 1364
    REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
... ...
@@ -1387,139 +1366,126 @@ DERING_CORE((%%REGd, %1)   ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,
1387 1387
 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8)    ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1388 1388
 DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1389 1389
 
1390
-                "1:                        \n\t"
1391
-                : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
1392
-                : "%"REG_a, "%"REG_d, "%"REG_c
1393
-        );
1390
+        "1:                        \n\t"
1391
+        : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
1392
+        : "%"REG_a, "%"REG_d, "%"REG_c
1393
+    );
1394 1394
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1395
-        int y;
1396
-        int min=255;
1397
-        int max=0;
1398
-        int avg;
1399
-        uint8_t *p;
1400
-        int s[10];
1401
-        const int QP2= c->QP/2 + 1;
1402
-
1403
-        for(y=1; y<9; y++)
1404
-        {
1405
-                int x;
1406
-                p= src + stride*y;
1407
-                for(x=1; x<9; x++)
1408
-                {
1409
-                        p++;
1410
-                        if(*p > max) max= *p;
1411
-                        if(*p < min) min= *p;
1412
-                }
1413
-        }
1414
-        avg= (min + max + 1)>>1;
1415
-
1416
-        if(max - min <deringThreshold) return;
1417
-
1418
-        for(y=0; y<10; y++)
1419
-        {
1420
-                int t = 0;
1421
-
1422
-                if(src[stride*y + 0] > avg) t+= 1;
1423
-                if(src[stride*y + 1] > avg) t+= 2;
1424
-                if(src[stride*y + 2] > avg) t+= 4;
1425
-                if(src[stride*y + 3] > avg) t+= 8;
1426
-                if(src[stride*y + 4] > avg) t+= 16;
1427
-                if(src[stride*y + 5] > avg) t+= 32;
1428
-                if(src[stride*y + 6] > avg) t+= 64;
1429
-                if(src[stride*y + 7] > avg) t+= 128;
1430
-                if(src[stride*y + 8] > avg) t+= 256;
1431
-                if(src[stride*y + 9] > avg) t+= 512;
1432
-
1433
-                t |= (~t)<<16;
1434
-                t &= (t<<1) & (t>>1);
1435
-                s[y] = t;
1436
-        }
1437
-
1438
-        for(y=1; y<9; y++)
1439
-        {
1440
-                int t = s[y-1] & s[y] & s[y+1];
1441
-                t|= t>>16;
1442
-                s[y-1]= t;
1395
+    int y;
1396
+    int min=255;
1397
+    int max=0;
1398
+    int avg;
1399
+    uint8_t *p;
1400
+    int s[10];
1401
+    const int QP2= c->QP/2 + 1;
1402
+
1403
+    for(y=1; y<9; y++){
1404
+        int x;
1405
+        p= src + stride*y;
1406
+        for(x=1; x<9; x++){
1407
+            p++;
1408
+            if(*p > max) max= *p;
1409
+            if(*p < min) min= *p;
1443 1410
         }
1411
+    }
1412
+    avg= (min + max + 1)>>1;
1413
+
1414
+    if(max - min <deringThreshold) return;
1415
+
1416
+    for(y=0; y<10; y++){
1417
+        int t = 0;
1418
+
1419
+        if(src[stride*y + 0] > avg) t+= 1;
1420
+        if(src[stride*y + 1] > avg) t+= 2;
1421
+        if(src[stride*y + 2] > avg) t+= 4;
1422
+        if(src[stride*y + 3] > avg) t+= 8;
1423
+        if(src[stride*y + 4] > avg) t+= 16;
1424
+        if(src[stride*y + 5] > avg) t+= 32;
1425
+        if(src[stride*y + 6] > avg) t+= 64;
1426
+        if(src[stride*y + 7] > avg) t+= 128;
1427
+        if(src[stride*y + 8] > avg) t+= 256;
1428
+        if(src[stride*y + 9] > avg) t+= 512;
1429
+
1430
+        t |= (~t)<<16;
1431
+        t &= (t<<1) & (t>>1);
1432
+        s[y] = t;
1433
+    }
1434
+
1435
+    for(y=1; y<9; y++){
1436
+        int t = s[y-1] & s[y] & s[y+1];
1437
+        t|= t>>16;
1438
+        s[y-1]= t;
1439
+    }
1440
+
1441
+    for(y=1; y<9; y++){
1442
+        int x;
1443
+        int t = s[y-1];
1444 1444
 
1445
-        for(y=1; y<9; y++)
1446
-        {
1447
-                int x;
1448
-                int t = s[y-1];
1449
-
1450
-                p= src + stride*y;
1451
-                for(x=1; x<9; x++)
1452
-                {
1453
-                        p++;
1454
-                        if(t & (1<<x))
1455
-                        {
1456
-                                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1457
-                                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1458
-                                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1459
-                                f= (f + 8)>>4;
1445
+        p= src + stride*y;
1446
+        for(x=1; x<9; x++){
1447
+            p++;
1448
+            if(t & (1<<x)){
1449
+                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1450
+                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1451
+                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1452
+                f= (f + 8)>>4;
1460 1453
 
1461 1454
 #ifdef DEBUG_DERING_THRESHOLD
1462
-                                asm volatile("emms\n\t":);
1463
-                                {
1464
-                                static long long numPixels=0;
1465
-                                if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1466
-//                                if((max-min)<20 || (max-min)*QP<200)
1467
-//                                if((max-min)*QP < 500)
1468
-//                                if(max-min<QP/2)
1469
-                                if(max-min < 20)
1470
-                                {
1471
-                                        static int numSkiped=0;
1472
-                                        static int errorSum=0;
1473
-                                        static int worstQP=0;
1474
-                                        static int worstRange=0;
1475
-                                        static int worstDiff=0;
1476
-                                        int diff= (f - *p);
1477
-                                        int absDiff= FFABS(diff);
1478
-                                        int error= diff*diff;
1479
-
1480
-                                        if(x==1 || x==8 || y==1 || y==8) continue;
1481
-
1482
-                                        numSkiped++;
1483
-                                        if(absDiff > worstDiff)
1484
-                                        {
1485
-                                                worstDiff= absDiff;
1486
-                                                worstQP= QP;
1487
-                                                worstRange= max-min;
1488
-                                        }
1489
-                                        errorSum+= error;
1490
-
1491
-                                        if(1024LL*1024LL*1024LL % numSkiped == 0)
1492
-                                        {
1493
-                                                av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1494
-                                                        "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1495
-                                                        (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1496
-                                                        worstDiff, (float)numSkiped/numPixels);
1497
-                                        }
1498
-                                }
1499
-                                }
1500
-#endif
1501
-                                if     (*p + QP2 < f) *p= *p + QP2;
1502
-                                else if(*p - QP2 > f) *p= *p - QP2;
1503
-                                else *p=f;
1455
+                    asm volatile("emms\n\t":);
1456
+                    {
1457
+                    static long long numPixels=0;
1458
+                    if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1459
+//                    if((max-min)<20 || (max-min)*QP<200)
1460
+//                    if((max-min)*QP < 500)
1461
+//                    if(max-min<QP/2)
1462
+                    if(max-min < 20){
1463
+                        static int numSkiped=0;
1464
+                        static int errorSum=0;
1465
+                        static int worstQP=0;
1466
+                        static int worstRange=0;
1467
+                        static int worstDiff=0;
1468
+                        int diff= (f - *p);
1469
+                        int absDiff= FFABS(diff);
1470
+                        int error= diff*diff;
1471
+
1472
+                        if(x==1 || x==8 || y==1 || y==8) continue;
1473
+
1474
+                        numSkiped++;
1475
+                        if(absDiff > worstDiff){
1476
+                            worstDiff= absDiff;
1477
+                            worstQP= QP;
1478
+                            worstRange= max-min;
1504 1479
                         }
1505
-                }
1480
+                        errorSum+= error;
1481
+
1482
+                        if(1024LL*1024LL*1024LL % numSkiped == 0){
1483
+                            av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1484
+                                   "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1485
+                                   (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1486
+                                   worstDiff, (float)numSkiped/numPixels);
1487
+                        }
1488
+                    }
1489
+                    }
1490
+#endif
1491
+                    if     (*p + QP2 < f) *p= *p + QP2;
1492
+                    else if(*p - QP2 > f) *p= *p - QP2;
1493
+                    else *p=f;
1494
+            }
1506 1495
         }
1496
+    }
1507 1497
 #ifdef DEBUG_DERING_THRESHOLD
1508
-        if(max-min < 20)
1509
-        {
1510
-                for(y=1; y<9; y++)
1511
-                {
1512
-                        int x;
1513
-                        int t = 0;
1514
-                        p= src + stride*y;
1515
-                        for(x=1; x<9; x++)
1516
-                        {
1517
-                                p++;
1518
-                                *p = FFMIN(*p + 20, 255);
1519
-                        }
1520
-                }
1521
-//                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1498
+    if(max-min < 20){
1499
+        for(y=1; y<9; y++){
1500
+            int x;
1501
+            int t = 0;
1502
+            p= src + stride*y;
1503
+            for(x=1; x<9; x++){
1504
+                p++;
1505
+                *p = FFMIN(*p + 20, 255);
1506
+            }
1522 1507
         }
1508
+//        src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1509
+    }
1523 1510
 #endif
1524 1511
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1525 1512
 }
... ...
@@ -1534,46 +1500,46 @@ DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,
1534 1534
 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1535 1535
 {
1536 1536
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1537
-        src+= 4*stride;
1538
-        asm volatile(
1539
-                "lea (%0, %1), %%"REG_a"                \n\t"
1540
-                "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
1537
+    src+= 4*stride;
1538
+    asm volatile(
1539
+        "lea (%0, %1), %%"REG_a"                \n\t"
1540
+        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
1541 1541
 //      0       1       2       3       4       5       6       7       8       9
1542 1542
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
1543 1543
 
1544
-                "movq (%0), %%mm0                       \n\t"
1545
-                "movq (%%"REG_a", %1), %%mm1            \n\t"
1546
-                PAVGB(%%mm1, %%mm0)
1547
-                "movq %%mm0, (%%"REG_a")                \n\t"
1548
-                "movq (%0, %1, 4), %%mm0                \n\t"
1549
-                PAVGB(%%mm0, %%mm1)
1550
-                "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
1551
-                "movq (%%"REG_c", %1), %%mm1            \n\t"
1552
-                PAVGB(%%mm1, %%mm0)
1553
-                "movq %%mm0, (%%"REG_c")                \n\t"
1554
-                "movq (%0, %1, 8), %%mm0                \n\t"
1555
-                PAVGB(%%mm0, %%mm1)
1556
-                "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
1557
-
1558
-                : : "r" (src), "r" ((long)stride)
1559
-                : "%"REG_a, "%"REG_c
1560
-        );
1544
+        "movq (%0), %%mm0                       \n\t"
1545
+        "movq (%%"REG_a", %1), %%mm1            \n\t"
1546
+        PAVGB(%%mm1, %%mm0)
1547
+        "movq %%mm0, (%%"REG_a")                \n\t"
1548
+        "movq (%0, %1, 4), %%mm0                \n\t"
1549
+        PAVGB(%%mm0, %%mm1)
1550
+        "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
1551
+        "movq (%%"REG_c", %1), %%mm1            \n\t"
1552
+        PAVGB(%%mm1, %%mm0)
1553
+        "movq %%mm0, (%%"REG_c")                \n\t"
1554
+        "movq (%0, %1, 8), %%mm0                \n\t"
1555
+        PAVGB(%%mm0, %%mm1)
1556
+        "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
1557
+
1558
+        : : "r" (src), "r" ((long)stride)
1559
+        : "%"REG_a, "%"REG_c
1560
+    );
1561 1561
 #else
1562
-        int a, b, x;
1563
-        src+= 4*stride;
1564
-
1565
-        for(x=0; x<2; x++){
1566
-                a= *(uint32_t*)&src[stride*0];
1567
-                b= *(uint32_t*)&src[stride*2];
1568
-                *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1569
-                a= *(uint32_t*)&src[stride*4];
1570
-                *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1571
-                b= *(uint32_t*)&src[stride*6];
1572
-                *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1573
-                a= *(uint32_t*)&src[stride*8];
1574
-                *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1575
-                src += 4;
1576
-        }
1562
+    int a, b, x;
1563
+    src+= 4*stride;
1564
+
1565
+    for(x=0; x<2; x++){
1566
+        a= *(uint32_t*)&src[stride*0];
1567
+        b= *(uint32_t*)&src[stride*2];
1568
+        *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1569
+        a= *(uint32_t*)&src[stride*4];
1570
+        *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1571
+        b= *(uint32_t*)&src[stride*6];
1572
+        *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1573
+        a= *(uint32_t*)&src[stride*8];
1574
+        *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1575
+        src += 4;
1576
+    }
1577 1577
 #endif
1578 1578
 }
1579 1579
 
... ...
@@ -1587,37 +1553,37 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid
1587 1587
 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1588 1588
 {
1589 1589
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1590
-        src+= stride*3;
1591
-        asm volatile(
1592
-                "lea (%0, %1), %%"REG_a"                \n\t"
1593
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1594
-                "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
1595
-                "add %1, %%"REG_c"                      \n\t"
1596
-                "pxor %%mm7, %%mm7                      \n\t"
1590
+    src+= stride*3;
1591
+    asm volatile(
1592
+        "lea (%0, %1), %%"REG_a"                \n\t"
1593
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1594
+        "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
1595
+        "add %1, %%"REG_c"                      \n\t"
1596
+        "pxor %%mm7, %%mm7                      \n\t"
1597 1597
 //      0       1       2       3       4       5       6       7       8       9       10
1598 1598
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1599 1599
 
1600 1600
 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1601
-                "movq " #a ", %%mm0                     \n\t"\
1602
-                "movq " #b ", %%mm1                     \n\t"\
1603
-                "movq " #d ", %%mm2                     \n\t"\
1604
-                "movq " #e ", %%mm3                     \n\t"\
1605
-                PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
1606
-                PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
1607
-                "movq %%mm0, %%mm2                      \n\t"\
1608
-                "punpcklbw %%mm7, %%mm0                 \n\t"\
1609
-                "punpckhbw %%mm7, %%mm2                 \n\t"\
1610
-                "movq %%mm1, %%mm3                      \n\t"\
1611
-                "punpcklbw %%mm7, %%mm1                 \n\t"\
1612
-                "punpckhbw %%mm7, %%mm3                 \n\t"\
1613
-                "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
1614
-                "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
1615
-                "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
1616
-                "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
1617
-                "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
1618
-                "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
1619
-                "packuswb %%mm3, %%mm1                  \n\t"\
1620
-                "movq %%mm1, " #c "                     \n\t"
1601
+        "movq " #a ", %%mm0                     \n\t"\
1602
+        "movq " #b ", %%mm1                     \n\t"\
1603
+        "movq " #d ", %%mm2                     \n\t"\
1604
+        "movq " #e ", %%mm3                     \n\t"\
1605
+        PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
1606
+        PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
1607
+        "movq %%mm0, %%mm2                      \n\t"\
1608
+        "punpcklbw %%mm7, %%mm0                 \n\t"\
1609
+        "punpckhbw %%mm7, %%mm2                 \n\t"\
1610
+        "movq %%mm1, %%mm3                      \n\t"\
1611
+        "punpcklbw %%mm7, %%mm1                 \n\t"\
1612
+        "punpckhbw %%mm7, %%mm3                 \n\t"\
1613
+        "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
1614
+        "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
1615
+        "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
1616
+        "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
1617
+        "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
1618
+        "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
1619
+        "packuswb %%mm3, %%mm1                  \n\t"\
1620
+        "movq %%mm1, " #c "                     \n\t"
1621 1621
 #define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
1622 1622
 
1623 1623
 DEINT_CUBIC((%0)        , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
... ...
@@ -1625,20 +1591,19 @@ DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%0, %1,
1625 1625
 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1626 1626
 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc, %1, 2))
1627 1627
 
1628
-                : : "r" (src), "r" ((long)stride)
1629
-                : "%"REG_a, "%"REG_d, "%"REG_c
1630
-        );
1628
+        : : "r" (src), "r" ((long)stride)
1629
+        : "%"REG_a, "%"REG_d, "%"REG_c
1630
+    );
1631 1631
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1632
-        int x;
1633
-        src+= stride*3;
1634
-        for(x=0; x<8; x++)
1635
-        {
1636
-                src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1637
-                src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1638
-                src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1639
-                src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1640
-                src++;
1641
-        }
1632
+    int x;
1633
+    src+= stride*3;
1634
+    for(x=0; x<8; x++){
1635
+        src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1636
+        src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1637
+        src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1638
+        src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1639
+        src++;
1640
+    }
1642 1641
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1643 1642
 }
1644 1643
 
... ...
@@ -1652,42 +1617,42 @@ DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc,
1652 1652
 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1653 1653
 {
1654 1654
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1655
-        src+= stride*4;
1656
-        asm volatile(
1657
-                "lea (%0, %1), %%"REG_a"                \n\t"
1658
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1659
-                "pxor %%mm7, %%mm7                      \n\t"
1660
-                "movq (%2), %%mm0                       \n\t"
1655
+    src+= stride*4;
1656
+    asm volatile(
1657
+        "lea (%0, %1), %%"REG_a"                \n\t"
1658
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1659
+        "pxor %%mm7, %%mm7                      \n\t"
1660
+        "movq (%2), %%mm0                       \n\t"
1661 1661
 //      0       1       2       3       4       5       6       7       8       9       10
1662 1662
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1663 1663
 
1664 1664
 #define REAL_DEINT_FF(a,b,c,d)\
1665
-                "movq " #a ", %%mm1                     \n\t"\
1666
-                "movq " #b ", %%mm2                     \n\t"\
1667
-                "movq " #c ", %%mm3                     \n\t"\
1668
-                "movq " #d ", %%mm4                     \n\t"\
1669
-                PAVGB(%%mm3, %%mm1)                          \
1670
-                PAVGB(%%mm4, %%mm0)                          \
1671
-                "movq %%mm0, %%mm3                      \n\t"\
1672
-                "punpcklbw %%mm7, %%mm0                 \n\t"\
1673
-                "punpckhbw %%mm7, %%mm3                 \n\t"\
1674
-                "movq %%mm1, %%mm4                      \n\t"\
1675
-                "punpcklbw %%mm7, %%mm1                 \n\t"\
1676
-                "punpckhbw %%mm7, %%mm4                 \n\t"\
1677
-                "psllw $2, %%mm1                        \n\t"\
1678
-                "psllw $2, %%mm4                        \n\t"\
1679
-                "psubw %%mm0, %%mm1                     \n\t"\
1680
-                "psubw %%mm3, %%mm4                     \n\t"\
1681
-                "movq %%mm2, %%mm5                      \n\t"\
1682
-                "movq %%mm2, %%mm0                      \n\t"\
1683
-                "punpcklbw %%mm7, %%mm2                 \n\t"\
1684
-                "punpckhbw %%mm7, %%mm5                 \n\t"\
1685
-                "paddw %%mm2, %%mm1                     \n\t"\
1686
-                "paddw %%mm5, %%mm4                     \n\t"\
1687
-                "psraw $2, %%mm1                        \n\t"\
1688
-                "psraw $2, %%mm4                        \n\t"\
1689
-                "packuswb %%mm4, %%mm1                  \n\t"\
1690
-                "movq %%mm1, " #b "                     \n\t"\
1665
+        "movq " #a ", %%mm1                     \n\t"\
1666
+        "movq " #b ", %%mm2                     \n\t"\
1667
+        "movq " #c ", %%mm3                     \n\t"\
1668
+        "movq " #d ", %%mm4                     \n\t"\
1669
+        PAVGB(%%mm3, %%mm1)                          \
1670
+        PAVGB(%%mm4, %%mm0)                          \
1671
+        "movq %%mm0, %%mm3                      \n\t"\
1672
+        "punpcklbw %%mm7, %%mm0                 \n\t"\
1673
+        "punpckhbw %%mm7, %%mm3                 \n\t"\
1674
+        "movq %%mm1, %%mm4                      \n\t"\
1675
+        "punpcklbw %%mm7, %%mm1                 \n\t"\
1676
+        "punpckhbw %%mm7, %%mm4                 \n\t"\
1677
+        "psllw $2, %%mm1                        \n\t"\
1678
+        "psllw $2, %%mm4                        \n\t"\
1679
+        "psubw %%mm0, %%mm1                     \n\t"\
1680
+        "psubw %%mm3, %%mm4                     \n\t"\
1681
+        "movq %%mm2, %%mm5                      \n\t"\
1682
+        "movq %%mm2, %%mm0                      \n\t"\
1683
+        "punpcklbw %%mm7, %%mm2                 \n\t"\
1684
+        "punpckhbw %%mm7, %%mm5                 \n\t"\
1685
+        "paddw %%mm2, %%mm1                     \n\t"\
1686
+        "paddw %%mm5, %%mm4                     \n\t"\
1687
+        "psraw $2, %%mm1                        \n\t"\
1688
+        "psraw $2, %%mm4                        \n\t"\
1689
+        "packuswb %%mm4, %%mm1                  \n\t"\
1690
+        "movq %%mm1, " #b "                     \n\t"\
1691 1691
 
1692 1692
 #define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
1693 1693
 
... ...
@@ -1696,29 +1661,28 @@ DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd)       )
1696 1696
 DEINT_FF((%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
1697 1697
 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1698 1698
 
1699
-                "movq %%mm0, (%2)                       \n\t"
1700
-                : : "r" (src), "r" ((long)stride), "r"(tmp)
1701
-                : "%"REG_a, "%"REG_d
1702
-        );
1699
+        "movq %%mm0, (%2)                       \n\t"
1700
+        : : "r" (src), "r" ((long)stride), "r"(tmp)
1701
+        : "%"REG_a, "%"REG_d
1702
+    );
1703 1703
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1704
-        int x;
1705
-        src+= stride*4;
1706
-        for(x=0; x<8; x++)
1707
-        {
1708
-                int t1= tmp[x];
1709
-                int t2= src[stride*1];
1710
-
1711
-                src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1712
-                t1= src[stride*4];
1713
-                src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1714
-                t2= src[stride*6];
1715
-                src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1716
-                t1= src[stride*8];
1717
-                src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1718
-                tmp[x]= t1;
1719
-
1720
-                src++;
1721
-        }
1704
+    int x;
1705
+    src+= stride*4;
1706
+    for(x=0; x<8; x++){
1707
+        int t1= tmp[x];
1708
+        int t2= src[stride*1];
1709
+
1710
+        src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1711
+        t1= src[stride*4];
1712
+        src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1713
+        t2= src[stride*6];
1714
+        src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1715
+        t1= src[stride*8];
1716
+        src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1717
+        tmp[x]= t1;
1718
+
1719
+        src++;
1720
+    }
1722 1721
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1723 1722
 }
1724 1723
 
... ...
@@ -1732,48 +1696,48 @@ DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1732 1732
 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1733 1733
 {
1734 1734
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1735
-        src+= stride*4;
1736
-        asm volatile(
1737
-                "lea (%0, %1), %%"REG_a"                \n\t"
1738
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1739
-                "pxor %%mm7, %%mm7                      \n\t"
1740
-                "movq (%2), %%mm0                       \n\t"
1741
-                "movq (%3), %%mm1                       \n\t"
1735
+    src+= stride*4;
1736
+    asm volatile(
1737
+        "lea (%0, %1), %%"REG_a"                \n\t"
1738
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1739
+        "pxor %%mm7, %%mm7                      \n\t"
1740
+        "movq (%2), %%mm0                       \n\t"
1741
+        "movq (%3), %%mm1                       \n\t"
1742 1742
 //      0       1       2       3       4       5       6       7       8       9       10
1743 1743
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1744 1744
 
1745 1745
 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1746
-                "movq " #a ", %%mm2                     \n\t"\
1747
-                "movq " #b ", %%mm3                     \n\t"\
1748
-                "movq " #c ", %%mm4                     \n\t"\
1749
-                PAVGB(t2, %%mm3)                             \
1750
-                PAVGB(t1, %%mm4)                             \
1751
-                "movq %%mm2, %%mm5                      \n\t"\
1752
-                "movq %%mm2, " #t1 "                    \n\t"\
1753
-                "punpcklbw %%mm7, %%mm2                 \n\t"\
1754
-                "punpckhbw %%mm7, %%mm5                 \n\t"\
1755
-                "movq %%mm2, %%mm6                      \n\t"\
1756
-                "paddw %%mm2, %%mm2                     \n\t"\
1757
-                "paddw %%mm6, %%mm2                     \n\t"\
1758
-                "movq %%mm5, %%mm6                      \n\t"\
1759
-                "paddw %%mm5, %%mm5                     \n\t"\
1760
-                "paddw %%mm6, %%mm5                     \n\t"\
1761
-                "movq %%mm3, %%mm6                      \n\t"\
1762
-                "punpcklbw %%mm7, %%mm3                 \n\t"\
1763
-                "punpckhbw %%mm7, %%mm6                 \n\t"\
1764
-                "paddw %%mm3, %%mm3                     \n\t"\
1765
-                "paddw %%mm6, %%mm6                     \n\t"\
1766
-                "paddw %%mm3, %%mm2                     \n\t"\
1767
-                "paddw %%mm6, %%mm5                     \n\t"\
1768
-                "movq %%mm4, %%mm6                      \n\t"\
1769
-                "punpcklbw %%mm7, %%mm4                 \n\t"\
1770
-                "punpckhbw %%mm7, %%mm6                 \n\t"\
1771
-                "psubw %%mm4, %%mm2                     \n\t"\
1772
-                "psubw %%mm6, %%mm5                     \n\t"\
1773
-                "psraw $2, %%mm2                        \n\t"\
1774
-                "psraw $2, %%mm5                        \n\t"\
1775
-                "packuswb %%mm5, %%mm2                  \n\t"\
1776
-                "movq %%mm2, " #a "                     \n\t"\
1746
+        "movq " #a ", %%mm2                     \n\t"\
1747
+        "movq " #b ", %%mm3                     \n\t"\
1748
+        "movq " #c ", %%mm4                     \n\t"\
1749
+        PAVGB(t2, %%mm3)                             \
1750
+        PAVGB(t1, %%mm4)                             \
1751
+        "movq %%mm2, %%mm5                      \n\t"\
1752
+        "movq %%mm2, " #t1 "                    \n\t"\
1753
+        "punpcklbw %%mm7, %%mm2                 \n\t"\
1754
+        "punpckhbw %%mm7, %%mm5                 \n\t"\
1755
+        "movq %%mm2, %%mm6                      \n\t"\
1756
+        "paddw %%mm2, %%mm2                     \n\t"\
1757
+        "paddw %%mm6, %%mm2                     \n\t"\
1758
+        "movq %%mm5, %%mm6                      \n\t"\
1759
+        "paddw %%mm5, %%mm5                     \n\t"\
1760
+        "paddw %%mm6, %%mm5                     \n\t"\
1761
+        "movq %%mm3, %%mm6                      \n\t"\
1762
+        "punpcklbw %%mm7, %%mm3                 \n\t"\
1763
+        "punpckhbw %%mm7, %%mm6                 \n\t"\
1764
+        "paddw %%mm3, %%mm3                     \n\t"\
1765
+        "paddw %%mm6, %%mm6                     \n\t"\
1766
+        "paddw %%mm3, %%mm2                     \n\t"\
1767
+        "paddw %%mm6, %%mm5                     \n\t"\
1768
+        "movq %%mm4, %%mm6                      \n\t"\
1769
+        "punpcklbw %%mm7, %%mm4                 \n\t"\
1770
+        "punpckhbw %%mm7, %%mm6                 \n\t"\
1771
+        "psubw %%mm4, %%mm2                     \n\t"\
1772
+        "psubw %%mm6, %%mm5                     \n\t"\
1773
+        "psraw $2, %%mm2                        \n\t"\
1774
+        "psraw $2, %%mm5                        \n\t"\
1775
+        "packuswb %%mm5, %%mm2                  \n\t"\
1776
+        "movq %%mm2, " #a "                     \n\t"\
1777 1777
 
1778 1778
 #define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
1779 1779
 
... ...
@@ -1786,41 +1750,40 @@ DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
1786 1786
 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
1787 1787
 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
1788 1788
 
1789
-                "movq %%mm0, (%2)                       \n\t"
1790
-                "movq %%mm1, (%3)                       \n\t"
1791
-                : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
1792
-                : "%"REG_a, "%"REG_d
1793
-        );
1789
+        "movq %%mm0, (%2)                       \n\t"
1790
+        "movq %%mm1, (%3)                       \n\t"
1791
+        : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
1792
+        : "%"REG_a, "%"REG_d
1793
+    );
1794 1794
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1795
-        int x;
1796
-        src+= stride*4;
1797
-        for(x=0; x<8; x++)
1798
-        {
1799
-                int t1= tmp[x];
1800
-                int t2= tmp2[x];
1801
-                int t3= src[0];
1802
-
1803
-                src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1804
-                t1= src[stride*1];
1805
-                src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1806
-                t2= src[stride*2];
1807
-                src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1808
-                t3= src[stride*3];
1809
-                src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1810
-                t1= src[stride*4];
1811
-                src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1812
-                t2= src[stride*5];
1813
-                src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1814
-                t3= src[stride*6];
1815
-                src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1816
-                t1= src[stride*7];
1817
-                src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1818
-
1819
-                tmp[x]= t3;
1820
-                tmp2[x]= t1;
1821
-
1822
-                src++;
1823
-        }
1795
+    int x;
1796
+    src+= stride*4;
1797
+    for(x=0; x<8; x++){
1798
+        int t1= tmp[x];
1799
+        int t2= tmp2[x];
1800
+        int t3= src[0];
1801
+
1802
+        src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1803
+        t1= src[stride*1];
1804
+        src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1805
+        t2= src[stride*2];
1806
+        src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1807
+        t3= src[stride*3];
1808
+        src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1809
+        t1= src[stride*4];
1810
+        src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1811
+        t2= src[stride*5];
1812
+        src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1813
+        t3= src[stride*6];
1814
+        src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1815
+        t1= src[stride*7];
1816
+        src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1817
+
1818
+        tmp[x]= t3;
1819
+        tmp2[x]= t1;
1820
+
1821
+        src++;
1822
+    }
1824 1823
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1825 1824
 }
1826 1825
 
... ...
@@ -1834,95 +1797,95 @@ DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
1834 1834
 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1835 1835
 {
1836 1836
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1837
-        src+= 4*stride;
1838
-        asm volatile(
1839
-                "lea (%0, %1), %%"REG_a"                \n\t"
1840
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1837
+    src+= 4*stride;
1838
+    asm volatile(
1839
+        "lea (%0, %1), %%"REG_a"                \n\t"
1840
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1841 1841
 //      0       1       2       3       4       5       6       7       8       9
1842 1842
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1843 1843
 
1844
-                "movq (%2), %%mm0                       \n\t" // L0
1845
-                "movq (%%"REG_a"), %%mm1                \n\t" // L2
1846
-                PAVGB(%%mm1, %%mm0)                           // L0+L2
1847
-                "movq (%0), %%mm2                       \n\t" // L1
1848
-                PAVGB(%%mm2, %%mm0)
1849
-                "movq %%mm0, (%0)                       \n\t"
1850
-                "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
1851
-                PAVGB(%%mm0, %%mm2)                           // L1+L3
1852
-                PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
1853
-                "movq %%mm2, (%%"REG_a")                \n\t"
1854
-                "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
1855
-                PAVGB(%%mm2, %%mm1)                           // L2+L4
1856
-                PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
1857
-                "movq %%mm1, (%%"REG_a", %1)            \n\t"
1858
-                "movq (%0, %1, 4), %%mm1                \n\t" // L5
1859
-                PAVGB(%%mm1, %%mm0)                           // L3+L5
1860
-                PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
1861
-                "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
1862
-                "movq (%%"REG_d"), %%mm0                \n\t" // L6
1863
-                PAVGB(%%mm0, %%mm2)                           // L4+L6
1864
-                PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
1865
-                "movq %%mm2, (%0, %1, 4)                \n\t"
1866
-                "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
1867
-                PAVGB(%%mm2, %%mm1)                           // L5+L7
1868
-                PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
1869
-                "movq %%mm1, (%%"REG_d")                \n\t"
1870
-                "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
1871
-                PAVGB(%%mm1, %%mm0)                           // L6+L8
1872
-                PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
1873
-                "movq %%mm0, (%%"REG_d", %1)            \n\t"
1874
-                "movq (%0, %1, 8), %%mm0                \n\t" // L9
1875
-                PAVGB(%%mm0, %%mm2)                           // L7+L9
1876
-                PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
1877
-                "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1878
-                "movq %%mm1, (%2)                       \n\t"
1879
-
1880
-                : : "r" (src), "r" ((long)stride), "r" (tmp)
1881
-                : "%"REG_a, "%"REG_d
1882
-        );
1844
+        "movq (%2), %%mm0                       \n\t" // L0
1845
+        "movq (%%"REG_a"), %%mm1                \n\t" // L2
1846
+        PAVGB(%%mm1, %%mm0)                           // L0+L2
1847
+        "movq (%0), %%mm2                       \n\t" // L1
1848
+        PAVGB(%%mm2, %%mm0)
1849
+        "movq %%mm0, (%0)                       \n\t"
1850
+        "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
1851
+        PAVGB(%%mm0, %%mm2)                           // L1+L3
1852
+        PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
1853
+        "movq %%mm2, (%%"REG_a")                \n\t"
1854
+        "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
1855
+        PAVGB(%%mm2, %%mm1)                           // L2+L4
1856
+        PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
1857
+        "movq %%mm1, (%%"REG_a", %1)            \n\t"
1858
+        "movq (%0, %1, 4), %%mm1                \n\t" // L5
1859
+        PAVGB(%%mm1, %%mm0)                           // L3+L5
1860
+        PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
1861
+        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
1862
+        "movq (%%"REG_d"), %%mm0                \n\t" // L6
1863
+        PAVGB(%%mm0, %%mm2)                           // L4+L6
1864
+        PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
1865
+        "movq %%mm2, (%0, %1, 4)                \n\t"
1866
+        "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
1867
+        PAVGB(%%mm2, %%mm1)                           // L5+L7
1868
+        PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
1869
+        "movq %%mm1, (%%"REG_d")                \n\t"
1870
+        "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
1871
+        PAVGB(%%mm1, %%mm0)                           // L6+L8
1872
+        PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
1873
+        "movq %%mm0, (%%"REG_d", %1)            \n\t"
1874
+        "movq (%0, %1, 8), %%mm0                \n\t" // L9
1875
+        PAVGB(%%mm0, %%mm2)                           // L7+L9
1876
+        PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
1877
+        "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1878
+        "movq %%mm1, (%2)                       \n\t"
1879
+
1880
+        : : "r" (src), "r" ((long)stride), "r" (tmp)
1881
+        : "%"REG_a, "%"REG_d
1882
+    );
1883 1883
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1884
-        int a, b, c, x;
1885
-        src+= 4*stride;
1886
-
1887
-        for(x=0; x<2; x++){
1888
-                a= *(uint32_t*)&tmp[stride*0];
1889
-                b= *(uint32_t*)&src[stride*0];
1890
-                c= *(uint32_t*)&src[stride*1];
1891
-                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1892
-                *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1893
-
1894
-                a= *(uint32_t*)&src[stride*2];
1895
-                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1896
-                *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1897
-
1898
-                b= *(uint32_t*)&src[stride*3];
1899
-                c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1900
-                *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1901
-
1902
-                c= *(uint32_t*)&src[stride*4];
1903
-                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1904
-                *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1905
-
1906
-                a= *(uint32_t*)&src[stride*5];
1907
-                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1908
-                *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1909
-
1910
-                b= *(uint32_t*)&src[stride*6];
1911
-                c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1912
-                *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1913
-
1914
-                c= *(uint32_t*)&src[stride*7];
1915
-                a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1916
-                *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1917
-
1918
-                a= *(uint32_t*)&src[stride*8];
1919
-                b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1920
-                *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1921
-
1922
-                *(uint32_t*)&tmp[stride*0]= c;
1923
-                src += 4;
1924
-                tmp += 4;
1925
-        }
1884
+    int a, b, c, x;
1885
+    src+= 4*stride;
1886
+
1887
+    for(x=0; x<2; x++){
1888
+        a= *(uint32_t*)&tmp[stride*0];
1889
+        b= *(uint32_t*)&src[stride*0];
1890
+        c= *(uint32_t*)&src[stride*1];
1891
+        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1892
+        *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1893
+
1894
+        a= *(uint32_t*)&src[stride*2];
1895
+        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1896
+        *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1897
+
1898
+        b= *(uint32_t*)&src[stride*3];
1899
+        c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1900
+        *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1901
+
1902
+        c= *(uint32_t*)&src[stride*4];
1903
+        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1904
+        *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1905
+
1906
+        a= *(uint32_t*)&src[stride*5];
1907
+        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1908
+        *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1909
+
1910
+        b= *(uint32_t*)&src[stride*6];
1911
+        c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1912
+        *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1913
+
1914
+        c= *(uint32_t*)&src[stride*7];
1915
+        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1916
+        *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1917
+
1918
+        a= *(uint32_t*)&src[stride*8];
1919
+        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1920
+        *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1921
+
1922
+        *(uint32_t*)&tmp[stride*0]= c;
1923
+        src += 4;
1924
+        tmp += 4;
1925
+    }
1926 1926
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1927 1927
 }
1928 1928
 
... ...
@@ -1935,87 +1898,87 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
1935 1935
 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1936 1936
 {
1937 1937
 #ifdef HAVE_MMX
1938
-        src+= 4*stride;
1938
+    src+= 4*stride;
1939 1939
 #ifdef HAVE_MMX2
1940
-        asm volatile(
1941
-                "lea (%0, %1), %%"REG_a"                \n\t"
1942
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1940
+    asm volatile(
1941
+        "lea (%0, %1), %%"REG_a"                \n\t"
1942
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1943 1943
 //      0       1       2       3       4       5       6       7       8       9
1944 1944
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1945 1945
 
1946
-                "movq (%0), %%mm0                       \n\t" //
1947
-                "movq (%%"REG_a", %1), %%mm2            \n\t" //
1948
-                "movq (%%"REG_a"), %%mm1                \n\t" //
1949
-                "movq %%mm0, %%mm3                      \n\t"
1950
-                "pmaxub %%mm1, %%mm0                    \n\t" //
1951
-                "pminub %%mm3, %%mm1                    \n\t" //
1952
-                "pmaxub %%mm2, %%mm1                    \n\t" //
1953
-                "pminub %%mm1, %%mm0                    \n\t"
1954
-                "movq %%mm0, (%%"REG_a")                \n\t"
1955
-
1956
-                "movq (%0, %1, 4), %%mm0                \n\t" //
1957
-                "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
1958
-                "movq %%mm2, %%mm3                      \n\t"
1959
-                "pmaxub %%mm1, %%mm2                    \n\t" //
1960
-                "pminub %%mm3, %%mm1                    \n\t" //
1961
-                "pmaxub %%mm0, %%mm1                    \n\t" //
1962
-                "pminub %%mm1, %%mm2                    \n\t"
1963
-                "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
1964
-
1965
-                "movq (%%"REG_d"), %%mm2                \n\t" //
1966
-                "movq (%%"REG_d", %1), %%mm1            \n\t" //
1967
-                "movq %%mm2, %%mm3                      \n\t"
1968
-                "pmaxub %%mm0, %%mm2                    \n\t" //
1969
-                "pminub %%mm3, %%mm0                    \n\t" //
1970
-                "pmaxub %%mm1, %%mm0                    \n\t" //
1971
-                "pminub %%mm0, %%mm2                    \n\t"
1972
-                "movq %%mm2, (%%"REG_d")                \n\t"
1973
-
1974
-                "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
1975
-                "movq (%0, %1, 8), %%mm0                \n\t" //
1976
-                "movq %%mm2, %%mm3                      \n\t"
1977
-                "pmaxub %%mm0, %%mm2                    \n\t" //
1978
-                "pminub %%mm3, %%mm0                    \n\t" //
1979
-                "pmaxub %%mm1, %%mm0                    \n\t" //
1980
-                "pminub %%mm0, %%mm2                    \n\t"
1981
-                "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1982
-
1983
-
1984
-                : : "r" (src), "r" ((long)stride)
1985
-                : "%"REG_a, "%"REG_d
1986
-        );
1946
+        "movq (%0), %%mm0                       \n\t" //
1947
+        "movq (%%"REG_a", %1), %%mm2            \n\t" //
1948
+        "movq (%%"REG_a"), %%mm1                \n\t" //
1949
+        "movq %%mm0, %%mm3                      \n\t"
1950
+        "pmaxub %%mm1, %%mm0                    \n\t" //
1951
+        "pminub %%mm3, %%mm1                    \n\t" //
1952
+        "pmaxub %%mm2, %%mm1                    \n\t" //
1953
+        "pminub %%mm1, %%mm0                    \n\t"
1954
+        "movq %%mm0, (%%"REG_a")                \n\t"
1955
+
1956
+        "movq (%0, %1, 4), %%mm0                \n\t" //
1957
+        "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
1958
+        "movq %%mm2, %%mm3                      \n\t"
1959
+        "pmaxub %%mm1, %%mm2                    \n\t" //
1960
+        "pminub %%mm3, %%mm1                    \n\t" //
1961
+        "pmaxub %%mm0, %%mm1                    \n\t" //
1962
+        "pminub %%mm1, %%mm2                    \n\t"
1963
+        "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
1964
+
1965
+        "movq (%%"REG_d"), %%mm2                \n\t" //
1966
+        "movq (%%"REG_d", %1), %%mm1            \n\t" //
1967
+        "movq %%mm2, %%mm3                      \n\t"
1968
+        "pmaxub %%mm0, %%mm2                    \n\t" //
1969
+        "pminub %%mm3, %%mm0                    \n\t" //
1970
+        "pmaxub %%mm1, %%mm0                    \n\t" //
1971
+        "pminub %%mm0, %%mm2                    \n\t"
1972
+        "movq %%mm2, (%%"REG_d")                \n\t"
1973
+
1974
+        "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
1975
+        "movq (%0, %1, 8), %%mm0                \n\t" //
1976
+        "movq %%mm2, %%mm3                      \n\t"
1977
+        "pmaxub %%mm0, %%mm2                    \n\t" //
1978
+        "pminub %%mm3, %%mm0                    \n\t" //
1979
+        "pmaxub %%mm1, %%mm0                    \n\t" //
1980
+        "pminub %%mm0, %%mm2                    \n\t"
1981
+        "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1982
+
1983
+
1984
+        : : "r" (src), "r" ((long)stride)
1985
+        : "%"REG_a, "%"REG_d
1986
+    );
1987 1987
 
1988 1988
 #else // MMX without MMX2
1989
-        asm volatile(
1990
-                "lea (%0, %1), %%"REG_a"                \n\t"
1991
-                "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1989
+    asm volatile(
1990
+        "lea (%0, %1), %%"REG_a"                \n\t"
1991
+        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1992 1992
 //      0       1       2       3       4       5       6       7       8       9
1993 1993
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1994
-                "pxor %%mm7, %%mm7                      \n\t"
1994
+        "pxor %%mm7, %%mm7                      \n\t"
1995 1995
 
1996 1996
 #define REAL_MEDIAN(a,b,c)\
1997
-                "movq " #a ", %%mm0                     \n\t"\
1998
-                "movq " #b ", %%mm2                     \n\t"\
1999
-                "movq " #c ", %%mm1                     \n\t"\
2000
-                "movq %%mm0, %%mm3                      \n\t"\
2001
-                "movq %%mm1, %%mm4                      \n\t"\
2002
-                "movq %%mm2, %%mm5                      \n\t"\
2003
-                "psubusb %%mm1, %%mm3                   \n\t"\
2004
-                "psubusb %%mm2, %%mm4                   \n\t"\
2005
-                "psubusb %%mm0, %%mm5                   \n\t"\
2006
-                "pcmpeqb %%mm7, %%mm3                   \n\t"\
2007
-                "pcmpeqb %%mm7, %%mm4                   \n\t"\
2008
-                "pcmpeqb %%mm7, %%mm5                   \n\t"\
2009
-                "movq %%mm3, %%mm6                      \n\t"\
2010
-                "pxor %%mm4, %%mm3                      \n\t"\
2011
-                "pxor %%mm5, %%mm4                      \n\t"\
2012
-                "pxor %%mm6, %%mm5                      \n\t"\
2013
-                "por %%mm3, %%mm1                       \n\t"\
2014
-                "por %%mm4, %%mm2                       \n\t"\
2015
-                "por %%mm5, %%mm0                       \n\t"\
2016
-                "pand %%mm2, %%mm0                      \n\t"\
2017
-                "pand %%mm1, %%mm0                      \n\t"\
2018
-                "movq %%mm0, " #b "                     \n\t"
1997
+        "movq " #a ", %%mm0                     \n\t"\
1998
+        "movq " #b ", %%mm2                     \n\t"\
1999
+        "movq " #c ", %%mm1                     \n\t"\
2000
+        "movq %%mm0, %%mm3                      \n\t"\
2001
+        "movq %%mm1, %%mm4                      \n\t"\
2002
+        "movq %%mm2, %%mm5                      \n\t"\
2003
+        "psubusb %%mm1, %%mm3                   \n\t"\
2004
+        "psubusb %%mm2, %%mm4                   \n\t"\
2005
+        "psubusb %%mm0, %%mm5                   \n\t"\
2006
+        "pcmpeqb %%mm7, %%mm3                   \n\t"\
2007
+        "pcmpeqb %%mm7, %%mm4                   \n\t"\
2008
+        "pcmpeqb %%mm7, %%mm5                   \n\t"\
2009
+        "movq %%mm3, %%mm6                      \n\t"\
2010
+        "pxor %%mm4, %%mm3                      \n\t"\
2011
+        "pxor %%mm5, %%mm4                      \n\t"\
2012
+        "pxor %%mm6, %%mm5                      \n\t"\
2013
+        "por %%mm3, %%mm1                       \n\t"\
2014
+        "por %%mm4, %%mm2                       \n\t"\
2015
+        "por %%mm5, %%mm0                       \n\t"\
2016
+        "pand %%mm2, %%mm0                      \n\t"\
2017
+        "pand %%mm1, %%mm0                      \n\t"\
2018
+        "movq %%mm0, " #b "                     \n\t"
2019 2019
 #define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
2020 2020
 
2021 2021
 MEDIAN((%0)        , (%%REGa)       , (%%REGa, %1))
... ...
@@ -2023,31 +1986,29 @@ MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
2023 2023
 MEDIAN((%0, %1, 4) , (%%REGd)       , (%%REGd, %1))
2024 2024
 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
2025 2025
 
2026
-                : : "r" (src), "r" ((long)stride)
2027
-                : "%"REG_a, "%"REG_d
2028
-        );
2026
+        : : "r" (src), "r" ((long)stride)
2027
+        : "%"REG_a, "%"REG_d
2028
+    );
2029 2029
 #endif //HAVE_MMX2
2030 2030
 #else //HAVE_MMX
2031
-        int x, y;
2032
-        src+= 4*stride;
2033
-        // FIXME - there should be a way to do a few columns in parallel like w/mmx
2034
-        for(x=0; x<8; x++)
2035
-        {
2036
-                uint8_t *colsrc = src;
2037
-                for (y=0; y<4; y++)
2038
-                {
2039
-                        int a, b, c, d, e, f;
2040
-                        a = colsrc[0       ];
2041
-                        b = colsrc[stride  ];
2042
-                        c = colsrc[stride*2];
2043
-                        d = (a-b)>>31;
2044
-                        e = (b-c)>>31;
2045
-                        f = (c-a)>>31;
2046
-                        colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2047
-                        colsrc += stride*2;
2048
-                }
2049
-                src++;
2031
+    int x, y;
2032
+    src+= 4*stride;
2033
+    // FIXME - there should be a way to do a few columns in parallel like w/mmx
2034
+    for(x=0; x<8; x++){
2035
+        uint8_t *colsrc = src;
2036
+        for (y=0; y<4; y++){
2037
+            int a, b, c, d, e, f;
2038
+            a = colsrc[0       ];
2039
+            b = colsrc[stride  ];
2040
+            c = colsrc[stride*2];
2041
+            d = (a-b)>>31;
2042
+            e = (b-c)>>31;
2043
+            f = (c-a)>>31;
2044
+            colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2045
+            colsrc += stride*2;
2050 2046
         }
2047
+        src++;
2048
+    }
2051 2049
 #endif //HAVE_MMX
2052 2050
 }
2053 2051
 
... ...
@@ -2057,84 +2018,84 @@ MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
2057 2057
  */
2058 2058
 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2059 2059
 {
2060
-        asm(
2061
-                "lea (%0, %1), %%"REG_a"                \n\t"
2060
+    asm(
2061
+        "lea (%0, %1), %%"REG_a"                \n\t"
2062 2062
 //      0       1       2       3       4       5       6       7       8       9
2063 2063
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
2064
-                "movq (%0), %%mm0                       \n\t" // 12345678
2065
-                "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2066
-                "movq %%mm0, %%mm2                      \n\t" // 12345678
2067
-                "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2068
-                "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2069
-
2070
-                "movq (%%"REG_a", %1), %%mm1            \n\t"
2071
-                "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2072
-                "movq %%mm1, %%mm4                      \n\t"
2073
-                "punpcklbw %%mm3, %%mm1                 \n\t"
2074
-                "punpckhbw %%mm3, %%mm4                 \n\t"
2075
-
2076
-                "movq %%mm0, %%mm3                      \n\t"
2077
-                "punpcklwd %%mm1, %%mm0                 \n\t"
2078
-                "punpckhwd %%mm1, %%mm3                 \n\t"
2079
-                "movq %%mm2, %%mm1                      \n\t"
2080
-                "punpcklwd %%mm4, %%mm2                 \n\t"
2081
-                "punpckhwd %%mm4, %%mm1                 \n\t"
2082
-
2083
-                "movd %%mm0, 128(%2)                    \n\t"
2084
-                "psrlq $32, %%mm0                       \n\t"
2085
-                "movd %%mm0, 144(%2)                    \n\t"
2086
-                "movd %%mm3, 160(%2)                    \n\t"
2087
-                "psrlq $32, %%mm3                       \n\t"
2088
-                "movd %%mm3, 176(%2)                    \n\t"
2089
-                "movd %%mm3, 48(%3)                     \n\t"
2090
-                "movd %%mm2, 192(%2)                    \n\t"
2091
-                "movd %%mm2, 64(%3)                     \n\t"
2092
-                "psrlq $32, %%mm2                       \n\t"
2093
-                "movd %%mm2, 80(%3)                     \n\t"
2094
-                "movd %%mm1, 96(%3)                     \n\t"
2095
-                "psrlq $32, %%mm1                       \n\t"
2096
-                "movd %%mm1, 112(%3)                    \n\t"
2097
-
2098
-                "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
2099
-
2100
-                "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
2101
-                "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2102
-                "movq %%mm0, %%mm2                      \n\t" // 12345678
2103
-                "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2104
-                "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2105
-
2106
-                "movq (%%"REG_a", %1), %%mm1            \n\t"
2107
-                "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2108
-                "movq %%mm1, %%mm4                      \n\t"
2109
-                "punpcklbw %%mm3, %%mm1                 \n\t"
2110
-                "punpckhbw %%mm3, %%mm4                 \n\t"
2111
-
2112
-                "movq %%mm0, %%mm3                      \n\t"
2113
-                "punpcklwd %%mm1, %%mm0                 \n\t"
2114
-                "punpckhwd %%mm1, %%mm3                 \n\t"
2115
-                "movq %%mm2, %%mm1                      \n\t"
2116
-                "punpcklwd %%mm4, %%mm2                 \n\t"
2117
-                "punpckhwd %%mm4, %%mm1                 \n\t"
2118
-
2119
-                "movd %%mm0, 132(%2)                    \n\t"
2120
-                "psrlq $32, %%mm0                       \n\t"
2121
-                "movd %%mm0, 148(%2)                    \n\t"
2122
-                "movd %%mm3, 164(%2)                    \n\t"
2123
-                "psrlq $32, %%mm3                       \n\t"
2124
-                "movd %%mm3, 180(%2)                    \n\t"
2125
-                "movd %%mm3, 52(%3)                     \n\t"
2126
-                "movd %%mm2, 196(%2)                    \n\t"
2127
-                "movd %%mm2, 68(%3)                     \n\t"
2128
-                "psrlq $32, %%mm2                       \n\t"
2129
-                "movd %%mm2, 84(%3)                     \n\t"
2130
-                "movd %%mm1, 100(%3)                    \n\t"
2131
-                "psrlq $32, %%mm1                       \n\t"
2132
-                "movd %%mm1, 116(%3)                    \n\t"
2064
+        "movq (%0), %%mm0                       \n\t" // 12345678
2065
+        "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2066
+        "movq %%mm0, %%mm2                      \n\t" // 12345678
2067
+        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2068
+        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2069
+
2070
+        "movq (%%"REG_a", %1), %%mm1            \n\t"
2071
+        "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2072
+        "movq %%mm1, %%mm4                      \n\t"
2073
+        "punpcklbw %%mm3, %%mm1                 \n\t"
2074
+        "punpckhbw %%mm3, %%mm4                 \n\t"
2075
+
2076
+        "movq %%mm0, %%mm3                      \n\t"
2077
+        "punpcklwd %%mm1, %%mm0                 \n\t"
2078
+        "punpckhwd %%mm1, %%mm3                 \n\t"
2079
+        "movq %%mm2, %%mm1                      \n\t"
2080
+        "punpcklwd %%mm4, %%mm2                 \n\t"
2081
+        "punpckhwd %%mm4, %%mm1                 \n\t"
2082
+
2083
+        "movd %%mm0, 128(%2)                    \n\t"
2084
+        "psrlq $32, %%mm0                       \n\t"
2085
+        "movd %%mm0, 144(%2)                    \n\t"
2086
+        "movd %%mm3, 160(%2)                    \n\t"
2087
+        "psrlq $32, %%mm3                       \n\t"
2088
+        "movd %%mm3, 176(%2)                    \n\t"
2089
+        "movd %%mm3, 48(%3)                     \n\t"
2090
+        "movd %%mm2, 192(%2)                    \n\t"
2091
+        "movd %%mm2, 64(%3)                     \n\t"
2092
+        "psrlq $32, %%mm2                       \n\t"
2093
+        "movd %%mm2, 80(%3)                     \n\t"
2094
+        "movd %%mm1, 96(%3)                     \n\t"
2095
+        "psrlq $32, %%mm1                       \n\t"
2096
+        "movd %%mm1, 112(%3)                    \n\t"
2097
+
2098
+        "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
2099
+
2100
+        "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
2101
+        "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2102
+        "movq %%mm0, %%mm2                      \n\t" // 12345678
2103
+        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2104
+        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2105
+
2106
+        "movq (%%"REG_a", %1), %%mm1            \n\t"
2107
+        "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2108
+        "movq %%mm1, %%mm4                      \n\t"
2109
+        "punpcklbw %%mm3, %%mm1                 \n\t"
2110
+        "punpckhbw %%mm3, %%mm4                 \n\t"
2111
+
2112
+        "movq %%mm0, %%mm3                      \n\t"
2113
+        "punpcklwd %%mm1, %%mm0                 \n\t"
2114
+        "punpckhwd %%mm1, %%mm3                 \n\t"
2115
+        "movq %%mm2, %%mm1                      \n\t"
2116
+        "punpcklwd %%mm4, %%mm2                 \n\t"
2117
+        "punpckhwd %%mm4, %%mm1                 \n\t"
2118
+
2119
+        "movd %%mm0, 132(%2)                    \n\t"
2120
+        "psrlq $32, %%mm0                       \n\t"
2121
+        "movd %%mm0, 148(%2)                    \n\t"
2122
+        "movd %%mm3, 164(%2)                    \n\t"
2123
+        "psrlq $32, %%mm3                       \n\t"
2124
+        "movd %%mm3, 180(%2)                    \n\t"
2125
+        "movd %%mm3, 52(%3)                     \n\t"
2126
+        "movd %%mm2, 196(%2)                    \n\t"
2127
+        "movd %%mm2, 68(%3)                     \n\t"
2128
+        "psrlq $32, %%mm2                       \n\t"
2129
+        "movd %%mm2, 84(%3)                     \n\t"
2130
+        "movd %%mm1, 100(%3)                    \n\t"
2131
+        "psrlq $32, %%mm1                       \n\t"
2132
+        "movd %%mm1, 116(%3)                    \n\t"
2133 2133
 
2134 2134
 
2135 2135
         :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
2136 2136
         : "%"REG_a
2137
-        );
2137
+    );
2138 2138
 }
2139 2139
 
2140 2140
 /**
... ...
@@ -2142,79 +2103,79 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src
2142 2142
  */
2143 2143
 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2144 2144
 {
2145
-        asm(
2146
-                "lea (%0, %1), %%"REG_a"                \n\t"
2147
-                "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
2145
+    asm(
2146
+        "lea (%0, %1), %%"REG_a"                \n\t"
2147
+        "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
2148 2148
 //      0       1       2       3       4       5       6       7       8       9
2149 2149
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
2150
-                "movq (%2), %%mm0                       \n\t" // 12345678
2151
-                "movq 16(%2), %%mm1                     \n\t" // abcdefgh
2152
-                "movq %%mm0, %%mm2                      \n\t" // 12345678
2153
-                "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2154
-                "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2155
-
2156
-                "movq 32(%2), %%mm1                     \n\t"
2157
-                "movq 48(%2), %%mm3                     \n\t"
2158
-                "movq %%mm1, %%mm4                      \n\t"
2159
-                "punpcklbw %%mm3, %%mm1                 \n\t"
2160
-                "punpckhbw %%mm3, %%mm4                 \n\t"
2161
-
2162
-                "movq %%mm0, %%mm3                      \n\t"
2163
-                "punpcklwd %%mm1, %%mm0                 \n\t"
2164
-                "punpckhwd %%mm1, %%mm3                 \n\t"
2165
-                "movq %%mm2, %%mm1                      \n\t"
2166
-                "punpcklwd %%mm4, %%mm2                 \n\t"
2167
-                "punpckhwd %%mm4, %%mm1                 \n\t"
2168
-
2169
-                "movd %%mm0, (%0)                       \n\t"
2170
-                "psrlq $32, %%mm0                       \n\t"
2171
-                "movd %%mm0, (%%"REG_a")                \n\t"
2172
-                "movd %%mm3, (%%"REG_a", %1)            \n\t"
2173
-                "psrlq $32, %%mm3                       \n\t"
2174
-                "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
2175
-                "movd %%mm2, (%0, %1, 4)                \n\t"
2176
-                "psrlq $32, %%mm2                       \n\t"
2177
-                "movd %%mm2, (%%"REG_d")                \n\t"
2178
-                "movd %%mm1, (%%"REG_d", %1)            \n\t"
2179
-                "psrlq $32, %%mm1                       \n\t"
2180
-                "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
2181
-
2182
-
2183
-                "movq 64(%2), %%mm0                     \n\t" // 12345678
2184
-                "movq 80(%2), %%mm1                     \n\t" // abcdefgh
2185
-                "movq %%mm0, %%mm2                      \n\t" // 12345678
2186
-                "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2187
-                "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2188
-
2189
-                "movq 96(%2), %%mm1                     \n\t"
2190
-                "movq 112(%2), %%mm3                    \n\t"
2191
-                "movq %%mm1, %%mm4                      \n\t"
2192
-                "punpcklbw %%mm3, %%mm1                 \n\t"
2193
-                "punpckhbw %%mm3, %%mm4                 \n\t"
2194
-
2195
-                "movq %%mm0, %%mm3                      \n\t"
2196
-                "punpcklwd %%mm1, %%mm0                 \n\t"
2197
-                "punpckhwd %%mm1, %%mm3                 \n\t"
2198
-                "movq %%mm2, %%mm1                      \n\t"
2199
-                "punpcklwd %%mm4, %%mm2                 \n\t"
2200
-                "punpckhwd %%mm4, %%mm1                 \n\t"
2201
-
2202
-                "movd %%mm0, 4(%0)                      \n\t"
2203
-                "psrlq $32, %%mm0                       \n\t"
2204
-                "movd %%mm0, 4(%%"REG_a")               \n\t"
2205
-                "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
2206
-                "psrlq $32, %%mm3                       \n\t"
2207
-                "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
2208
-                "movd %%mm2, 4(%0, %1, 4)               \n\t"
2209
-                "psrlq $32, %%mm2                       \n\t"
2210
-                "movd %%mm2, 4(%%"REG_d")               \n\t"
2211
-                "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
2212
-                "psrlq $32, %%mm1                       \n\t"
2213
-                "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
2150
+        "movq (%2), %%mm0                       \n\t" // 12345678
2151
+        "movq 16(%2), %%mm1                     \n\t" // abcdefgh
2152
+        "movq %%mm0, %%mm2                      \n\t" // 12345678
2153
+        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2154
+        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2155
+
2156
+        "movq 32(%2), %%mm1                     \n\t"
2157
+        "movq 48(%2), %%mm3                     \n\t"
2158
+        "movq %%mm1, %%mm4                      \n\t"
2159
+        "punpcklbw %%mm3, %%mm1                 \n\t"
2160
+        "punpckhbw %%mm3, %%mm4                 \n\t"
2161
+
2162
+        "movq %%mm0, %%mm3                      \n\t"
2163
+        "punpcklwd %%mm1, %%mm0                 \n\t"
2164
+        "punpckhwd %%mm1, %%mm3                 \n\t"
2165
+        "movq %%mm2, %%mm1                      \n\t"
2166
+        "punpcklwd %%mm4, %%mm2                 \n\t"
2167
+        "punpckhwd %%mm4, %%mm1                 \n\t"
2168
+
2169
+        "movd %%mm0, (%0)                       \n\t"
2170
+        "psrlq $32, %%mm0                       \n\t"
2171
+        "movd %%mm0, (%%"REG_a")                \n\t"
2172
+        "movd %%mm3, (%%"REG_a", %1)            \n\t"
2173
+        "psrlq $32, %%mm3                       \n\t"
2174
+        "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
2175
+        "movd %%mm2, (%0, %1, 4)                \n\t"
2176
+        "psrlq $32, %%mm2                       \n\t"
2177
+        "movd %%mm2, (%%"REG_d")                \n\t"
2178
+        "movd %%mm1, (%%"REG_d", %1)            \n\t"
2179
+        "psrlq $32, %%mm1                       \n\t"
2180
+        "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
2181
+
2182
+
2183
+        "movq 64(%2), %%mm0                     \n\t" // 12345678
2184
+        "movq 80(%2), %%mm1                     \n\t" // abcdefgh
2185
+        "movq %%mm0, %%mm2                      \n\t" // 12345678
2186
+        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2187
+        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2188
+
2189
+        "movq 96(%2), %%mm1                     \n\t"
2190
+        "movq 112(%2), %%mm3                    \n\t"
2191
+        "movq %%mm1, %%mm4                      \n\t"
2192
+        "punpcklbw %%mm3, %%mm1                 \n\t"
2193
+        "punpckhbw %%mm3, %%mm4                 \n\t"
2194
+
2195
+        "movq %%mm0, %%mm3                      \n\t"
2196
+        "punpcklwd %%mm1, %%mm0                 \n\t"
2197
+        "punpckhwd %%mm1, %%mm3                 \n\t"
2198
+        "movq %%mm2, %%mm1                      \n\t"
2199
+        "punpcklwd %%mm4, %%mm2                 \n\t"
2200
+        "punpckhwd %%mm4, %%mm1                 \n\t"
2201
+
2202
+        "movd %%mm0, 4(%0)                      \n\t"
2203
+        "psrlq $32, %%mm0                       \n\t"
2204
+        "movd %%mm0, 4(%%"REG_a")               \n\t"
2205
+        "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
2206
+        "psrlq $32, %%mm3                       \n\t"
2207
+        "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
2208
+        "movd %%mm2, 4(%0, %1, 4)               \n\t"
2209
+        "psrlq $32, %%mm2                       \n\t"
2210
+        "movd %%mm2, 4(%%"REG_d")               \n\t"
2211
+        "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
2212
+        "psrlq $32, %%mm1                       \n\t"
2213
+        "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
2214 2214
 
2215 2215
         :: "r" (dst), "r" ((long)dstStride), "r" (src)
2216 2216
         : "%"REG_a, "%"REG_d
2217
-        );
2217
+    );
2218 2218
 }
2219 2219
 #endif //HAVE_MMX
2220 2220
 //static long test=0;
... ...
@@ -2223,83 +2184,83 @@ static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2223 2223
 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2224 2224
                                     uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2225 2225
 {
2226
-        // to save a register (FIXME do this outside of the loops)
2227
-        tempBluredPast[127]= maxNoise[0];
2228
-        tempBluredPast[128]= maxNoise[1];
2229
-        tempBluredPast[129]= maxNoise[2];
2226
+    // to save a register (FIXME do this outside of the loops)
2227
+    tempBluredPast[127]= maxNoise[0];
2228
+    tempBluredPast[128]= maxNoise[1];
2229
+    tempBluredPast[129]= maxNoise[2];
2230 2230
 
2231 2231
 #define FAST_L2_DIFF
2232 2232
 //#define L1_DIFF //u should change the thresholds too if u try that one
2233 2233
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2234
-        asm volatile(
2235
-                "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
2236
-                "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
2237
-                "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2234
+    asm volatile(
2235
+        "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
2236
+        "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
2237
+        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2238 2238
 //      0       1       2       3       4       5       6       7       8       9
2239 2239
 //      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+edx  %x+2eax %x+ecx  %x+8%2
2240 2240
 //FIXME reorder?
2241 2241
 #ifdef L1_DIFF //needs mmx2
2242
-                "movq (%0), %%mm0                       \n\t" // L0
2243
-                "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
2244
-                "movq (%0, %2), %%mm1                   \n\t" // L1
2245
-                "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
2246
-                "movq (%0, %2, 2), %%mm2                \n\t" // L2
2247
-                "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
2248
-                "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2249
-                "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
2250
-
2251
-                "movq (%0, %2, 4), %%mm4                \n\t" // L4
2252
-                "paddw %%mm1, %%mm0                     \n\t"
2253
-                "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
2254
-                "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2255
-                "paddw %%mm2, %%mm0                     \n\t"
2256
-                "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
2257
-                "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2258
-                "paddw %%mm3, %%mm0                     \n\t"
2259
-                "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
2260
-                "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2261
-                "paddw %%mm4, %%mm0                     \n\t"
2262
-                "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
2263
-                "paddw %%mm5, %%mm6                     \n\t"
2264
-                "paddw %%mm7, %%mm6                     \n\t"
2265
-                "paddw %%mm6, %%mm0                     \n\t"
2242
+        "movq (%0), %%mm0                       \n\t" // L0
2243
+        "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
2244
+        "movq (%0, %2), %%mm1                   \n\t" // L1
2245
+        "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
2246
+        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2247
+        "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
2248
+        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2249
+        "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
2250
+
2251
+        "movq (%0, %2, 4), %%mm4                \n\t" // L4
2252
+        "paddw %%mm1, %%mm0                     \n\t"
2253
+        "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
2254
+        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2255
+        "paddw %%mm2, %%mm0                     \n\t"
2256
+        "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
2257
+        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2258
+        "paddw %%mm3, %%mm0                     \n\t"
2259
+        "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
2260
+        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2261
+        "paddw %%mm4, %%mm0                     \n\t"
2262
+        "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
2263
+        "paddw %%mm5, %%mm6                     \n\t"
2264
+        "paddw %%mm7, %%mm6                     \n\t"
2265
+        "paddw %%mm6, %%mm0                     \n\t"
2266 2266
 #else //L1_DIFF
2267 2267
 #if defined (FAST_L2_DIFF)
2268
-                "pcmpeqb %%mm7, %%mm7                   \n\t"
2269
-                "movq "MANGLE(b80)", %%mm6              \n\t"
2270
-                "pxor %%mm0, %%mm0                      \n\t"
2268
+        "pcmpeqb %%mm7, %%mm7                   \n\t"
2269
+        "movq "MANGLE(b80)", %%mm6              \n\t"
2270
+        "pxor %%mm0, %%mm0                      \n\t"
2271 2271
 #define REAL_L2_DIFF_CORE(a, b)\
2272
-                "movq " #a ", %%mm5                     \n\t"\
2273
-                "movq " #b ", %%mm2                     \n\t"\
2274
-                "pxor %%mm7, %%mm2                      \n\t"\
2275
-                PAVGB(%%mm2, %%mm5)\
2276
-                "paddb %%mm6, %%mm5                     \n\t"\
2277
-                "movq %%mm5, %%mm2                      \n\t"\
2278
-                "psllw $8, %%mm5                        \n\t"\
2279
-                "pmaddwd %%mm5, %%mm5                   \n\t"\
2280
-                "pmaddwd %%mm2, %%mm2                   \n\t"\
2281
-                "paddd %%mm2, %%mm5                     \n\t"\
2282
-                "psrld $14, %%mm5                       \n\t"\
2283
-                "paddd %%mm5, %%mm0                     \n\t"
2272
+        "movq " #a ", %%mm5                     \n\t"\
2273
+        "movq " #b ", %%mm2                     \n\t"\
2274
+        "pxor %%mm7, %%mm2                      \n\t"\
2275
+        PAVGB(%%mm2, %%mm5)\
2276
+        "paddb %%mm6, %%mm5                     \n\t"\
2277
+        "movq %%mm5, %%mm2                      \n\t"\
2278
+        "psllw $8, %%mm5                        \n\t"\
2279
+        "pmaddwd %%mm5, %%mm5                   \n\t"\
2280
+        "pmaddwd %%mm2, %%mm2                   \n\t"\
2281
+        "paddd %%mm2, %%mm5                     \n\t"\
2282
+        "psrld $14, %%mm5                       \n\t"\
2283
+        "paddd %%mm5, %%mm0                     \n\t"
2284 2284
 
2285 2285
 #else //defined (FAST_L2_DIFF)
2286
-                "pxor %%mm7, %%mm7                      \n\t"
2287
-                "pxor %%mm0, %%mm0                      \n\t"
2286
+        "pxor %%mm7, %%mm7                      \n\t"
2287
+        "pxor %%mm0, %%mm0                      \n\t"
2288 2288
 #define REAL_L2_DIFF_CORE(a, b)\
2289
-                "movq " #a ", %%mm5                     \n\t"\
2290
-                "movq " #b ", %%mm2                     \n\t"\
2291
-                "movq %%mm5, %%mm1                      \n\t"\
2292
-                "movq %%mm2, %%mm3                      \n\t"\
2293
-                "punpcklbw %%mm7, %%mm5                 \n\t"\
2294
-                "punpckhbw %%mm7, %%mm1                 \n\t"\
2295
-                "punpcklbw %%mm7, %%mm2                 \n\t"\
2296
-                "punpckhbw %%mm7, %%mm3                 \n\t"\
2297
-                "psubw %%mm2, %%mm5                     \n\t"\
2298
-                "psubw %%mm3, %%mm1                     \n\t"\
2299
-                "pmaddwd %%mm5, %%mm5                   \n\t"\
2300
-                "pmaddwd %%mm1, %%mm1                   \n\t"\
2301
-                "paddd %%mm1, %%mm5                     \n\t"\
2302
-                "paddd %%mm5, %%mm0                     \n\t"
2289
+        "movq " #a ", %%mm5                     \n\t"\
2290
+        "movq " #b ", %%mm2                     \n\t"\
2291
+        "movq %%mm5, %%mm1                      \n\t"\
2292
+        "movq %%mm2, %%mm3                      \n\t"\
2293
+        "punpcklbw %%mm7, %%mm5                 \n\t"\
2294
+        "punpckhbw %%mm7, %%mm1                 \n\t"\
2295
+        "punpcklbw %%mm7, %%mm2                 \n\t"\
2296
+        "punpckhbw %%mm7, %%mm3                 \n\t"\
2297
+        "psubw %%mm2, %%mm5                     \n\t"\
2298
+        "psubw %%mm3, %%mm1                     \n\t"\
2299
+        "pmaddwd %%mm5, %%mm5                   \n\t"\
2300
+        "pmaddwd %%mm1, %%mm1                   \n\t"\
2301
+        "paddd %%mm1, %%mm5                     \n\t"\
2302
+        "paddd %%mm5, %%mm0                     \n\t"
2303 2303
 
2304 2304
 #endif //defined (FAST_L2_DIFF)
2305 2305
 
... ...
@@ -2316,239 +2277,237 @@ L2_DIFF_CORE((%0, %%REGc)  , (%1, %%REGc))
2316 2316
 
2317 2317
 #endif //L1_DIFF
2318 2318
 
2319
-                "movq %%mm0, %%mm4                      \n\t"
2320
-                "psrlq $32, %%mm0                       \n\t"
2321
-                "paddd %%mm0, %%mm4                     \n\t"
2322
-                "movd %%mm4, %%ecx                      \n\t"
2323
-                "shll $2, %%ecx                         \n\t"
2324
-                "mov %3, %%"REG_d"                      \n\t"
2325
-                "addl -4(%%"REG_d"), %%ecx              \n\t"
2326
-                "addl 4(%%"REG_d"), %%ecx               \n\t"
2327
-                "addl -1024(%%"REG_d"), %%ecx           \n\t"
2328
-                "addl $4, %%ecx                         \n\t"
2329
-                "addl 1024(%%"REG_d"), %%ecx            \n\t"
2330
-                "shrl $3, %%ecx                         \n\t"
2331
-                "movl %%ecx, (%%"REG_d")                \n\t"
2332
-
2333
-//                "mov %3, %%"REG_c"                      \n\t"
2334
-//                "mov %%"REG_c", test                    \n\t"
2335
-//                "jmp 4f                                 \n\t"
2336
-                "cmpl 512(%%"REG_d"), %%ecx             \n\t"
2337
-                " jb 2f                                 \n\t"
2338
-                "cmpl 516(%%"REG_d"), %%ecx             \n\t"
2339
-                " jb 1f                                 \n\t"
2340
-
2341
-                "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2342
-                "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2343
-                "movq (%0), %%mm0                       \n\t" // L0
2344
-                "movq (%0, %2), %%mm1                   \n\t" // L1
2345
-                "movq (%0, %2, 2), %%mm2                \n\t" // L2
2346
-                "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2347
-                "movq (%0, %2, 4), %%mm4                \n\t" // L4
2348
-                "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2349
-                "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2350
-                "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2351
-                "movq %%mm0, (%1)                       \n\t" // L0
2352
-                "movq %%mm1, (%1, %2)                   \n\t" // L1
2353
-                "movq %%mm2, (%1, %2, 2)                \n\t" // L2
2354
-                "movq %%mm3, (%1, %%"REG_a")            \n\t" // L3
2355
-                "movq %%mm4, (%1, %2, 4)                \n\t" // L4
2356
-                "movq %%mm5, (%1, %%"REG_d")            \n\t" // L5
2357
-                "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // L6
2358
-                "movq %%mm7, (%1, %%"REG_c")            \n\t" // L7
2359
-                "jmp 4f                                 \n\t"
2360
-
2361
-                "1:                                     \n\t"
2362
-                "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2363
-                "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2364
-                "movq (%0), %%mm0                       \n\t" // L0
2365
-                PAVGB((%1), %%mm0)                            // L0
2366
-                "movq (%0, %2), %%mm1                   \n\t" // L1
2367
-                PAVGB((%1, %2), %%mm1)                        // L1
2368
-                "movq (%0, %2, 2), %%mm2                \n\t" // L2
2369
-                PAVGB((%1, %2, 2), %%mm2)                     // L2
2370
-                "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2371
-                PAVGB((%1, %%REGa), %%mm3)                    // L3
2372
-                "movq (%0, %2, 4), %%mm4                \n\t" // L4
2373
-                PAVGB((%1, %2, 4), %%mm4)                     // L4
2374
-                "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2375
-                PAVGB((%1, %%REGd), %%mm5)                    // L5
2376
-                "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2377
-                PAVGB((%1, %%REGa, 2), %%mm6)                 // L6
2378
-                "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2379
-                PAVGB((%1, %%REGc), %%mm7)                    // L7
2380
-                "movq %%mm0, (%1)                       \n\t" // R0
2381
-                "movq %%mm1, (%1, %2)                   \n\t" // R1
2382
-                "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2383
-                "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2384
-                "movq %%mm4, (%1, %2, 4)                \n\t" // R4
2385
-                "movq %%mm5, (%1, %%"REG_d")            \n\t" // R5
2386
-                "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // R6
2387
-                "movq %%mm7, (%1, %%"REG_c")            \n\t" // R7
2388
-                "movq %%mm0, (%0)                       \n\t" // L0
2389
-                "movq %%mm1, (%0, %2)                   \n\t" // L1
2390
-                "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2391
-                "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2392
-                "movq %%mm4, (%0, %2, 4)                \n\t" // L4
2393
-                "movq %%mm5, (%0, %%"REG_d")            \n\t" // L5
2394
-                "movq %%mm6, (%0, %%"REG_a", 2)         \n\t" // L6
2395
-                "movq %%mm7, (%0, %%"REG_c")            \n\t" // L7
2396
-                "jmp 4f                                 \n\t"
2397
-
2398
-                "2:                                     \n\t"
2399
-                "cmpl 508(%%"REG_d"), %%ecx             \n\t"
2400
-                " jb 3f                                 \n\t"
2401
-
2402
-                "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2403
-                "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2404
-                "movq (%0), %%mm0                       \n\t" // L0
2405
-                "movq (%0, %2), %%mm1                   \n\t" // L1
2406
-                "movq (%0, %2, 2), %%mm2                \n\t" // L2
2407
-                "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2408
-                "movq (%1), %%mm4                       \n\t" // R0
2409
-                "movq (%1, %2), %%mm5                   \n\t" // R1
2410
-                "movq (%1, %2, 2), %%mm6                \n\t" // R2
2411
-                "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
2412
-                PAVGB(%%mm4, %%mm0)
2413
-                PAVGB(%%mm5, %%mm1)
2414
-                PAVGB(%%mm6, %%mm2)
2415
-                PAVGB(%%mm7, %%mm3)
2416
-                PAVGB(%%mm4, %%mm0)
2417
-                PAVGB(%%mm5, %%mm1)
2418
-                PAVGB(%%mm6, %%mm2)
2419
-                PAVGB(%%mm7, %%mm3)
2420
-                "movq %%mm0, (%1)                       \n\t" // R0
2421
-                "movq %%mm1, (%1, %2)                   \n\t" // R1
2422
-                "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2423
-                "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2424
-                "movq %%mm0, (%0)                       \n\t" // L0
2425
-                "movq %%mm1, (%0, %2)                   \n\t" // L1
2426
-                "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2427
-                "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2428
-
2429
-                "movq (%0, %2, 4), %%mm0                \n\t" // L4
2430
-                "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
2431
-                "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
2432
-                "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
2433
-                "movq (%1, %2, 4), %%mm4                \n\t" // R4
2434
-                "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
2435
-                "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
2436
-                "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
2437
-                PAVGB(%%mm4, %%mm0)
2438
-                PAVGB(%%mm5, %%mm1)
2439
-                PAVGB(%%mm6, %%mm2)
2440
-                PAVGB(%%mm7, %%mm3)
2441
-                PAVGB(%%mm4, %%mm0)
2442
-                PAVGB(%%mm5, %%mm1)
2443
-                PAVGB(%%mm6, %%mm2)
2444
-                PAVGB(%%mm7, %%mm3)
2445
-                "movq %%mm0, (%1, %2, 4)                \n\t" // R4
2446
-                "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
2447
-                "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
2448
-                "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
2449
-                "movq %%mm0, (%0, %2, 4)                \n\t" // L4
2450
-                "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
2451
-                "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
2452
-                "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
2453
-                "jmp 4f                                 \n\t"
2454
-
2455
-                "3:                                     \n\t"
2456
-                "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2457
-                "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2458
-                "movq (%0), %%mm0                       \n\t" // L0
2459
-                "movq (%0, %2), %%mm1                   \n\t" // L1
2460
-                "movq (%0, %2, 2), %%mm2                \n\t" // L2
2461
-                "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2462
-                "movq (%1), %%mm4                       \n\t" // R0
2463
-                "movq (%1, %2), %%mm5                   \n\t" // R1
2464
-                "movq (%1, %2, 2), %%mm6                \n\t" // R2
2465
-                "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
2466
-                PAVGB(%%mm4, %%mm0)
2467
-                PAVGB(%%mm5, %%mm1)
2468
-                PAVGB(%%mm6, %%mm2)
2469
-                PAVGB(%%mm7, %%mm3)
2470
-                PAVGB(%%mm4, %%mm0)
2471
-                PAVGB(%%mm5, %%mm1)
2472
-                PAVGB(%%mm6, %%mm2)
2473
-                PAVGB(%%mm7, %%mm3)
2474
-                PAVGB(%%mm4, %%mm0)
2475
-                PAVGB(%%mm5, %%mm1)
2476
-                PAVGB(%%mm6, %%mm2)
2477
-                PAVGB(%%mm7, %%mm3)
2478
-                "movq %%mm0, (%1)                       \n\t" // R0
2479
-                "movq %%mm1, (%1, %2)                   \n\t" // R1
2480
-                "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2481
-                "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2482
-                "movq %%mm0, (%0)                       \n\t" // L0
2483
-                "movq %%mm1, (%0, %2)                   \n\t" // L1
2484
-                "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2485
-                "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2486
-
2487
-                "movq (%0, %2, 4), %%mm0                \n\t" // L4
2488
-                "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
2489
-                "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
2490
-                "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
2491
-                "movq (%1, %2, 4), %%mm4                \n\t" // R4
2492
-                "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
2493
-                "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
2494
-                "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
2495
-                PAVGB(%%mm4, %%mm0)
2496
-                PAVGB(%%mm5, %%mm1)
2497
-                PAVGB(%%mm6, %%mm2)
2498
-                PAVGB(%%mm7, %%mm3)
2499
-                PAVGB(%%mm4, %%mm0)
2500
-                PAVGB(%%mm5, %%mm1)
2501
-                PAVGB(%%mm6, %%mm2)
2502
-                PAVGB(%%mm7, %%mm3)
2503
-                PAVGB(%%mm4, %%mm0)
2504
-                PAVGB(%%mm5, %%mm1)
2505
-                PAVGB(%%mm6, %%mm2)
2506
-                PAVGB(%%mm7, %%mm3)
2507
-                "movq %%mm0, (%1, %2, 4)                \n\t" // R4
2508
-                "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
2509
-                "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
2510
-                "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
2511
-                "movq %%mm0, (%0, %2, 4)                \n\t" // L4
2512
-                "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
2513
-                "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
2514
-                "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
2515
-
2516
-                "4:                                     \n\t"
2517
-
2518
-                :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
2519
-                : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2520
-                );
2319
+        "movq %%mm0, %%mm4                      \n\t"
2320
+        "psrlq $32, %%mm0                       \n\t"
2321
+        "paddd %%mm0, %%mm4                     \n\t"
2322
+        "movd %%mm4, %%ecx                      \n\t"
2323
+        "shll $2, %%ecx                         \n\t"
2324
+        "mov %3, %%"REG_d"                      \n\t"
2325
+        "addl -4(%%"REG_d"), %%ecx              \n\t"
2326
+        "addl 4(%%"REG_d"), %%ecx               \n\t"
2327
+        "addl -1024(%%"REG_d"), %%ecx           \n\t"
2328
+        "addl $4, %%ecx                         \n\t"
2329
+        "addl 1024(%%"REG_d"), %%ecx            \n\t"
2330
+        "shrl $3, %%ecx                         \n\t"
2331
+        "movl %%ecx, (%%"REG_d")                \n\t"
2332
+
2333
+//        "mov %3, %%"REG_c"                      \n\t"
2334
+//        "mov %%"REG_c", test                    \n\t"
2335
+//        "jmp 4f                                 \n\t"
2336
+        "cmpl 512(%%"REG_d"), %%ecx             \n\t"
2337
+        " jb 2f                                 \n\t"
2338
+        "cmpl 516(%%"REG_d"), %%ecx             \n\t"
2339
+        " jb 1f                                 \n\t"
2340
+
2341
+        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2342
+        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2343
+        "movq (%0), %%mm0                       \n\t" // L0
2344
+        "movq (%0, %2), %%mm1                   \n\t" // L1
2345
+        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2346
+        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2347
+        "movq (%0, %2, 4), %%mm4                \n\t" // L4
2348
+        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2349
+        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2350
+        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2351
+        "movq %%mm0, (%1)                       \n\t" // L0
2352
+        "movq %%mm1, (%1, %2)                   \n\t" // L1
2353
+        "movq %%mm2, (%1, %2, 2)                \n\t" // L2
2354
+        "movq %%mm3, (%1, %%"REG_a")            \n\t" // L3
2355
+        "movq %%mm4, (%1, %2, 4)                \n\t" // L4
2356
+        "movq %%mm5, (%1, %%"REG_d")            \n\t" // L5
2357
+        "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // L6
2358
+        "movq %%mm7, (%1, %%"REG_c")            \n\t" // L7
2359
+        "jmp 4f                                 \n\t"
2360
+
2361
+        "1:                                     \n\t"
2362
+        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2363
+        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2364
+        "movq (%0), %%mm0                       \n\t" // L0
2365
+        PAVGB((%1), %%mm0)                            // L0
2366
+        "movq (%0, %2), %%mm1                   \n\t" // L1
2367
+        PAVGB((%1, %2), %%mm1)                        // L1
2368
+        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2369
+        PAVGB((%1, %2, 2), %%mm2)                     // L2
2370
+        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2371
+        PAVGB((%1, %%REGa), %%mm3)                    // L3
2372
+        "movq (%0, %2, 4), %%mm4                \n\t" // L4
2373
+        PAVGB((%1, %2, 4), %%mm4)                     // L4
2374
+        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2375
+        PAVGB((%1, %%REGd), %%mm5)                    // L5
2376
+        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2377
+        PAVGB((%1, %%REGa, 2), %%mm6)                 // L6
2378
+        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2379
+        PAVGB((%1, %%REGc), %%mm7)                    // L7
2380
+        "movq %%mm0, (%1)                       \n\t" // R0
2381
+        "movq %%mm1, (%1, %2)                   \n\t" // R1
2382
+        "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2383
+        "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2384
+        "movq %%mm4, (%1, %2, 4)                \n\t" // R4
2385
+        "movq %%mm5, (%1, %%"REG_d")            \n\t" // R5
2386
+        "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // R6
2387
+        "movq %%mm7, (%1, %%"REG_c")            \n\t" // R7
2388
+        "movq %%mm0, (%0)                       \n\t" // L0
2389
+        "movq %%mm1, (%0, %2)                   \n\t" // L1
2390
+        "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2391
+        "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2392
+        "movq %%mm4, (%0, %2, 4)                \n\t" // L4
2393
+        "movq %%mm5, (%0, %%"REG_d")            \n\t" // L5
2394
+        "movq %%mm6, (%0, %%"REG_a", 2)         \n\t" // L6
2395
+        "movq %%mm7, (%0, %%"REG_c")            \n\t" // L7
2396
+        "jmp 4f                                 \n\t"
2397
+
2398
+        "2:                                     \n\t"
2399
+        "cmpl 508(%%"REG_d"), %%ecx             \n\t"
2400
+        " jb 3f                                 \n\t"
2401
+
2402
+        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2403
+        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2404
+        "movq (%0), %%mm0                       \n\t" // L0
2405
+        "movq (%0, %2), %%mm1                   \n\t" // L1
2406
+        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2407
+        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2408
+        "movq (%1), %%mm4                       \n\t" // R0
2409
+        "movq (%1, %2), %%mm5                   \n\t" // R1
2410
+        "movq (%1, %2, 2), %%mm6                \n\t" // R2
2411
+        "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
2412
+        PAVGB(%%mm4, %%mm0)
2413
+        PAVGB(%%mm5, %%mm1)
2414
+        PAVGB(%%mm6, %%mm2)
2415
+        PAVGB(%%mm7, %%mm3)
2416
+        PAVGB(%%mm4, %%mm0)
2417
+        PAVGB(%%mm5, %%mm1)
2418
+        PAVGB(%%mm6, %%mm2)
2419
+        PAVGB(%%mm7, %%mm3)
2420
+        "movq %%mm0, (%1)                       \n\t" // R0
2421
+        "movq %%mm1, (%1, %2)                   \n\t" // R1
2422
+        "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2423
+        "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2424
+        "movq %%mm0, (%0)                       \n\t" // L0
2425
+        "movq %%mm1, (%0, %2)                   \n\t" // L1
2426
+        "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2427
+        "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2428
+
2429
+        "movq (%0, %2, 4), %%mm0                \n\t" // L4
2430
+        "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
2431
+        "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
2432
+        "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
2433
+        "movq (%1, %2, 4), %%mm4                \n\t" // R4
2434
+        "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
2435
+        "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
2436
+        "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
2437
+        PAVGB(%%mm4, %%mm0)
2438
+        PAVGB(%%mm5, %%mm1)
2439
+        PAVGB(%%mm6, %%mm2)
2440
+        PAVGB(%%mm7, %%mm3)
2441
+        PAVGB(%%mm4, %%mm0)
2442
+        PAVGB(%%mm5, %%mm1)
2443
+        PAVGB(%%mm6, %%mm2)
2444
+        PAVGB(%%mm7, %%mm3)
2445
+        "movq %%mm0, (%1, %2, 4)                \n\t" // R4
2446
+        "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
2447
+        "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
2448
+        "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
2449
+        "movq %%mm0, (%0, %2, 4)                \n\t" // L4
2450
+        "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
2451
+        "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
2452
+        "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
2453
+        "jmp 4f                                 \n\t"
2454
+
2455
+        "3:                                     \n\t"
2456
+        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2457
+        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2458
+        "movq (%0), %%mm0                       \n\t" // L0
2459
+        "movq (%0, %2), %%mm1                   \n\t" // L1
2460
+        "movq (%0, %2, 2), %%mm2                \n\t" // L2
2461
+        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2462
+        "movq (%1), %%mm4                       \n\t" // R0
2463
+        "movq (%1, %2), %%mm5                   \n\t" // R1
2464
+        "movq (%1, %2, 2), %%mm6                \n\t" // R2
2465
+        "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
2466
+        PAVGB(%%mm4, %%mm0)
2467
+        PAVGB(%%mm5, %%mm1)
2468
+        PAVGB(%%mm6, %%mm2)
2469
+        PAVGB(%%mm7, %%mm3)
2470
+        PAVGB(%%mm4, %%mm0)
2471
+        PAVGB(%%mm5, %%mm1)
2472
+        PAVGB(%%mm6, %%mm2)
2473
+        PAVGB(%%mm7, %%mm3)
2474
+        PAVGB(%%mm4, %%mm0)
2475
+        PAVGB(%%mm5, %%mm1)
2476
+        PAVGB(%%mm6, %%mm2)
2477
+        PAVGB(%%mm7, %%mm3)
2478
+        "movq %%mm0, (%1)                       \n\t" // R0
2479
+        "movq %%mm1, (%1, %2)                   \n\t" // R1
2480
+        "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2481
+        "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2482
+        "movq %%mm0, (%0)                       \n\t" // L0
2483
+        "movq %%mm1, (%0, %2)                   \n\t" // L1
2484
+        "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2485
+        "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2486
+
2487
+        "movq (%0, %2, 4), %%mm0                \n\t" // L4
2488
+        "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
2489
+        "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
2490
+        "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
2491
+        "movq (%1, %2, 4), %%mm4                \n\t" // R4
2492
+        "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
2493
+        "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
2494
+        "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
2495
+        PAVGB(%%mm4, %%mm0)
2496
+        PAVGB(%%mm5, %%mm1)
2497
+        PAVGB(%%mm6, %%mm2)
2498
+        PAVGB(%%mm7, %%mm3)
2499
+        PAVGB(%%mm4, %%mm0)
2500
+        PAVGB(%%mm5, %%mm1)
2501
+        PAVGB(%%mm6, %%mm2)
2502
+        PAVGB(%%mm7, %%mm3)
2503
+        PAVGB(%%mm4, %%mm0)
2504
+        PAVGB(%%mm5, %%mm1)
2505
+        PAVGB(%%mm6, %%mm2)
2506
+        PAVGB(%%mm7, %%mm3)
2507
+        "movq %%mm0, (%1, %2, 4)                \n\t" // R4
2508
+        "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
2509
+        "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
2510
+        "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
2511
+        "movq %%mm0, (%0, %2, 4)                \n\t" // L4
2512
+        "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
2513
+        "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
2514
+        "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
2515
+
2516
+        "4:                                     \n\t"
2517
+
2518
+        :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
2519
+        : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2520
+    );
2521 2521
 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2522 2522
 {
2523
-        int y;
2524
-        int d=0;
2525
-//        int sysd=0;
2526
-        int i;
2523
+    int y;
2524
+    int d=0;
2525
+//    int sysd=0;
2526
+    int i;
2527 2527
 
2528
-        for(y=0; y<8; y++)
2529
-        {
2530
-                int x;
2531
-                for(x=0; x<8; x++)
2532
-                {
2533
-                        int ref= tempBlured[ x + y*stride ];
2534
-                        int cur= src[ x + y*stride ];
2535
-                        int d1=ref - cur;
2536
-//                        if(x==0 || x==7) d1+= d1>>1;
2537
-//                        if(y==0 || y==7) d1+= d1>>1;
2538
-//                        d+= FFABS(d1);
2539
-                        d+= d1*d1;
2540
-//                        sysd+= d1;
2541
-                }
2528
+    for(y=0; y<8; y++){
2529
+        int x;
2530
+        for(x=0; x<8; x++){
2531
+            int ref= tempBlured[ x + y*stride ];
2532
+            int cur= src[ x + y*stride ];
2533
+            int d1=ref - cur;
2534
+//            if(x==0 || x==7) d1+= d1>>1;
2535
+//            if(y==0 || y==7) d1+= d1>>1;
2536
+//            d+= FFABS(d1);
2537
+            d+= d1*d1;
2538
+//            sysd+= d1;
2542 2539
         }
2543
-        i=d;
2544
-        d=         (
2545
-                4*d
2546
-                +(*(tempBluredPast-256))
2547
-                +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2548
-                +(*(tempBluredPast+256))
2549
-                +4)>>3;
2550
-        *tempBluredPast=i;
2551
-//        ((*tempBluredPast)*3 + d + 2)>>2;
2540
+    }
2541
+    i=d;
2542
+    d=  (
2543
+        4*d
2544
+        +(*(tempBluredPast-256))
2545
+        +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2546
+        +(*(tempBluredPast+256))
2547
+        +4)>>3;
2548
+    *tempBluredPast=i;
2549
+//    ((*tempBluredPast)*3 + d + 2)>>2;
2552 2550
 
2553 2551
 /*
2554 2552
 Switch between
... ...
@@ -2557,68 +2516,51 @@ Switch between
2557 2557
 64 48 36 27 20 15 11 (33) (approx)
2558 2558
 64 56 49 43 37 33 29 (200) (approx)
2559 2559
 */
2560
-        if(d > maxNoise[1])
2561
-        {
2562
-                if(d < maxNoise[2])
2563
-                {
2564
-                        for(y=0; y<8; y++)
2565
-                        {
2566
-                                int x;
2567
-                                for(x=0; x<8; x++)
2568
-                                {
2569
-                                        int ref= tempBlured[ x + y*stride ];
2570
-                                        int cur= src[ x + y*stride ];
2571
-                                        tempBlured[ x + y*stride ]=
2572
-                                        src[ x + y*stride ]=
2573
-                                                (ref + cur + 1)>>1;
2574
-                                }
2575
-                        }
2560
+    if(d > maxNoise[1]){
2561
+        if(d < maxNoise[2]){
2562
+            for(y=0; y<8; y++){
2563
+                int x;
2564
+                for(x=0; x<8; x++){
2565
+                    int ref= tempBlured[ x + y*stride ];
2566
+                    int cur= src[ x + y*stride ];
2567
+                    tempBlured[ x + y*stride ]=
2568
+                    src[ x + y*stride ]=
2569
+                        (ref + cur + 1)>>1;
2576 2570
                 }
2577
-                else
2578
-                {
2579
-                        for(y=0; y<8; y++)
2580
-                        {
2581
-                                int x;
2582
-                                for(x=0; x<8; x++)
2583
-                                {
2584
-                                        tempBlured[ x + y*stride ]= src[ x + y*stride ];
2585
-                                }
2586
-                        }
2571
+            }
2572
+        }else{
2573
+            for(y=0; y<8; y++){
2574
+                int x;
2575
+                for(x=0; x<8; x++){
2576
+                    tempBlured[ x + y*stride ]= src[ x + y*stride ];
2587 2577
                 }
2578
+            }
2588 2579
         }
2589
-        else
2590
-        {
2591
-                if(d < maxNoise[0])
2592
-                {
2593
-                        for(y=0; y<8; y++)
2594
-                        {
2595
-                                int x;
2596
-                                for(x=0; x<8; x++)
2597
-                                {
2598
-                                        int ref= tempBlured[ x + y*stride ];
2599
-                                        int cur= src[ x + y*stride ];
2600
-                                        tempBlured[ x + y*stride ]=
2601
-                                        src[ x + y*stride ]=
2602
-                                                (ref*7 + cur + 4)>>3;
2603
-                                }
2604
-                        }
2580
+    }else{
2581
+        if(d < maxNoise[0]){
2582
+            for(y=0; y<8; y++){
2583
+                int x;
2584
+                for(x=0; x<8; x++){
2585
+                    int ref= tempBlured[ x + y*stride ];
2586
+                    int cur= src[ x + y*stride ];
2587
+                    tempBlured[ x + y*stride ]=
2588
+                    src[ x + y*stride ]=
2589
+                        (ref*7 + cur + 4)>>3;
2605 2590
                 }
2606
-                else
2607
-                {
2608
-                        for(y=0; y<8; y++)
2609
-                        {
2610
-                                int x;
2611
-                                for(x=0; x<8; x++)
2612
-                                {
2613
-                                        int ref= tempBlured[ x + y*stride ];
2614
-                                        int cur= src[ x + y*stride ];
2615
-                                        tempBlured[ x + y*stride ]=
2616
-                                        src[ x + y*stride ]=
2617
-                                                (ref*3 + cur + 2)>>2;
2618
-                                }
2619
-                        }
2591
+            }
2592
+        }else{
2593
+            for(y=0; y<8; y++){
2594
+                int x;
2595
+                for(x=0; x<8; x++){
2596
+                    int ref= tempBlured[ x + y*stride ];
2597
+                    int cur= src[ x + y*stride ];
2598
+                    tempBlured[ x + y*stride ]=
2599
+                    src[ x + y*stride ]=
2600
+                        (ref*3 + cur + 2)>>2;
2620 2601
                 }
2602
+            }
2621 2603
         }
2604
+    }
2622 2605
 }
2623 2606
 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2624 2607
 }
... ...
@@ -2629,531 +2571,531 @@ Switch between
2629 2629
  * accurate deblock filter
2630 2630
  */
2631 2631
 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2632
-        int64_t dc_mask, eq_mask, both_masks;
2633
-        int64_t sums[10*8*2];
2634
-        src+= step*3; // src points to begin of the 8x8 Block
2632
+    int64_t dc_mask, eq_mask, both_masks;
2633
+    int64_t sums[10*8*2];
2634
+    src+= step*3; // src points to begin of the 8x8 Block
2635 2635
 //START_TIMER
2636
-asm volatile(
2637
-                "movq %0, %%mm7                         \n\t"
2638
-                "movq %1, %%mm6                         \n\t"
2639
-                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
2640
-                );
2641
-
2642
-asm volatile(
2643
-                "lea (%2, %3), %%"REG_a"                \n\t"
2636
+    asm volatile(
2637
+        "movq %0, %%mm7                         \n\t"
2638
+        "movq %1, %%mm6                         \n\t"
2639
+        : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
2640
+        );
2641
+
2642
+    asm volatile(
2643
+        "lea (%2, %3), %%"REG_a"                \n\t"
2644 2644
 //      0       1       2       3       4       5       6       7       8       9
2645 2645
 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
2646 2646
 
2647
-                "movq (%2), %%mm0                       \n\t"
2648
-                "movq (%%"REG_a"), %%mm1                \n\t"
2649
-                "movq %%mm1, %%mm3                      \n\t"
2650
-                "movq %%mm1, %%mm4                      \n\t"
2651
-                "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
2652
-                "paddb %%mm7, %%mm0                     \n\t"
2653
-                "pcmpgtb %%mm6, %%mm0                   \n\t"
2654
-
2655
-                "movq (%%"REG_a",%3), %%mm2             \n\t"
2656
-                PMAXUB(%%mm2, %%mm4)
2657
-                PMINUB(%%mm2, %%mm3, %%mm5)
2658
-                "psubb %%mm2, %%mm1                     \n\t"
2659
-                "paddb %%mm7, %%mm1                     \n\t"
2660
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
2661
-                "paddb %%mm1, %%mm0                     \n\t"
2662
-
2663
-                "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
2664
-                PMAXUB(%%mm1, %%mm4)
2665
-                PMINUB(%%mm1, %%mm3, %%mm5)
2666
-                "psubb %%mm1, %%mm2                     \n\t"
2667
-                "paddb %%mm7, %%mm2                     \n\t"
2668
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
2669
-                "paddb %%mm2, %%mm0                     \n\t"
2670
-
2671
-                "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
2672
-
2673
-                "movq (%2, %3, 4), %%mm2                \n\t"
2674
-                PMAXUB(%%mm2, %%mm4)
2675
-                PMINUB(%%mm2, %%mm3, %%mm5)
2676
-                "psubb %%mm2, %%mm1                     \n\t"
2677
-                "paddb %%mm7, %%mm1                     \n\t"
2678
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
2679
-                "paddb %%mm1, %%mm0                     \n\t"
2680
-
2681
-                "movq (%%"REG_a"), %%mm1                \n\t"
2682
-                PMAXUB(%%mm1, %%mm4)
2683
-                PMINUB(%%mm1, %%mm3, %%mm5)
2684
-                "psubb %%mm1, %%mm2                     \n\t"
2685
-                "paddb %%mm7, %%mm2                     \n\t"
2686
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
2687
-                "paddb %%mm2, %%mm0                     \n\t"
2688
-
2689
-                "movq (%%"REG_a", %3), %%mm2            \n\t"
2690
-                PMAXUB(%%mm2, %%mm4)
2691
-                PMINUB(%%mm2, %%mm3, %%mm5)
2692
-                "psubb %%mm2, %%mm1                     \n\t"
2693
-                "paddb %%mm7, %%mm1                     \n\t"
2694
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
2695
-                "paddb %%mm1, %%mm0                     \n\t"
2696
-
2697
-                "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
2698
-                PMAXUB(%%mm1, %%mm4)
2699
-                PMINUB(%%mm1, %%mm3, %%mm5)
2700
-                "psubb %%mm1, %%mm2                     \n\t"
2701
-                "paddb %%mm7, %%mm2                     \n\t"
2702
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
2703
-                "paddb %%mm2, %%mm0                     \n\t"
2704
-
2705
-                "movq (%2, %3, 8), %%mm2                \n\t"
2706
-                PMAXUB(%%mm2, %%mm4)
2707
-                PMINUB(%%mm2, %%mm3, %%mm5)
2708
-                "psubb %%mm2, %%mm1                     \n\t"
2709
-                "paddb %%mm7, %%mm1                     \n\t"
2710
-                "pcmpgtb %%mm6, %%mm1                   \n\t"
2711
-                "paddb %%mm1, %%mm0                     \n\t"
2712
-
2713
-                "movq (%%"REG_a", %3, 4), %%mm1         \n\t"
2714
-                "psubb %%mm1, %%mm2                     \n\t"
2715
-                "paddb %%mm7, %%mm2                     \n\t"
2716
-                "pcmpgtb %%mm6, %%mm2                   \n\t"
2717
-                "paddb %%mm2, %%mm0                     \n\t"
2718
-                "psubusb %%mm3, %%mm4                   \n\t"
2719
-
2720
-                "pxor %%mm6, %%mm6                      \n\t"
2721
-                "movq %4, %%mm7                         \n\t" // QP,..., QP
2722
-                "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
2723
-                "psubusb %%mm4, %%mm7                   \n\t" // Diff >=2QP -> 0
2724
-                "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
2725
-                "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
2726
-                "movq %%mm7, %1                         \n\t"
2727
-
2728
-                "movq %5, %%mm7                         \n\t"
2729
-                "punpcklbw %%mm7, %%mm7                 \n\t"
2730
-                "punpcklbw %%mm7, %%mm7                 \n\t"
2731
-                "punpcklbw %%mm7, %%mm7                 \n\t"
2732
-                "psubb %%mm0, %%mm6                     \n\t"
2733
-                "pcmpgtb %%mm7, %%mm6                   \n\t"
2734
-                "movq %%mm6, %0                         \n\t"
2735
-
2736
-                : "=m" (eq_mask), "=m" (dc_mask)
2737
-                : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2738
-                : "%"REG_a
2739
-                );
2740
-
2741
-        both_masks = dc_mask & eq_mask;
2742
-
2743
-        if(both_masks){
2744
-                long offset= -8*step;
2745
-                int64_t *temp_sums= sums;
2746
-
2747
-                asm volatile(
2748
-                "movq %2, %%mm0                         \n\t"  // QP,..., QP
2749
-                "pxor %%mm4, %%mm4                      \n\t"
2750
-
2751
-                "movq (%0), %%mm6                       \n\t"
2752
-                "movq (%0, %1), %%mm5                   \n\t"
2753
-                "movq %%mm5, %%mm1                      \n\t"
2754
-                "movq %%mm6, %%mm2                      \n\t"
2755
-                "psubusb %%mm6, %%mm5                   \n\t"
2756
-                "psubusb %%mm1, %%mm2                   \n\t"
2757
-                "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
2758
-                "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
2759
-                "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
2760
-
2761
-                "pxor %%mm6, %%mm1                      \n\t"
2762
-                "pand %%mm0, %%mm1                      \n\t"
2763
-                "pxor %%mm1, %%mm6                      \n\t"
2764
-                // 0:QP  6:First
2765
-
2766
-                "movq (%0, %1, 8), %%mm5                \n\t"
2767
-                "add %1, %0                             \n\t" // %0 points to line 1 not 0
2768
-                "movq (%0, %1, 8), %%mm7                \n\t"
2769
-                "movq %%mm5, %%mm1                      \n\t"
2770
-                "movq %%mm7, %%mm2                      \n\t"
2771
-                "psubusb %%mm7, %%mm5                   \n\t"
2772
-                "psubusb %%mm1, %%mm2                   \n\t"
2773
-                "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
2774
-                "movq %2, %%mm0                         \n\t"  // QP,..., QP
2775
-                "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
2776
-                "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
2777
-
2778
-                "pxor %%mm7, %%mm1                      \n\t"
2779
-                "pand %%mm0, %%mm1                      \n\t"
2780
-                "pxor %%mm1, %%mm7                      \n\t"
2781
-
2782
-                "movq %%mm6, %%mm5                      \n\t"
2783
-                "punpckhbw %%mm4, %%mm6                 \n\t"
2784
-                "punpcklbw %%mm4, %%mm5                 \n\t"
2785
-                // 4:0 5/6:First 7:Last
2786
-
2787
-                "movq %%mm5, %%mm0                      \n\t"
2788
-                "movq %%mm6, %%mm1                      \n\t"
2789
-                "psllw $2, %%mm0                        \n\t"
2790
-                "psllw $2, %%mm1                        \n\t"
2791
-                "paddw "MANGLE(w04)", %%mm0             \n\t"
2792
-                "paddw "MANGLE(w04)", %%mm1             \n\t"
2647
+        "movq (%2), %%mm0                       \n\t"
2648
+        "movq (%%"REG_a"), %%mm1                \n\t"
2649
+        "movq %%mm1, %%mm3                      \n\t"
2650
+        "movq %%mm1, %%mm4                      \n\t"
2651
+        "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
2652
+        "paddb %%mm7, %%mm0                     \n\t"
2653
+        "pcmpgtb %%mm6, %%mm0                   \n\t"
2654
+
2655
+        "movq (%%"REG_a",%3), %%mm2             \n\t"
2656
+        PMAXUB(%%mm2, %%mm4)
2657
+        PMINUB(%%mm2, %%mm3, %%mm5)
2658
+        "psubb %%mm2, %%mm1                     \n\t"
2659
+        "paddb %%mm7, %%mm1                     \n\t"
2660
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
2661
+        "paddb %%mm1, %%mm0                     \n\t"
2662
+
2663
+        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
2664
+        PMAXUB(%%mm1, %%mm4)
2665
+        PMINUB(%%mm1, %%mm3, %%mm5)
2666
+        "psubb %%mm1, %%mm2                     \n\t"
2667
+        "paddb %%mm7, %%mm2                     \n\t"
2668
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
2669
+        "paddb %%mm2, %%mm0                     \n\t"
2670
+
2671
+        "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
2672
+
2673
+        "movq (%2, %3, 4), %%mm2                \n\t"
2674
+        PMAXUB(%%mm2, %%mm4)
2675
+        PMINUB(%%mm2, %%mm3, %%mm5)
2676
+        "psubb %%mm2, %%mm1                     \n\t"
2677
+        "paddb %%mm7, %%mm1                     \n\t"
2678
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
2679
+        "paddb %%mm1, %%mm0                     \n\t"
2680
+
2681
+        "movq (%%"REG_a"), %%mm1                \n\t"
2682
+        PMAXUB(%%mm1, %%mm4)
2683
+        PMINUB(%%mm1, %%mm3, %%mm5)
2684
+        "psubb %%mm1, %%mm2                     \n\t"
2685
+        "paddb %%mm7, %%mm2                     \n\t"
2686
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
2687
+        "paddb %%mm2, %%mm0                     \n\t"
2688
+
2689
+        "movq (%%"REG_a", %3), %%mm2            \n\t"
2690
+        PMAXUB(%%mm2, %%mm4)
2691
+        PMINUB(%%mm2, %%mm3, %%mm5)
2692
+        "psubb %%mm2, %%mm1                     \n\t"
2693
+        "paddb %%mm7, %%mm1                     \n\t"
2694
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
2695
+        "paddb %%mm1, %%mm0                     \n\t"
2696
+
2697
+        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
2698
+        PMAXUB(%%mm1, %%mm4)
2699
+        PMINUB(%%mm1, %%mm3, %%mm5)
2700
+        "psubb %%mm1, %%mm2                     \n\t"
2701
+        "paddb %%mm7, %%mm2                     \n\t"
2702
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
2703
+        "paddb %%mm2, %%mm0                     \n\t"
2704
+
2705
+        "movq (%2, %3, 8), %%mm2                \n\t"
2706
+        PMAXUB(%%mm2, %%mm4)
2707
+        PMINUB(%%mm2, %%mm3, %%mm5)
2708
+        "psubb %%mm2, %%mm1                     \n\t"
2709
+        "paddb %%mm7, %%mm1                     \n\t"
2710
+        "pcmpgtb %%mm6, %%mm1                   \n\t"
2711
+        "paddb %%mm1, %%mm0                     \n\t"
2712
+
2713
+        "movq (%%"REG_a", %3, 4), %%mm1         \n\t"
2714
+        "psubb %%mm1, %%mm2                     \n\t"
2715
+        "paddb %%mm7, %%mm2                     \n\t"
2716
+        "pcmpgtb %%mm6, %%mm2                   \n\t"
2717
+        "paddb %%mm2, %%mm0                     \n\t"
2718
+        "psubusb %%mm3, %%mm4                   \n\t"
2719
+
2720
+        "pxor %%mm6, %%mm6                      \n\t"
2721
+        "movq %4, %%mm7                         \n\t" // QP,..., QP
2722
+        "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
2723
+        "psubusb %%mm4, %%mm7                   \n\t" // Diff >=2QP -> 0
2724
+        "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
2725
+        "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
2726
+        "movq %%mm7, %1                         \n\t"
2727
+
2728
+        "movq %5, %%mm7                         \n\t"
2729
+        "punpcklbw %%mm7, %%mm7                 \n\t"
2730
+        "punpcklbw %%mm7, %%mm7                 \n\t"
2731
+        "punpcklbw %%mm7, %%mm7                 \n\t"
2732
+        "psubb %%mm0, %%mm6                     \n\t"
2733
+        "pcmpgtb %%mm7, %%mm6                   \n\t"
2734
+        "movq %%mm6, %0                         \n\t"
2735
+
2736
+        : "=m" (eq_mask), "=m" (dc_mask)
2737
+        : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2738
+        : "%"REG_a
2739
+    );
2740
+
2741
+    both_masks = dc_mask & eq_mask;
2742
+
2743
+    if(both_masks){
2744
+        long offset= -8*step;
2745
+        int64_t *temp_sums= sums;
2746
+
2747
+        asm volatile(
2748
+            "movq %2, %%mm0                         \n\t"  // QP,..., QP
2749
+            "pxor %%mm4, %%mm4                      \n\t"
2750
+
2751
+            "movq (%0), %%mm6                       \n\t"
2752
+            "movq (%0, %1), %%mm5                   \n\t"
2753
+            "movq %%mm5, %%mm1                      \n\t"
2754
+            "movq %%mm6, %%mm2                      \n\t"
2755
+            "psubusb %%mm6, %%mm5                   \n\t"
2756
+            "psubusb %%mm1, %%mm2                   \n\t"
2757
+            "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
2758
+            "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
2759
+            "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
2760
+
2761
+            "pxor %%mm6, %%mm1                      \n\t"
2762
+            "pand %%mm0, %%mm1                      \n\t"
2763
+            "pxor %%mm1, %%mm6                      \n\t"
2764
+            // 0:QP  6:First
2765
+
2766
+            "movq (%0, %1, 8), %%mm5                \n\t"
2767
+            "add %1, %0                             \n\t" // %0 points to line 1 not 0
2768
+            "movq (%0, %1, 8), %%mm7                \n\t"
2769
+            "movq %%mm5, %%mm1                      \n\t"
2770
+            "movq %%mm7, %%mm2                      \n\t"
2771
+            "psubusb %%mm7, %%mm5                   \n\t"
2772
+            "psubusb %%mm1, %%mm2                   \n\t"
2773
+            "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
2774
+            "movq %2, %%mm0                         \n\t"  // QP,..., QP
2775
+            "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
2776
+            "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
2777
+
2778
+            "pxor %%mm7, %%mm1                      \n\t"
2779
+            "pand %%mm0, %%mm1                      \n\t"
2780
+            "pxor %%mm1, %%mm7                      \n\t"
2781
+
2782
+            "movq %%mm6, %%mm5                      \n\t"
2783
+            "punpckhbw %%mm4, %%mm6                 \n\t"
2784
+            "punpcklbw %%mm4, %%mm5                 \n\t"
2785
+            // 4:0 5/6:First 7:Last
2786
+
2787
+            "movq %%mm5, %%mm0                      \n\t"
2788
+            "movq %%mm6, %%mm1                      \n\t"
2789
+            "psllw $2, %%mm0                        \n\t"
2790
+            "psllw $2, %%mm1                        \n\t"
2791
+            "paddw "MANGLE(w04)", %%mm0             \n\t"
2792
+            "paddw "MANGLE(w04)", %%mm1             \n\t"
2793 2793
 
2794 2794
 #define NEXT\
2795
-                "movq (%0), %%mm2                       \n\t"\
2796
-                "movq (%0), %%mm3                       \n\t"\
2797
-                "add %1, %0                             \n\t"\
2798
-                "punpcklbw %%mm4, %%mm2                 \n\t"\
2799
-                "punpckhbw %%mm4, %%mm3                 \n\t"\
2800
-                "paddw %%mm2, %%mm0                     \n\t"\
2801
-                "paddw %%mm3, %%mm1                     \n\t"
2795
+            "movq (%0), %%mm2                       \n\t"\
2796
+            "movq (%0), %%mm3                       \n\t"\
2797
+            "add %1, %0                             \n\t"\
2798
+            "punpcklbw %%mm4, %%mm2                 \n\t"\
2799
+            "punpckhbw %%mm4, %%mm3                 \n\t"\
2800
+            "paddw %%mm2, %%mm0                     \n\t"\
2801
+            "paddw %%mm3, %%mm1                     \n\t"
2802 2802
 
2803 2803
 #define PREV\
2804
-                "movq (%0), %%mm2                       \n\t"\
2805
-                "movq (%0), %%mm3                       \n\t"\
2806
-                "add %1, %0                             \n\t"\
2807
-                "punpcklbw %%mm4, %%mm2                 \n\t"\
2808
-                "punpckhbw %%mm4, %%mm3                 \n\t"\
2809
-                "psubw %%mm2, %%mm0                     \n\t"\
2810
-                "psubw %%mm3, %%mm1                     \n\t"
2811
-
2812
-
2813
-                NEXT //0
2814
-                NEXT //1
2815
-                NEXT //2
2816
-                "movq %%mm0, (%3)                       \n\t"
2817
-                "movq %%mm1, 8(%3)                      \n\t"
2818
-
2819
-                NEXT //3
2820
-                "psubw %%mm5, %%mm0                     \n\t"
2821
-                "psubw %%mm6, %%mm1                     \n\t"
2822
-                "movq %%mm0, 16(%3)                     \n\t"
2823
-                "movq %%mm1, 24(%3)                     \n\t"
2824
-
2825
-                NEXT //4
2826
-                "psubw %%mm5, %%mm0                     \n\t"
2827
-                "psubw %%mm6, %%mm1                     \n\t"
2828
-                "movq %%mm0, 32(%3)                     \n\t"
2829
-                "movq %%mm1, 40(%3)                     \n\t"
2830
-
2831
-                NEXT //5
2832
-                "psubw %%mm5, %%mm0                     \n\t"
2833
-                "psubw %%mm6, %%mm1                     \n\t"
2834
-                "movq %%mm0, 48(%3)                     \n\t"
2835
-                "movq %%mm1, 56(%3)                     \n\t"
2836
-
2837
-                NEXT //6
2838
-                "psubw %%mm5, %%mm0                     \n\t"
2839
-                "psubw %%mm6, %%mm1                     \n\t"
2840
-                "movq %%mm0, 64(%3)                     \n\t"
2841
-                "movq %%mm1, 72(%3)                     \n\t"
2842
-
2843
-                "movq %%mm7, %%mm6                      \n\t"
2844
-                "punpckhbw %%mm4, %%mm7                 \n\t"
2845
-                "punpcklbw %%mm4, %%mm6                 \n\t"
2846
-
2847
-                NEXT //7
2848
-                "mov %4, %0                             \n\t"
2849
-                "add %1, %0                             \n\t"
2850
-                PREV //0
2851
-                "movq %%mm0, 80(%3)                     \n\t"
2852
-                "movq %%mm1, 88(%3)                     \n\t"
2853
-
2854
-                PREV //1
2855
-                "paddw %%mm6, %%mm0                     \n\t"
2856
-                "paddw %%mm7, %%mm1                     \n\t"
2857
-                "movq %%mm0, 96(%3)                     \n\t"
2858
-                "movq %%mm1, 104(%3)                    \n\t"
2859
-
2860
-                PREV //2
2861
-                "paddw %%mm6, %%mm0                     \n\t"
2862
-                "paddw %%mm7, %%mm1                     \n\t"
2863
-                "movq %%mm0, 112(%3)                    \n\t"
2864
-                "movq %%mm1, 120(%3)                    \n\t"
2865
-
2866
-                PREV //3
2867
-                "paddw %%mm6, %%mm0                     \n\t"
2868
-                "paddw %%mm7, %%mm1                     \n\t"
2869
-                "movq %%mm0, 128(%3)                    \n\t"
2870
-                "movq %%mm1, 136(%3)                    \n\t"
2871
-
2872
-                PREV //4
2873
-                "paddw %%mm6, %%mm0                     \n\t"
2874
-                "paddw %%mm7, %%mm1                     \n\t"
2875
-                "movq %%mm0, 144(%3)                    \n\t"
2876
-                "movq %%mm1, 152(%3)                    \n\t"
2877
-
2878
-                "mov %4, %0                             \n\t" //FIXME
2879
-
2880
-                : "+&r"(src)
2881
-                : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
2882
-                );
2883
-
2884
-                src+= step; // src points to begin of the 8x8 Block
2885
-
2886
-                asm volatile(
2887
-                "movq %4, %%mm6                         \n\t"
2888
-                "pcmpeqb %%mm5, %%mm5                   \n\t"
2889
-                "pxor %%mm6, %%mm5                      \n\t"
2890
-                "pxor %%mm7, %%mm7                      \n\t"
2891
-
2892
-                "1:                                     \n\t"
2893
-                "movq (%1), %%mm0                       \n\t"
2894
-                "movq 8(%1), %%mm1                      \n\t"
2895
-                "paddw 32(%1), %%mm0                    \n\t"
2896
-                "paddw 40(%1), %%mm1                    \n\t"
2897
-                "movq (%0, %3), %%mm2                   \n\t"
2898
-                "movq %%mm2, %%mm3                      \n\t"
2899
-                "movq %%mm2, %%mm4                      \n\t"
2900
-                "punpcklbw %%mm7, %%mm2                 \n\t"
2901
-                "punpckhbw %%mm7, %%mm3                 \n\t"
2902
-                "paddw %%mm2, %%mm0                     \n\t"
2903
-                "paddw %%mm3, %%mm1                     \n\t"
2904
-                "paddw %%mm2, %%mm0                     \n\t"
2905
-                "paddw %%mm3, %%mm1                     \n\t"
2906
-                "psrlw $4, %%mm0                        \n\t"
2907
-                "psrlw $4, %%mm1                        \n\t"
2908
-                "packuswb %%mm1, %%mm0                  \n\t"
2909
-                "pand %%mm6, %%mm0                      \n\t"
2910
-                "pand %%mm5, %%mm4                      \n\t"
2911
-                "por %%mm4, %%mm0                       \n\t"
2912
-                "movq %%mm0, (%0, %3)                   \n\t"
2913
-                "add $16, %1                            \n\t"
2914
-                "add %2, %0                             \n\t"
2915
-                " js 1b                                 \n\t"
2916
-
2917
-                : "+r"(offset), "+r"(temp_sums)
2918
-                : "r" ((long)step), "r"(src - offset), "m"(both_masks)
2919
-                );
2920
-        }else
2921
-                src+= step; // src points to begin of the 8x8 Block
2922
-
2923
-        if(eq_mask != -1LL){
2924
-                uint8_t *temp_src= src;
2925
-                asm volatile(
2926
-                "pxor %%mm7, %%mm7                      \n\t"
2927
-                "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
2928
-                "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
2804
+            "movq (%0), %%mm2                       \n\t"\
2805
+            "movq (%0), %%mm3                       \n\t"\
2806
+            "add %1, %0                             \n\t"\
2807
+            "punpcklbw %%mm4, %%mm2                 \n\t"\
2808
+            "punpckhbw %%mm4, %%mm3                 \n\t"\
2809
+            "psubw %%mm2, %%mm0                     \n\t"\
2810
+            "psubw %%mm3, %%mm1                     \n\t"
2811
+
2812
+
2813
+            NEXT //0
2814
+            NEXT //1
2815
+            NEXT //2
2816
+            "movq %%mm0, (%3)                       \n\t"
2817
+            "movq %%mm1, 8(%3)                      \n\t"
2818
+
2819
+            NEXT //3
2820
+            "psubw %%mm5, %%mm0                     \n\t"
2821
+            "psubw %%mm6, %%mm1                     \n\t"
2822
+            "movq %%mm0, 16(%3)                     \n\t"
2823
+            "movq %%mm1, 24(%3)                     \n\t"
2824
+
2825
+            NEXT //4
2826
+            "psubw %%mm5, %%mm0                     \n\t"
2827
+            "psubw %%mm6, %%mm1                     \n\t"
2828
+            "movq %%mm0, 32(%3)                     \n\t"
2829
+            "movq %%mm1, 40(%3)                     \n\t"
2830
+
2831
+            NEXT //5
2832
+            "psubw %%mm5, %%mm0                     \n\t"
2833
+            "psubw %%mm6, %%mm1                     \n\t"
2834
+            "movq %%mm0, 48(%3)                     \n\t"
2835
+            "movq %%mm1, 56(%3)                     \n\t"
2836
+
2837
+            NEXT //6
2838
+            "psubw %%mm5, %%mm0                     \n\t"
2839
+            "psubw %%mm6, %%mm1                     \n\t"
2840
+            "movq %%mm0, 64(%3)                     \n\t"
2841
+            "movq %%mm1, 72(%3)                     \n\t"
2842
+
2843
+            "movq %%mm7, %%mm6                      \n\t"
2844
+            "punpckhbw %%mm4, %%mm7                 \n\t"
2845
+            "punpcklbw %%mm4, %%mm6                 \n\t"
2846
+
2847
+            NEXT //7
2848
+            "mov %4, %0                             \n\t"
2849
+            "add %1, %0                             \n\t"
2850
+            PREV //0
2851
+            "movq %%mm0, 80(%3)                     \n\t"
2852
+            "movq %%mm1, 88(%3)                     \n\t"
2853
+
2854
+            PREV //1
2855
+            "paddw %%mm6, %%mm0                     \n\t"
2856
+            "paddw %%mm7, %%mm1                     \n\t"
2857
+            "movq %%mm0, 96(%3)                     \n\t"
2858
+            "movq %%mm1, 104(%3)                    \n\t"
2859
+
2860
+            PREV //2
2861
+            "paddw %%mm6, %%mm0                     \n\t"
2862
+            "paddw %%mm7, %%mm1                     \n\t"
2863
+            "movq %%mm0, 112(%3)                    \n\t"
2864
+            "movq %%mm1, 120(%3)                    \n\t"
2865
+
2866
+            PREV //3
2867
+            "paddw %%mm6, %%mm0                     \n\t"
2868
+            "paddw %%mm7, %%mm1                     \n\t"
2869
+            "movq %%mm0, 128(%3)                    \n\t"
2870
+            "movq %%mm1, 136(%3)                    \n\t"
2871
+
2872
+            PREV //4
2873
+            "paddw %%mm6, %%mm0                     \n\t"
2874
+            "paddw %%mm7, %%mm1                     \n\t"
2875
+            "movq %%mm0, 144(%3)                    \n\t"
2876
+            "movq %%mm1, 152(%3)                    \n\t"
2877
+
2878
+            "mov %4, %0                             \n\t" //FIXME
2879
+
2880
+            : "+&r"(src)
2881
+            : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
2882
+        );
2883
+
2884
+        src+= step; // src points to begin of the 8x8 Block
2885
+
2886
+        asm volatile(
2887
+            "movq %4, %%mm6                         \n\t"
2888
+            "pcmpeqb %%mm5, %%mm5                   \n\t"
2889
+            "pxor %%mm6, %%mm5                      \n\t"
2890
+            "pxor %%mm7, %%mm7                      \n\t"
2891
+
2892
+            "1:                                     \n\t"
2893
+            "movq (%1), %%mm0                       \n\t"
2894
+            "movq 8(%1), %%mm1                      \n\t"
2895
+            "paddw 32(%1), %%mm0                    \n\t"
2896
+            "paddw 40(%1), %%mm1                    \n\t"
2897
+            "movq (%0, %3), %%mm2                   \n\t"
2898
+            "movq %%mm2, %%mm3                      \n\t"
2899
+            "movq %%mm2, %%mm4                      \n\t"
2900
+            "punpcklbw %%mm7, %%mm2                 \n\t"
2901
+            "punpckhbw %%mm7, %%mm3                 \n\t"
2902
+            "paddw %%mm2, %%mm0                     \n\t"
2903
+            "paddw %%mm3, %%mm1                     \n\t"
2904
+            "paddw %%mm2, %%mm0                     \n\t"
2905
+            "paddw %%mm3, %%mm1                     \n\t"
2906
+            "psrlw $4, %%mm0                        \n\t"
2907
+            "psrlw $4, %%mm1                        \n\t"
2908
+            "packuswb %%mm1, %%mm0                  \n\t"
2909
+            "pand %%mm6, %%mm0                      \n\t"
2910
+            "pand %%mm5, %%mm4                      \n\t"
2911
+            "por %%mm4, %%mm0                       \n\t"
2912
+            "movq %%mm0, (%0, %3)                   \n\t"
2913
+            "add $16, %1                            \n\t"
2914
+            "add %2, %0                             \n\t"
2915
+            " js 1b                                 \n\t"
2916
+
2917
+            : "+r"(offset), "+r"(temp_sums)
2918
+            : "r" ((long)step), "r"(src - offset), "m"(both_masks)
2919
+        );
2920
+    }else
2921
+        src+= step; // src points to begin of the 8x8 Block
2922
+
2923
+    if(eq_mask != -1LL){
2924
+        uint8_t *temp_src= src;
2925
+        asm volatile(
2926
+            "pxor %%mm7, %%mm7                      \n\t"
2927
+            "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
2928
+            "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
2929 2929
 //      0       1       2       3       4       5       6       7       8       9
2930 2930
 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %1+8%1  ecx+4%1
2931 2931
 
2932
-                "movq (%0), %%mm0                       \n\t"
2933
-                "movq %%mm0, %%mm1                      \n\t"
2934
-                "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
2935
-                "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
2936
-
2937
-                "movq (%0, %1), %%mm2                   \n\t"
2938
-                "lea (%0, %1, 2), %%"REG_a"             \n\t"
2939
-                "movq %%mm2, %%mm3                      \n\t"
2940
-                "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
2941
-                "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
2942
-
2943
-                "movq (%%"REG_a"), %%mm4                \n\t"
2944
-                "movq %%mm4, %%mm5                      \n\t"
2945
-                "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
2946
-                "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
2947
-
2948
-                "paddw %%mm0, %%mm0                     \n\t" // 2L0
2949
-                "paddw %%mm1, %%mm1                     \n\t" // 2H0
2950
-                "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
2951
-                "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
2952
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
2953
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
2954
-
2955
-                "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
2956
-                "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
2957
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
2958
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
2959
-
2960
-                "movq (%%"REG_a", %1), %%mm2            \n\t"
2961
-                "movq %%mm2, %%mm3                      \n\t"
2962
-                "punpcklbw %%mm7, %%mm2                 \n\t" // L3
2963
-                "punpckhbw %%mm7, %%mm3                 \n\t" // H3
2964
-
2965
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
2966
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
2967
-                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2968
-                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2969
-                "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2970
-                "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2971
-
2972
-                "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
2973
-                "movq %%mm0, %%mm1                      \n\t"
2974
-                "punpcklbw %%mm7, %%mm0                 \n\t" // L4
2975
-                "punpckhbw %%mm7, %%mm1                 \n\t" // H4
2976
-
2977
-                "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
2978
-                "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
2979
-                "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
2980
-                "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
2981
-                "paddw %%mm4, %%mm4                     \n\t" // 2L2
2982
-                "paddw %%mm5, %%mm5                     \n\t" // 2H2
2983
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
2984
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
2985
-
2986
-                "lea (%%"REG_a", %1), %0                \n\t"
2987
-                "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
2988
-                "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
2989
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
2990
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
2932
+            "movq (%0), %%mm0                       \n\t"
2933
+            "movq %%mm0, %%mm1                      \n\t"
2934
+            "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
2935
+            "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
2936
+
2937
+            "movq (%0, %1), %%mm2                   \n\t"
2938
+            "lea (%0, %1, 2), %%"REG_a"             \n\t"
2939
+            "movq %%mm2, %%mm3                      \n\t"
2940
+            "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
2941
+            "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
2942
+
2943
+            "movq (%%"REG_a"), %%mm4                \n\t"
2944
+            "movq %%mm4, %%mm5                      \n\t"
2945
+            "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
2946
+            "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
2947
+
2948
+            "paddw %%mm0, %%mm0                     \n\t" // 2L0
2949
+            "paddw %%mm1, %%mm1                     \n\t" // 2H0
2950
+            "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
2951
+            "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
2952
+            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
2953
+            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
2954
+
2955
+            "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
2956
+            "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
2957
+            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
2958
+            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
2959
+
2960
+            "movq (%%"REG_a", %1), %%mm2            \n\t"
2961
+            "movq %%mm2, %%mm3                      \n\t"
2962
+            "punpcklbw %%mm7, %%mm2                 \n\t" // L3
2963
+            "punpckhbw %%mm7, %%mm3                 \n\t" // H3
2964
+
2965
+            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
2966
+            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
2967
+            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2968
+            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2969
+            "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2970
+            "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2971
+
2972
+            "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
2973
+            "movq %%mm0, %%mm1                      \n\t"
2974
+            "punpcklbw %%mm7, %%mm0                 \n\t" // L4
2975
+            "punpckhbw %%mm7, %%mm1                 \n\t" // H4
2976
+
2977
+            "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
2978
+            "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
2979
+            "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
2980
+            "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
2981
+            "paddw %%mm4, %%mm4                     \n\t" // 2L2
2982
+            "paddw %%mm5, %%mm5                     \n\t" // 2H2
2983
+            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
2984
+            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
2985
+
2986
+            "lea (%%"REG_a", %1), %0                \n\t"
2987
+            "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
2988
+            "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
2989
+            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
2990
+            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
2991 2991
 //50 opcodes so far
2992
-                "movq (%0, %1, 2), %%mm2                \n\t"
2993
-                "movq %%mm2, %%mm3                      \n\t"
2994
-                "punpcklbw %%mm7, %%mm2                 \n\t" // L5
2995
-                "punpckhbw %%mm7, %%mm3                 \n\t" // H5
2996
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
2997
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
2998
-                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2999
-                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
3000
-
3001
-                "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
3002
-                "punpcklbw %%mm7, %%mm6                 \n\t" // L6
3003
-                "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
3004
-                "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
3005
-                "punpckhbw %%mm7, %%mm6                 \n\t" // H6
3006
-                "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
3007
-
3008
-                "paddw %%mm0, %%mm0                     \n\t" // 2L4
3009
-                "paddw %%mm1, %%mm1                     \n\t" // 2H4
3010
-                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
3011
-                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
3012
-
3013
-                "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
3014
-                "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
3015
-                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
3016
-                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
3017
-
3018
-                "movq (%0, %1, 4), %%mm2                \n\t"
3019
-                "movq %%mm2, %%mm3                      \n\t"
3020
-                "punpcklbw %%mm7, %%mm2                 \n\t" // L7
3021
-                "punpckhbw %%mm7, %%mm3                 \n\t" // H7
3022
-
3023
-                "paddw %%mm2, %%mm2                     \n\t" // 2L7
3024
-                "paddw %%mm3, %%mm3                     \n\t" // 2H7
3025
-                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3026
-                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3027
-
3028
-                "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3029
-                "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2992
+            "movq (%0, %1, 2), %%mm2                \n\t"
2993
+            "movq %%mm2, %%mm3                      \n\t"
2994
+            "punpcklbw %%mm7, %%mm2                 \n\t" // L5
2995
+            "punpckhbw %%mm7, %%mm3                 \n\t" // H5
2996
+            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
2997
+            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
2998
+            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2999
+            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
3000
+
3001
+            "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
3002
+            "punpcklbw %%mm7, %%mm6                 \n\t" // L6
3003
+            "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
3004
+            "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
3005
+            "punpckhbw %%mm7, %%mm6                 \n\t" // H6
3006
+            "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
3007
+
3008
+            "paddw %%mm0, %%mm0                     \n\t" // 2L4
3009
+            "paddw %%mm1, %%mm1                     \n\t" // 2H4
3010
+            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
3011
+            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
3012
+
3013
+            "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
3014
+            "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
3015
+            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
3016
+            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
3017
+
3018
+            "movq (%0, %1, 4), %%mm2                \n\t"
3019
+            "movq %%mm2, %%mm3                      \n\t"
3020
+            "punpcklbw %%mm7, %%mm2                 \n\t" // L7
3021
+            "punpckhbw %%mm7, %%mm3                 \n\t" // H7
3022
+
3023
+            "paddw %%mm2, %%mm2                     \n\t" // 2L7
3024
+            "paddw %%mm3, %%mm3                     \n\t" // 2H7
3025
+            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3026
+            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3027
+
3028
+            "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3029
+            "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3030 3030
 
3031 3031
 #ifdef HAVE_MMX2
3032
-                "movq %%mm7, %%mm6                      \n\t" // 0
3033
-                "psubw %%mm0, %%mm6                     \n\t"
3034
-                "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3035
-                "movq %%mm7, %%mm6                      \n\t" // 0
3036
-                "psubw %%mm1, %%mm6                     \n\t"
3037
-                "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3038
-                "movq %%mm7, %%mm6                      \n\t" // 0
3039
-                "psubw %%mm2, %%mm6                     \n\t"
3040
-                "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3041
-                "movq %%mm7, %%mm6                      \n\t" // 0
3042
-                "psubw %%mm3, %%mm6                     \n\t"
3043
-                "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3032
+            "movq %%mm7, %%mm6                      \n\t" // 0
3033
+            "psubw %%mm0, %%mm6                     \n\t"
3034
+            "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3035
+            "movq %%mm7, %%mm6                      \n\t" // 0
3036
+            "psubw %%mm1, %%mm6                     \n\t"
3037
+            "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3038
+            "movq %%mm7, %%mm6                      \n\t" // 0
3039
+            "psubw %%mm2, %%mm6                     \n\t"
3040
+            "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3041
+            "movq %%mm7, %%mm6                      \n\t" // 0
3042
+            "psubw %%mm3, %%mm6                     \n\t"
3043
+            "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3044 3044
 #else
3045
-                "movq %%mm7, %%mm6                      \n\t" // 0
3046
-                "pcmpgtw %%mm0, %%mm6                   \n\t"
3047
-                "pxor %%mm6, %%mm0                      \n\t"
3048
-                "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3049
-                "movq %%mm7, %%mm6                      \n\t" // 0
3050
-                "pcmpgtw %%mm1, %%mm6                   \n\t"
3051
-                "pxor %%mm6, %%mm1                      \n\t"
3052
-                "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3053
-                "movq %%mm7, %%mm6                      \n\t" // 0
3054
-                "pcmpgtw %%mm2, %%mm6                   \n\t"
3055
-                "pxor %%mm6, %%mm2                      \n\t"
3056
-                "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3057
-                "movq %%mm7, %%mm6                      \n\t" // 0
3058
-                "pcmpgtw %%mm3, %%mm6                   \n\t"
3059
-                "pxor %%mm6, %%mm3                      \n\t"
3060
-                "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3045
+            "movq %%mm7, %%mm6                      \n\t" // 0
3046
+            "pcmpgtw %%mm0, %%mm6                   \n\t"
3047
+            "pxor %%mm6, %%mm0                      \n\t"
3048
+            "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3049
+            "movq %%mm7, %%mm6                      \n\t" // 0
3050
+            "pcmpgtw %%mm1, %%mm6                   \n\t"
3051
+            "pxor %%mm6, %%mm1                      \n\t"
3052
+            "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3053
+            "movq %%mm7, %%mm6                      \n\t" // 0
3054
+            "pcmpgtw %%mm2, %%mm6                   \n\t"
3055
+            "pxor %%mm6, %%mm2                      \n\t"
3056
+            "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3057
+            "movq %%mm7, %%mm6                      \n\t" // 0
3058
+            "pcmpgtw %%mm3, %%mm6                   \n\t"
3059
+            "pxor %%mm6, %%mm3                      \n\t"
3060
+            "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3061 3061
 #endif
3062 3062
 
3063 3063
 #ifdef HAVE_MMX2
3064
-                "pminsw %%mm2, %%mm0                    \n\t"
3065
-                "pminsw %%mm3, %%mm1                    \n\t"
3064
+            "pminsw %%mm2, %%mm0                    \n\t"
3065
+            "pminsw %%mm3, %%mm1                    \n\t"
3066 3066
 #else
3067
-                "movq %%mm0, %%mm6                      \n\t"
3068
-                "psubusw %%mm2, %%mm6                   \n\t"
3069
-                "psubw %%mm6, %%mm0                     \n\t"
3070
-                "movq %%mm1, %%mm6                      \n\t"
3071
-                "psubusw %%mm3, %%mm6                   \n\t"
3072
-                "psubw %%mm6, %%mm1                     \n\t"
3067
+            "movq %%mm0, %%mm6                      \n\t"
3068
+            "psubusw %%mm2, %%mm6                   \n\t"
3069
+            "psubw %%mm6, %%mm0                     \n\t"
3070
+            "movq %%mm1, %%mm6                      \n\t"
3071
+            "psubusw %%mm3, %%mm6                   \n\t"
3072
+            "psubw %%mm6, %%mm1                     \n\t"
3073 3073
 #endif
3074 3074
 
3075
-                "movd %2, %%mm2                         \n\t" // QP
3076
-                "punpcklbw %%mm7, %%mm2                 \n\t"
3075
+            "movd %2, %%mm2                         \n\t" // QP
3076
+            "punpcklbw %%mm7, %%mm2                 \n\t"
3077 3077
 
3078
-                "movq %%mm7, %%mm6                      \n\t" // 0
3079
-                "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3080
-                "pxor %%mm6, %%mm4                      \n\t"
3081
-                "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3082
-                "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3083
-                "pxor %%mm7, %%mm5                      \n\t"
3084
-                "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3078
+            "movq %%mm7, %%mm6                      \n\t" // 0
3079
+            "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3080
+            "pxor %%mm6, %%mm4                      \n\t"
3081
+            "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3082
+            "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3083
+            "pxor %%mm7, %%mm5                      \n\t"
3084
+            "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3085 3085
 // 100 opcodes
3086
-                "psllw $3, %%mm2                        \n\t" // 8QP
3087
-                "movq %%mm2, %%mm3                      \n\t" // 8QP
3088
-                "pcmpgtw %%mm4, %%mm2                   \n\t"
3089
-                "pcmpgtw %%mm5, %%mm3                   \n\t"
3090
-                "pand %%mm2, %%mm4                      \n\t"
3091
-                "pand %%mm3, %%mm5                      \n\t"
3092
-
3093
-
3094
-                "psubusw %%mm0, %%mm4                   \n\t" // hd
3095
-                "psubusw %%mm1, %%mm5                   \n\t" // ld
3096
-
3097
-
3098
-                "movq "MANGLE(w05)", %%mm2              \n\t" // 5
3099
-                "pmullw %%mm2, %%mm4                    \n\t"
3100
-                "pmullw %%mm2, %%mm5                    \n\t"
3101
-                "movq "MANGLE(w20)", %%mm2              \n\t" // 32
3102
-                "paddw %%mm2, %%mm4                     \n\t"
3103
-                "paddw %%mm2, %%mm5                     \n\t"
3104
-                "psrlw $6, %%mm4                        \n\t"
3105
-                "psrlw $6, %%mm5                        \n\t"
3106
-
3107
-                "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
3108
-                "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
3109
-
3110
-                "pxor %%mm2, %%mm2                      \n\t"
3111
-                "pxor %%mm3, %%mm3                      \n\t"
3112
-
3113
-                "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
3114
-                "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
3115
-                "pxor %%mm2, %%mm0                      \n\t"
3116
-                "pxor %%mm3, %%mm1                      \n\t"
3117
-                "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
3118
-                "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
3119
-                "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
3120
-                "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
3121
-
3122
-                "pxor %%mm6, %%mm2                      \n\t"
3123
-                "pxor %%mm7, %%mm3                      \n\t"
3124
-                "pand %%mm2, %%mm4                      \n\t"
3125
-                "pand %%mm3, %%mm5                      \n\t"
3086
+            "psllw $3, %%mm2                        \n\t" // 8QP
3087
+            "movq %%mm2, %%mm3                      \n\t" // 8QP
3088
+            "pcmpgtw %%mm4, %%mm2                   \n\t"
3089
+            "pcmpgtw %%mm5, %%mm3                   \n\t"
3090
+            "pand %%mm2, %%mm4                      \n\t"
3091
+            "pand %%mm3, %%mm5                      \n\t"
3092
+
3093
+
3094
+            "psubusw %%mm0, %%mm4                   \n\t" // hd
3095
+            "psubusw %%mm1, %%mm5                   \n\t" // ld
3096
+
3097
+
3098
+            "movq "MANGLE(w05)", %%mm2              \n\t" // 5
3099
+            "pmullw %%mm2, %%mm4                    \n\t"
3100
+            "pmullw %%mm2, %%mm5                    \n\t"
3101
+            "movq "MANGLE(w20)", %%mm2              \n\t" // 32
3102
+            "paddw %%mm2, %%mm4                     \n\t"
3103
+            "paddw %%mm2, %%mm5                     \n\t"
3104
+            "psrlw $6, %%mm4                        \n\t"
3105
+            "psrlw $6, %%mm5                        \n\t"
3106
+
3107
+            "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
3108
+            "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
3109
+
3110
+            "pxor %%mm2, %%mm2                      \n\t"
3111
+            "pxor %%mm3, %%mm3                      \n\t"
3112
+
3113
+            "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
3114
+            "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
3115
+            "pxor %%mm2, %%mm0                      \n\t"
3116
+            "pxor %%mm3, %%mm1                      \n\t"
3117
+            "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
3118
+            "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
3119
+            "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
3120
+            "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
3121
+
3122
+            "pxor %%mm6, %%mm2                      \n\t"
3123
+            "pxor %%mm7, %%mm3                      \n\t"
3124
+            "pand %%mm2, %%mm4                      \n\t"
3125
+            "pand %%mm3, %%mm5                      \n\t"
3126 3126
 
3127 3127
 #ifdef HAVE_MMX2
3128
-                "pminsw %%mm0, %%mm4                    \n\t"
3129
-                "pminsw %%mm1, %%mm5                    \n\t"
3128
+            "pminsw %%mm0, %%mm4                    \n\t"
3129
+            "pminsw %%mm1, %%mm5                    \n\t"
3130 3130
 #else
3131
-                "movq %%mm4, %%mm2                      \n\t"
3132
-                "psubusw %%mm0, %%mm2                   \n\t"
3133
-                "psubw %%mm2, %%mm4                     \n\t"
3134
-                "movq %%mm5, %%mm2                      \n\t"
3135
-                "psubusw %%mm1, %%mm2                   \n\t"
3136
-                "psubw %%mm2, %%mm5                     \n\t"
3131
+            "movq %%mm4, %%mm2                      \n\t"
3132
+            "psubusw %%mm0, %%mm2                   \n\t"
3133
+            "psubw %%mm2, %%mm4                     \n\t"
3134
+            "movq %%mm5, %%mm2                      \n\t"
3135
+            "psubusw %%mm1, %%mm2                   \n\t"
3136
+            "psubw %%mm2, %%mm5                     \n\t"
3137 3137
 #endif
3138
-                "pxor %%mm6, %%mm4                      \n\t"
3139
-                "pxor %%mm7, %%mm5                      \n\t"
3140
-                "psubw %%mm6, %%mm4                     \n\t"
3141
-                "psubw %%mm7, %%mm5                     \n\t"
3142
-                "packsswb %%mm5, %%mm4                  \n\t"
3143
-                "movq %3, %%mm1                         \n\t"
3144
-                "pandn %%mm4, %%mm1                     \n\t"
3145
-                "movq (%0), %%mm0                       \n\t"
3146
-                "paddb   %%mm1, %%mm0                   \n\t"
3147
-                "movq %%mm0, (%0)                       \n\t"
3148
-                "movq (%0, %1), %%mm0                   \n\t"
3149
-                "psubb %%mm1, %%mm0                     \n\t"
3150
-                "movq %%mm0, (%0, %1)                   \n\t"
3151
-
3152
-                : "+r" (temp_src)
3153
-                : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
3154
-                : "%"REG_a, "%"REG_c
3155
-                );
3156
-        }
3138
+            "pxor %%mm6, %%mm4                      \n\t"
3139
+            "pxor %%mm7, %%mm5                      \n\t"
3140
+            "psubw %%mm6, %%mm4                     \n\t"
3141
+            "psubw %%mm7, %%mm5                     \n\t"
3142
+            "packsswb %%mm5, %%mm4                  \n\t"
3143
+            "movq %3, %%mm1                         \n\t"
3144
+            "pandn %%mm4, %%mm1                     \n\t"
3145
+            "movq (%0), %%mm0                       \n\t"
3146
+            "paddb   %%mm1, %%mm0                   \n\t"
3147
+            "movq %%mm0, (%0)                       \n\t"
3148
+            "movq (%0, %1), %%mm0                   \n\t"
3149
+            "psubb %%mm1, %%mm0                     \n\t"
3150
+            "movq %%mm0, (%0, %1)                   \n\t"
3151
+
3152
+            : "+r" (temp_src)
3153
+            : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
3154
+            : "%"REG_a, "%"REG_c
3155
+        );
3156
+    }
3157 3157
 /*if(step==16){
3158 3158
     STOP_TIMER("step16")
3159 3159
 }else{
... ...
@@ -3163,7 +3105,7 @@ asm volatile(
3163 3163
 #endif //HAVE_MMX
3164 3164
 
3165 3165
 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3166
-        const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3166
+                                const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3167 3167
 
3168 3168
 /**
3169 3169
  * Copies a block from src to dst and fixes the blacklevel.
... ...
@@ -3172,69 +3114,68 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
3172 3172
 #undef SCALED_CPY
3173 3173
 
3174 3174
 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3175
-        int levelFix, int64_t *packedOffsetAndScale)
3175
+                                     int levelFix, int64_t *packedOffsetAndScale)
3176 3176
 {
3177 3177
 #ifndef HAVE_MMX
3178
-        int i;
3178
+    int i;
3179 3179
 #endif
3180
-        if(levelFix)
3181
-        {
3180
+    if(levelFix){
3182 3181
 #ifdef HAVE_MMX
3183
-                asm volatile(
3184
-                        "movq (%%"REG_a"), %%mm2        \n\t" // packedYOffset
3185
-                        "movq 8(%%"REG_a"), %%mm3       \n\t" // packedYScale
3186
-                        "lea (%2,%4), %%"REG_a"         \n\t"
3187
-                        "lea (%3,%5), %%"REG_d"         \n\t"
3188
-                        "pxor %%mm4, %%mm4              \n\t"
3182
+    asm volatile(
3183
+        "movq (%%"REG_a"), %%mm2        \n\t" // packedYOffset
3184
+        "movq 8(%%"REG_a"), %%mm3       \n\t" // packedYScale
3185
+        "lea (%2,%4), %%"REG_a"         \n\t"
3186
+        "lea (%3,%5), %%"REG_d"         \n\t"
3187
+        "pxor %%mm4, %%mm4              \n\t"
3189 3188
 #ifdef HAVE_MMX2
3190 3189
 #define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                                \
3191
-                        "movq " #src1 ", %%mm0          \n\t"\
3192
-                        "movq " #src1 ", %%mm5          \n\t"\
3193
-                        "movq " #src2 ", %%mm1          \n\t"\
3194
-                        "movq " #src2 ", %%mm6          \n\t"\
3195
-                        "punpcklbw %%mm0, %%mm0         \n\t"\
3196
-                        "punpckhbw %%mm5, %%mm5         \n\t"\
3197
-                        "punpcklbw %%mm1, %%mm1         \n\t"\
3198
-                        "punpckhbw %%mm6, %%mm6         \n\t"\
3199
-                        "pmulhuw %%mm3, %%mm0           \n\t"\
3200
-                        "pmulhuw %%mm3, %%mm5           \n\t"\
3201
-                        "pmulhuw %%mm3, %%mm1           \n\t"\
3202
-                        "pmulhuw %%mm3, %%mm6           \n\t"\
3203
-                        "psubw %%mm2, %%mm0             \n\t"\
3204
-                        "psubw %%mm2, %%mm5             \n\t"\
3205
-                        "psubw %%mm2, %%mm1             \n\t"\
3206
-                        "psubw %%mm2, %%mm6             \n\t"\
3207
-                        "packuswb %%mm5, %%mm0          \n\t"\
3208
-                        "packuswb %%mm6, %%mm1          \n\t"\
3209
-                        "movq %%mm0, " #dst1 "          \n\t"\
3210
-                        "movq %%mm1, " #dst2 "          \n\t"\
3190
+        "movq " #src1 ", %%mm0          \n\t"\
3191
+        "movq " #src1 ", %%mm5          \n\t"\
3192
+        "movq " #src2 ", %%mm1          \n\t"\
3193
+        "movq " #src2 ", %%mm6          \n\t"\
3194
+        "punpcklbw %%mm0, %%mm0         \n\t"\
3195
+        "punpckhbw %%mm5, %%mm5         \n\t"\
3196
+        "punpcklbw %%mm1, %%mm1         \n\t"\
3197
+        "punpckhbw %%mm6, %%mm6         \n\t"\
3198
+        "pmulhuw %%mm3, %%mm0           \n\t"\
3199
+        "pmulhuw %%mm3, %%mm5           \n\t"\
3200
+        "pmulhuw %%mm3, %%mm1           \n\t"\
3201
+        "pmulhuw %%mm3, %%mm6           \n\t"\
3202
+        "psubw %%mm2, %%mm0             \n\t"\
3203
+        "psubw %%mm2, %%mm5             \n\t"\
3204
+        "psubw %%mm2, %%mm1             \n\t"\
3205
+        "psubw %%mm2, %%mm6             \n\t"\
3206
+        "packuswb %%mm5, %%mm0          \n\t"\
3207
+        "packuswb %%mm6, %%mm1          \n\t"\
3208
+        "movq %%mm0, " #dst1 "          \n\t"\
3209
+        "movq %%mm1, " #dst2 "          \n\t"\
3211 3210
 
3212 3211
 #else //HAVE_MMX2
3213 3212
 #define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                        \
3214
-                        "movq " #src1 ", %%mm0          \n\t"\
3215
-                        "movq " #src1 ", %%mm5          \n\t"\
3216
-                        "punpcklbw %%mm4, %%mm0         \n\t"\
3217
-                        "punpckhbw %%mm4, %%mm5         \n\t"\
3218
-                        "psubw %%mm2, %%mm0             \n\t"\
3219
-                        "psubw %%mm2, %%mm5             \n\t"\
3220
-                        "movq " #src2 ", %%mm1          \n\t"\
3221
-                        "psllw $6, %%mm0                \n\t"\
3222
-                        "psllw $6, %%mm5                \n\t"\
3223
-                        "pmulhw %%mm3, %%mm0            \n\t"\
3224
-                        "movq " #src2 ", %%mm6          \n\t"\
3225
-                        "pmulhw %%mm3, %%mm5            \n\t"\
3226
-                        "punpcklbw %%mm4, %%mm1         \n\t"\
3227
-                        "punpckhbw %%mm4, %%mm6         \n\t"\
3228
-                        "psubw %%mm2, %%mm1             \n\t"\
3229
-                        "psubw %%mm2, %%mm6             \n\t"\
3230
-                        "psllw $6, %%mm1                \n\t"\
3231
-                        "psllw $6, %%mm6                \n\t"\
3232
-                        "pmulhw %%mm3, %%mm1            \n\t"\
3233
-                        "pmulhw %%mm3, %%mm6            \n\t"\
3234
-                        "packuswb %%mm5, %%mm0          \n\t"\
3235
-                        "packuswb %%mm6, %%mm1          \n\t"\
3236
-                        "movq %%mm0, " #dst1 "          \n\t"\
3237
-                        "movq %%mm1, " #dst2 "          \n\t"\
3213
+        "movq " #src1 ", %%mm0          \n\t"\
3214
+        "movq " #src1 ", %%mm5          \n\t"\
3215
+        "punpcklbw %%mm4, %%mm0         \n\t"\
3216
+        "punpckhbw %%mm4, %%mm5         \n\t"\
3217
+        "psubw %%mm2, %%mm0             \n\t"\
3218
+        "psubw %%mm2, %%mm5             \n\t"\
3219
+        "movq " #src2 ", %%mm1          \n\t"\
3220
+        "psllw $6, %%mm0                \n\t"\
3221
+        "psllw $6, %%mm5                \n\t"\
3222
+        "pmulhw %%mm3, %%mm0            \n\t"\
3223
+        "movq " #src2 ", %%mm6          \n\t"\
3224
+        "pmulhw %%mm3, %%mm5            \n\t"\
3225
+        "punpcklbw %%mm4, %%mm1         \n\t"\
3226
+        "punpckhbw %%mm4, %%mm6         \n\t"\
3227
+        "psubw %%mm2, %%mm1             \n\t"\
3228
+        "psubw %%mm2, %%mm6             \n\t"\
3229
+        "psllw $6, %%mm1                \n\t"\
3230
+        "psllw $6, %%mm6                \n\t"\
3231
+        "pmulhw %%mm3, %%mm1            \n\t"\
3232
+        "pmulhw %%mm3, %%mm6            \n\t"\
3233
+        "packuswb %%mm5, %%mm0          \n\t"\
3234
+        "packuswb %%mm6, %%mm1          \n\t"\
3235
+        "movq %%mm0, " #dst1 "          \n\t"\
3236
+        "movq %%mm1, " #dst2 "          \n\t"\
3238 3237
 
3239 3238
 #endif //HAVE_MMX2
3240 3239
 #define SCALED_CPY(src1, src2, dst1, dst2)\
... ...
@@ -3243,37 +3184,35 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t
3243 3243
 SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
3244 3244
 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
3245 3245
 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
3246
-                        "lea (%%"REG_a",%4,4), %%"REG_a"        \n\t"
3247
-                        "lea (%%"REG_d",%5,4), %%"REG_d"        \n\t"
3246
+        "lea (%%"REG_a",%4,4), %%"REG_a"        \n\t"
3247
+        "lea (%%"REG_d",%5,4), %%"REG_d"        \n\t"
3248 3248
 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3249 3249
 
3250 3250
 
3251
-                        : "=&a" (packedOffsetAndScale)
3252
-                        : "0" (packedOffsetAndScale),
3253
-                        "r"(src),
3254
-                        "r"(dst),
3255
-                        "r" ((long)srcStride),
3256
-                        "r" ((long)dstStride)
3257
-                        : "%"REG_d
3258
-                                        );
3251
+        : "=&a" (packedOffsetAndScale)
3252
+        : "0" (packedOffsetAndScale),
3253
+        "r"(src),
3254
+        "r"(dst),
3255
+        "r" ((long)srcStride),
3256
+        "r" ((long)dstStride)
3257
+        : "%"REG_d
3258
+    );
3259 3259
 #else //HAVE_MMX
3260
-        for(i=0; i<8; i++)
3261
-                memcpy( &(dst[dstStride*i]),
3262
-                        &(src[srcStride*i]), BLOCK_SIZE);
3260
+    for(i=0; i<8; i++)
3261
+        memcpy( &(dst[dstStride*i]),
3262
+                &(src[srcStride*i]), BLOCK_SIZE);
3263 3263
 #endif //HAVE_MMX
3264
-        }
3265
-        else
3266
-        {
3264
+    }else{
3267 3265
 #ifdef HAVE_MMX
3268
-        asm volatile(
3269
-                "lea (%0,%2), %%"REG_a"                 \n\t"
3270
-                "lea (%1,%3), %%"REG_d"                 \n\t"
3266
+    asm volatile(
3267
+        "lea (%0,%2), %%"REG_a"                 \n\t"
3268
+        "lea (%1,%3), %%"REG_d"                 \n\t"
3271 3269
 
3272 3270
 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2)                              \
3273
-                "movq " #src1 ", %%mm0          \n\t"\
3274
-                "movq " #src2 ", %%mm1          \n\t"\
3275
-                "movq %%mm0, " #dst1 "          \n\t"\
3276
-                "movq %%mm1, " #dst2 "          \n\t"\
3271
+        "movq " #src1 ", %%mm0          \n\t"\
3272
+        "movq " #src2 ", %%mm1          \n\t"\
3273
+        "movq %%mm0, " #dst1 "          \n\t"\
3274
+        "movq %%mm1, " #dst2 "          \n\t"\
3277 3275
 
3278 3276
 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3279 3277
    REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
... ...
@@ -3281,22 +3220,22 @@ SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3281 3281
 SIMPLE_CPY((%0)       , (%0, %2)       , (%1)       , (%1, %3))
3282 3282
 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
3283 3283
 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
3284
-                "lea (%%"REG_a",%2,4), %%"REG_a"        \n\t"
3285
-                "lea (%%"REG_d",%3,4), %%"REG_d"        \n\t"
3284
+        "lea (%%"REG_a",%2,4), %%"REG_a"        \n\t"
3285
+        "lea (%%"REG_d",%3,4), %%"REG_d"        \n\t"
3286 3286
 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3287 3287
 
3288
-                : : "r" (src),
3289
-                "r" (dst),
3290
-                "r" ((long)srcStride),
3291
-                "r" ((long)dstStride)
3292
-                : "%"REG_a, "%"REG_d
3293
-        );
3288
+        : : "r" (src),
3289
+        "r" (dst),
3290
+        "r" ((long)srcStride),
3291
+        "r" ((long)dstStride)
3292
+        : "%"REG_a, "%"REG_d
3293
+    );
3294 3294
 #else //HAVE_MMX
3295
-        for(i=0; i<8; i++)
3296
-                memcpy( &(dst[dstStride*i]),
3297
-                        &(src[srcStride*i]), BLOCK_SIZE);
3295
+    for(i=0; i<8; i++)
3296
+        memcpy( &(dst[dstStride*i]),
3297
+                &(src[srcStride*i]), BLOCK_SIZE);
3298 3298
 #endif //HAVE_MMX
3299
-        }
3299
+    }
3300 3300
 }
3301 3301
 
3302 3302
 /**
... ...
@@ -3305,23 +3244,22 @@ SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3305 3305
 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3306 3306
 {
3307 3307
 #ifdef HAVE_MMX
3308
-        asm volatile(
3309
-                "movq (%0), %%mm0               \n\t"
3310
-                "add %1, %0                     \n\t"
3311
-                "movq %%mm0, (%0)               \n\t"
3312
-                "movq %%mm0, (%0, %1)           \n\t"
3313
-                "movq %%mm0, (%0, %1, 2)        \n\t"
3314
-                : "+r" (src)
3315
-                : "r" ((long)-stride)
3316
-        );
3308
+    asm volatile(
3309
+        "movq (%0), %%mm0               \n\t"
3310
+        "add %1, %0                     \n\t"
3311
+        "movq %%mm0, (%0)               \n\t"
3312
+        "movq %%mm0, (%0, %1)           \n\t"
3313
+        "movq %%mm0, (%0, %1, 2)        \n\t"
3314
+        : "+r" (src)
3315
+        : "r" ((long)-stride)
3316
+    );
3317 3317
 #else
3318
-        int i;
3319
-        uint8_t *p=src;
3320
-        for(i=0; i<3; i++)
3321
-        {
3322
-                p-= stride;
3323
-                memcpy(p, src, 8);
3324
-        }
3318
+    int i;
3319
+    uint8_t *p=src;
3320
+    for(i=0; i<3; i++){
3321
+        p-= stride;
3322
+        memcpy(p, src, 8);
3323
+    }
3325 3324
 #endif
3326 3325
 }
3327 3326
 
... ...
@@ -3329,498 +3267,467 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride)
3329 3329
  * Filters array of bytes (Y or U or V values)
3330 3330
  */
3331 3331
 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3332
-        const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3332
+                                const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3333 3333
 {
3334
-        DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3335
-        int x,y;
3334
+    DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3335
+    int x,y;
3336 3336
 #ifdef COMPILE_TIME_MODE
3337
-        const int mode= COMPILE_TIME_MODE;
3337
+    const int mode= COMPILE_TIME_MODE;
3338 3338
 #else
3339
-        const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3339
+    const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3340 3340
 #endif
3341
-        int black=0, white=255; // blackest black and whitest white in the picture
3342
-        int QPCorrecture= 256*256;
3341
+    int black=0, white=255; // blackest black and whitest white in the picture
3342
+    int QPCorrecture= 256*256;
3343 3343
 
3344
-        int copyAhead;
3344
+    int copyAhead;
3345 3345
 #ifdef HAVE_MMX
3346
-        int i;
3346
+    int i;
3347 3347
 #endif
3348 3348
 
3349
-        const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3350
-        const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3349
+    const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3350
+    const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3351 3351
 
3352
-        //FIXME remove
3353
-        uint64_t * const yHistogram= c.yHistogram;
3354
-        uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3355
-        uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3356
-        //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3352
+    //FIXME remove
3353
+    uint64_t * const yHistogram= c.yHistogram;
3354
+    uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3355
+    uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3356
+    //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3357 3357
 
3358 3358
 #ifdef HAVE_MMX
3359
-        for(i=0; i<57; i++){
3360
-                int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3361
-                int threshold= offset*2 + 1;
3362
-                c.mmxDcOffset[i]= 0x7F - offset;
3363
-                c.mmxDcThreshold[i]= 0x7F - threshold;
3364
-                c.mmxDcOffset[i]*= 0x0101010101010101LL;
3365
-                c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3366
-        }
3359
+    for(i=0; i<57; i++){
3360
+        int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3361
+        int threshold= offset*2 + 1;
3362
+        c.mmxDcOffset[i]= 0x7F - offset;
3363
+        c.mmxDcThreshold[i]= 0x7F - threshold;
3364
+        c.mmxDcOffset[i]*= 0x0101010101010101LL;
3365
+        c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3366
+    }
3367 3367
 #endif
3368 3368
 
3369
-        if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3370
-        else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
3371
-                || (mode & FFMPEG_DEINT_FILTER)
3372
-                || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3373
-        else if(   (mode & V_DEBLOCK)
3374
-                || (mode & LINEAR_IPOL_DEINT_FILTER)
3375
-                || (mode & MEDIAN_DEINT_FILTER)
3376
-                || (mode & V_A_DEBLOCK)) copyAhead=13;
3377
-        else if(mode & V_X1_FILTER) copyAhead=11;
3378
-//        else if(mode & V_RK1_FILTER) copyAhead=10;
3379
-        else if(mode & DERING) copyAhead=9;
3380
-        else copyAhead=8;
3381
-
3382
-        copyAhead-= 8;
3383
-
3384
-        if(!isColor)
3385
-        {
3386
-                uint64_t sum= 0;
3387
-                int i;
3388
-                uint64_t maxClipped;
3389
-                uint64_t clipped;
3390
-                double scale;
3369
+    if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3370
+    else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
3371
+            || (mode & FFMPEG_DEINT_FILTER)
3372
+            || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3373
+    else if(   (mode & V_DEBLOCK)
3374
+            || (mode & LINEAR_IPOL_DEINT_FILTER)
3375
+            || (mode & MEDIAN_DEINT_FILTER)
3376
+            || (mode & V_A_DEBLOCK)) copyAhead=13;
3377
+    else if(mode & V_X1_FILTER) copyAhead=11;
3378
+//    else if(mode & V_RK1_FILTER) copyAhead=10;
3379
+    else if(mode & DERING) copyAhead=9;
3380
+    else copyAhead=8;
3381
+
3382
+    copyAhead-= 8;
3383
+
3384
+    if(!isColor){
3385
+        uint64_t sum= 0;
3386
+        int i;
3387
+        uint64_t maxClipped;
3388
+        uint64_t clipped;
3389
+        double scale;
3391 3390
 
3392
-                c.frameNum++;
3393
-                // first frame is fscked so we ignore it
3394
-                if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3391
+        c.frameNum++;
3392
+        // first frame is fscked so we ignore it
3393
+        if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3395 3394
 
3396
-                for(i=0; i<256; i++)
3397
-                {
3398
-                        sum+= yHistogram[i];
3399
-                }
3395
+        for(i=0; i<256; i++){
3396
+            sum+= yHistogram[i];
3397
+        }
3400 3398
 
3401
-                /* We always get a completely black picture first. */
3402
-                maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3399
+        /* We always get a completely black picture first. */
3400
+        maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3403 3401
 
3404
-                clipped= sum;
3405
-                for(black=255; black>0; black--)
3406
-                {
3407
-                        if(clipped < maxClipped) break;
3408
-                        clipped-= yHistogram[black];
3409
-                }
3402
+        clipped= sum;
3403
+        for(black=255; black>0; black--){
3404
+            if(clipped < maxClipped) break;
3405
+            clipped-= yHistogram[black];
3406
+        }
3410 3407
 
3411
-                clipped= sum;
3412
-                for(white=0; white<256; white++)
3413
-                {
3414
-                        if(clipped < maxClipped) break;
3415
-                        clipped-= yHistogram[white];
3416
-                }
3408
+        clipped= sum;
3409
+        for(white=0; white<256; white++){
3410
+            if(clipped < maxClipped) break;
3411
+            clipped-= yHistogram[white];
3412
+        }
3417 3413
 
3418
-                scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3414
+        scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3419 3415
 
3420 3416
 #ifdef HAVE_MMX2
3421
-                c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3422
-                c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3417
+        c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3418
+        c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3423 3419
 #else
3424
-                c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3425
-                c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3420
+        c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3421
+        c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3426 3422
 #endif
3427 3423
 
3428
-                c.packedYOffset|= c.packedYOffset<<32;
3429
-                c.packedYOffset|= c.packedYOffset<<16;
3424
+        c.packedYOffset|= c.packedYOffset<<32;
3425
+        c.packedYOffset|= c.packedYOffset<<16;
3430 3426
 
3431
-                c.packedYScale|= c.packedYScale<<32;
3432
-                c.packedYScale|= c.packedYScale<<16;
3427
+        c.packedYScale|= c.packedYScale<<32;
3428
+        c.packedYScale|= c.packedYScale<<16;
3433 3429
 
3434
-                if(mode & LEVEL_FIX)        QPCorrecture= (int)(scale*256*256 + 0.5);
3435
-                else                        QPCorrecture= 256*256;
3436
-        }
3437
-        else
3438
-        {
3439
-                c.packedYScale= 0x0100010001000100LL;
3440
-                c.packedYOffset= 0;
3441
-                QPCorrecture= 256*256;
3442
-        }
3430
+        if(mode & LEVEL_FIX)        QPCorrecture= (int)(scale*256*256 + 0.5);
3431
+        else                        QPCorrecture= 256*256;
3432
+    }else{
3433
+        c.packedYScale= 0x0100010001000100LL;
3434
+        c.packedYOffset= 0;
3435
+        QPCorrecture= 256*256;
3436
+    }
3443 3437
 
3444
-        /* copy & deinterlace first row of blocks */
3445
-        y=-BLOCK_SIZE;
3446
-        {
3447
-                const uint8_t *srcBlock= &(src[y*srcStride]);
3448
-                uint8_t *dstBlock= tempDst + dstStride;
3438
+    /* copy & deinterlace first row of blocks */
3439
+    y=-BLOCK_SIZE;
3440
+    {
3441
+        const uint8_t *srcBlock= &(src[y*srcStride]);
3442
+        uint8_t *dstBlock= tempDst + dstStride;
3449 3443
 
3450
-                // From this point on it is guaranteed that we can read and write 16 lines downward
3451
-                // finish 1 block before the next otherwise we might have a problem
3452
-                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3453
-                for(x=0; x<width; x+=BLOCK_SIZE)
3454
-                {
3444
+        // From this point on it is guaranteed that we can read and write 16 lines downward
3445
+        // finish 1 block before the next otherwise we might have a problem
3446
+        // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3447
+        for(x=0; x<width; x+=BLOCK_SIZE){
3455 3448
 
3456 3449
 #ifdef HAVE_MMX2
3457 3450
 /*
3458
-                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3459
-                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3460
-                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3461
-                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3451
+            prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3452
+            prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3453
+            prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3454
+            prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3462 3455
 */
3463 3456
 
3464
-                        asm(
3465
-                                "mov %4, %%"REG_a"              \n\t"
3466
-                                "shr $2, %%"REG_a"              \n\t"
3467
-                                "and $6, %%"REG_a"              \n\t"
3468
-                                "add %5, %%"REG_a"              \n\t"
3469
-                                "mov %%"REG_a", %%"REG_d"       \n\t"
3470
-                                "imul %1, %%"REG_a"             \n\t"
3471
-                                "imul %3, %%"REG_d"             \n\t"
3472
-                                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3473
-                                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3474
-                                "add %1, %%"REG_a"              \n\t"
3475
-                                "add %3, %%"REG_d"              \n\t"
3476
-                                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3477
-                                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3478
-                        :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3479
-                        "g" ((long)x), "g" ((long)copyAhead)
3480
-                        : "%"REG_a, "%"REG_d
3481
-                        );
3457
+            asm(
3458
+                "mov %4, %%"REG_a"              \n\t"
3459
+                "shr $2, %%"REG_a"              \n\t"
3460
+                "and $6, %%"REG_a"              \n\t"
3461
+                "add %5, %%"REG_a"              \n\t"
3462
+                "mov %%"REG_a", %%"REG_d"       \n\t"
3463
+                "imul %1, %%"REG_a"             \n\t"
3464
+                "imul %3, %%"REG_d"             \n\t"
3465
+                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3466
+                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3467
+                "add %1, %%"REG_a"              \n\t"
3468
+                "add %3, %%"REG_d"              \n\t"
3469
+                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3470
+                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3471
+                :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3472
+                "g" ((long)x), "g" ((long)copyAhead)
3473
+                : "%"REG_a, "%"REG_d
3474
+            );
3482 3475
 
3483 3476
 #elif defined(HAVE_3DNOW)
3484 3477
 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3485
-/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3486
-                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3487
-                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3488
-                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3478
+/*          prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3479
+            prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3480
+            prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3481
+            prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3489 3482
 */
3490 3483
 #endif
3491 3484
 
3492
-                        RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3493
-                                srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3494
-
3495
-                        RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3496
-
3497
-                        if(mode & LINEAR_IPOL_DEINT_FILTER)
3498
-                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3499
-                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
3500
-                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3501
-                        else if(mode & MEDIAN_DEINT_FILTER)
3502
-                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
3503
-                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
3504
-                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3505
-                        else if(mode & FFMPEG_DEINT_FILTER)
3506
-                                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3507
-                        else if(mode & LOWPASS5_DEINT_FILTER)
3508
-                                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3509
-/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3510
-                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3485
+            RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3486
+                              srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3487
+
3488
+            RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3489
+
3490
+            if(mode & LINEAR_IPOL_DEINT_FILTER)
3491
+                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3492
+            else if(mode & LINEAR_BLEND_DEINT_FILTER)
3493
+                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3494
+            else if(mode & MEDIAN_DEINT_FILTER)
3495
+                RENAME(deInterlaceMedian)(dstBlock, dstStride);
3496
+            else if(mode & CUBIC_IPOL_DEINT_FILTER)
3497
+                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3498
+            else if(mode & FFMPEG_DEINT_FILTER)
3499
+                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3500
+            else if(mode & LOWPASS5_DEINT_FILTER)
3501
+                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3502
+/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
3503
+                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3511 3504
 */
3512
-                        dstBlock+=8;
3513
-                        srcBlock+=8;
3514
-                }
3515
-                if(width==FFABS(dstStride))
3516
-                        linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3517
-                else
3518
-                {
3519
-                        int i;
3520
-                        for(i=0; i<copyAhead; i++)
3521
-                        {
3522
-                                memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3523
-                        }
3524
-                }
3505
+            dstBlock+=8;
3506
+            srcBlock+=8;
3507
+        }
3508
+        if(width==FFABS(dstStride))
3509
+            linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3510
+        else{
3511
+            int i;
3512
+            for(i=0; i<copyAhead; i++){
3513
+                memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3514
+            }
3525 3515
         }
3516
+    }
3526 3517
 
3527
-        for(y=0; y<height; y+=BLOCK_SIZE)
3528
-        {
3529
-                //1% speedup if these are here instead of the inner loop
3530
-                const uint8_t *srcBlock= &(src[y*srcStride]);
3531
-                uint8_t *dstBlock= &(dst[y*dstStride]);
3518
+    for(y=0; y<height; y+=BLOCK_SIZE){
3519
+        //1% speedup if these are here instead of the inner loop
3520
+        const uint8_t *srcBlock= &(src[y*srcStride]);
3521
+        uint8_t *dstBlock= &(dst[y*dstStride]);
3532 3522
 #ifdef HAVE_MMX
3533
-                uint8_t *tempBlock1= c.tempBlocks;
3534
-                uint8_t *tempBlock2= c.tempBlocks + 8;
3523
+        uint8_t *tempBlock1= c.tempBlocks;
3524
+        uint8_t *tempBlock2= c.tempBlocks + 8;
3535 3525
 #endif
3536
-                const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3537
-                int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3538
-                int QP=0;
3539
-                /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3540
-                   if not than use a temporary buffer */
3541
-                if(y+15 >= height)
3542
-                {
3543
-                        int i;
3544
-                        /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3545
-                           blockcopy to dst later */
3546
-                        linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3547
-                                FFMAX(height-y-copyAhead, 0), srcStride);
3548
-
3549
-                        /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3550
-                        for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3551
-                                memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3552
-
3553
-                        /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3554
-                        linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3555
-
3556
-                        /* duplicate last line of dst to fill the void upto line (copyAhead) */
3557
-                        for(i=height-y+1; i<=copyAhead; i++)
3558
-                                memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3559
-
3560
-                        dstBlock= tempDst + dstStride;
3561
-                        srcBlock= tempSrc;
3562
-                }
3526
+        const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3527
+        int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3528
+        int QP=0;
3529
+        /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3530
+           if not than use a temporary buffer */
3531
+        if(y+15 >= height){
3532
+            int i;
3533
+            /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3534
+               blockcopy to dst later */
3535
+            linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3536
+                    FFMAX(height-y-copyAhead, 0), srcStride);
3537
+
3538
+            /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3539
+            for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3540
+                    memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3541
+
3542
+            /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3543
+            linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3544
+
3545
+            /* duplicate last line of dst to fill the void upto line (copyAhead) */
3546
+            for(i=height-y+1; i<=copyAhead; i++)
3547
+                    memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3548
+
3549
+            dstBlock= tempDst + dstStride;
3550
+            srcBlock= tempSrc;
3551
+        }
3563 3552
 
3564
-                // From this point on it is guaranteed that we can read and write 16 lines downward
3565
-                // finish 1 block before the next otherwise we might have a problem
3566
-                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3567
-                for(x=0; x<width; x+=BLOCK_SIZE)
3568
-                {
3569
-                        const int stride= dstStride;
3553
+        // From this point on it is guaranteed that we can read and write 16 lines downward
3554
+        // finish 1 block before the next otherwise we might have a problem
3555
+        // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3556
+        for(x=0; x<width; x+=BLOCK_SIZE){
3557
+            const int stride= dstStride;
3570 3558
 #ifdef HAVE_MMX
3571
-                        uint8_t *tmpXchg;
3559
+            uint8_t *tmpXchg;
3572 3560
 #endif
3573
-                        if(isColor)
3574
-                        {
3575
-                                QP= QPptr[x>>qpHShift];
3576
-                                c.nonBQP= nonBQPptr[x>>qpHShift];
3577
-                        }
3578
-                        else
3579
-                        {
3580
-                                QP= QPptr[x>>4];
3581
-                                QP= (QP* QPCorrecture + 256*128)>>16;
3582
-                                c.nonBQP= nonBQPptr[x>>4];
3583
-                                c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3584
-                                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3585
-                        }
3586
-                        c.QP= QP;
3561
+            if(isColor){
3562
+                QP= QPptr[x>>qpHShift];
3563
+                c.nonBQP= nonBQPptr[x>>qpHShift];
3564
+            }else{
3565
+                QP= QPptr[x>>4];
3566
+                QP= (QP* QPCorrecture + 256*128)>>16;
3567
+                c.nonBQP= nonBQPptr[x>>4];
3568
+                c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3569
+                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3570
+            }
3571
+            c.QP= QP;
3587 3572
 #ifdef HAVE_MMX
3588
-                        asm volatile(
3589
-                                "movd %1, %%mm7         \n\t"
3590
-                                "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3591
-                                "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3592
-                                "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
3593
-                                "movq %%mm7, %0         \n\t"
3594
-                                : "=m" (c.pQPb)
3595
-                                : "r" (QP)
3596
-                        );
3573
+            asm volatile(
3574
+                "movd %1, %%mm7         \n\t"
3575
+                "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3576
+                "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3577
+                "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
3578
+                "movq %%mm7, %0         \n\t"
3579
+                : "=m" (c.pQPb)
3580
+                : "r" (QP)
3581
+            );
3597 3582
 #endif
3598 3583
 
3599 3584
 
3600 3585
 #ifdef HAVE_MMX2
3601 3586
 /*
3602
-                        prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3603
-                        prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3604
-                        prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3605
-                        prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3587
+            prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3588
+            prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3589
+            prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3590
+            prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3606 3591
 */
3607 3592
 
3608
-                        asm(
3609
-                                "mov %4, %%"REG_a"              \n\t"
3610
-                                "shr $2, %%"REG_a"              \n\t"
3611
-                                "and $6, %%"REG_a"              \n\t"
3612
-                                "add %5, %%"REG_a"              \n\t"
3613
-                                "mov %%"REG_a", %%"REG_d"       \n\t"
3614
-                                "imul %1, %%"REG_a"             \n\t"
3615
-                                "imul %3, %%"REG_d"             \n\t"
3616
-                                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3617
-                                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3618
-                                "add %1, %%"REG_a"              \n\t"
3619
-                                "add %3, %%"REG_d"              \n\t"
3620
-                                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3621
-                                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3622
-                        :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3623
-                         "g" ((long)x), "g" ((long)copyAhead)
3624
-                        : "%"REG_a, "%"REG_d
3625
-                        );
3593
+            asm(
3594
+                "mov %4, %%"REG_a"              \n\t"
3595
+                "shr $2, %%"REG_a"              \n\t"
3596
+                "and $6, %%"REG_a"              \n\t"
3597
+                "add %5, %%"REG_a"              \n\t"
3598
+                "mov %%"REG_a", %%"REG_d"       \n\t"
3599
+                "imul %1, %%"REG_a"             \n\t"
3600
+                "imul %3, %%"REG_d"             \n\t"
3601
+                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3602
+                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3603
+                "add %1, %%"REG_a"              \n\t"
3604
+                "add %3, %%"REG_d"              \n\t"
3605
+                "prefetchnta 32(%%"REG_a", %0)  \n\t"
3606
+                "prefetcht0 32(%%"REG_d", %2)   \n\t"
3607
+                :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3608
+                "g" ((long)x), "g" ((long)copyAhead)
3609
+                : "%"REG_a, "%"REG_d
3610
+            );
3626 3611
 
3627 3612
 #elif defined(HAVE_3DNOW)
3628 3613
 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3629
-/*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3630
-                        prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3631
-                        prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3632
-                        prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3614
+/*          prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3615
+            prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3616
+            prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3617
+            prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3633 3618
 */
3634 3619
 #endif
3635 3620
 
3636
-                        RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3637
-                                srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3638
-
3639
-                        if(mode & LINEAR_IPOL_DEINT_FILTER)
3640
-                                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3641
-                        else if(mode & LINEAR_BLEND_DEINT_FILTER)
3642
-                                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3643
-                        else if(mode & MEDIAN_DEINT_FILTER)
3644
-                                RENAME(deInterlaceMedian)(dstBlock, dstStride);
3645
-                        else if(mode & CUBIC_IPOL_DEINT_FILTER)
3646
-                                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3647
-                        else if(mode & FFMPEG_DEINT_FILTER)
3648
-                                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3649
-                        else if(mode & LOWPASS5_DEINT_FILTER)
3650
-                                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3651
-/*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3652
-                                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3621
+            RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3622
+                              srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3623
+
3624
+            if(mode & LINEAR_IPOL_DEINT_FILTER)
3625
+                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3626
+            else if(mode & LINEAR_BLEND_DEINT_FILTER)
3627
+                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3628
+            else if(mode & MEDIAN_DEINT_FILTER)
3629
+                RENAME(deInterlaceMedian)(dstBlock, dstStride);
3630
+            else if(mode & CUBIC_IPOL_DEINT_FILTER)
3631
+                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3632
+            else if(mode & FFMPEG_DEINT_FILTER)
3633
+                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3634
+            else if(mode & LOWPASS5_DEINT_FILTER)
3635
+                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3636
+/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
3637
+                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3653 3638
 */
3654 3639
 
3655
-                        /* only deblock if we have 2 blocks */
3656
-                        if(y + 8 < height)
3657
-                        {
3658
-                                if(mode & V_X1_FILTER)
3659
-                                        RENAME(vertX1Filter)(dstBlock, stride, &c);
3660
-                                else if(mode & V_DEBLOCK)
3661
-                                {
3662
-                                        const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3663
-
3664
-                                        if(t==1)
3665
-                                                RENAME(doVertLowPass)(dstBlock, stride, &c);
3666
-                                        else if(t==2)
3667
-                                                RENAME(doVertDefFilter)(dstBlock, stride, &c);
3668
-                                }else if(mode & V_A_DEBLOCK){
3669
-                                        RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3670
-                                }
3671
-                        }
3640
+            /* only deblock if we have 2 blocks */
3641
+            if(y + 8 < height){
3642
+                if(mode & V_X1_FILTER)
3643
+                    RENAME(vertX1Filter)(dstBlock, stride, &c);
3644
+                else if(mode & V_DEBLOCK){
3645
+                    const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3646
+
3647
+                    if(t==1)
3648
+                        RENAME(doVertLowPass)(dstBlock, stride, &c);
3649
+                    else if(t==2)
3650
+                        RENAME(doVertDefFilter)(dstBlock, stride, &c);
3651
+                }else if(mode & V_A_DEBLOCK){
3652
+                    RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3653
+                }
3654
+            }
3672 3655
 
3673 3656
 #ifdef HAVE_MMX
3674
-                        RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3657
+            RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3675 3658
 #endif
3676
-                        /* check if we have a previous block to deblock it with dstBlock */
3677
-                        if(x - 8 >= 0)
3678
-                        {
3659
+            /* check if we have a previous block to deblock it with dstBlock */
3660
+            if(x - 8 >= 0){
3679 3661
 #ifdef HAVE_MMX
3680
-                                if(mode & H_X1_FILTER)
3681
-                                        RENAME(vertX1Filter)(tempBlock1, 16, &c);
3682
-                                else if(mode & H_DEBLOCK)
3683
-                                {
3662
+                if(mode & H_X1_FILTER)
3663
+                        RENAME(vertX1Filter)(tempBlock1, 16, &c);
3664
+                else if(mode & H_DEBLOCK){
3684 3665
 //START_TIMER
3685
-                                        const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3666
+                    const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3686 3667
 //STOP_TIMER("dc & minmax")
3687
-                                        if(t==1)
3688
-                                                RENAME(doVertLowPass)(tempBlock1, 16, &c);
3689
-                                        else if(t==2)
3690
-                                                RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3691
-                                }else if(mode & H_A_DEBLOCK){
3692
-                                        RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3693
-                                }
3668
+                    if(t==1)
3669
+                        RENAME(doVertLowPass)(tempBlock1, 16, &c);
3670
+                    else if(t==2)
3671
+                        RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3672
+                }else if(mode & H_A_DEBLOCK){
3673
+                        RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3674
+                }
3694 3675
 
3695
-                                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3676
+                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3696 3677
 
3697 3678
 #else
3698
-                                if(mode & H_X1_FILTER)
3699
-                                        horizX1Filter(dstBlock-4, stride, QP);
3700
-                                else if(mode & H_DEBLOCK)
3701
-                                {
3679
+                if(mode & H_X1_FILTER)
3680
+                    horizX1Filter(dstBlock-4, stride, QP);
3681
+                else if(mode & H_DEBLOCK){
3702 3682
 #ifdef HAVE_ALTIVEC
3703
-                                        DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
3704
-                                        transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3705
-
3706
-                                        const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3707
-                                        if(t==1) {
3708
-                                                doVertLowPass_altivec(tempBlock-48, 16, &c);
3709
-                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3710
-                                        }
3711
-                                        else if(t==2) {
3712
-                                                doVertDefFilter_altivec(tempBlock-48, 16, &c);
3713
-                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3714
-                                        }
3683
+                    DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
3684
+                    transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3685
+
3686
+                    const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3687
+                    if(t==1) {
3688
+                        doVertLowPass_altivec(tempBlock-48, 16, &c);
3689
+                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3690
+                    }
3691
+                    else if(t==2) {
3692
+                        doVertDefFilter_altivec(tempBlock-48, 16, &c);
3693
+                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3694
+                    }
3715 3695
 #else
3716
-                                        const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3696
+                    const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3717 3697
 
3718
-                                        if(t==1)
3719
-                                                RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3720
-                                        else if(t==2)
3721
-                                                RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3698
+                    if(t==1)
3699
+                        RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3700
+                    else if(t==2)
3701
+                        RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3722 3702
 #endif
3723
-                                }else if(mode & H_A_DEBLOCK){
3724
-                                        RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3725
-                                }
3703
+                }else if(mode & H_A_DEBLOCK){
3704
+                    RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3705
+                }
3726 3706
 #endif //HAVE_MMX
3727
-                                if(mode & DERING)
3728
-                                {
3729
-                                //FIXME filter first line
3730
-                                        if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3731
-                                }
3732
-
3733
-                                if(mode & TEMP_NOISE_FILTER)
3734
-                                {
3735
-                                        RENAME(tempNoiseReducer)(dstBlock-8, stride,
3736
-                                                c.tempBlured[isColor] + y*dstStride + x,
3737
-                                                c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3738
-                                                c.ppMode.maxTmpNoise);
3739
-                                }
3740
-                        }
3707
+                if(mode & DERING){
3708
+                //FIXME filter first line
3709
+                    if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3710
+                }
3741 3711
 
3742
-                        dstBlock+=8;
3743
-                        srcBlock+=8;
3712
+                if(mode & TEMP_NOISE_FILTER)
3713
+                {
3714
+                    RENAME(tempNoiseReducer)(dstBlock-8, stride,
3715
+                            c.tempBlured[isColor] + y*dstStride + x,
3716
+                            c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3717
+                            c.ppMode.maxTmpNoise);
3718
+                }
3719
+            }
3720
+
3721
+            dstBlock+=8;
3722
+            srcBlock+=8;
3744 3723
 
3745 3724
 #ifdef HAVE_MMX
3746
-                        tmpXchg= tempBlock1;
3747
-                        tempBlock1= tempBlock2;
3748
-                        tempBlock2 = tmpXchg;
3725
+            tmpXchg= tempBlock1;
3726
+            tempBlock1= tempBlock2;
3727
+            tempBlock2 = tmpXchg;
3749 3728
 #endif
3750
-                }
3729
+        }
3751 3730
 
3752
-                if(mode & DERING)
3753
-                {
3754
-                                if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3755
-                }
3731
+        if(mode & DERING){
3732
+            if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3733
+        }
3756 3734
 
3757
-                if((mode & TEMP_NOISE_FILTER))
3758
-                {
3759
-                        RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3760
-                                c.tempBlured[isColor] + y*dstStride + x,
3761
-                                c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3762
-                                c.ppMode.maxTmpNoise);
3763
-                }
3735
+        if((mode & TEMP_NOISE_FILTER)){
3736
+            RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3737
+                    c.tempBlured[isColor] + y*dstStride + x,
3738
+                    c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3739
+                    c.ppMode.maxTmpNoise);
3740
+        }
3764 3741
 
3765
-                /* did we use a tmp buffer for the last lines*/
3766
-                if(y+15 >= height)
3767
-                {
3768
-                        uint8_t *dstBlock= &(dst[y*dstStride]);
3769
-                        if(width==FFABS(dstStride))
3770
-                                linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3771
-                        else
3772
-                        {
3773
-                                int i;
3774
-                                for(i=0; i<height-y; i++)
3775
-                                {
3776
-                                        memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3777
-                                }
3778
-                        }
3742
+        /* did we use a tmp buffer for the last lines*/
3743
+        if(y+15 >= height){
3744
+            uint8_t *dstBlock= &(dst[y*dstStride]);
3745
+            if(width==FFABS(dstStride))
3746
+                linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3747
+            else{
3748
+                int i;
3749
+                for(i=0; i<height-y; i++){
3750
+                    memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3779 3751
                 }
3780
-/*
3781
-                for(x=0; x<width; x+=32)
3782
-                {
3783
-                        volatile int i;
3784
-                        i+=        + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3785
-                                + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3786
-                                + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3787
-//                                + dstBlock[x +13*dstStride]
3788
-//                                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3789
-                }*/
3752
+            }
3790 3753
         }
3754
+/*
3755
+        for(x=0; x<width; x+=32){
3756
+            volatile int i;
3757
+            i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3758
+                + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3759
+                + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3760
+                + dstBlock[x +13*dstStride]
3761
+                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3762
+        }*/
3763
+    }
3791 3764
 #ifdef HAVE_3DNOW
3792
-        asm volatile("femms");
3765
+    asm volatile("femms");
3793 3766
 #elif defined (HAVE_MMX)
3794
-        asm volatile("emms");
3767
+    asm volatile("emms");
3795 3768
 #endif
3796 3769
 
3797 3770
 #ifdef DEBUG_BRIGHTNESS
3798
-        if(!isColor)
3799
-        {
3800
-                int max=1;
3801
-                int i;
3802
-                for(i=0; i<256; i++)
3803
-                        if(yHistogram[i] > max) max=yHistogram[i];
3804
-
3805
-                for(i=1; i<256; i++)
3806
-                {
3807
-                        int x;
3808
-                        int start=yHistogram[i-1]/(max/256+1);
3809
-                        int end=yHistogram[i]/(max/256+1);
3810
-                        int inc= end > start ? 1 : -1;
3811
-                        for(x=start; x!=end+inc; x+=inc)
3812
-                                dst[ i*dstStride + x]+=128;
3813
-                }
3814
-
3815
-                for(i=0; i<100; i+=2)
3816
-                {
3817
-                        dst[ (white)*dstStride + i]+=128;
3818
-                        dst[ (black)*dstStride + i]+=128;
3819
-                }
3771
+    if(!isColor){
3772
+        int max=1;
3773
+        int i;
3774
+        for(i=0; i<256; i++)
3775
+            if(yHistogram[i] > max) max=yHistogram[i];
3776
+
3777
+        for(i=1; i<256; i++){
3778
+            int x;
3779
+            int start=yHistogram[i-1]/(max/256+1);
3780
+            int end=yHistogram[i]/(max/256+1);
3781
+            int inc= end > start ? 1 : -1;
3782
+            for(x=start; x!=end+inc; x+=inc)
3783
+                dst[ i*dstStride + x]+=128;
3784
+        }
3820 3785
 
3786
+        for(i=0; i<100; i+=2){
3787
+            dst[ (white)*dstStride + i]+=128;
3788
+            dst[ (black)*dstStride + i]+=128;
3821 3789
         }
3790
+    }
3822 3791
 #endif
3823 3792
 
3824
-        *c2= c; //copy local context back
3793
+    *c2= c; //copy local context back
3825 3794
 
3826 3795
 }