Browse code

SSE optimized 32-point DCT

Originally committed as revision 24077 to svn://svn.ffmpeg.org/ffmpeg/trunk

Vitor Sessak authored on 2010/07/07 01:58:54
Showing 5 changed files
... ...
@@ -30,6 +30,7 @@
30 30
 #include <math.h>
31 31
 #include "libavutil/mathematics.h"
32 32
 #include "fft.h"
33
+#include "x86/fft.h"
33 34
 
34 35
 #define DCT32_FLOAT
35 36
 #include "dct32.c"
... ...
@@ -213,6 +214,7 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
213 213
         s->dct_calc = dct32_func;
214 214
 
215 215
     s->dct32 = dct32;
216
+    if (HAVE_MMX)     ff_dct_init_mmx(s);
216 217
 
217 218
     return 0;
218 219
 }
... ...
@@ -112,6 +112,7 @@ void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
112 112
 void ff_fft_init_altivec(FFTContext *s);
113 113
 void ff_fft_init_mmx(FFTContext *s);
114 114
 void ff_fft_init_arm(FFTContext *s);
115
+void ff_dct_init_mmx(DCTContext *s);
115 116
 
116 117
 /**
117 118
  * Do the permutation needed BEFORE calling ff_fft_calc().
... ...
@@ -42,3 +42,11 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
42 42
     }
43 43
 #endif
44 44
 }
45
+
46
+av_cold void ff_dct_init_mmx(DCTContext *s)
47
+{
48
+    int has_vectors = mm_support();
49
+    if (has_vectors & FF_MM_SSE && HAVE_SSE)
50
+        s->dct32 = ff_dct32_float_sse;
51
+}
52
+
... ...
@@ -32,5 +32,6 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
32 32
 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
33 33
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
34 34
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
35
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
35 36
 
36 37
 #endif
... ...
@@ -20,6 +20,7 @@
20 20
  */
21 21
 
22 22
 #include "libavutil/x86_cpu.h"
23
+#include "libavutil/common.h"
23 24
 #include "libavcodec/dsputil.h"
24 25
 #include "fft.h"
25 26
 
... ...
@@ -201,3 +202,268 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
201 201
     );
202 202
 }
203 203
 
204
+DECLARE_ALIGNED(16, static const float, b1)[] = {
205
+     0.500603,  0.505471,  0.515447,  0.531043,
206
+     0.553104,  0.582935,  0.622504,  0.674808,
207
+    -1.169440, -0.972568, -0.839350, -0.744536,
208
+   -10.190008, -3.407609, -2.057781, -1.484165,
209
+     0.502419,  0.522499,  0.566944,  0.646822,
210
+     0.788155,  1.060678,  1.722447,  5.101149,
211
+     0.509796,  0.601345,  0.899976,  2.562916,
212
+     1.000000,  1.000000,  1.306563,  0.541196,
213
+     1.000000,  0.707107,  1.000000, -0.707107
214
+};
215
+
216
+DECLARE_ALIGNED(16, static const int32_t, smask)[4] = {
217
+    0, 0, 0x80000000, 0x80000000
218
+};
219
+
220
+/* butterfly operator */
221
+#define BUTTERFLY(a,b,c,tmp)                            \
222
+    "movaps  %%" #a    ", %%" #tmp  "             \n\t" \
223
+    "subps   %%" #b    ", %%" #a    "             \n\t" \
224
+    "addps   %%" #tmp  ", %%" #b    "             \n\t" \
225
+    "mulps     " #c    ", %%" #a    "             \n\t"
226
+
227
+///* Same as BUTTERFLY when vectors a and b overlap */
228
+#define BUTTERFLY0(val, mask, cos, tmp, shuf)                            \
229
+    "movaps  %%" #val  ", %%" #tmp  "             \n\t"                  \
230
+    "shufps    " #shuf ", %%" #val  ",%%" #val "  \n\t"                  \
231
+    "xorps   %%" #mask ", %%" #tmp  "             \n\t" /* flip signs */ \
232
+    "addps   %%" #tmp  ", %%" #val  "             \n\t"                  \
233
+    "mulps   %%" #cos  ", %%" #val  "             \n\t"
234
+
235
+#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b)
236
+#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1)
237
+
238
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
239
+{
240
+    int32_t tmp1 = 0;
241
+    __asm__ volatile(
242
+        /* pass 1 */
243
+
244
+        "movaps    (%4), %%xmm0           \n\t"
245
+        "movaps 112(%4), %%xmm1           \n\t"
246
+        "shufps   $0x1b, %%xmm1, %%xmm1   \n\t"
247
+        BUTTERFLY(xmm0, xmm1, (%2), xmm3)
248
+
249
+        "movaps  64(%4), %%xmm7           \n\t"
250
+        "movaps  48(%4), %%xmm4           \n\t"
251
+        "shufps   $0x1b, %%xmm4, %%xmm4   \n\t"
252
+        BUTTERFLY(xmm7, xmm4, 48(%2), xmm3)
253
+
254
+
255
+        /* pass 2 */
256
+        "movaps  64(%2), %%xmm2           \n\t"
257
+        BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
258
+        "movaps  %%xmm1, 48(%1)           \n\t"
259
+        "movaps  %%xmm4, (%1)             \n\t"
260
+
261
+        /* pass 1 */
262
+        "movaps  16(%4), %%xmm1           \n\t"
263
+        "movaps  96(%4), %%xmm6           \n\t"
264
+        "shufps   $0x1b, %%xmm6, %%xmm6   \n\t"
265
+        BUTTERFLY(xmm1, xmm6, 16(%2), xmm3)
266
+
267
+        "movaps  80(%4), %%xmm4           \n\t"
268
+        "movaps  32(%4), %%xmm5           \n\t"
269
+        "shufps   $0x1b, %%xmm5, %%xmm5   \n\t"
270
+        BUTTERFLY(xmm4, xmm5, 32(%2), xmm3)
271
+
272
+        /* pass 2 */
273
+        BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3)
274
+
275
+        "movaps  80(%2), %%xmm2           \n\t"
276
+        BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3)
277
+
278
+        BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
279
+
280
+        /* pass 3 */
281
+        "movaps  96(%2), %%xmm2           \n\t"
282
+        "shufps   $0x1b, %%xmm1, %%xmm1   \n\t"
283
+        BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3)
284
+        "movaps  %%xmm0, 112(%1)          \n\t"
285
+        "movaps  %%xmm1,  96(%1)          \n\t"
286
+
287
+        "movaps   0(%1), %%xmm0           \n\t"
288
+        "shufps   $0x1b, %%xmm5, %%xmm5   \n\t"
289
+        BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3)
290
+
291
+        "movaps  48(%1), %%xmm1           \n\t"
292
+        "shufps   $0x1b, %%xmm6, %%xmm6   \n\t"
293
+        BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3)
294
+        "movaps  %%xmm1,  48(%1)          \n\t"
295
+
296
+        "shufps   $0x1b, %%xmm4, %%xmm4   \n\t"
297
+        BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3)
298
+
299
+        /* pass 4 */
300
+        "movaps    (%3), %%xmm3           \n\t"
301
+        "movaps 112(%2), %%xmm2           \n\t"
302
+
303
+        BUTTERFLY2(xmm5, xmm3, xmm2, xmm1)
304
+
305
+        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
306
+        "movaps  %%xmm0, 16(%1)           \n\t"
307
+
308
+        BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
309
+        "movaps  %%xmm6, 32(%1)           \n\t"
310
+
311
+        "movaps  48(%1), %%xmm0           \n\t"
312
+        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
313
+        "movaps  %%xmm0, 48(%1)           \n\t"
314
+
315
+        BUTTERFLY2(xmm4, xmm3, xmm2, xmm1)
316
+
317
+        BUTTERFLY2(xmm7, xmm3, xmm2, xmm1)
318
+
319
+        "movaps  96(%1), %%xmm6           \n\t"
320
+        BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
321
+
322
+        "movaps 112(%1), %%xmm0           \n\t"
323
+        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
324
+
325
+        /* pass 5 */
326
+        "movaps 128(%2), %%xmm2           \n\t"
327
+        "shufps   $0xCC, %%xmm3,%%xmm3    \n\t"
328
+
329
+        BUTTERFLY3(xmm5, xmm3, xmm2, xmm1)
330
+        "movaps  %%xmm5, (%1)             \n\t"
331
+
332
+        "movaps  16(%1), %%xmm1           \n\t"
333
+        BUTTERFLY3(xmm1, xmm3, xmm2, xmm5)
334
+        "movaps  %%xmm1, 16(%1)           \n\t"
335
+
336
+        BUTTERFLY3(xmm4, xmm3, xmm2, xmm5)
337
+        "movaps  %%xmm4, 64(%1)           \n\t"
338
+
339
+        BUTTERFLY3(xmm7, xmm3, xmm2, xmm5)
340
+        "movaps  %%xmm7, 80(%1)           \n\t"
341
+
342
+        "movaps  32(%1), %%xmm5           \n\t"
343
+        BUTTERFLY3(xmm5, xmm3, xmm2, xmm7)
344
+        "movaps  %%xmm5, 32(%1)           \n\t"
345
+
346
+        "movaps  48(%1), %%xmm4           \n\t"
347
+        BUTTERFLY3(xmm4, xmm3, xmm2, xmm7)
348
+        "movaps  %%xmm4, 48(%1)           \n\t"
349
+
350
+        BUTTERFLY3(xmm6, xmm3, xmm2, xmm7)
351
+        "movaps  %%xmm6, 96(%1)           \n\t"
352
+
353
+        BUTTERFLY3(xmm0, xmm3, xmm2, xmm7)
354
+        "movaps  %%xmm0, 112(%1)          \n\t"
355
+
356
+
357
+        /* pass 6, no SIMD... */
358
+        "movss    56(%1),  %%xmm3           \n\t"
359
+        "movl      4(%1),      %0           \n\t"
360
+        "addss    60(%1),  %%xmm3           \n\t"
361
+        "movss    72(%1),  %%xmm7           \n\t"
362
+        "addss    %%xmm3,  %%xmm4           \n\t"
363
+        "movss    52(%1),  %%xmm2           \n\t"
364
+        "addss    %%xmm3,  %%xmm2           \n\t"
365
+        "movss    24(%1),  %%xmm3           \n\t"
366
+        "addss    28(%1),  %%xmm3           \n\t"
367
+        "addss    76(%1),  %%xmm7           \n\t"
368
+        "addss    %%xmm3,  %%xmm1           \n\t"
369
+        "addss    %%xmm4,  %%xmm5           \n\t"
370
+        "movss    %%xmm1,  16(%1)           \n\t"
371
+        "movss    20(%1),  %%xmm1           \n\t"
372
+        "addss    %%xmm3,  %%xmm1           \n\t"
373
+        "movss    40(%1),  %%xmm3           \n\t"
374
+        "movss    %%xmm1,  48(%1)           \n\t"
375
+        "addss    44(%1),  %%xmm3           \n\t"
376
+        "movss    20(%1),  %%xmm1           \n\t"
377
+        "addss    %%xmm3,  %%xmm4           \n\t"
378
+        "addss    %%xmm2,  %%xmm3           \n\t"
379
+        "addss    28(%1),  %%xmm1           \n\t"
380
+        "movss    %%xmm3,  40(%1)           \n\t"
381
+        "addss    36(%1),  %%xmm2           \n\t"
382
+        "movss     8(%1),  %%xmm3           \n\t"
383
+        "movss    %%xmm2,  56(%1)           \n\t"
384
+        "addss    12(%1),  %%xmm3           \n\t"
385
+        "movss    %%xmm5,   8(%1)           \n\t"
386
+        "movss    %%xmm3,  32(%1)           \n\t"
387
+        "movss    52(%1),  %%xmm2           \n\t"
388
+        "movss    80(%1),  %%xmm3           \n\t"
389
+        "movss   120(%1),  %%xmm5           \n\t"
390
+        "movss    %%xmm1,  80(%1)           \n\t"
391
+        "movss    %%xmm4,  24(%1)           \n\t"
392
+        "addss   124(%1),  %%xmm5           \n\t"
393
+        "movss    64(%1),  %%xmm1           \n\t"
394
+        "addss    60(%1),  %%xmm2           \n\t"
395
+        "addss    %%xmm5,  %%xmm0           \n\t"
396
+        "addss   116(%1),  %%xmm5           \n\t"
397
+        "movl         %0,  64(%1)           \n\t"
398
+        "addss    %%xmm0,  %%xmm6           \n\t"
399
+        "addss    %%xmm6,  %%xmm1           \n\t"
400
+        "movl     12(%1),      %0           \n\t"
401
+        "movss    %%xmm1,   4(%1)           \n\t"
402
+        "movss    88(%1),  %%xmm1           \n\t"
403
+        "movl         %0,  96(%1)           \n\t"
404
+        "addss    92(%1),  %%xmm1           \n\t"
405
+        "movss   104(%1),  %%xmm4           \n\t"
406
+        "movl     28(%1),      %0           \n\t"
407
+        "addss   108(%1),  %%xmm4           \n\t"
408
+        "addss    %%xmm4,  %%xmm0           \n\t"
409
+        "addss    %%xmm1,  %%xmm3           \n\t"
410
+        "addss    84(%1),  %%xmm1           \n\t"
411
+        "addss    %%xmm5,  %%xmm4           \n\t"
412
+        "addss    %%xmm3,  %%xmm6           \n\t"
413
+        "addss    %%xmm0,  %%xmm3           \n\t"
414
+        "addss    %%xmm7,  %%xmm0           \n\t"
415
+        "addss   100(%1),  %%xmm5           \n\t"
416
+        "addss    %%xmm4,  %%xmm7           \n\t"
417
+        "movl         %0, 112(%1)           \n\t"
418
+        "movss    %%xmm0,  28(%1)           \n\t"
419
+        "movss    36(%1),  %%xmm0           \n\t"
420
+        "movss    %%xmm7,  36(%1)           \n\t"
421
+        "addss    %%xmm1,  %%xmm4           \n\t"
422
+        "movss   116(%1),  %%xmm7           \n\t"
423
+        "addss    %%xmm2,  %%xmm0           \n\t"
424
+        "addss   124(%1),  %%xmm7           \n\t"
425
+        "movss    %%xmm0,  72(%1)           \n\t"
426
+        "movss    44(%1),  %%xmm0           \n\t"
427
+        "movss    %%xmm6,  12(%1)           \n\t"
428
+        "movss    %%xmm3,  20(%1)           \n\t"
429
+        "addss    %%xmm0,  %%xmm2           \n\t"
430
+        "movss    %%xmm4,  44(%1)           \n\t"
431
+        "movss    %%xmm2,  88(%1)           \n\t"
432
+        "addss    60(%1),  %%xmm0           \n\t"
433
+        "movl     60(%1),      %0           \n\t"
434
+        "movl         %0, 120(%1)           \n\t"
435
+        "movss    %%xmm0, 104(%1)           \n\t"
436
+        "addss    %%xmm5,  %%xmm1           \n\t"
437
+        "addss    68(%1),  %%xmm5           \n\t"
438
+        "movss    %%xmm1,  52(%1)           \n\t"
439
+        "movss    %%xmm5,  60(%1)           \n\t"
440
+        "movss    68(%1),  %%xmm1           \n\t"
441
+        "movss   100(%1),  %%xmm5           \n\t"
442
+        "addss    %%xmm7,  %%xmm5           \n\t"
443
+        "addss   108(%1),  %%xmm7           \n\t"
444
+        "addss    %%xmm5,  %%xmm1           \n\t"
445
+        "movss    84(%1),  %%xmm2           \n\t"
446
+        "addss    92(%1),  %%xmm2           \n\t"
447
+        "addss    %%xmm2,  %%xmm5           \n\t"
448
+        "movss    %%xmm1,  68(%1)           \n\t"
449
+        "addss    %%xmm7,  %%xmm2           \n\t"
450
+        "movss    76(%1),  %%xmm1           \n\t"
451
+        "movss    %%xmm2,  84(%1)           \n\t"
452
+        "movss    %%xmm5,  76(%1)           \n\t"
453
+        "movss   108(%1),  %%xmm2           \n\t"
454
+        "addss    %%xmm1,  %%xmm7           \n\t"
455
+        "addss   124(%1),  %%xmm2           \n\t"
456
+        "addss    %%xmm2,  %%xmm1           \n\t"
457
+        "addss    92(%1),  %%xmm2           \n\t"
458
+        "movss    %%xmm1, 100(%1)           \n\t"
459
+        "movss    %%xmm2, 108(%1)           \n\t"
460
+        "movss    92(%1),  %%xmm2           \n\t"
461
+        "movss    %%xmm7,  92(%1)           \n\t"
462
+        "addss   124(%1),  %%xmm2           \n\t"
463
+        "movss    %%xmm2, 116(%1)           \n\t"
464
+        :"+&r"(tmp1)
465
+        :"r"(out), "r"(b1), "r"(smask), "r"(in)
466
+        :"memory"
467
+        );
468
+}