Browse code

Convert vector_fmul range of functions to YASM and add AVX versions

Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>

Kieran Kunhya authored on 2012/04/21 04:49:30
Showing 10 changed files
... ...
@@ -267,8 +267,8 @@ static const int8_t sbr_offset[6][16] = {
267 267
 };
268 268
 
269 269
 ///< window coefficients for analysis/synthesis QMF banks
270
-static DECLARE_ALIGNED(16, float, sbr_qmf_window_ds)[320];
271
-static DECLARE_ALIGNED(16, float, sbr_qmf_window_us)[640] = {
270
+static DECLARE_ALIGNED(32, float, sbr_qmf_window_ds)[320];
271
+static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = {
272 272
      0.0000000000, -0.0005525286, -0.0005617692, -0.0004947518,
273 273
     -0.0004875227, -0.0004893791, -0.0005040714, -0.0005226564,
274 274
     -0.0005466565, -0.0005677802, -0.0005870930, -0.0006132747,
... ...
@@ -33,8 +33,8 @@
33 33
 
34 34
 #include <stdint.h>
35 35
 
36
-DECLARE_ALIGNED(16, float,  ff_aac_kbd_long_1024)[1024];
37
-DECLARE_ALIGNED(16, float,  ff_aac_kbd_short_128)[128];
36
+DECLARE_ALIGNED(32, float,  ff_aac_kbd_long_1024)[1024];
37
+DECLARE_ALIGNED(32, float,  ff_aac_kbd_short_128)[128];
38 38
 
39 39
 const uint8_t ff_aac_num_swb_1024[] = {
40 40
     41, 41, 47, 49, 49, 51, 47, 47, 43, 43, 43, 40, 40
... ...
@@ -44,8 +44,8 @@
44 44
 /* @name window coefficients
45 45
  * @{
46 46
  */
47
-DECLARE_ALIGNED(16, extern float,  ff_aac_kbd_long_1024)[1024];
48
-DECLARE_ALIGNED(16, extern float,  ff_aac_kbd_short_128)[128];
47
+DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_long_1024)[1024];
48
+DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_short_128)[128];
49 49
 // @}
50 50
 
51 51
 /* @name number of scalefactor window bands for long and short transform windows respectively
... ...
@@ -398,7 +398,7 @@ typedef struct DSPContext {
398 398
     /* assume len is a multiple of 4, and arrays are 16-byte aligned */
399 399
     void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
400 400
     void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
401
-    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
401
+    /* assume len is a multiple of 16, and arrays are 32-byte aligned */
402 402
     void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len);
403 403
     void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
404 404
     /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
... ...
@@ -38,8 +38,8 @@
38 38
 typedef struct {
39 39
     AVFrame frame;
40 40
     DSPContext dsp;
41
-    DECLARE_ALIGNED(16, float,   sp_lpc)[FFALIGN(36, 8)];   ///< LPC coefficients for speech data (spec: A)
42
-    DECLARE_ALIGNED(16, float, gain_lpc)[FFALIGN(10, 8)];   ///< LPC coefficients for gain        (spec: GB)
41
+    DECLARE_ALIGNED(32, float,   sp_lpc)[FFALIGN(36, 16)];   ///< LPC coefficients for speech data (spec: A)
42
+    DECLARE_ALIGNED(32, float, gain_lpc)[FFALIGN(10, 16)];   ///< LPC coefficients for gain        (spec: GB)
43 43
 
44 44
     /** speech data history                                      (spec: SB).
45 45
      *  Its first 70 coefficients are updated only at backward filtering.
... ...
@@ -133,11 +133,11 @@ static void do_hybrid_window(RA288Context *ractx,
133 133
     int i;
134 134
     float buffer1[MAX_BACKWARD_FILTER_ORDER + 1];
135 135
     float buffer2[MAX_BACKWARD_FILTER_ORDER + 1];
136
-    LOCAL_ALIGNED_16(float, work, [FFALIGN(MAX_BACKWARD_FILTER_ORDER +
137
-                                           MAX_BACKWARD_FILTER_LEN   +
138
-                                           MAX_BACKWARD_FILTER_NONREC, 8)]);
136
+    LOCAL_ALIGNED(32, float, work, [FFALIGN(MAX_BACKWARD_FILTER_ORDER +
137
+                                            MAX_BACKWARD_FILTER_LEN   +
138
+                                            MAX_BACKWARD_FILTER_NONREC, 16)]);
139 139
 
140
-    ractx->dsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 8));
140
+    ractx->dsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
141 141
 
142 142
     convolve(buffer1, work + order    , n      , order);
143 143
     convolve(buffer2, work + order + n, non_rec, order);
... ...
@@ -164,7 +164,7 @@ static void backward_filter(RA288Context *ractx,
164 164
     do_hybrid_window(ractx, order, n, non_rec, temp, hist, rec, window);
165 165
 
166 166
     if (!compute_lpc_coefs(temp, order, lpc, 0, 1, 1))
167
-        ractx->dsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 8));
167
+        ractx->dsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
168 168
 
169 169
     memmove(hist, hist + n, move_size*sizeof(*hist));
170 170
 }
... ...
@@ -97,7 +97,7 @@ static const int16_t codetable[128][5]={
97 97
     {  3746,  -606,    53,  -269, -3301}, {   606,  2018, -1316,  4064,   398}
98 98
 };
99 99
 
100
-DECLARE_ALIGNED(16, static const float, syn_window)[FFALIGN(111, 8)]={
100
+DECLARE_ALIGNED(32, static const float, syn_window)[FFALIGN(111, 16)]={
101 101
   0.576690972, 0.580838025, 0.585013986, 0.589219987, 0.59345597,  0.597723007,
102 102
   0.602020264, 0.606384277, 0.610748291, 0.615142822, 0.619598389, 0.624084473,
103 103
   0.628570557, 0.633117676, 0.637695313, 0.642272949, 0.646911621, 0.651580811,
... ...
@@ -119,7 +119,7 @@ DECLARE_ALIGNED(16, static const float, syn_window)[FFALIGN(111, 8)]={
119 119
   0.142852783, 0.0954284668,0.0477600098
120 120
 };
121 121
 
122
-DECLARE_ALIGNED(16, static const float, gain_window)[FFALIGN(38, 8)]={
122
+DECLARE_ALIGNED(32, static const float, gain_window)[FFALIGN(38, 16)]={
123 123
   0.505699992, 0.524200022, 0.54339999,  0.563300014, 0.583953857, 0.60534668,
124 124
   0.627502441, 0.650482178, 0.674316406, 0.699005127, 0.724578857, 0.75112915,
125 125
   0.778625488, 0.807128906, 0.836669922, 0.86730957,  0.899078369, 0.932006836,
... ...
@@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, static const float, gain_window)[FFALIGN(38, 8)]={
130 130
 };
131 131
 
132 132
 /** synthesis bandwidth broadening table */
133
-DECLARE_ALIGNED(16, static const float, syn_bw_tab)[FFALIGN(36, 8)] = {
133
+DECLARE_ALIGNED(32, static const float, syn_bw_tab)[FFALIGN(36, 16)] = {
134 134
   0.98828125,  0.976699829, 0.965254128, 0.953942537, 0.942763507, 0.931715488,
135 135
   0.920796931, 0.910006344, 0.899342179, 0.888803005, 0.878387332, 0.868093729,
136 136
   0.857920766, 0.847867012, 0.837931097, 0.828111589, 0.818407178, 0.808816493,
... ...
@@ -140,7 +140,7 @@ DECLARE_ALIGNED(16, static const float, syn_bw_tab)[FFALIGN(36, 8)] = {
140 140
 };
141 141
 
142 142
 /** gain bandwidth broadening table */
143
-DECLARE_ALIGNED(16, static const float, gain_bw_tab)[FFALIGN(10, 8)] = {
143
+DECLARE_ALIGNED(32, static const float, gain_bw_tab)[FFALIGN(10, 16)] = {
144 144
   0.90625,     0.821289063, 0.74432373,  0.674499512, 0.61126709,
145 145
   0.553955078, 0.50201416,  0.454956055, 0.41229248,  0.373657227
146 146
 };
... ...
@@ -78,8 +78,8 @@ typedef struct {
78 78
      * @name State variables
79 79
      * @{
80 80
      */
81
-    DECLARE_ALIGNED(16, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
82
-    DECLARE_ALIGNED(16, float, analysis_filterbank_samples) [1312];
81
+    DECLARE_ALIGNED(32, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
82
+    DECLARE_ALIGNED(32, float, analysis_filterbank_samples) [1312];
83 83
     int                synthesis_filterbank_samples_offset;
84 84
     ///l_APrev and l_A
85 85
     int                e_a[2];
... ...
@@ -31,7 +31,7 @@
31 31
 #endif
32 32
 
33 33
 #define SINETABLE(size) \
34
-    SINETABLE_CONST DECLARE_ALIGNED(16, float, ff_sine_##size)[size]
34
+    SINETABLE_CONST DECLARE_ALIGNED(32, float, ff_sine_##size)[size]
35 35
 
36 36
 /**
37 37
  * Generate a sine window.
... ...
@@ -2348,135 +2348,6 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2348 2348
     }
2349 2349
 }
2350 2350
 
2351
-static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1,
2352
-                              int len)
2353
-{
2354
-    x86_reg i = (len - 4) * 4;
2355
-    __asm__ volatile (
2356
-        "1:                             \n\t"
2357
-        "movq    (%2, %0), %%mm0        \n\t"
2358
-        "movq   8(%2, %0), %%mm1        \n\t"
2359
-        "pfmul   (%3, %0), %%mm0        \n\t"
2360
-        "pfmul  8(%3, %0), %%mm1        \n\t"
2361
-        "movq       %%mm0,  (%1, %0)    \n\t"
2362
-        "movq       %%mm1, 8(%1, %0)    \n\t"
2363
-        "sub          $16, %0           \n\t"
2364
-        "jge           1b               \n\t"
2365
-        "femms                          \n\t"
2366
-        : "+r"(i)
2367
-        : "r"(dst), "r"(src0), "r"(src1)
2368
-        : "memory"
2369
-    );
2370
-}
2371
-
2372
-static void vector_fmul_sse(float *dst, const float *src0, const float *src1,
2373
-                            int len)
2374
-{
2375
-    x86_reg i = (len - 8) * 4;
2376
-    __asm__ volatile (
2377
-        "1:                             \n\t"
2378
-        "movaps    (%2, %0), %%xmm0     \n\t"
2379
-        "movaps  16(%2, %0), %%xmm1     \n\t"
2380
-        "mulps     (%3, %0), %%xmm0     \n\t"
2381
-        "mulps   16(%3, %0), %%xmm1     \n\t"
2382
-        "movaps      %%xmm0,   (%1, %0) \n\t"
2383
-        "movaps      %%xmm1, 16(%1, %0) \n\t"
2384
-        "sub            $32, %0         \n\t"
2385
-        "jge             1b             \n\t"
2386
-        : "+r"(i)
2387
-        : "r"(dst), "r"(src0), "r"(src1)
2388
-        : "memory"
2389
-    );
2390
-}
2391
-
2392
-static void vector_fmul_reverse_3dnow2(float *dst, const float *src0,
2393
-                                       const float *src1, int len)
2394
-{
2395
-    x86_reg i = len * 4 - 16;
2396
-    __asm__ volatile (
2397
-        "1:                             \n\t"
2398
-        "pswapd     8(%1), %%mm0        \n\t"
2399
-        "pswapd      (%1), %%mm1        \n\t"
2400
-        "pfmul   (%3, %0), %%mm0        \n\t"
2401
-        "pfmul  8(%3, %0), %%mm1        \n\t"
2402
-        "movq       %%mm0,  (%2, %0)    \n\t"
2403
-        "movq       %%mm1, 8(%2, %0)    \n\t"
2404
-        "add          $16, %1           \n\t"
2405
-        "sub          $16, %0           \n\t"
2406
-        "jge           1b               \n\t"
2407
-        : "+r"(i), "+r"(src1)
2408
-        : "r"(dst), "r"(src0)
2409
-    );
2410
-    __asm__ volatile ("femms");
2411
-}
2412
-
2413
-static void vector_fmul_reverse_sse(float *dst, const float *src0,
2414
-                                    const float *src1, int len)
2415
-{
2416
-    x86_reg i = len * 4 - 32;
2417
-    __asm__ volatile (
2418
-        "1:                                 \n\t"
2419
-        "movaps         16(%1), %%xmm0      \n\t"
2420
-        "movaps           (%1), %%xmm1      \n\t"
2421
-        "shufps  $0x1b, %%xmm0, %%xmm0      \n\t"
2422
-        "shufps  $0x1b, %%xmm1, %%xmm1      \n\t"
2423
-        "mulps        (%3, %0), %%xmm0      \n\t"
2424
-        "mulps      16(%3, %0), %%xmm1      \n\t"
2425
-        "movaps         %%xmm0,   (%2, %0)  \n\t"
2426
-        "movaps         %%xmm1, 16(%2, %0)  \n\t"
2427
-        "add               $32, %1          \n\t"
2428
-        "sub               $32, %0          \n\t"
2429
-        "jge                1b              \n\t"
2430
-        : "+r"(i), "+r"(src1)
2431
-        : "r"(dst), "r"(src0)
2432
-    );
2433
-}
2434
-
2435
-static void vector_fmul_add_3dnow(float *dst, const float *src0,
2436
-                                  const float *src1, const float *src2, int len)
2437
-{
2438
-    x86_reg i = (len - 4) * 4;
2439
-    __asm__ volatile (
2440
-        "1:                             \n\t"
2441
-        "movq   (%2, %0), %%mm0         \n\t"
2442
-        "movq  8(%2, %0), %%mm1         \n\t"
2443
-        "pfmul  (%3, %0), %%mm0         \n\t"
2444
-        "pfmul 8(%3, %0), %%mm1         \n\t"
2445
-        "pfadd  (%4, %0), %%mm0         \n\t"
2446
-        "pfadd 8(%4, %0), %%mm1         \n\t"
2447
-        "movq      %%mm0,  (%1, %0)     \n\t"
2448
-        "movq      %%mm1, 8(%1, %0)     \n\t"
2449
-        "sub         $16, %0            \n\t"
2450
-        "jge          1b                \n\t"
2451
-        : "+r"(i)
2452
-        : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2453
-        : "memory"
2454
-    );
2455
-    __asm__ volatile ("femms");
2456
-}
2457
-
2458
-static void vector_fmul_add_sse(float *dst, const float *src0,
2459
-                                const float *src1, const float *src2, int len)
2460
-{
2461
-    x86_reg i = (len - 8) * 4;
2462
-    __asm__ volatile (
2463
-        "1:                             \n\t"
2464
-        "movaps   (%2, %0), %%xmm0      \n\t"
2465
-        "movaps 16(%2, %0), %%xmm1      \n\t"
2466
-        "mulps    (%3, %0), %%xmm0      \n\t"
2467
-        "mulps  16(%3, %0), %%xmm1      \n\t"
2468
-        "addps    (%4, %0), %%xmm0      \n\t"
2469
-        "addps  16(%4, %0), %%xmm1      \n\t"
2470
-        "movaps     %%xmm0,   (%1, %0)  \n\t"
2471
-        "movaps     %%xmm1, 16(%1, %0)  \n\t"
2472
-        "sub           $32, %0          \n\t"
2473
-        "jge            1b              \n\t"
2474
-        : "+r"(i)
2475
-        : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2476
-        : "memory"
2477
-    );
2478
-}
2479
-
2480 2351
 #if HAVE_6REGS
2481 2352
 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2482 2353
                                       const float *src1, const float *win,
... ...
@@ -2631,6 +2502,21 @@ int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2631 2631
 
2632 2632
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2633 2633
 
2634
+void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
2635
+                        int len);
2636
+void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
2637
+                        int len);
2638
+
2639
+void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2640
+                                const float *src1, int len);
2641
+void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2642
+                                const float *src1, int len);
2643
+
2644
+void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2645
+                            const float *src2, int len);
2646
+void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2647
+                            const float *src2, int len);
2648
+
2634 2649
 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
2635 2650
                                    int32_t min, int32_t max, unsigned int len);
2636 2651
 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
... ...
@@ -2918,8 +2804,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2918 2918
 #endif
2919 2919
 
2920 2920
     c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2921
-    c->vector_fmul             = vector_fmul_3dnow;
2922
-    c->vector_fmul_add         = vector_fmul_add_3dnow;
2923 2921
 
2924 2922
 #if HAVE_7REGS
2925 2923
     c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
... ...
@@ -2929,7 +2813,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2929 2929
 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2930 2930
                                 int mm_flags)
2931 2931
 {
2932
-    c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2933 2932
 #if HAVE_6REGS
2934 2933
     c->vector_fmul_window  = vector_fmul_window_3dnow2;
2935 2934
 #endif
... ...
@@ -2949,11 +2832,11 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2949 2949
 
2950 2950
     c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2951 2951
     c->ac3_downmix             = ac3_downmix_sse;
2952
-    c->vector_fmul             = vector_fmul_sse;
2953
-    c->vector_fmul_reverse     = vector_fmul_reverse_sse;
2954
-
2955
-    if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2956
-        c->vector_fmul_add = vector_fmul_add_sse;
2952
+#if HAVE_YASM
2953
+    c->vector_fmul         = ff_vector_fmul_sse;
2954
+    c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2955
+    c->vector_fmul_add     = ff_vector_fmul_add_sse;
2956
+#endif
2957 2957
 
2958 2958
 #if HAVE_6REGS
2959 2959
     c->vector_fmul_window = vector_fmul_window_sse;
... ...
@@ -3112,6 +2995,9 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3112 3112
         }
3113 3113
     }
3114 3114
     c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3115
+    c->vector_fmul = ff_vector_fmul_avx;
3116
+    c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
3117
+    c->vector_fmul_add = ff_vector_fmul_add_avx;
3115 3118
 #endif
3116 3119
 }
3117 3120
 
... ...
@@ -1130,6 +1130,111 @@ VECTOR_CLIP_INT32 6, 1, 0, 0
1130 1130
 %endif
1131 1131
 
1132 1132
 ;-----------------------------------------------------------------------------
1133
+; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
1134
+;-----------------------------------------------------------------------------
1135
+%macro VECTOR_FMUL 0
1136
+cglobal vector_fmul, 4,4,2, dst, src0, src1, len
1137
+    lea       lenq, [lend*4 - 2*mmsize]
1138
+ALIGN 16
1139
+.loop
1140
+    mova      m0,   [src0q + lenq]
1141
+    mova      m1,   [src0q + lenq + mmsize]
1142
+    mulps     m0, m0, [src1q + lenq]
1143
+    mulps     m1, m1, [src1q + lenq + mmsize]
1144
+    mova      [dstq + lenq], m0
1145
+    mova      [dstq + lenq + mmsize], m1
1146
+
1147
+    sub       lenq, 2*mmsize
1148
+    jge       .loop
1149
+%if mmsize == 32
1150
+    vzeroupper
1151
+    RET
1152
+%else
1153
+    REP_RET
1154
+%endif
1155
+%endmacro
1156
+
1157
+INIT_XMM sse
1158
+VECTOR_FMUL
1159
+INIT_YMM avx
1160
+VECTOR_FMUL
1161
+
1162
+;-----------------------------------------------------------------------------
1163
+; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
1164
+;                          int len)
1165
+;-----------------------------------------------------------------------------
1166
+%macro VECTOR_FMUL_REVERSE 0
1167
+cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
1168
+    lea       lenq, [lend*4 - 2*mmsize]
1169
+ALIGN 16
1170
+.loop
1171
+%if cpuflag(avx)
1172
+    vmovaps     xmm0, [src1q + 16]
1173
+    vinsertf128 m0, m0, [src1q], 1
1174
+    vshufps     m0, m0, m0, q0123
1175
+    vmovaps     xmm1, [src1q + mmsize + 16]
1176
+    vinsertf128 m1, m1, [src1q + mmsize], 1
1177
+    vshufps     m1, m1, m1, q0123
1178
+%else
1179
+    mova    m0, [src1q]
1180
+    mova    m1, [src1q + mmsize]
1181
+    shufps  m0, m0, q0123
1182
+    shufps  m1, m1, q0123
1183
+%endif
1184
+    mulps   m0, m0, [src0q + lenq + mmsize]
1185
+    mulps   m1, m1, [src0q + lenq]
1186
+    mova    [dstq + lenq + mmsize], m0
1187
+    mova    [dstq + lenq], m1
1188
+    add     src1q, 2*mmsize
1189
+    sub     lenq,  2*mmsize
1190
+    jge     .loop
1191
+%if mmsize == 32
1192
+    vzeroupper
1193
+    RET
1194
+%else
1195
+    REP_RET
1196
+%endif
1197
+%endmacro
1198
+
1199
+INIT_XMM sse
1200
+VECTOR_FMUL_REVERSE
1201
+INIT_YMM avx
1202
+VECTOR_FMUL_REVERSE
1203
+
1204
+;-----------------------------------------------------------------------------
1205
+; vector_fmul_add(float *dst, const float *src0, const float *src1,
1206
+;                 const float *src2, int len)
1207
+;-----------------------------------------------------------------------------
1208
+%macro VECTOR_FMUL_ADD 0
1209
+cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
1210
+    lea       lenq, [lend*4 - 2*mmsize]
1211
+ALIGN 16
1212
+.loop
1213
+    mova    m0,   [src0q + lenq]
1214
+    mova    m1,   [src0q + lenq + mmsize]
1215
+    mulps   m0, m0, [src1q + lenq]
1216
+    mulps   m1, m1, [src1q + lenq + mmsize]
1217
+    addps   m0, m0, [src2q + lenq]
1218
+    addps   m1, m1, [src2q + lenq + mmsize]
1219
+    mova    [dstq + lenq], m0
1220
+    mova    [dstq + lenq + mmsize], m1
1221
+
1222
+    sub     lenq,   2*mmsize
1223
+    jge     .loop
1224
+%if mmsize == 32
1225
+    vzeroupper
1226
+    RET
1227
+%else
1228
+    REP_RET
1229
+%endif
1230
+%endmacro
1231
+
1232
+INIT_XMM sse
1233
+VECTOR_FMUL_ADD
1234
+INIT_YMM avx
1235
+VECTOR_FMUL_ADD
1236
+
1237
+;-----------------------------------------------------------------------------
1133 1238
 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
1134 1239
 ;                                      const float *src1, int len);
1135 1240
 ;-----------------------------------------------------------------------------