Browse code

Merge commit 'b57e38f52cc3f31a27105c28887d57cd6812c3eb'

* commit 'b57e38f52cc3f31a27105c28887d57cd6812c3eb':
ac3dsp: x86: Replace inline asm for in-decoder downmixing with standalone asm

Merged-by: Clément Bœsch <u@pkh.me>

Clément Bœsch authored on 2017/03/22 20:44:49
Showing 6 changed files
... ...
@@ -1430,19 +1430,19 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
1430 1430
             ac3_downmix_c_fixed16(s->outptr, s->downmix_coeffs,
1431 1431
                               s->out_channels, s->fbw_channels, 256);
1432 1432
 #else
1433
-            s->ac3dsp.downmix(s->outptr, s->downmix_coeffs,
1433
+            ff_ac3dsp_downmix(&s->ac3dsp, s->outptr, s->downmix_coeffs,
1434 1434
                               s->out_channels, s->fbw_channels, 256);
1435 1435
 #endif
1436 1436
         }
1437 1437
     } else {
1438 1438
         if (downmix_output) {
1439
-            s->ac3dsp.AC3_RENAME(downmix)(s->xcfptr + 1, s->downmix_coeffs,
1439
+            AC3_RENAME(ff_ac3dsp_downmix)(&s->ac3dsp, s->xcfptr + 1, s->downmix_coeffs,
1440 1440
                                           s->out_channels, s->fbw_channels, 256);
1441 1441
         }
1442 1442
 
1443 1443
         if (downmix_output && !s->downmixed) {
1444 1444
             s->downmixed = 1;
1445
-            s->ac3dsp.AC3_RENAME(downmix)(s->dlyptr, s->downmix_coeffs,
1445
+            AC3_RENAME(ff_ac3dsp_downmix)(&s->ac3dsp, s->dlyptr, s->downmix_coeffs,
1446 1446
                                           s->out_channels, s->fbw_channels, 128);
1447 1447
         }
1448 1448
 
... ...
@@ -213,49 +213,53 @@ static void ac3_sum_square_butterfly_float_c(float sum[4],
213 213
     }
214 214
 }
215 215
 
216
-static void ac3_downmix_c(float **samples, float **matrix,
217
-                          int out_ch, int in_ch, int len)
216
+static void ac3_downmix_5_to_2_symmetric_c(float **samples, float **matrix,
217
+                                           int len)
218 218
 {
219
-    int **matrix_cmp = (int **)matrix;
220
-    int i, j;
219
+    int i;
221 220
     float v0, v1;
221
+    float front_mix    = matrix[0][0];
222
+    float center_mix   = matrix[0][1];
223
+    float surround_mix = matrix[0][3];
222 224
 
223
-    if (in_ch == 5 && out_ch == 2 &&
224
-        !(matrix_cmp[1][0] | matrix_cmp[0][2]   |
225
-          matrix_cmp[1][3] | matrix_cmp[0][4]   |
226
-         (matrix_cmp[0][1] ^ matrix_cmp[1][1]) |
227
-         (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) {
228
-        float front_mix    = matrix[0][0];
229
-        float center_mix   = matrix[0][1];
230
-        float surround_mix = matrix[0][3];
225
+    for (i = 0; i < len; i++) {
226
+        v0 = samples[0][i] * front_mix  +
227
+             samples[1][i] * center_mix +
228
+             samples[3][i] * surround_mix;
231 229
 
232
-        for (i = 0; i < len; i++) {
233
-            v0 = samples[0][i] * front_mix  +
234
-                 samples[1][i] * center_mix +
235
-                 samples[3][i] * surround_mix;
230
+        v1 = samples[1][i] * center_mix +
231
+             samples[2][i] * front_mix  +
232
+             samples[4][i] * surround_mix;
236 233
 
237
-            v1 = samples[1][i] * center_mix +
238
-                 samples[2][i] * front_mix  +
239
-                 samples[4][i] * surround_mix;
234
+        samples[0][i] = v0;
235
+        samples[1][i] = v1;
236
+    }
237
+}
240 238
 
241
-            samples[0][i] = v0;
242
-            samples[1][i] = v1;
243
-        }
244
-    } else if (in_ch == 5 && out_ch == 1 &&
245
-               matrix_cmp[0][0] == matrix_cmp[0][2] &&
246
-               matrix_cmp[0][3] == matrix_cmp[0][4]) {
247
-        float front_mix    = matrix[0][0];
248
-        float center_mix   = matrix[0][1];
249
-        float surround_mix = matrix[0][3];
239
+static void ac3_downmix_5_to_1_symmetric_c(float **samples, float **matrix,
240
+                                           int len)
241
+{
242
+    int i;
243
+    float front_mix    = matrix[0][0];
244
+    float center_mix   = matrix[0][1];
245
+    float surround_mix = matrix[0][3];
250 246
 
251
-        for (i = 0; i < len; i++) {
252
-            samples[0][i] = samples[0][i] * front_mix    +
253
-                            samples[1][i] * center_mix   +
254
-                            samples[2][i] * front_mix    +
255
-                            samples[3][i] * surround_mix +
256
-                            samples[4][i] * surround_mix;
257
-        }
258
-    } else if (out_ch == 2) {
247
+    for (i = 0; i < len; i++) {
248
+        samples[0][i] = samples[0][i] * front_mix    +
249
+                        samples[1][i] * center_mix   +
250
+                        samples[2][i] * front_mix    +
251
+                        samples[3][i] * surround_mix +
252
+                        samples[4][i] * surround_mix;
253
+    }
254
+}
255
+
256
+static void ac3_downmix_c(float **samples, float **matrix,
257
+                          int out_ch, int in_ch, int len)
258
+{
259
+    int i, j;
260
+    float v0, v1;
261
+
262
+    if (out_ch == 2) {
259 263
         for (i = 0; i < len; i++) {
260 264
             v0 = v1 = 0.0f;
261 265
             for (j = 0; j < in_ch; j++) {
... ...
@@ -300,6 +304,15 @@ static void ac3_downmix_c_fixed(int32_t **samples, int16_t **matrix,
300 300
     }
301 301
 }
302 302
 
303
+void ff_ac3dsp_downmix_fixed(AC3DSPContext *c, int32_t **samples, int16_t **matrix,
304
+                             int out_ch, int in_ch, int len)
305
+{
306
+    if (c->downmix_fixed)
307
+        c->downmix_fixed(samples, matrix, len);
308
+    else
309
+        ac3_downmix_c_fixed(samples, matrix, out_ch, in_ch, len);
310
+}
311
+
303 312
 static void apply_window_int16_c(int16_t *output, const int16_t *input,
304 313
                                  const int16_t *window, unsigned int len)
305 314
 {
... ...
@@ -313,6 +326,38 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
313 313
     }
314 314
 }
315 315
 
316
+void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix,
317
+                       int out_ch, int in_ch, int len)
318
+{
319
+    if (c->in_channels != in_ch || c->out_channels != out_ch) {
320
+        int **matrix_cmp = (int **)matrix;
321
+
322
+        c->in_channels  = in_ch;
323
+        c->out_channels = out_ch;
324
+        c->downmix      = NULL;
325
+
326
+        if (in_ch == 5 && out_ch == 2 &&
327
+            !(matrix_cmp[1][0] | matrix_cmp[0][2]   |
328
+              matrix_cmp[1][3] | matrix_cmp[0][4]   |
329
+             (matrix_cmp[0][1] ^ matrix_cmp[1][1]) |
330
+             (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) {
331
+            c->downmix = ac3_downmix_5_to_2_symmetric_c;
332
+        } else if (in_ch == 5 && out_ch == 1 &&
333
+                   matrix_cmp[0][0] == matrix_cmp[0][2] &&
334
+                   matrix_cmp[0][3] == matrix_cmp[0][4]) {
335
+            c->downmix = ac3_downmix_5_to_1_symmetric_c;
336
+        }
337
+
338
+        if (ARCH_X86)
339
+            ff_ac3dsp_set_downmix_x86(c);
340
+    }
341
+
342
+    if (c->downmix)
343
+        c->downmix(samples, matrix, len);
344
+    else
345
+        ac3_downmix_c(samples, matrix, out_ch, in_ch, len);
346
+}
347
+
316 348
 av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
317 349
 {
318 350
     c->ac3_exponent_min = ac3_exponent_min_c;
... ...
@@ -326,8 +371,10 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
326 326
     c->extract_exponents = ac3_extract_exponents_c;
327 327
     c->sum_square_butterfly_int32 = ac3_sum_square_butterfly_int32_c;
328 328
     c->sum_square_butterfly_float = ac3_sum_square_butterfly_float_c;
329
-    c->downmix = ac3_downmix_c;
330
-    c->downmix_fixed = ac3_downmix_c_fixed;
329
+    c->in_channels           = 0;
330
+    c->out_channels          = 0;
331
+    c->downmix               = NULL;
332
+    c->downmix_fixed         = NULL;
331 333
     c->apply_window_int16 = apply_window_int16_c;
332 334
 
333 335
     if (ARCH_ARM)
... ...
@@ -132,11 +132,10 @@ typedef struct AC3DSPContext {
132 132
     void (*sum_square_butterfly_float)(float sum[4], const float *coef0,
133 133
                                        const float *coef1, int len);
134 134
 
135
-    void (*downmix)(float **samples, float **matrix, int out_ch,
136
-                    int in_ch, int len);
137
-
138
-    void (*downmix_fixed)(int32_t **samples, int16_t **matrix, int out_ch,
139
-                          int in_ch, int len);
135
+    int out_channels;
136
+    int in_channels;
137
+    void (*downmix)(float **samples, float **matrix, int len);
138
+    void (*downmix_fixed)(int32_t **samples, int16_t **matrix, int len);
140 139
 
141 140
     /**
142 141
      * Apply symmetric window in 16-bit fixed-point.
... ...
@@ -158,4 +157,11 @@ void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact);
158 158
 void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
159 159
 void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact);
160 160
 
161
+void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix,
162
+                       int out_ch, int in_ch, int len);
163
+void ff_ac3dsp_downmix_fixed(AC3DSPContext *c, int32_t **samples, int16_t **matrix,
164
+                             int out_ch, int in_ch, int len);
165
+
166
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c);
167
+
161 168
 #endif /* AVCODEC_AC3DSP_H */
... ...
@@ -87,7 +87,8 @@ MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
87 87
 MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
88 88
 
89 89
 # subsystems
90
-YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
90
+YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o                  \
91
+                                          x86/ac3dsp_downmix.o
91 92
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
92 93
 YASM-OBJS-$(CONFIG_BLOCKDSP)           += x86/blockdsp.o
93 94
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
94 95
new file mode 100644
... ...
@@ -0,0 +1,187 @@
0
+;*****************************************************************************
1
+;* x86-optimized AC-3 downmixing
2
+;* Copyright (c) 2012 Justin Ruggles
3
+;*
4
+;* This file is part of FFmpeg.
5
+;*
6
+;* FFmpeg is free software; you can redistribute it and/or
7
+;* modify it under the terms of the GNU Lesser General Public
8
+;* License as published by the Free Software Foundation; either
9
+;* version 2.1 of the License, or (at your option) any later version.
10
+;*
11
+;* FFmpeg is distributed in the hope that it will be useful,
12
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+;* Lesser General Public License for more details.
15
+;*
16
+;* You should have received a copy of the GNU Lesser General Public
17
+;* License along with FFmpeg; if not, write to the Free Software
18
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+;******************************************************************************
20
+
21
+;******************************************************************************
22
+;* This is based on the channel mixing asm in libavresample, but it is
23
+;* simplified for only float coefficients and only 3 to 6 channels.
24
+;******************************************************************************
25
+
26
+%include "libavutil/x86/x86util.asm"
27
+
28
+SECTION .text
29
+
30
+;-----------------------------------------------------------------------------
31
+; functions to downmix from 3 to 6 channels to mono or stereo
32
+; void ff_ac3_downmix_*(float **samples, float **matrix, int len);
33
+;-----------------------------------------------------------------------------
34
+
35
+%macro AC3_DOWNMIX 2 ; %1 = in channels, %2 = out channels
36
+; define some names to make the code clearer
37
+%assign  in_channels %1
38
+%assign out_channels %2
39
+%assign stereo out_channels - 1
40
+
41
+; determine how many matrix elements must go on the stack vs. mmregs
42
+%assign matrix_elements in_channels * out_channels
43
+%if stereo
44
+    %assign needed_mmregs 4
45
+%else
46
+    %assign needed_mmregs 3
47
+%endif
48
+%assign matrix_elements_mm num_mmregs - needed_mmregs
49
+%if matrix_elements < matrix_elements_mm
50
+    %assign matrix_elements_mm matrix_elements
51
+%endif
52
+%assign total_mmregs needed_mmregs+matrix_elements_mm
53
+%if matrix_elements_mm < matrix_elements
54
+    %assign matrix_elements_stack matrix_elements - matrix_elements_mm
55
+%else
56
+    %assign matrix_elements_stack 0
57
+%endif
58
+
59
+cglobal ac3_downmix_%1_to_%2, 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, len, src2, src3, src4, src5
60
+
61
+; load matrix pointers
62
+%define matrix0q r1q
63
+%define matrix1q r3q
64
+%if stereo
65
+    mov      matrix1q, [matrix0q+gprsize]
66
+%endif
67
+    mov      matrix0q, [matrix0q]
68
+
69
+; define matrix coeff names
70
+%assign %%i 0
71
+%assign %%j needed_mmregs
72
+%rep in_channels
73
+    %if %%i >= matrix_elements_mm
74
+        CAT_XDEFINE mx_stack_0_, %%i, 1
75
+        CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
76
+    %else
77
+        CAT_XDEFINE mx_stack_0_, %%i, 0
78
+        CAT_XDEFINE mx_0_, %%i, m %+ %%j
79
+        %assign %%j %%j+1
80
+    %endif
81
+    %assign %%i %%i+1
82
+%endrep
83
+%if stereo
84
+%assign %%i 0
85
+%rep in_channels
86
+    %if in_channels + %%i >= matrix_elements_mm
87
+        CAT_XDEFINE mx_stack_1_, %%i, 1
88
+        CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
89
+    %else
90
+        CAT_XDEFINE mx_stack_1_, %%i, 0
91
+        CAT_XDEFINE mx_1_, %%i, m %+ %%j
92
+        %assign %%j %%j+1
93
+    %endif
94
+    %assign %%i %%i+1
95
+%endrep
96
+%endif
97
+
98
+; load/splat matrix coeffs
99
+%assign %%i 0
100
+%rep in_channels
101
+    %if mx_stack_0_ %+ %%i
102
+        VBROADCASTSS m0, [matrix0q+4*%%i]
103
+        mova  mx_0_ %+ %%i, m0
104
+    %else
105
+        VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
106
+    %endif
107
+    %if stereo
108
+    %if mx_stack_1_ %+ %%i
109
+        VBROADCASTSS m0, [matrix1q+4*%%i]
110
+        mova  mx_1_ %+ %%i, m0
111
+    %else
112
+        VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
113
+    %endif
114
+    %endif
115
+    %assign %%i %%i+1
116
+%endrep
117
+
118
+    lea          lenq, [4*r2d]
119
+    ; load channel pointers to registers
120
+%assign %%i 1
121
+%rep (in_channels - 1)
122
+    mov         src %+ %%i %+ q, [src0q+%%i*gprsize]
123
+    add         src %+ %%i %+ q, lenq
124
+    %assign %%i %%i+1
125
+%endrep
126
+    mov         src0q, [src0q]
127
+    add         src0q, lenq
128
+    neg          lenq
129
+.loop:
130
+    %if stereo || mx_stack_0_0
131
+    mova           m0, [src0q+lenq]
132
+    %endif
133
+    %if stereo
134
+    mulps          m1, m0, mx_1_0
135
+    %endif
136
+    %if stereo || mx_stack_0_0
137
+    mulps          m0, m0, mx_0_0
138
+    %else
139
+    mulps          m0, mx_0_0, [src0q+lenq]
140
+    %endif
141
+%assign %%i 1
142
+%rep (in_channels - 1)
143
+    %define src_ptr src %+ %%i %+ q
144
+    ; avoid extra load for mono if matrix is in a mm register
145
+    %if stereo || mx_stack_0_ %+ %%i
146
+    mova           m2, [src_ptr+lenq]
147
+    %endif
148
+    %if stereo
149
+    FMULADD_PS     m1, m2, mx_1_ %+ %%i, m1, m3
150
+    %endif
151
+    %if stereo || mx_stack_0_ %+ %%i
152
+    FMULADD_PS     m0, m2, mx_0_ %+ %%i, m0, m2
153
+    %else
154
+    FMULADD_PS     m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
155
+    %endif
156
+    %assign %%i %%i+1
157
+%endrep
158
+    mova [src0q+lenq], m0
159
+    %if stereo
160
+    mova [src1q+lenq], m1
161
+    %endif
162
+
163
+    add          lenq, mmsize
164
+    jl .loop
165
+    RET
166
+%endmacro
167
+
168
+%macro AC3_DOWNMIX_FUNCS 0
169
+%assign %%i 3
170
+%rep 4
171
+    INIT_XMM sse
172
+    AC3_DOWNMIX %%i, 1
173
+    AC3_DOWNMIX %%i, 2
174
+    INIT_YMM avx
175
+    AC3_DOWNMIX %%i, 1
176
+    AC3_DOWNMIX %%i, 2
177
+    %if HAVE_FMA3_EXTERNAL
178
+    INIT_YMM fma3
179
+    AC3_DOWNMIX %%i, 1
180
+    AC3_DOWNMIX %%i, 2
181
+    %endif
182
+    %assign %%i %%i+1
183
+%endrep
184
+%endmacro
185
+
186
+AC3_DOWNMIX_FUNCS
... ...
@@ -63,140 +63,6 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
63 63
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
64 64
                                       const int16_t *window, unsigned int len);
65 65
 
66
-#if ARCH_X86_32 && defined(__INTEL_COMPILER)
67
-#       undef HAVE_7REGS
68
-#       define HAVE_7REGS 0
69
-#endif
70
-
71
-#if HAVE_SSE_INLINE && HAVE_7REGS
72
-
73
-#define IF1(x) x
74
-#define IF0(x)
75
-
76
-#define MIX5(mono, stereo)                                      \
77
-    __asm__ volatile (                                          \
78
-        "movss           0(%1), %%xmm5          \n"             \
79
-        "movss           4(%1), %%xmm6          \n"             \
80
-        "movss          12(%1), %%xmm7          \n"             \
81
-        "shufps     $0, %%xmm5, %%xmm5          \n"             \
82
-        "shufps     $0, %%xmm6, %%xmm6          \n"             \
83
-        "shufps     $0, %%xmm7, %%xmm7          \n"             \
84
-        "1:                                     \n"             \
85
-        "movaps       (%0, %2), %%xmm0          \n"             \
86
-        "movaps       (%0, %3), %%xmm1          \n"             \
87
-        "movaps       (%0, %4), %%xmm2          \n"             \
88
-        "movaps       (%0, %5), %%xmm3          \n"             \
89
-        "movaps       (%0, %6), %%xmm4          \n"             \
90
-        "mulps          %%xmm5, %%xmm0          \n"             \
91
-        "mulps          %%xmm6, %%xmm1          \n"             \
92
-        "mulps          %%xmm5, %%xmm2          \n"             \
93
-        "mulps          %%xmm7, %%xmm3          \n"             \
94
-        "mulps          %%xmm7, %%xmm4          \n"             \
95
- stereo("addps          %%xmm1, %%xmm0          \n")            \
96
-        "addps          %%xmm1, %%xmm2          \n"             \
97
-        "addps          %%xmm3, %%xmm0          \n"             \
98
-        "addps          %%xmm4, %%xmm2          \n"             \
99
-   mono("addps          %%xmm2, %%xmm0          \n")            \
100
-        "movaps         %%xmm0, (%0, %2)        \n"             \
101
- stereo("movaps         %%xmm2, (%0, %3)        \n")            \
102
-        "add               $16, %0              \n"             \
103
-        "jl                 1b                  \n"             \
104
-        : "+&r"(i)                                              \
105
-        : "r"(matrix[0]),                                          \
106
-          "r"(samples[0] + len),                                \
107
-          "r"(samples[1] + len),                                \
108
-          "r"(samples[2] + len),                                \
109
-          "r"(samples[3] + len),                                \
110
-          "r"(samples[4] + len)                                 \
111
-        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",      \
112
-                      "%xmm4", "%xmm5", "%xmm6", "%xmm7",)      \
113
-         "memory"                                               \
114
-    );
115
-
116
-#define MIX_MISC(stereo)                                        \
117
-    __asm__ volatile (                                          \
118
-        "mov              %5, %2            \n"                 \
119
-        "1:                                 \n"                 \
120
-        "mov -%c7(%6, %2, %c8), %3          \n"                 \
121
-        "movaps     (%3, %0), %%xmm0        \n"                 \
122
- stereo("movaps       %%xmm0, %%xmm1        \n")                \
123
-        "mulps        %%xmm4, %%xmm0        \n"                 \
124
- stereo("mulps        %%xmm5, %%xmm1        \n")                \
125
-        "2:                                 \n"                 \
126
-        "mov   (%6, %2, %c8), %1            \n"                 \
127
-        "movaps     (%1, %0), %%xmm2        \n"                 \
128
- stereo("movaps       %%xmm2, %%xmm3        \n")                \
129
-        "mulps   (%4, %2, 8), %%xmm2        \n"                 \
130
- stereo("mulps 16(%4, %2, 8), %%xmm3        \n")                \
131
-        "addps        %%xmm2, %%xmm0        \n"                 \
132
- stereo("addps        %%xmm3, %%xmm1        \n")                \
133
-        "add              $4, %2            \n"                 \
134
-        "jl               2b                \n"                 \
135
-        "mov              %5, %2            \n"                 \
136
- stereo("mov   (%6, %2, %c8), %1            \n")                \
137
-        "movaps       %%xmm0, (%3, %0)      \n"                 \
138
- stereo("movaps       %%xmm1, (%1, %0)      \n")                \
139
-        "add             $16, %0            \n"                 \
140
-        "jl               1b                \n"                 \
141
-        : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m)                \
142
-        : "r"(matrix_simd + in_ch),                             \
143
-          "g"((intptr_t) - 4 * (in_ch - 1)),                    \
144
-          "r"(samp + in_ch),                                    \
145
-          "i"(sizeof(float *)), "i"(sizeof(float *)/4)          \
146
-        : "memory"                                              \
147
-    );
148
-
149
-static void ac3_downmix_sse(float **samples, float **matrix,
150
-                            int out_ch, int in_ch, int len)
151
-{
152
-    int **matrix_cmp = (int **)matrix;
153
-    intptr_t i, j, k, m;
154
-
155
-    i = -len * sizeof(float);
156
-    if (in_ch == 5 && out_ch == 2 &&
157
-        !(matrix_cmp[1][0] | matrix_cmp[0][2]   |
158
-          matrix_cmp[1][3] | matrix_cmp[0][4]   |
159
-          (matrix_cmp[0][1] ^ matrix_cmp[1][1]) |
160
-          (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) {
161
-        MIX5(IF0, IF1);
162
-    } else if (in_ch == 5 && out_ch == 1 &&
163
-               matrix_cmp[0][0] == matrix_cmp[0][2] &&
164
-               matrix_cmp[0][3] == matrix_cmp[0][4]) {
165
-        MIX5(IF1, IF0);
166
-    } else {
167
-        LOCAL_ALIGNED(16, float, matrix_simd, [AC3_MAX_CHANNELS], [2][4]);
168
-        float *samp[AC3_MAX_CHANNELS];
169
-
170
-        for (j = 0; j < in_ch; j++)
171
-            samp[j] = samples[j] + len;
172
-
173
-        j = 2 * in_ch * sizeof(float);
174
-        k =     in_ch * sizeof(float);
175
-        __asm__ volatile (
176
-            "1:                                 \n"
177
-            "sub             $4, %1             \n"
178
-            "sub             $8, %0             \n"
179
-            "movss     (%3, %1), %%xmm4         \n"
180
-            "movss     (%4, %1), %%xmm5         \n"
181
-            "shufps          $0, %%xmm4, %%xmm4 \n"
182
-            "shufps          $0, %%xmm5, %%xmm5 \n"
183
-            "movaps      %%xmm4,   (%2, %0, 4)  \n"
184
-            "movaps      %%xmm5, 16(%2, %0, 4)  \n"
185
-            "jg              1b                 \n"
186
-            : "+&r"(j), "+&r"(k)
187
-            : "r"(matrix_simd), "r"(matrix[0]), "r"(matrix[1])
188
-            : "memory"
189
-        );
190
-        if (out_ch == 2) {
191
-            MIX_MISC(IF1);
192
-        } else {
193
-            MIX_MISC(IF0);
194
-        }
195
-    }
196
-}
197
-
198
-#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
199
-
200 66
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
201 67
 {
202 68
     int cpu_flags = av_get_cpu_flags();
... ...
@@ -252,10 +118,47 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
252 252
             c->apply_window_int16 = ff_apply_window_int16_ssse3;
253 253
         }
254 254
     }
255
+}
256
+
257
+#define DOWNMIX_FUNC_OPT(ch, opt)                                       \
258
+void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples,            \
259
+                                            float **matrix, int len);   \
260
+void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples,            \
261
+                                            float **matrix, int len);
262
+
263
+#define DOWNMIX_FUNCS(opt)   \
264
+    DOWNMIX_FUNC_OPT(3, opt) \
265
+    DOWNMIX_FUNC_OPT(4, opt) \
266
+    DOWNMIX_FUNC_OPT(5, opt) \
267
+    DOWNMIX_FUNC_OPT(6, opt)
268
+
269
+DOWNMIX_FUNCS(sse)
270
+DOWNMIX_FUNCS(avx)
271
+DOWNMIX_FUNCS(fma3)
272
+
273
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
274
+{
275
+    int cpu_flags = av_get_cpu_flags();
276
+
277
+#define SET_DOWNMIX(ch, suf, SUF)                                       \
278
+    if (ch == c->in_channels) {                                         \
279
+        if (EXTERNAL_ ## SUF (cpu_flags)) {                             \
280
+            if (c->out_channels == 1)                                   \
281
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf;    \
282
+            else                                                        \
283
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf;    \
284
+        }                                                               \
285
+    }
286
+
287
+#define SET_DOWNMIX_ALL(suf, SUF)                   \
288
+    SET_DOWNMIX(3, suf, SUF)                        \
289
+    SET_DOWNMIX(4, suf, SUF)                        \
290
+    SET_DOWNMIX(5, suf, SUF)                        \
291
+    SET_DOWNMIX(6, suf, SUF)
255 292
 
256
-#if HAVE_SSE_INLINE && HAVE_7REGS
257
-    if (INLINE_SSE(cpu_flags)) {
258
-        c->downmix = ac3_downmix_sse;
293
+    SET_DOWNMIX_ALL(sse,  SSE)
294
+    if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
295
+        SET_DOWNMIX_ALL(avx,  AVX)
296
+        SET_DOWNMIX_ALL(fma3, FMA3)
259 297
     }
260
-#endif
261 298
 }