Browse code

ac3dsp: x86: Replace inline asm for in-decoder downmixing with standalone asm

Adds a wrapper function for downmixing which detects channel count changes
and updates the selected downmix function accordingly.

Simplification and porting to current x86inc infrastructure by Diego Biurrun.

Signed-off-by: Diego Biurrun <diego@biurrun.de>

Justin Ruggles authored on 2015/10/28 23:38:22
Showing 6 changed files
... ...
@@ -1328,19 +1328,19 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
1328 1328
         do_imdct(s, s->channels);
1329 1329
 
1330 1330
         if (downmix_output) {
1331
-            s->ac3dsp.downmix(s->outptr, s->downmix_coeffs,
1331
+            ff_ac3dsp_downmix(&s->ac3dsp, s->outptr, s->downmix_coeffs,
1332 1332
                               s->out_channels, s->fbw_channels, 256);
1333 1333
         }
1334 1334
     } else {
1335 1335
         if (downmix_output) {
1336
-            s->ac3dsp.downmix(s->xcfptr + 1, s->downmix_coeffs,
1336
+            ff_ac3dsp_downmix(&s->ac3dsp, s->xcfptr + 1, s->downmix_coeffs,
1337 1337
                               s->out_channels, s->fbw_channels, 256);
1338 1338
         }
1339 1339
 
1340 1340
         if (downmix_output && !s->downmixed) {
1341 1341
             s->downmixed = 1;
1342
-            s->ac3dsp.downmix(s->dlyptr, s->downmix_coeffs, s->out_channels,
1343
-                              s->fbw_channels, 128);
1342
+            ff_ac3dsp_downmix(&s->ac3dsp, s->dlyptr, s->downmix_coeffs,
1343
+                              s->out_channels, s->fbw_channels, 128);
1344 1344
         }
1345 1345
 
1346 1346
         do_imdct(s, s->out_channels);
... ...
@@ -171,49 +171,53 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs)
171 171
     }
172 172
 }
173 173
 
174
-static void ac3_downmix_c(float **samples, float **matrix,
175
-                          int out_ch, int in_ch, int len)
174
+static void ac3_downmix_5_to_2_symmetric_c(float **samples, float **matrix,
175
+                                           int len)
176 176
 {
177
-    int **matrix_cmp = (int **)matrix;
178
-    int i, j;
177
+    int i;
179 178
     float v0, v1;
179
+    float front_mix    = matrix[0][0];
180
+    float center_mix   = matrix[0][1];
181
+    float surround_mix = matrix[0][3];
180 182
 
181
-    if (in_ch == 5 && out_ch == 2 &&
182
-        !(matrix_cmp[1][0] | matrix_cmp[0][2]   |
183
-          matrix_cmp[1][3] | matrix_cmp[0][4]   |
184
-         (matrix_cmp[0][1] ^ matrix_cmp[1][1]) |
185
-         (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) {
186
-        float front_mix    = matrix[0][0];
187
-        float center_mix   = matrix[0][1];
188
-        float surround_mix = matrix[0][3];
183
+    for (i = 0; i < len; i++) {
184
+        v0 = samples[0][i] * front_mix  +
185
+             samples[1][i] * center_mix +
186
+             samples[3][i] * surround_mix;
189 187
 
190
-        for (i = 0; i < len; i++) {
191
-            v0 = samples[0][i] * front_mix  +
192
-                 samples[1][i] * center_mix +
193
-                 samples[3][i] * surround_mix;
188
+        v1 = samples[1][i] * center_mix +
189
+             samples[2][i] * front_mix  +
190
+             samples[4][i] * surround_mix;
194 191
 
195
-            v1 = samples[1][i] * center_mix +
196
-                 samples[2][i] * front_mix  +
197
-                 samples[4][i] * surround_mix;
192
+        samples[0][i] = v0;
193
+        samples[1][i] = v1;
194
+    }
195
+}
198 196
 
199
-            samples[0][i] = v0;
200
-            samples[1][i] = v1;
201
-        }
202
-    } else if (in_ch == 5 && out_ch == 1 &&
203
-               matrix_cmp[0][0] == matrix_cmp[0][2] &&
204
-               matrix_cmp[0][3] == matrix_cmp[0][4]) {
205
-        float front_mix    = matrix[0][0];
206
-        float center_mix   = matrix[0][1];
207
-        float surround_mix = matrix[0][3];
197
+static void ac3_downmix_5_to_1_symmetric_c(float **samples, float **matrix,
198
+                                           int len)
199
+{
200
+    int i;
201
+    float front_mix    = matrix[0][0];
202
+    float center_mix   = matrix[0][1];
203
+    float surround_mix = matrix[0][3];
208 204
 
209
-        for (i = 0; i < len; i++) {
210
-            samples[0][i] = samples[0][i] * front_mix    +
211
-                            samples[1][i] * center_mix   +
212
-                            samples[2][i] * front_mix    +
213
-                            samples[3][i] * surround_mix +
214
-                            samples[4][i] * surround_mix;
215
-        }
216
-    } else if (out_ch == 2) {
205
+    for (i = 0; i < len; i++) {
206
+        samples[0][i] = samples[0][i] * front_mix    +
207
+                        samples[1][i] * center_mix   +
208
+                        samples[2][i] * front_mix    +
209
+                        samples[3][i] * surround_mix +
210
+                        samples[4][i] * surround_mix;
211
+    }
212
+}
213
+
214
+static void ac3_downmix_c(float **samples, float **matrix,
215
+                          int out_ch, int in_ch, int len)
216
+{
217
+    int i, j;
218
+    float v0, v1;
219
+
220
+    if (out_ch == 2) {
217 221
         for (i = 0; i < len; i++) {
218 222
             v0 = v1 = 0.0f;
219 223
             for (j = 0; j < in_ch; j++) {
... ...
@@ -246,6 +250,38 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
246 246
     }
247 247
 }
248 248
 
249
+void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix,
250
+                       int out_ch, int in_ch, int len)
251
+{
252
+    if (c->in_channels != in_ch || c->out_channels != out_ch) {
253
+        int **matrix_cmp = (int **)matrix;
254
+
255
+        c->in_channels  = in_ch;
256
+        c->out_channels = out_ch;
257
+        c->downmix      = NULL;
258
+
259
+        if (in_ch == 5 && out_ch == 2 &&
260
+            !(matrix_cmp[1][0] | matrix_cmp[0][2]   |
261
+              matrix_cmp[1][3] | matrix_cmp[0][4]   |
262
+             (matrix_cmp[0][1] ^ matrix_cmp[1][1]) |
263
+             (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) {
264
+            c->downmix = ac3_downmix_5_to_2_symmetric_c;
265
+        } else if (in_ch == 5 && out_ch == 1 &&
266
+                   matrix_cmp[0][0] == matrix_cmp[0][2] &&
267
+                   matrix_cmp[0][3] == matrix_cmp[0][4]) {
268
+            c->downmix = ac3_downmix_5_to_1_symmetric_c;
269
+        }
270
+
271
+        if (ARCH_X86)
272
+            ff_ac3dsp_set_downmix_x86(c);
273
+    }
274
+
275
+    if (c->downmix)
276
+        c->downmix(samples, matrix, len);
277
+    else
278
+        ac3_downmix_c(samples, matrix, out_ch, in_ch, len);
279
+}
280
+
249 281
 av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
250 282
 {
251 283
     c->ac3_exponent_min = ac3_exponent_min_c;
... ...
@@ -257,7 +293,9 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
257 257
     c->update_bap_counts = ac3_update_bap_counts_c;
258 258
     c->compute_mantissa_size = ac3_compute_mantissa_size_c;
259 259
     c->extract_exponents = ac3_extract_exponents_c;
260
-    c->downmix = ac3_downmix_c;
260
+    c->in_channels           = 0;
261
+    c->out_channels          = 0;
262
+    c->downmix               = NULL;
261 263
     c->apply_window_int16 = apply_window_int16_c;
262 264
 
263 265
     if (ARCH_ARM)
... ...
@@ -126,8 +126,9 @@ typedef struct AC3DSPContext {
126 126
 
127 127
     void (*extract_exponents)(uint8_t *exp, int32_t *coef, int nb_coefs);
128 128
 
129
-    void (*downmix)(float **samples, float **matrix, int out_ch,
130
-                    int in_ch, int len);
129
+    int out_channels;
130
+    int in_channels;
131
+    void (*downmix)(float **samples, float **matrix, int len);
131 132
 
132 133
     /**
133 134
      * Apply symmetric window in 16-bit fixed-point.
... ...
@@ -148,4 +149,8 @@ void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
148 148
 void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact);
149 149
 void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
150 150
 
151
+void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix,
152
+                       int out_ch, int in_ch, int len);
153
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c);
154
+
151 155
 #endif /* AVCODEC_AC3DSP_H */
... ...
@@ -71,7 +71,8 @@ MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
71 71
                                           x86/xvididct_sse2.o
72 72
 
73 73
 # subsystems
74
-YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
74
+YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o                  \
75
+                                          x86/ac3dsp_downmix.o
75 76
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
76 77
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
77 78
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
78 79
new file mode 100644
... ...
@@ -0,0 +1,187 @@
0
+;*****************************************************************************
1
+;* x86-optimized AC-3 downmixing
2
+;* Copyright (c) 2012 Justin Ruggles
3
+;*
4
+;* This file is part of Libav.
5
+;*
6
+;* Libav is free software; you can redistribute it and/or
7
+;* modify it under the terms of the GNU Lesser General Public
8
+;* License as published by the Free Software Foundation; either
9
+;* version 2.1 of the License, or (at your option) any later version.
10
+;*
11
+;* Libav is distributed in the hope that it will be useful,
12
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+;* Lesser General Public License for more details.
15
+;*
16
+;* You should have received a copy of the GNU Lesser General Public
17
+;* License along with Libav; if not, write to the Free Software
18
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+;******************************************************************************
20
+
21
+;******************************************************************************
22
+;* This is based on the channel mixing asm in libavresample, but it is
23
+;* simplified for only float coefficients and only 3 to 6 channels.
24
+;******************************************************************************
25
+
26
+%include "libavutil/x86/x86util.asm"
27
+
28
+SECTION .text
29
+
30
+;-----------------------------------------------------------------------------
31
+; functions to downmix from 3 to 6 channels to mono or stereo
32
+; void ff_ac3_downmix_*(float **samples, float **matrix, int len);
33
+;-----------------------------------------------------------------------------
34
+
35
+%macro AC3_DOWNMIX 2 ; %1 = in channels, %2 = out channels
36
+; define some names to make the code clearer
37
+%assign  in_channels %1
38
+%assign out_channels %2
39
+%assign stereo out_channels - 1
40
+
41
+; determine how many matrix elements must go on the stack vs. mmregs
42
+%assign matrix_elements in_channels * out_channels
43
+%if stereo
44
+    %assign needed_mmregs 4
45
+%else
46
+    %assign needed_mmregs 3
47
+%endif
48
+%assign matrix_elements_mm num_mmregs - needed_mmregs
49
+%if matrix_elements < matrix_elements_mm
50
+    %assign matrix_elements_mm matrix_elements
51
+%endif
52
+%assign total_mmregs needed_mmregs+matrix_elements_mm
53
+%if matrix_elements_mm < matrix_elements
54
+    %assign matrix_elements_stack matrix_elements - matrix_elements_mm
55
+%else
56
+    %assign matrix_elements_stack 0
57
+%endif
58
+
59
+cglobal ac3_downmix_%1_to_%2, 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, len, src2, src3, src4, src5
60
+
61
+; load matrix pointers
62
+%define matrix0q r1q
63
+%define matrix1q r3q
64
+%if stereo
65
+    mov      matrix1q, [matrix0q+gprsize]
66
+%endif
67
+    mov      matrix0q, [matrix0q]
68
+
69
+; define matrix coeff names
70
+%assign %%i 0
71
+%assign %%j needed_mmregs
72
+%rep in_channels
73
+    %if %%i >= matrix_elements_mm
74
+        CAT_XDEFINE mx_stack_0_, %%i, 1
75
+        CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
76
+    %else
77
+        CAT_XDEFINE mx_stack_0_, %%i, 0
78
+        CAT_XDEFINE mx_0_, %%i, m %+ %%j
79
+        %assign %%j %%j+1
80
+    %endif
81
+    %assign %%i %%i+1
82
+%endrep
83
+%if stereo
84
+%assign %%i 0
85
+%rep in_channels
86
+    %if in_channels + %%i >= matrix_elements_mm
87
+        CAT_XDEFINE mx_stack_1_, %%i, 1
88
+        CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
89
+    %else
90
+        CAT_XDEFINE mx_stack_1_, %%i, 0
91
+        CAT_XDEFINE mx_1_, %%i, m %+ %%j
92
+        %assign %%j %%j+1
93
+    %endif
94
+    %assign %%i %%i+1
95
+%endrep
96
+%endif
97
+
98
+; load/splat matrix coeffs
99
+%assign %%i 0
100
+%rep in_channels
101
+    %if mx_stack_0_ %+ %%i
102
+        VBROADCASTSS m0, [matrix0q+4*%%i]
103
+        mova  mx_0_ %+ %%i, m0
104
+    %else
105
+        VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
106
+    %endif
107
+    %if stereo
108
+    %if mx_stack_1_ %+ %%i
109
+        VBROADCASTSS m0, [matrix1q+4*%%i]
110
+        mova  mx_1_ %+ %%i, m0
111
+    %else
112
+        VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
113
+    %endif
114
+    %endif
115
+    %assign %%i %%i+1
116
+%endrep
117
+
118
+    lea          lenq, [4*r2d]
119
+    ; load channel pointers to registers
120
+%assign %%i 1
121
+%rep (in_channels - 1)
122
+    mov         src %+ %%i %+ q, [src0q+%%i*gprsize]
123
+    add         src %+ %%i %+ q, lenq
124
+    %assign %%i %%i+1
125
+%endrep
126
+    mov         src0q, [src0q]
127
+    add         src0q, lenq
128
+    neg          lenq
129
+.loop:
130
+    %if stereo || mx_stack_0_0
131
+    mova           m0, [src0q+lenq]
132
+    %endif
133
+    %if stereo
134
+    mulps          m1, m0, mx_1_0
135
+    %endif
136
+    %if stereo || mx_stack_0_0
137
+    mulps          m0, m0, mx_0_0
138
+    %else
139
+    mulps          m0, mx_0_0, [src0q+lenq]
140
+    %endif
141
+%assign %%i 1
142
+%rep (in_channels - 1)
143
+    %define src_ptr src %+ %%i %+ q
144
+    ; avoid extra load for mono if matrix is in a mm register
145
+    %if stereo || mx_stack_0_ %+ %%i
146
+    mova           m2, [src_ptr+lenq]
147
+    %endif
148
+    %if stereo
149
+    FMULADD_PS     m1, m2, mx_1_ %+ %%i, m1, m3
150
+    %endif
151
+    %if stereo || mx_stack_0_ %+ %%i
152
+    FMULADD_PS     m0, m2, mx_0_ %+ %%i, m0, m2
153
+    %else
154
+    FMULADD_PS     m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
155
+    %endif
156
+    %assign %%i %%i+1
157
+%endrep
158
+    mova [src0q+lenq], m0
159
+    %if stereo
160
+    mova [src1q+lenq], m1
161
+    %endif
162
+
163
+    add          lenq, mmsize
164
+    jl .loop
165
+    RET
166
+%endmacro
167
+
168
+%macro AC3_DOWNMIX_FUNCS 0
169
+%assign %%i 3
170
+%rep 4
171
+    INIT_XMM sse
172
+    AC3_DOWNMIX %%i, 1
173
+    AC3_DOWNMIX %%i, 2
174
+    INIT_YMM avx
175
+    AC3_DOWNMIX %%i, 1
176
+    AC3_DOWNMIX %%i, 2
177
+    %if HAVE_FMA3_EXTERNAL
178
+    INIT_YMM fma3
179
+    AC3_DOWNMIX %%i, 1
180
+    AC3_DOWNMIX %%i, 2
181
+    %endif
182
+    %assign %%i %%i+1
183
+%endrep
184
+%endmacro
185
+
186
+AC3_DOWNMIX_FUNCS
... ...
@@ -63,135 +63,6 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
63 63
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
64 64
                                       const int16_t *window, unsigned int len);
65 65
 
66
-#if HAVE_SSE_INLINE && HAVE_7REGS
67
-
68
-#define IF1(x) x
69
-#define IF0(x)
70
-
71
-#define MIX5(mono, stereo)                                      \
72
-    __asm__ volatile (                                          \
73
-        "movss           0(%1), %%xmm5          \n"             \
74
-        "movss           4(%1), %%xmm6          \n"             \
75
-        "movss          12(%1), %%xmm7          \n"             \
76
-        "shufps     $0, %%xmm5, %%xmm5          \n"             \
77
-        "shufps     $0, %%xmm6, %%xmm6          \n"             \
78
-        "shufps     $0, %%xmm7, %%xmm7          \n"             \
79
-        "1:                                     \n"             \
80
-        "movaps       (%0, %2), %%xmm0          \n"             \
81
-        "movaps       (%0, %3), %%xmm1          \n"             \
82
-        "movaps       (%0, %4), %%xmm2          \n"             \
83
-        "movaps       (%0, %5), %%xmm3          \n"             \
84
-        "movaps       (%0, %6), %%xmm4          \n"             \
85
-        "mulps          %%xmm5, %%xmm0          \n"             \
86
-        "mulps          %%xmm6, %%xmm1          \n"             \
87
-        "mulps          %%xmm5, %%xmm2          \n"             \
88
-        "mulps          %%xmm7, %%xmm3          \n"             \
89
-        "mulps          %%xmm7, %%xmm4          \n"             \
90
- stereo("addps          %%xmm1, %%xmm0          \n")            \
91
-        "addps          %%xmm1, %%xmm2          \n"             \
92
-        "addps          %%xmm3, %%xmm0          \n"             \
93
-        "addps          %%xmm4, %%xmm2          \n"             \
94
-   mono("addps          %%xmm2, %%xmm0          \n")            \
95
-        "movaps         %%xmm0, (%0, %2)        \n"             \
96
- stereo("movaps         %%xmm2, (%0, %3)        \n")            \
97
-        "add               $16, %0              \n"             \
98
-        "jl                 1b                  \n"             \
99
-        : "+&r"(i)                                              \
100
-        : "r"(matrix[0]),                                          \
101
-          "r"(samples[0] + len),                                \
102
-          "r"(samples[1] + len),                                \
103
-          "r"(samples[2] + len),                                \
104
-          "r"(samples[3] + len),                                \
105
-          "r"(samples[4] + len)                                 \
106
-        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",      \
107
-                      "%xmm4", "%xmm5", "%xmm6", "%xmm7",)      \
108
-         "memory"                                               \
109
-    );
110
-
111
-#define MIX_MISC(stereo)                                        \
112
-    __asm__ volatile (                                          \
113
-        "mov              %5, %2            \n"                 \
114
-        "1:                                 \n"                 \
115
-        "mov -%c7(%6, %2, %c8), %3          \n"                 \
116
-        "movaps     (%3, %0), %%xmm0        \n"                 \
117
- stereo("movaps       %%xmm0, %%xmm1        \n")                \
118
-        "mulps        %%xmm4, %%xmm0        \n"                 \
119
- stereo("mulps        %%xmm5, %%xmm1        \n")                \
120
-        "2:                                 \n"                 \
121
-        "mov   (%6, %2, %c8), %1            \n"                 \
122
-        "movaps     (%1, %0), %%xmm2        \n"                 \
123
- stereo("movaps       %%xmm2, %%xmm3        \n")                \
124
-        "mulps   (%4, %2, 8), %%xmm2        \n"                 \
125
- stereo("mulps 16(%4, %2, 8), %%xmm3        \n")                \
126
-        "addps        %%xmm2, %%xmm0        \n"                 \
127
- stereo("addps        %%xmm3, %%xmm1        \n")                \
128
-        "add              $4, %2            \n"                 \
129
-        "jl               2b                \n"                 \
130
-        "mov              %5, %2            \n"                 \
131
- stereo("mov   (%6, %2, %c8), %1            \n")                \
132
-        "movaps       %%xmm0, (%3, %0)      \n"                 \
133
- stereo("movaps       %%xmm1, (%1, %0)      \n")                \
134
-        "add             $16, %0            \n"                 \
135
-        "jl               1b                \n"                 \
136
-        : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m)                \
137
-        : "r"(matrix_simd + in_ch),                             \
138
-          "g"((intptr_t) - 4 * (in_ch - 1)),                    \
139
-          "r"(samp + in_ch),                                    \
140
-          "i"(sizeof(float *)), "i"(sizeof(float *)/4)          \
141
-        : "memory"                                              \
142
-    );
143
-
144
-static void ac3_downmix_sse(float **samples, float **matrix,
145
-                            int out_ch, int in_ch, int len)
146
-{
147
-    int **matrix_cmp = (int **)matrix;
148
-    intptr_t i, j, k, m;
149
-
150
-    i = -len * sizeof(float);
151
-    if (in_ch == 5 && out_ch == 2 &&
152
-        !(matrix_cmp[1][0] | matrix_cmp[0][2]   |
153
-          matrix_cmp[1][3] | matrix_cmp[0][4]   |
154
-          (matrix_cmp[0][1] ^ matrix_cmp[1][1]) |
155
-          (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) {
156
-        MIX5(IF0, IF1);
157
-    } else if (in_ch == 5 && out_ch == 1 &&
158
-               matrix_cmp[0][0] == matrix_cmp[0][2] &&
159
-               matrix_cmp[0][3] == matrix_cmp[0][4]) {
160
-        MIX5(IF1, IF0);
161
-    } else {
162
-        DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
163
-        float *samp[AC3_MAX_CHANNELS];
164
-
165
-        for (j = 0; j < in_ch; j++)
166
-            samp[j] = samples[j] + len;
167
-
168
-        j = 2 * in_ch * sizeof(float);
169
-        k =     in_ch * sizeof(float);
170
-        __asm__ volatile (
171
-            "1:                                 \n"
172
-            "sub             $4, %1             \n"
173
-            "sub             $8, %0             \n"
174
-            "movss     (%3, %1), %%xmm4         \n"
175
-            "movss     (%4, %1), %%xmm5         \n"
176
-            "shufps          $0, %%xmm4, %%xmm4 \n"
177
-            "shufps          $0, %%xmm5, %%xmm5 \n"
178
-            "movaps      %%xmm4,   (%2, %0, 4)  \n"
179
-            "movaps      %%xmm5, 16(%2, %0, 4)  \n"
180
-            "jg              1b                 \n"
181
-            : "+&r"(j), "+&r"(k)
182
-            : "r"(matrix_simd), "r"(matrix[0]), "r"(matrix[1])
183
-            : "memory"
184
-        );
185
-        if (out_ch == 2) {
186
-            MIX_MISC(IF1);
187
-        } else {
188
-            MIX_MISC(IF0);
189
-        }
190
-    }
191
-}
192
-
193
-#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
194
-
195 66
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
196 67
 {
197 68
     int cpu_flags = av_get_cpu_flags();
... ...
@@ -247,10 +118,47 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
247 247
             c->apply_window_int16 = ff_apply_window_int16_ssse3;
248 248
         }
249 249
     }
250
+}
251
+
252
+#define DOWNMIX_FUNC_OPT(ch, opt)                                       \
253
+void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples,            \
254
+                                            float **matrix, int len);   \
255
+void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples,            \
256
+                                            float **matrix, int len);
257
+
258
+#define DOWNMIX_FUNCS(opt)   \
259
+    DOWNMIX_FUNC_OPT(3, opt) \
260
+    DOWNMIX_FUNC_OPT(4, opt) \
261
+    DOWNMIX_FUNC_OPT(5, opt) \
262
+    DOWNMIX_FUNC_OPT(6, opt)
263
+
264
+DOWNMIX_FUNCS(sse)
265
+DOWNMIX_FUNCS(avx)
266
+DOWNMIX_FUNCS(fma3)
267
+
268
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
269
+{
270
+    int cpu_flags = av_get_cpu_flags();
271
+
272
+#define SET_DOWNMIX(ch, suf, SUF)                                       \
273
+    if (ch == c->in_channels) {                                         \
274
+        if (EXTERNAL_ ## SUF (cpu_flags)) {                             \
275
+            if (c->out_channels == 1)                                   \
276
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf;    \
277
+            else                                                        \
278
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf;    \
279
+        }                                                               \
280
+    }
281
+
282
+#define SET_DOWNMIX_ALL(suf, SUF)                   \
283
+    SET_DOWNMIX(3, suf, SUF)                        \
284
+    SET_DOWNMIX(4, suf, SUF)                        \
285
+    SET_DOWNMIX(5, suf, SUF)                        \
286
+    SET_DOWNMIX(6, suf, SUF)
250 287
 
251
-#if HAVE_SSE_INLINE && HAVE_7REGS
252
-    if (INLINE_SSE(cpu_flags)) {
253
-        c->downmix = ac3_downmix_sse;
288
+    SET_DOWNMIX_ALL(sse,  SSE)
289
+    if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
290
+        SET_DOWNMIX_ALL(avx,  AVX)
291
+        SET_DOWNMIX_ALL(fma3, FMA3)
254 292
     }
255
-#endif
256 293
 }