* commit 'b57e38f52cc3f31a27105c28887d57cd6812c3eb':
ac3dsp: x86: Replace inline asm for in-decoder downmixing with standalone asm
Merged-by: Clément Bœsch <u@pkh.me>
... | ... |
@@ -1430,19 +1430,19 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) |
1430 | 1430 |
ac3_downmix_c_fixed16(s->outptr, s->downmix_coeffs, |
1431 | 1431 |
s->out_channels, s->fbw_channels, 256); |
1432 | 1432 |
#else |
1433 |
- s->ac3dsp.downmix(s->outptr, s->downmix_coeffs, |
|
1433 |
+ ff_ac3dsp_downmix(&s->ac3dsp, s->outptr, s->downmix_coeffs, |
|
1434 | 1434 |
s->out_channels, s->fbw_channels, 256); |
1435 | 1435 |
#endif |
1436 | 1436 |
} |
1437 | 1437 |
} else { |
1438 | 1438 |
if (downmix_output) { |
1439 |
- s->ac3dsp.AC3_RENAME(downmix)(s->xcfptr + 1, s->downmix_coeffs, |
|
1439 |
+ AC3_RENAME(ff_ac3dsp_downmix)(&s->ac3dsp, s->xcfptr + 1, s->downmix_coeffs, |
|
1440 | 1440 |
s->out_channels, s->fbw_channels, 256); |
1441 | 1441 |
} |
1442 | 1442 |
|
1443 | 1443 |
if (downmix_output && !s->downmixed) { |
1444 | 1444 |
s->downmixed = 1; |
1445 |
- s->ac3dsp.AC3_RENAME(downmix)(s->dlyptr, s->downmix_coeffs, |
|
1445 |
+ AC3_RENAME(ff_ac3dsp_downmix)(&s->ac3dsp, s->dlyptr, s->downmix_coeffs, |
|
1446 | 1446 |
s->out_channels, s->fbw_channels, 128); |
1447 | 1447 |
} |
1448 | 1448 |
|
... | ... |
@@ -213,49 +213,53 @@ static void ac3_sum_square_butterfly_float_c(float sum[4], |
213 | 213 |
} |
214 | 214 |
} |
215 | 215 |
|
216 |
-static void ac3_downmix_c(float **samples, float **matrix, |
|
217 |
- int out_ch, int in_ch, int len) |
|
216 |
+static void ac3_downmix_5_to_2_symmetric_c(float **samples, float **matrix, |
|
217 |
+ int len) |
|
218 | 218 |
{ |
219 |
- int **matrix_cmp = (int **)matrix; |
|
220 |
- int i, j; |
|
219 |
+ int i; |
|
221 | 220 |
float v0, v1; |
221 |
+ float front_mix = matrix[0][0]; |
|
222 |
+ float center_mix = matrix[0][1]; |
|
223 |
+ float surround_mix = matrix[0][3]; |
|
222 | 224 |
|
223 |
- if (in_ch == 5 && out_ch == 2 && |
|
224 |
- !(matrix_cmp[1][0] | matrix_cmp[0][2] | |
|
225 |
- matrix_cmp[1][3] | matrix_cmp[0][4] | |
|
226 |
- (matrix_cmp[0][1] ^ matrix_cmp[1][1]) | |
|
227 |
- (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) { |
|
228 |
- float front_mix = matrix[0][0]; |
|
229 |
- float center_mix = matrix[0][1]; |
|
230 |
- float surround_mix = matrix[0][3]; |
|
225 |
+ for (i = 0; i < len; i++) { |
|
226 |
+ v0 = samples[0][i] * front_mix + |
|
227 |
+ samples[1][i] * center_mix + |
|
228 |
+ samples[3][i] * surround_mix; |
|
231 | 229 |
|
232 |
- for (i = 0; i < len; i++) { |
|
233 |
- v0 = samples[0][i] * front_mix + |
|
234 |
- samples[1][i] * center_mix + |
|
235 |
- samples[3][i] * surround_mix; |
|
230 |
+ v1 = samples[1][i] * center_mix + |
|
231 |
+ samples[2][i] * front_mix + |
|
232 |
+ samples[4][i] * surround_mix; |
|
236 | 233 |
|
237 |
- v1 = samples[1][i] * center_mix + |
|
238 |
- samples[2][i] * front_mix + |
|
239 |
- samples[4][i] * surround_mix; |
|
234 |
+ samples[0][i] = v0; |
|
235 |
+ samples[1][i] = v1; |
|
236 |
+ } |
|
237 |
+} |
|
240 | 238 |
|
241 |
- samples[0][i] = v0; |
|
242 |
- samples[1][i] = v1; |
|
243 |
- } |
|
244 |
- } else if (in_ch == 5 && out_ch == 1 && |
|
245 |
- matrix_cmp[0][0] == matrix_cmp[0][2] && |
|
246 |
- matrix_cmp[0][3] == matrix_cmp[0][4]) { |
|
247 |
- float front_mix = matrix[0][0]; |
|
248 |
- float center_mix = matrix[0][1]; |
|
249 |
- float surround_mix = matrix[0][3]; |
|
239 |
+static void ac3_downmix_5_to_1_symmetric_c(float **samples, float **matrix, |
|
240 |
+ int len) |
|
241 |
+{ |
|
242 |
+ int i; |
|
243 |
+ float front_mix = matrix[0][0]; |
|
244 |
+ float center_mix = matrix[0][1]; |
|
245 |
+ float surround_mix = matrix[0][3]; |
|
250 | 246 |
|
251 |
- for (i = 0; i < len; i++) { |
|
252 |
- samples[0][i] = samples[0][i] * front_mix + |
|
253 |
- samples[1][i] * center_mix + |
|
254 |
- samples[2][i] * front_mix + |
|
255 |
- samples[3][i] * surround_mix + |
|
256 |
- samples[4][i] * surround_mix; |
|
257 |
- } |
|
258 |
- } else if (out_ch == 2) { |
|
247 |
+ for (i = 0; i < len; i++) { |
|
248 |
+ samples[0][i] = samples[0][i] * front_mix + |
|
249 |
+ samples[1][i] * center_mix + |
|
250 |
+ samples[2][i] * front_mix + |
|
251 |
+ samples[3][i] * surround_mix + |
|
252 |
+ samples[4][i] * surround_mix; |
|
253 |
+ } |
|
254 |
+} |
|
255 |
+ |
|
256 |
+static void ac3_downmix_c(float **samples, float **matrix, |
|
257 |
+ int out_ch, int in_ch, int len) |
|
258 |
+{ |
|
259 |
+ int i, j; |
|
260 |
+ float v0, v1; |
|
261 |
+ |
|
262 |
+ if (out_ch == 2) { |
|
259 | 263 |
for (i = 0; i < len; i++) { |
260 | 264 |
v0 = v1 = 0.0f; |
261 | 265 |
for (j = 0; j < in_ch; j++) { |
... | ... |
@@ -300,6 +304,15 @@ static void ac3_downmix_c_fixed(int32_t **samples, int16_t **matrix, |
300 | 300 |
} |
301 | 301 |
} |
302 | 302 |
|
303 |
+void ff_ac3dsp_downmix_fixed(AC3DSPContext *c, int32_t **samples, int16_t **matrix, |
|
304 |
+ int out_ch, int in_ch, int len) |
|
305 |
+{ |
|
306 |
+ if (c->downmix_fixed) |
|
307 |
+ c->downmix_fixed(samples, matrix, len); |
|
308 |
+ else |
|
309 |
+ ac3_downmix_c_fixed(samples, matrix, out_ch, in_ch, len); |
|
310 |
+} |
|
311 |
+ |
|
303 | 312 |
static void apply_window_int16_c(int16_t *output, const int16_t *input, |
304 | 313 |
const int16_t *window, unsigned int len) |
305 | 314 |
{ |
... | ... |
@@ -313,6 +326,38 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input, |
313 | 313 |
} |
314 | 314 |
} |
315 | 315 |
|
316 |
+void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix, |
|
317 |
+ int out_ch, int in_ch, int len) |
|
318 |
+{ |
|
319 |
+ if (c->in_channels != in_ch || c->out_channels != out_ch) { |
|
320 |
+ int **matrix_cmp = (int **)matrix; |
|
321 |
+ |
|
322 |
+ c->in_channels = in_ch; |
|
323 |
+ c->out_channels = out_ch; |
|
324 |
+ c->downmix = NULL; |
|
325 |
+ |
|
326 |
+ if (in_ch == 5 && out_ch == 2 && |
|
327 |
+ !(matrix_cmp[1][0] | matrix_cmp[0][2] | |
|
328 |
+ matrix_cmp[1][3] | matrix_cmp[0][4] | |
|
329 |
+ (matrix_cmp[0][1] ^ matrix_cmp[1][1]) | |
|
330 |
+ (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) { |
|
331 |
+ c->downmix = ac3_downmix_5_to_2_symmetric_c; |
|
332 |
+ } else if (in_ch == 5 && out_ch == 1 && |
|
333 |
+ matrix_cmp[0][0] == matrix_cmp[0][2] && |
|
334 |
+ matrix_cmp[0][3] == matrix_cmp[0][4]) { |
|
335 |
+ c->downmix = ac3_downmix_5_to_1_symmetric_c; |
|
336 |
+ } |
|
337 |
+ |
|
338 |
+ if (ARCH_X86) |
|
339 |
+ ff_ac3dsp_set_downmix_x86(c); |
|
340 |
+ } |
|
341 |
+ |
|
342 |
+ if (c->downmix) |
|
343 |
+ c->downmix(samples, matrix, len); |
|
344 |
+ else |
|
345 |
+ ac3_downmix_c(samples, matrix, out_ch, in_ch, len); |
|
346 |
+} |
|
347 |
+ |
|
316 | 348 |
av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact) |
317 | 349 |
{ |
318 | 350 |
c->ac3_exponent_min = ac3_exponent_min_c; |
... | ... |
@@ -326,8 +371,10 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact) |
326 | 326 |
c->extract_exponents = ac3_extract_exponents_c; |
327 | 327 |
c->sum_square_butterfly_int32 = ac3_sum_square_butterfly_int32_c; |
328 | 328 |
c->sum_square_butterfly_float = ac3_sum_square_butterfly_float_c; |
329 |
- c->downmix = ac3_downmix_c; |
|
330 |
- c->downmix_fixed = ac3_downmix_c_fixed; |
|
329 |
+ c->in_channels = 0; |
|
330 |
+ c->out_channels = 0; |
|
331 |
+ c->downmix = NULL; |
|
332 |
+ c->downmix_fixed = NULL; |
|
331 | 333 |
c->apply_window_int16 = apply_window_int16_c; |
332 | 334 |
|
333 | 335 |
if (ARCH_ARM) |
... | ... |
@@ -132,11 +132,10 @@ typedef struct AC3DSPContext { |
132 | 132 |
void (*sum_square_butterfly_float)(float sum[4], const float *coef0, |
133 | 133 |
const float *coef1, int len); |
134 | 134 |
|
135 |
- void (*downmix)(float **samples, float **matrix, int out_ch, |
|
136 |
- int in_ch, int len); |
|
137 |
- |
|
138 |
- void (*downmix_fixed)(int32_t **samples, int16_t **matrix, int out_ch, |
|
139 |
- int in_ch, int len); |
|
135 |
+ int out_channels; |
|
136 |
+ int in_channels; |
|
137 |
+ void (*downmix)(float **samples, float **matrix, int len); |
|
138 |
+ void (*downmix_fixed)(int32_t **samples, int16_t **matrix, int len); |
|
140 | 139 |
|
141 | 140 |
/** |
142 | 141 |
* Apply symmetric window in 16-bit fixed-point. |
... | ... |
@@ -158,4 +157,11 @@ void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact); |
158 | 158 |
void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact); |
159 | 159 |
void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact); |
160 | 160 |
|
161 |
+void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix, |
|
162 |
+ int out_ch, int in_ch, int len); |
|
163 |
+void ff_ac3dsp_downmix_fixed(AC3DSPContext *c, int32_t **samples, int16_t **matrix, |
|
164 |
+ int out_ch, int in_ch, int len); |
|
165 |
+ |
|
166 |
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c); |
|
167 |
+ |
|
161 | 168 |
#endif /* AVCODEC_AC3DSP_H */ |
... | ... |
@@ -87,7 +87,8 @@ MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o |
87 | 87 |
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o |
88 | 88 |
|
89 | 89 |
# subsystems |
90 |
-YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o |
|
90 |
+YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \ |
|
91 |
+ x86/ac3dsp_downmix.o |
|
91 | 92 |
YASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o |
92 | 93 |
YASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o |
93 | 94 |
YASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o |
94 | 95 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,187 @@ |
0 |
+;***************************************************************************** |
|
1 |
+;* x86-optimized AC-3 downmixing |
|
2 |
+;* Copyright (c) 2012 Justin Ruggles |
|
3 |
+;* |
|
4 |
+;* This file is part of FFmpeg. |
|
5 |
+;* |
|
6 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
7 |
+;* modify it under the terms of the GNU Lesser General Public |
|
8 |
+;* License as published by the Free Software Foundation; either |
|
9 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
10 |
+;* |
|
11 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
12 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+;* Lesser General Public License for more details. |
|
15 |
+;* |
|
16 |
+;* You should have received a copy of the GNU Lesser General Public |
|
17 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
18 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+;****************************************************************************** |
|
20 |
+ |
|
21 |
+;****************************************************************************** |
|
22 |
+;* This is based on the channel mixing asm in libavresample, but it is |
|
23 |
+;* simplified for only float coefficients and only 3 to 6 channels. |
|
24 |
+;****************************************************************************** |
|
25 |
+ |
|
26 |
+%include "libavutil/x86/x86util.asm" |
|
27 |
+ |
|
28 |
+SECTION .text |
|
29 |
+ |
|
30 |
+;----------------------------------------------------------------------------- |
|
31 |
+; functions to downmix from 3 to 6 channels to mono or stereo |
|
32 |
+; void ff_ac3_downmix_*(float **samples, float **matrix, int len); |
|
33 |
+;----------------------------------------------------------------------------- |
|
34 |
+ |
|
35 |
+%macro AC3_DOWNMIX 2 ; %1 = in channels, %2 = out channels |
|
36 |
+; define some names to make the code clearer |
|
37 |
+%assign in_channels %1 |
|
38 |
+%assign out_channels %2 |
|
39 |
+%assign stereo out_channels - 1 |
|
40 |
+ |
|
41 |
+; determine how many matrix elements must go on the stack vs. mmregs |
|
42 |
+%assign matrix_elements in_channels * out_channels |
|
43 |
+%if stereo |
|
44 |
+ %assign needed_mmregs 4 |
|
45 |
+%else |
|
46 |
+ %assign needed_mmregs 3 |
|
47 |
+%endif |
|
48 |
+%assign matrix_elements_mm num_mmregs - needed_mmregs |
|
49 |
+%if matrix_elements < matrix_elements_mm |
|
50 |
+ %assign matrix_elements_mm matrix_elements |
|
51 |
+%endif |
|
52 |
+%assign total_mmregs needed_mmregs+matrix_elements_mm |
|
53 |
+%if matrix_elements_mm < matrix_elements |
|
54 |
+ %assign matrix_elements_stack matrix_elements - matrix_elements_mm |
|
55 |
+%else |
|
56 |
+ %assign matrix_elements_stack 0 |
|
57 |
+%endif |
|
58 |
+ |
|
59 |
+cglobal ac3_downmix_%1_to_%2, 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, len, src2, src3, src4, src5 |
|
60 |
+ |
|
61 |
+; load matrix pointers |
|
62 |
+%define matrix0q r1q |
|
63 |
+%define matrix1q r3q |
|
64 |
+%if stereo |
|
65 |
+ mov matrix1q, [matrix0q+gprsize] |
|
66 |
+%endif |
|
67 |
+ mov matrix0q, [matrix0q] |
|
68 |
+ |
|
69 |
+; define matrix coeff names |
|
70 |
+%assign %%i 0 |
|
71 |
+%assign %%j needed_mmregs |
|
72 |
+%rep in_channels |
|
73 |
+ %if %%i >= matrix_elements_mm |
|
74 |
+ CAT_XDEFINE mx_stack_0_, %%i, 1 |
|
75 |
+ CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize] |
|
76 |
+ %else |
|
77 |
+ CAT_XDEFINE mx_stack_0_, %%i, 0 |
|
78 |
+ CAT_XDEFINE mx_0_, %%i, m %+ %%j |
|
79 |
+ %assign %%j %%j+1 |
|
80 |
+ %endif |
|
81 |
+ %assign %%i %%i+1 |
|
82 |
+%endrep |
|
83 |
+%if stereo |
|
84 |
+%assign %%i 0 |
|
85 |
+%rep in_channels |
|
86 |
+ %if in_channels + %%i >= matrix_elements_mm |
|
87 |
+ CAT_XDEFINE mx_stack_1_, %%i, 1 |
|
88 |
+ CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize] |
|
89 |
+ %else |
|
90 |
+ CAT_XDEFINE mx_stack_1_, %%i, 0 |
|
91 |
+ CAT_XDEFINE mx_1_, %%i, m %+ %%j |
|
92 |
+ %assign %%j %%j+1 |
|
93 |
+ %endif |
|
94 |
+ %assign %%i %%i+1 |
|
95 |
+%endrep |
|
96 |
+%endif |
|
97 |
+ |
|
98 |
+; load/splat matrix coeffs |
|
99 |
+%assign %%i 0 |
|
100 |
+%rep in_channels |
|
101 |
+ %if mx_stack_0_ %+ %%i |
|
102 |
+ VBROADCASTSS m0, [matrix0q+4*%%i] |
|
103 |
+ mova mx_0_ %+ %%i, m0 |
|
104 |
+ %else |
|
105 |
+ VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i] |
|
106 |
+ %endif |
|
107 |
+ %if stereo |
|
108 |
+ %if mx_stack_1_ %+ %%i |
|
109 |
+ VBROADCASTSS m0, [matrix1q+4*%%i] |
|
110 |
+ mova mx_1_ %+ %%i, m0 |
|
111 |
+ %else |
|
112 |
+ VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i] |
|
113 |
+ %endif |
|
114 |
+ %endif |
|
115 |
+ %assign %%i %%i+1 |
|
116 |
+%endrep |
|
117 |
+ |
|
118 |
+ lea lenq, [4*r2d] |
|
119 |
+ ; load channel pointers to registers |
|
120 |
+%assign %%i 1 |
|
121 |
+%rep (in_channels - 1) |
|
122 |
+ mov src %+ %%i %+ q, [src0q+%%i*gprsize] |
|
123 |
+ add src %+ %%i %+ q, lenq |
|
124 |
+ %assign %%i %%i+1 |
|
125 |
+%endrep |
|
126 |
+ mov src0q, [src0q] |
|
127 |
+ add src0q, lenq |
|
128 |
+ neg lenq |
|
129 |
+.loop: |
|
130 |
+ %if stereo || mx_stack_0_0 |
|
131 |
+ mova m0, [src0q+lenq] |
|
132 |
+ %endif |
|
133 |
+ %if stereo |
|
134 |
+ mulps m1, m0, mx_1_0 |
|
135 |
+ %endif |
|
136 |
+ %if stereo || mx_stack_0_0 |
|
137 |
+ mulps m0, m0, mx_0_0 |
|
138 |
+ %else |
|
139 |
+ mulps m0, mx_0_0, [src0q+lenq] |
|
140 |
+ %endif |
|
141 |
+%assign %%i 1 |
|
142 |
+%rep (in_channels - 1) |
|
143 |
+ %define src_ptr src %+ %%i %+ q |
|
144 |
+ ; avoid extra load for mono if matrix is in a mm register |
|
145 |
+ %if stereo || mx_stack_0_ %+ %%i |
|
146 |
+ mova m2, [src_ptr+lenq] |
|
147 |
+ %endif |
|
148 |
+ %if stereo |
|
149 |
+ FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3 |
|
150 |
+ %endif |
|
151 |
+ %if stereo || mx_stack_0_ %+ %%i |
|
152 |
+ FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2 |
|
153 |
+ %else |
|
154 |
+ FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1 |
|
155 |
+ %endif |
|
156 |
+ %assign %%i %%i+1 |
|
157 |
+%endrep |
|
158 |
+ mova [src0q+lenq], m0 |
|
159 |
+ %if stereo |
|
160 |
+ mova [src1q+lenq], m1 |
|
161 |
+ %endif |
|
162 |
+ |
|
163 |
+ add lenq, mmsize |
|
164 |
+ jl .loop |
|
165 |
+ RET |
|
166 |
+%endmacro |
|
167 |
+ |
|
168 |
+%macro AC3_DOWNMIX_FUNCS 0 |
|
169 |
+%assign %%i 3 |
|
170 |
+%rep 4 |
|
171 |
+ INIT_XMM sse |
|
172 |
+ AC3_DOWNMIX %%i, 1 |
|
173 |
+ AC3_DOWNMIX %%i, 2 |
|
174 |
+ INIT_YMM avx |
|
175 |
+ AC3_DOWNMIX %%i, 1 |
|
176 |
+ AC3_DOWNMIX %%i, 2 |
|
177 |
+ %if HAVE_FMA3_EXTERNAL |
|
178 |
+ INIT_YMM fma3 |
|
179 |
+ AC3_DOWNMIX %%i, 1 |
|
180 |
+ AC3_DOWNMIX %%i, 2 |
|
181 |
+ %endif |
|
182 |
+ %assign %%i %%i+1 |
|
183 |
+%endrep |
|
184 |
+%endmacro |
|
185 |
+ |
|
186 |
+AC3_DOWNMIX_FUNCS |
... | ... |
@@ -63,140 +63,6 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, |
63 | 63 |
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, |
64 | 64 |
const int16_t *window, unsigned int len); |
65 | 65 |
|
66 |
-#if ARCH_X86_32 && defined(__INTEL_COMPILER) |
|
67 |
-# undef HAVE_7REGS |
|
68 |
-# define HAVE_7REGS 0 |
|
69 |
-#endif |
|
70 |
- |
|
71 |
-#if HAVE_SSE_INLINE && HAVE_7REGS |
|
72 |
- |
|
73 |
-#define IF1(x) x |
|
74 |
-#define IF0(x) |
|
75 |
- |
|
76 |
-#define MIX5(mono, stereo) \ |
|
77 |
- __asm__ volatile ( \ |
|
78 |
- "movss 0(%1), %%xmm5 \n" \ |
|
79 |
- "movss 4(%1), %%xmm6 \n" \ |
|
80 |
- "movss 12(%1), %%xmm7 \n" \ |
|
81 |
- "shufps $0, %%xmm5, %%xmm5 \n" \ |
|
82 |
- "shufps $0, %%xmm6, %%xmm6 \n" \ |
|
83 |
- "shufps $0, %%xmm7, %%xmm7 \n" \ |
|
84 |
- "1: \n" \ |
|
85 |
- "movaps (%0, %2), %%xmm0 \n" \ |
|
86 |
- "movaps (%0, %3), %%xmm1 \n" \ |
|
87 |
- "movaps (%0, %4), %%xmm2 \n" \ |
|
88 |
- "movaps (%0, %5), %%xmm3 \n" \ |
|
89 |
- "movaps (%0, %6), %%xmm4 \n" \ |
|
90 |
- "mulps %%xmm5, %%xmm0 \n" \ |
|
91 |
- "mulps %%xmm6, %%xmm1 \n" \ |
|
92 |
- "mulps %%xmm5, %%xmm2 \n" \ |
|
93 |
- "mulps %%xmm7, %%xmm3 \n" \ |
|
94 |
- "mulps %%xmm7, %%xmm4 \n" \ |
|
95 |
- stereo("addps %%xmm1, %%xmm0 \n") \ |
|
96 |
- "addps %%xmm1, %%xmm2 \n" \ |
|
97 |
- "addps %%xmm3, %%xmm0 \n" \ |
|
98 |
- "addps %%xmm4, %%xmm2 \n" \ |
|
99 |
- mono("addps %%xmm2, %%xmm0 \n") \ |
|
100 |
- "movaps %%xmm0, (%0, %2) \n" \ |
|
101 |
- stereo("movaps %%xmm2, (%0, %3) \n") \ |
|
102 |
- "add $16, %0 \n" \ |
|
103 |
- "jl 1b \n" \ |
|
104 |
- : "+&r"(i) \ |
|
105 |
- : "r"(matrix[0]), \ |
|
106 |
- "r"(samples[0] + len), \ |
|
107 |
- "r"(samples[1] + len), \ |
|
108 |
- "r"(samples[2] + len), \ |
|
109 |
- "r"(samples[3] + len), \ |
|
110 |
- "r"(samples[4] + len) \ |
|
111 |
- : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
|
112 |
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ |
|
113 |
- "memory" \ |
|
114 |
- ); |
|
115 |
- |
|
116 |
-#define MIX_MISC(stereo) \ |
|
117 |
- __asm__ volatile ( \ |
|
118 |
- "mov %5, %2 \n" \ |
|
119 |
- "1: \n" \ |
|
120 |
- "mov -%c7(%6, %2, %c8), %3 \n" \ |
|
121 |
- "movaps (%3, %0), %%xmm0 \n" \ |
|
122 |
- stereo("movaps %%xmm0, %%xmm1 \n") \ |
|
123 |
- "mulps %%xmm4, %%xmm0 \n" \ |
|
124 |
- stereo("mulps %%xmm5, %%xmm1 \n") \ |
|
125 |
- "2: \n" \ |
|
126 |
- "mov (%6, %2, %c8), %1 \n" \ |
|
127 |
- "movaps (%1, %0), %%xmm2 \n" \ |
|
128 |
- stereo("movaps %%xmm2, %%xmm3 \n") \ |
|
129 |
- "mulps (%4, %2, 8), %%xmm2 \n" \ |
|
130 |
- stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ |
|
131 |
- "addps %%xmm2, %%xmm0 \n" \ |
|
132 |
- stereo("addps %%xmm3, %%xmm1 \n") \ |
|
133 |
- "add $4, %2 \n" \ |
|
134 |
- "jl 2b \n" \ |
|
135 |
- "mov %5, %2 \n" \ |
|
136 |
- stereo("mov (%6, %2, %c8), %1 \n") \ |
|
137 |
- "movaps %%xmm0, (%3, %0) \n" \ |
|
138 |
- stereo("movaps %%xmm1, (%1, %0) \n") \ |
|
139 |
- "add $16, %0 \n" \ |
|
140 |
- "jl 1b \n" \ |
|
141 |
- : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ |
|
142 |
- : "r"(matrix_simd + in_ch), \ |
|
143 |
- "g"((intptr_t) - 4 * (in_ch - 1)), \ |
|
144 |
- "r"(samp + in_ch), \ |
|
145 |
- "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ |
|
146 |
- : "memory" \ |
|
147 |
- ); |
|
148 |
- |
|
149 |
-static void ac3_downmix_sse(float **samples, float **matrix, |
|
150 |
- int out_ch, int in_ch, int len) |
|
151 |
-{ |
|
152 |
- int **matrix_cmp = (int **)matrix; |
|
153 |
- intptr_t i, j, k, m; |
|
154 |
- |
|
155 |
- i = -len * sizeof(float); |
|
156 |
- if (in_ch == 5 && out_ch == 2 && |
|
157 |
- !(matrix_cmp[1][0] | matrix_cmp[0][2] | |
|
158 |
- matrix_cmp[1][3] | matrix_cmp[0][4] | |
|
159 |
- (matrix_cmp[0][1] ^ matrix_cmp[1][1]) | |
|
160 |
- (matrix_cmp[0][0] ^ matrix_cmp[1][2]))) { |
|
161 |
- MIX5(IF0, IF1); |
|
162 |
- } else if (in_ch == 5 && out_ch == 1 && |
|
163 |
- matrix_cmp[0][0] == matrix_cmp[0][2] && |
|
164 |
- matrix_cmp[0][3] == matrix_cmp[0][4]) { |
|
165 |
- MIX5(IF1, IF0); |
|
166 |
- } else { |
|
167 |
- LOCAL_ALIGNED(16, float, matrix_simd, [AC3_MAX_CHANNELS], [2][4]); |
|
168 |
- float *samp[AC3_MAX_CHANNELS]; |
|
169 |
- |
|
170 |
- for (j = 0; j < in_ch; j++) |
|
171 |
- samp[j] = samples[j] + len; |
|
172 |
- |
|
173 |
- j = 2 * in_ch * sizeof(float); |
|
174 |
- k = in_ch * sizeof(float); |
|
175 |
- __asm__ volatile ( |
|
176 |
- "1: \n" |
|
177 |
- "sub $4, %1 \n" |
|
178 |
- "sub $8, %0 \n" |
|
179 |
- "movss (%3, %1), %%xmm4 \n" |
|
180 |
- "movss (%4, %1), %%xmm5 \n" |
|
181 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
182 |
- "shufps $0, %%xmm5, %%xmm5 \n" |
|
183 |
- "movaps %%xmm4, (%2, %0, 4) \n" |
|
184 |
- "movaps %%xmm5, 16(%2, %0, 4) \n" |
|
185 |
- "jg 1b \n" |
|
186 |
- : "+&r"(j), "+&r"(k) |
|
187 |
- : "r"(matrix_simd), "r"(matrix[0]), "r"(matrix[1]) |
|
188 |
- : "memory" |
|
189 |
- ); |
|
190 |
- if (out_ch == 2) { |
|
191 |
- MIX_MISC(IF1); |
|
192 |
- } else { |
|
193 |
- MIX_MISC(IF0); |
|
194 |
- } |
|
195 |
- } |
|
196 |
-} |
|
197 |
- |
|
198 |
-#endif /* HAVE_SSE_INLINE && HAVE_7REGS */ |
|
199 |
- |
|
200 | 66 |
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
201 | 67 |
{ |
202 | 68 |
int cpu_flags = av_get_cpu_flags(); |
... | ... |
@@ -252,10 +118,47 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
252 | 252 |
c->apply_window_int16 = ff_apply_window_int16_ssse3; |
253 | 253 |
} |
254 | 254 |
} |
255 |
+} |
|
256 |
+ |
|
257 |
+#define DOWNMIX_FUNC_OPT(ch, opt) \ |
|
258 |
+void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples, \ |
|
259 |
+ float **matrix, int len); \ |
|
260 |
+void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples, \ |
|
261 |
+ float **matrix, int len); |
|
262 |
+ |
|
263 |
+#define DOWNMIX_FUNCS(opt) \ |
|
264 |
+ DOWNMIX_FUNC_OPT(3, opt) \ |
|
265 |
+ DOWNMIX_FUNC_OPT(4, opt) \ |
|
266 |
+ DOWNMIX_FUNC_OPT(5, opt) \ |
|
267 |
+ DOWNMIX_FUNC_OPT(6, opt) |
|
268 |
+ |
|
269 |
+DOWNMIX_FUNCS(sse) |
|
270 |
+DOWNMIX_FUNCS(avx) |
|
271 |
+DOWNMIX_FUNCS(fma3) |
|
272 |
+ |
|
273 |
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c) |
|
274 |
+{ |
|
275 |
+ int cpu_flags = av_get_cpu_flags(); |
|
276 |
+ |
|
277 |
+#define SET_DOWNMIX(ch, suf, SUF) \ |
|
278 |
+ if (ch == c->in_channels) { \ |
|
279 |
+ if (EXTERNAL_ ## SUF (cpu_flags)) { \ |
|
280 |
+ if (c->out_channels == 1) \ |
|
281 |
+ c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf; \ |
|
282 |
+ else \ |
|
283 |
+ c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf; \ |
|
284 |
+ } \ |
|
285 |
+ } |
|
286 |
+ |
|
287 |
+#define SET_DOWNMIX_ALL(suf, SUF) \ |
|
288 |
+ SET_DOWNMIX(3, suf, SUF) \ |
|
289 |
+ SET_DOWNMIX(4, suf, SUF) \ |
|
290 |
+ SET_DOWNMIX(5, suf, SUF) \ |
|
291 |
+ SET_DOWNMIX(6, suf, SUF) |
|
255 | 292 |
|
256 |
-#if HAVE_SSE_INLINE && HAVE_7REGS |
|
257 |
- if (INLINE_SSE(cpu_flags)) { |
|
258 |
- c->downmix = ac3_downmix_sse; |
|
293 |
+ SET_DOWNMIX_ALL(sse, SSE) |
|
294 |
+ if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) { |
|
295 |
+ SET_DOWNMIX_ALL(avx, AVX) |
|
296 |
+ SET_DOWNMIX_ALL(fma3, FMA3) |
|
259 | 297 |
} |
260 |
-#endif |
|
261 | 298 |
} |