... | ... |
@@ -160,6 +160,8 @@ static av_cold void ac3_tables_init(void) |
160 | 160 |
static av_cold int ac3_decode_init(AVCodecContext *avctx) |
161 | 161 |
{ |
162 | 162 |
AC3DecodeContext *s = avctx->priv_data; |
163 |
+ int i; |
|
164 |
+ |
|
163 | 165 |
s->avctx = avctx; |
164 | 166 |
|
165 | 167 |
ff_ac3_common_init(); |
... | ... |
@@ -185,6 +187,12 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx) |
185 | 185 |
avcodec_get_frame_defaults(&s->frame); |
186 | 186 |
avctx->coded_frame = &s->frame; |
187 | 187 |
|
188 |
+ for (i = 0; i < AC3_MAX_CHANNELS; i++) { |
|
189 |
+ s->outptr[i] = s->output[i]; |
|
190 |
+ s->xcfptr[i] = s->transform_coeffs[i]; |
|
191 |
+ s->dlyptr[i] = s->delay[i]; |
|
192 |
+ } |
|
193 |
+ |
|
188 | 194 |
return 0; |
189 | 195 |
} |
190 | 196 |
|
... | ... |
@@ -1231,18 +1239,18 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) |
1231 | 1231 |
do_imdct(s, s->channels); |
1232 | 1232 |
|
1233 | 1233 |
if (downmix_output) { |
1234 |
- s->ac3dsp.downmix(s->output, s->downmix_coeffs, |
|
1234 |
+ s->ac3dsp.downmix(s->outptr, s->downmix_coeffs, |
|
1235 | 1235 |
s->out_channels, s->fbw_channels, 256); |
1236 | 1236 |
} |
1237 | 1237 |
} else { |
1238 | 1238 |
if (downmix_output) { |
1239 |
- s->ac3dsp.downmix(s->transform_coeffs + 1, s->downmix_coeffs, |
|
1239 |
+ s->ac3dsp.downmix(s->xcfptr + 1, s->downmix_coeffs, |
|
1240 | 1240 |
s->out_channels, s->fbw_channels, 256); |
1241 | 1241 |
} |
1242 | 1242 |
|
1243 | 1243 |
if (downmix_output && !s->downmixed) { |
1244 | 1244 |
s->downmixed = 1; |
1245 |
- s->ac3dsp.downmix(s->delay, s->downmix_coeffs, s->out_channels, |
|
1245 |
+ s->ac3dsp.downmix(s->dlyptr, s->downmix_coeffs, s->out_channels, |
|
1246 | 1246 |
s->fbw_channels, 128); |
1247 | 1247 |
} |
1248 | 1248 |
|
... | ... |
@@ -197,6 +197,10 @@ typedef struct AC3DecodeContext { |
197 | 197 |
FmtConvertContext fmt_conv; ///< optimized conversion functions |
198 | 198 |
///@} |
199 | 199 |
|
200 |
+ float *outptr[AC3_MAX_CHANNELS]; |
|
201 |
+ float *xcfptr[AC3_MAX_CHANNELS]; |
|
202 |
+ float *dlyptr[AC3_MAX_CHANNELS]; |
|
203 |
+ |
|
200 | 204 |
///@name Aligned arrays |
201 | 205 |
DECLARE_ALIGNED(16, int, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< fixed-point transform coefficients |
202 | 206 |
DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients |
... | ... |
@@ -171,7 +171,7 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs) |
171 | 171 |
} |
172 | 172 |
} |
173 | 173 |
|
174 |
-static void ac3_downmix_c(float (*samples)[256], float (*matrix)[2], |
|
174 |
+static void ac3_downmix_c(float **samples, float (*matrix)[2], |
|
175 | 175 |
int out_ch, int in_ch, int len) |
176 | 176 |
{ |
177 | 177 |
int i, j; |
... | ... |
@@ -126,7 +126,7 @@ typedef struct AC3DSPContext { |
126 | 126 |
|
127 | 127 |
void (*extract_exponents)(uint8_t *exp, int32_t *coef, int nb_coefs); |
128 | 128 |
|
129 |
- void (*downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, |
|
129 |
+ void (*downmix)(float **samples, float (*matrix)[2], int out_ch, |
|
130 | 130 |
int in_ch, int len); |
131 | 131 |
} AC3DSPContext; |
132 | 132 |
|
... | ... |
@@ -51,25 +51,25 @@ extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_c |
51 | 51 |
extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); |
52 | 52 |
extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); |
53 | 53 |
|
54 |
-#if HAVE_SSE_INLINE |
|
54 |
+#if HAVE_SSE_INLINE && HAVE_7REGS |
|
55 | 55 |
|
56 | 56 |
#define IF1(x) x |
57 | 57 |
#define IF0(x) |
58 | 58 |
|
59 | 59 |
#define MIX5(mono, stereo) \ |
60 | 60 |
__asm__ volatile ( \ |
61 |
- "movss 0(%2), %%xmm5 \n" \ |
|
62 |
- "movss 8(%2), %%xmm6 \n" \ |
|
63 |
- "movss 24(%2), %%xmm7 \n" \ |
|
61 |
+ "movss 0(%1), %%xmm5 \n" \ |
|
62 |
+ "movss 8(%1), %%xmm6 \n" \ |
|
63 |
+ "movss 24(%1), %%xmm7 \n" \ |
|
64 | 64 |
"shufps $0, %%xmm5, %%xmm5 \n" \ |
65 | 65 |
"shufps $0, %%xmm6, %%xmm6 \n" \ |
66 | 66 |
"shufps $0, %%xmm7, %%xmm7 \n" \ |
67 | 67 |
"1: \n" \ |
68 |
- "movaps (%0, %1), %%xmm0 \n" \ |
|
69 |
- "movaps 0x400(%0, %1), %%xmm1 \n" \ |
|
70 |
- "movaps 0x800(%0, %1), %%xmm2 \n" \ |
|
71 |
- "movaps 0xc00(%0, %1), %%xmm3 \n" \ |
|
72 |
- "movaps 0x1000(%0, %1), %%xmm4 \n" \ |
|
68 |
+ "movaps (%0, %2), %%xmm0 \n" \ |
|
69 |
+ "movaps (%0, %3), %%xmm1 \n" \ |
|
70 |
+ "movaps (%0, %4), %%xmm2 \n" \ |
|
71 |
+ "movaps (%0, %5), %%xmm3 \n" \ |
|
72 |
+ "movaps (%0, %6), %%xmm4 \n" \ |
|
73 | 73 |
"mulps %%xmm5, %%xmm0 \n" \ |
74 | 74 |
"mulps %%xmm6, %%xmm1 \n" \ |
75 | 75 |
"mulps %%xmm5, %%xmm2 \n" \ |
... | ... |
@@ -80,12 +80,17 @@ extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_c |
80 | 80 |
"addps %%xmm3, %%xmm0 \n" \ |
81 | 81 |
"addps %%xmm4, %%xmm2 \n" \ |
82 | 82 |
mono("addps %%xmm2, %%xmm0 \n") \ |
83 |
- "movaps %%xmm0, (%0, %1) \n" \ |
|
84 |
- stereo("movaps %%xmm2, 0x400(%0, %1) \n") \ |
|
83 |
+ "movaps %%xmm0, (%0, %2) \n" \ |
|
84 |
+ stereo("movaps %%xmm2, (%0, %3) \n") \ |
|
85 | 85 |
"add $16, %0 \n" \ |
86 | 86 |
"jl 1b \n" \ |
87 | 87 |
: "+&r"(i) \ |
88 |
- : "r"(samples[0] + len), "r"(matrix) \ |
|
88 |
+ : "r"(matrix), \ |
|
89 |
+ "r"(samples[0] + len), \ |
|
90 |
+ "r"(samples[1] + len), \ |
|
91 |
+ "r"(samples[2] + len), \ |
|
92 |
+ "r"(samples[3] + len), \ |
|
93 |
+ "r"(samples[4] + len) \ |
|
89 | 94 |
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
90 | 95 |
"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ |
91 | 96 |
"memory" \ |
... | ... |
@@ -93,38 +98,42 @@ extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_c |
93 | 93 |
|
94 | 94 |
#define MIX_MISC(stereo) \ |
95 | 95 |
__asm__ volatile ( \ |
96 |
+ "mov %5, %2 \n" \ |
|
96 | 97 |
"1: \n" \ |
98 |
+ "mov -%c7(%6, %2, %c8), %3 \n" \ |
|
97 | 99 |
"movaps (%3, %0), %%xmm0 \n" \ |
98 | 100 |
stereo("movaps %%xmm0, %%xmm1 \n") \ |
99 | 101 |
"mulps %%xmm4, %%xmm0 \n" \ |
100 | 102 |
stereo("mulps %%xmm5, %%xmm1 \n") \ |
101 |
- "lea 1024(%3, %0), %1 \n" \ |
|
102 |
- "mov %5, %2 \n" \ |
|
103 | 103 |
"2: \n" \ |
104 |
- "movaps (%1), %%xmm2 \n" \ |
|
104 |
+ "mov (%6, %2, %c8), %1 \n" \ |
|
105 |
+ "movaps (%1, %0), %%xmm2 \n" \ |
|
105 | 106 |
stereo("movaps %%xmm2, %%xmm3 \n") \ |
106 |
- "mulps (%4, %2), %%xmm2 \n" \ |
|
107 |
- stereo("mulps 16(%4, %2), %%xmm3 \n") \ |
|
107 |
+ "mulps (%4, %2, 8), %%xmm2 \n" \ |
|
108 |
+ stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ |
|
108 | 109 |
"addps %%xmm2, %%xmm0 \n" \ |
109 | 110 |
stereo("addps %%xmm3, %%xmm1 \n") \ |
110 |
- "add $1024, %1 \n" \ |
|
111 |
- "add $32, %2 \n" \ |
|
111 |
+ "add $4, %2 \n" \ |
|
112 | 112 |
"jl 2b \n" \ |
113 |
- "movaps %%xmm0, (%3, %0) \n" \ |
|
114 |
- stereo("movaps %%xmm1, 1024(%3, %0) \n") \ |
|
113 |
+ "mov %5, %2 \n" \ |
|
114 |
+ stereo("mov (%6, %2, %c8), %1 \n") \ |
|
115 |
+ "movaps %%xmm0, (%3, %0) \n" \ |
|
116 |
+ stereo("movaps %%xmm1, (%1, %0) \n") \ |
|
115 | 117 |
"add $16, %0 \n" \ |
116 | 118 |
"jl 1b \n" \ |
117 |
- : "+&r"(i), "=&r"(j), "=&r"(k) \ |
|
118 |
- : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \ |
|
119 |
- "g"((intptr_t) - 32 * (in_ch - 1)) \ |
|
119 |
+ : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ |
|
120 |
+ : "r"(matrix_simd + in_ch), \ |
|
121 |
+ "g"((intptr_t) - 4 * (in_ch - 1)), \ |
|
122 |
+ "r"(samp + in_ch), \ |
|
123 |
+ "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ |
|
120 | 124 |
: "memory" \ |
121 | 125 |
); |
122 | 126 |
|
123 |
-static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], |
|
127 |
+static void ac3_downmix_sse(float **samples, float (*matrix)[2], |
|
124 | 128 |
int out_ch, int in_ch, int len) |
125 | 129 |
{ |
126 | 130 |
int (*matrix_cmp)[2] = (int(*)[2])matrix; |
127 |
- intptr_t i, j, k; |
|
131 |
+ intptr_t i, j, k, m; |
|
128 | 132 |
|
129 | 133 |
i = -len * sizeof(float); |
130 | 134 |
if (in_ch == 5 && out_ch == 2 && |
... | ... |
@@ -139,6 +148,11 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], |
139 | 139 |
MIX5(IF1, IF0); |
140 | 140 |
} else { |
141 | 141 |
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; |
142 |
+ float *samp[AC3_MAX_CHANNELS]; |
|
143 |
+ |
|
144 |
+ for (j = 0; j < in_ch; j++) |
|
145 |
+ samp[j] = samples[j] + len; |
|
146 |
+ |
|
142 | 147 |
j = 2 * in_ch * sizeof(float); |
143 | 148 |
__asm__ volatile ( |
144 | 149 |
"1: \n" |
... | ... |
@@ -162,7 +176,7 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], |
162 | 162 |
} |
163 | 163 |
} |
164 | 164 |
|
165 |
-#endif /* HAVE_SSE_INLINE */ |
|
165 |
+#endif /* HAVE_SSE_INLINE && HAVE_7REGS */ |
|
166 | 166 |
|
167 | 167 |
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
168 | 168 |
{ |
... | ... |
@@ -205,7 +219,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
205 | 205 |
} |
206 | 206 |
} |
207 | 207 |
|
208 |
-#if HAVE_SSE_INLINE |
|
208 |
+#if HAVE_SSE_INLINE && HAVE_7REGS |
|
209 | 209 |
if (INLINE_SSE(mm_flags)) { |
210 | 210 |
c->downmix = ac3_downmix_sse; |
211 | 211 |
} |