Signed-off-by: Bojan Zivkovic <bojan@mips.com>
Reviewed-by: Nedeljko Babic <Nedeljko.Babic@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -766,6 +766,9 @@ static av_cold int aac_encode_init(AVCodecContext *avctx) |
766 | 766 |
s->psypp = ff_psy_preprocess_init(avctx); |
767 | 767 |
s->coder = &ff_aac_coders[s->options.aac_coder]; |
768 | 768 |
|
769 |
+ if (HAVE_MIPSDSPR1) |
|
770 |
+ ff_aac_coder_init_mips(s); |
|
771 |
+ |
|
769 | 772 |
s->lambda = avctx->global_quality ? avctx->global_quality : 120; |
770 | 773 |
|
771 | 774 |
ff_aac_tableinit(); |
20 | 21 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,2498 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2012 |
|
2 |
+ * MIPS Technologies, Inc., California. |
|
3 |
+ * |
|
4 |
+ * Redistribution and use in source and binary forms, with or without |
|
5 |
+ * modification, are permitted provided that the following conditions |
|
6 |
+ * are met: |
|
7 |
+ * 1. Redistributions of source code must retain the above copyright |
|
8 |
+ * notice, this list of conditions and the following disclaimer. |
|
9 |
+ * 2. Redistributions in binary form must reproduce the above copyright |
|
10 |
+ * notice, this list of conditions and the following disclaimer in the |
|
11 |
+ * documentation and/or other materials provided with the distribution. |
|
12 |
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its |
|
13 |
+ * contributors may be used to endorse or promote products derived from |
|
14 |
+ * this software without specific prior written permission. |
|
15 |
+ * |
|
16 |
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND |
|
17 |
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
18 |
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
19 |
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE |
|
20 |
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
21 |
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
22 |
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
23 |
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
24 |
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
25 |
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
26 |
+ * SUCH DAMAGE. |
|
27 |
+ * |
|
28 |
+ * Author: Stanislav Ocovaj (socovaj@mips.com) |
|
29 |
+ * Szabolcs Pal (sabolc@mips.com) |
|
30 |
+ * |
|
31 |
+ * AAC coefficients encoder optimized for MIPS floating-point architecture |
|
32 |
+ * |
|
33 |
+ * This file is part of FFmpeg. |
|
34 |
+ * |
|
35 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
36 |
+ * modify it under the terms of the GNU Lesser General Public |
|
37 |
+ * License as published by the Free Software Foundation; either |
|
38 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
39 |
+ * |
|
40 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
41 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
42 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
43 |
+ * Lesser General Public License for more details. |
|
44 |
+ * |
|
45 |
+ * You should have received a copy of the GNU Lesser General Public |
|
46 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
47 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
48 |
+ */ |
|
49 |
+ |
|
50 |
+/** |
|
51 |
+ * @file |
|
52 |
+ * Reference: libavcodec/aaccoder.c |
|
53 |
+ */ |
|
54 |
+ |
|
55 |
+#include "libavutil/libm.h" |
|
56 |
+ |
|
57 |
+#include <float.h> |
|
58 |
+#include "libavutil/mathematics.h" |
|
59 |
+#include "libavcodec/avcodec.h" |
|
60 |
+#include "libavcodec/put_bits.h" |
|
61 |
+#include "libavcodec/aac.h" |
|
62 |
+#include "libavcodec/aacenc.h" |
|
63 |
+#include "libavcodec/aactab.h" |
|
64 |
+ |
|
65 |
+#if HAVE_INLINE_ASM |
|
66 |
+typedef struct BandCodingPath { |
|
67 |
+ int prev_idx; |
|
68 |
+ float cost; |
|
69 |
+ int run; |
|
70 |
+} BandCodingPath; |
|
71 |
+ |
|
72 |
+static const uint8_t run_value_bits_long[64] = { |
|
73 |
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
|
74 |
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, |
|
75 |
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
|
76 |
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15 |
|
77 |
+}; |
|
78 |
+ |
|
79 |
+static const uint8_t run_value_bits_short[16] = { |
|
80 |
+ 3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9 |
|
81 |
+}; |
|
82 |
+ |
|
83 |
+static const uint8_t *run_value_bits[2] = { |
|
84 |
+ run_value_bits_long, run_value_bits_short |
|
85 |
+}; |
|
86 |
+ |
|
87 |
+static const uint8_t uquad_sign_bits[81] = { |
|
88 |
+ 0, 1, 1, 1, 2, 2, 1, 2, 2, |
|
89 |
+ 1, 2, 2, 2, 3, 3, 2, 3, 3, |
|
90 |
+ 1, 2, 2, 2, 3, 3, 2, 3, 3, |
|
91 |
+ 1, 2, 2, 2, 3, 3, 2, 3, 3, |
|
92 |
+ 2, 3, 3, 3, 4, 4, 3, 4, 4, |
|
93 |
+ 2, 3, 3, 3, 4, 4, 3, 4, 4, |
|
94 |
+ 1, 2, 2, 2, 3, 3, 2, 3, 3, |
|
95 |
+ 2, 3, 3, 3, 4, 4, 3, 4, 4, |
|
96 |
+ 2, 3, 3, 3, 4, 4, 3, 4, 4 |
|
97 |
+}; |
|
98 |
+ |
|
99 |
+static const uint8_t upair7_sign_bits[64] = { |
|
100 |
+ 0, 1, 1, 1, 1, 1, 1, 1, |
|
101 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
102 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
103 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
104 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
105 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
106 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
107 |
+ 1, 2, 2, 2, 2, 2, 2, 2, |
|
108 |
+}; |
|
109 |
+ |
|
110 |
+static const uint8_t upair12_sign_bits[169] = { |
|
111 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
112 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
113 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
114 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
115 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
116 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
117 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
118 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
119 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
120 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
121 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
122 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
123 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
|
124 |
+}; |
|
125 |
+ |
|
126 |
+static const uint8_t esc_sign_bits[289] = { |
|
127 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
128 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
129 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
130 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
131 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
132 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
133 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
134 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
135 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
136 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
137 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
138 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
139 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
140 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
141 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
142 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
143 |
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
|
144 |
+}; |
|
145 |
+ |
|
146 |
+static void abs_pow34_v(float *out, const float *in, const int size) { |
|
147 |
+#ifndef USE_REALLY_FULL_SEARCH |
|
148 |
+ int i; |
|
149 |
+ float a, b, c, d; |
|
150 |
+ float ax, bx, cx, dx; |
|
151 |
+ |
|
152 |
+ for (i = 0; i < size; i += 4) { |
|
153 |
+ a = fabsf(in[i ]); |
|
154 |
+ b = fabsf(in[i+1]); |
|
155 |
+ c = fabsf(in[i+2]); |
|
156 |
+ d = fabsf(in[i+3]); |
|
157 |
+ |
|
158 |
+ ax = sqrtf(a); |
|
159 |
+ bx = sqrtf(b); |
|
160 |
+ cx = sqrtf(c); |
|
161 |
+ dx = sqrtf(d); |
|
162 |
+ |
|
163 |
+ a = a * ax; |
|
164 |
+ b = b * bx; |
|
165 |
+ c = c * cx; |
|
166 |
+ d = d * dx; |
|
167 |
+ |
|
168 |
+ out[i ] = sqrtf(a); |
|
169 |
+ out[i+1] = sqrtf(b); |
|
170 |
+ out[i+2] = sqrtf(c); |
|
171 |
+ out[i+3] = sqrtf(d); |
|
172 |
+ } |
|
173 |
+#endif /* USE_REALLY_FULL_SEARCH */ |
|
174 |
+} |
|
175 |
+ |
|
176 |
+static float find_max_val(int group_len, int swb_size, const float *scaled) { |
|
177 |
+ float maxval = 0.0f; |
|
178 |
+ int w2, i; |
|
179 |
+ for (w2 = 0; w2 < group_len; w2++) { |
|
180 |
+ for (i = 0; i < swb_size; i++) { |
|
181 |
+ maxval = FFMAX(maxval, scaled[w2*128+i]); |
|
182 |
+ } |
|
183 |
+ } |
|
184 |
+ return maxval; |
|
185 |
+} |
|
186 |
+ |
|
187 |
+static int find_min_book(float maxval, int sf) { |
|
188 |
+ float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512]; |
|
189 |
+ float Q34 = sqrtf(Q * sqrtf(Q)); |
|
190 |
+ int qmaxval, cb; |
|
191 |
+ qmaxval = maxval * Q34 + 0.4054f; |
|
192 |
+ if (qmaxval == 0) cb = 0; |
|
193 |
+ else if (qmaxval == 1) cb = 1; |
|
194 |
+ else if (qmaxval == 2) cb = 3; |
|
195 |
+ else if (qmaxval <= 4) cb = 5; |
|
196 |
+ else if (qmaxval <= 7) cb = 7; |
|
197 |
+ else if (qmaxval <= 12) cb = 9; |
|
198 |
+ else cb = 11; |
|
199 |
+ return cb; |
|
200 |
+} |
|
201 |
+ |
|
202 |
+/** |
|
203 |
+ * Functions developed from template function and optimized for quantizing and encoding band |
|
204 |
+ */ |
|
205 |
+static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s, |
|
206 |
+ PutBitContext *pb, const float *in, |
|
207 |
+ const float *scaled, int size, int scale_idx, |
|
208 |
+ int cb, const float lambda, const float uplim, |
|
209 |
+ int *bits) |
|
210 |
+{ |
|
211 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
212 |
+ int i; |
|
213 |
+ int qc1, qc2, qc3, qc4; |
|
214 |
+ |
|
215 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
216 |
+ uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; |
|
217 |
+ |
|
218 |
+ abs_pow34_v(s->scoefs, in, size); |
|
219 |
+ scaled = s->scoefs; |
|
220 |
+ for (i = 0; i < size; i += 4) { |
|
221 |
+ int curidx; |
|
222 |
+ int *in_int = (int *)&in[i]; |
|
223 |
+ |
|
224 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
225 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
226 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
227 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
228 |
+ |
|
229 |
+ __asm__ volatile ( |
|
230 |
+ ".set push \n\t" |
|
231 |
+ ".set noreorder \n\t" |
|
232 |
+ |
|
233 |
+ "slt %[qc1], $zero, %[qc1] \n\t" |
|
234 |
+ "slt %[qc2], $zero, %[qc2] \n\t" |
|
235 |
+ "slt %[qc3], $zero, %[qc3] \n\t" |
|
236 |
+ "slt %[qc4], $zero, %[qc4] \n\t" |
|
237 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
238 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
239 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
240 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
241 |
+ "srl $t0, $t0, 31 \n\t" |
|
242 |
+ "srl $t1, $t1, 31 \n\t" |
|
243 |
+ "srl $t2, $t2, 31 \n\t" |
|
244 |
+ "srl $t3, $t3, 31 \n\t" |
|
245 |
+ "subu $t4, $zero, %[qc1] \n\t" |
|
246 |
+ "subu $t5, $zero, %[qc2] \n\t" |
|
247 |
+ "subu $t6, $zero, %[qc3] \n\t" |
|
248 |
+ "subu $t7, $zero, %[qc4] \n\t" |
|
249 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
250 |
+ "movn %[qc2], $t5, $t1 \n\t" |
|
251 |
+ "movn %[qc3], $t6, $t2 \n\t" |
|
252 |
+ "movn %[qc4], $t7, $t3 \n\t" |
|
253 |
+ |
|
254 |
+ ".set pop \n\t" |
|
255 |
+ |
|
256 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
257 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
258 |
+ : [in_int]"r"(in_int) |
|
259 |
+ : "t0", "t1", "t2", "t3", |
|
260 |
+ "t4", "t5", "t6", "t7", |
|
261 |
+ "memory" |
|
262 |
+ ); |
|
263 |
+ |
|
264 |
+ curidx = qc1; |
|
265 |
+ curidx *= 3; |
|
266 |
+ curidx += qc2; |
|
267 |
+ curidx *= 3; |
|
268 |
+ curidx += qc3; |
|
269 |
+ curidx *= 3; |
|
270 |
+ curidx += qc4; |
|
271 |
+ curidx += 40; |
|
272 |
+ |
|
273 |
+ put_bits(pb, p_bits[curidx], p_codes[curidx]); |
|
274 |
+ } |
|
275 |
+} |
|
276 |
+ |
|
277 |
+static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s, |
|
278 |
+ PutBitContext *pb, const float *in, |
|
279 |
+ const float *scaled, int size, int scale_idx, |
|
280 |
+ int cb, const float lambda, const float uplim, |
|
281 |
+ int *bits) |
|
282 |
+{ |
|
283 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
284 |
+ int i; |
|
285 |
+ int qc1, qc2, qc3, qc4; |
|
286 |
+ |
|
287 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
288 |
+ uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; |
|
289 |
+ |
|
290 |
+ abs_pow34_v(s->scoefs, in, size); |
|
291 |
+ scaled = s->scoefs; |
|
292 |
+ for (i = 0; i < size; i += 4) { |
|
293 |
+ int curidx, sign, count; |
|
294 |
+ int *in_int = (int *)&in[i]; |
|
295 |
+ uint8_t v_bits; |
|
296 |
+ unsigned int v_codes; |
|
297 |
+ |
|
298 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
299 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
300 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
301 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
302 |
+ |
|
303 |
+ __asm__ volatile ( |
|
304 |
+ ".set push \n\t" |
|
305 |
+ ".set noreorder \n\t" |
|
306 |
+ |
|
307 |
+ "ori $t4, $zero, 2 \n\t" |
|
308 |
+ "ori %[sign], $zero, 0 \n\t" |
|
309 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
310 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
311 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
312 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
313 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
314 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
315 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
316 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
317 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
318 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
319 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
320 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
321 |
+ "slt $t0, $t0, $zero \n\t" |
|
322 |
+ "movn %[sign], $t0, %[qc1] \n\t" |
|
323 |
+ "slt $t1, $t1, $zero \n\t" |
|
324 |
+ "slt $t2, $t2, $zero \n\t" |
|
325 |
+ "slt $t3, $t3, $zero \n\t" |
|
326 |
+ "sll $t0, %[sign], 1 \n\t" |
|
327 |
+ "or $t0, $t0, $t1 \n\t" |
|
328 |
+ "movn %[sign], $t0, %[qc2] \n\t" |
|
329 |
+ "slt $t4, $zero, %[qc1] \n\t" |
|
330 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
331 |
+ "slt %[count], $zero, %[qc3] \n\t" |
|
332 |
+ "sll $t0, %[sign], 1 \n\t" |
|
333 |
+ "or $t0, $t0, $t2 \n\t" |
|
334 |
+ "movn %[sign], $t0, %[qc3] \n\t" |
|
335 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
336 |
+ "addu %[count], %[count], $t4 \n\t" |
|
337 |
+ "addu %[count], %[count], $t1 \n\t" |
|
338 |
+ "sll $t0, %[sign], 1 \n\t" |
|
339 |
+ "or $t0, $t0, $t3 \n\t" |
|
340 |
+ "movn %[sign], $t0, %[qc4] \n\t" |
|
341 |
+ "addu %[count], %[count], $t2 \n\t" |
|
342 |
+ |
|
343 |
+ ".set pop \n\t" |
|
344 |
+ |
|
345 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
346 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
347 |
+ [sign]"=&r"(sign), [count]"=&r"(count) |
|
348 |
+ : [in_int]"r"(in_int) |
|
349 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
350 |
+ "memory" |
|
351 |
+ ); |
|
352 |
+ |
|
353 |
+ curidx = qc1; |
|
354 |
+ curidx *= 3; |
|
355 |
+ curidx += qc2; |
|
356 |
+ curidx *= 3; |
|
357 |
+ curidx += qc3; |
|
358 |
+ curidx *= 3; |
|
359 |
+ curidx += qc4; |
|
360 |
+ |
|
361 |
+ v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1)); |
|
362 |
+ v_bits = p_bits[curidx] + count; |
|
363 |
+ put_bits(pb, v_bits, v_codes); |
|
364 |
+ } |
|
365 |
+} |
|
366 |
+ |
|
367 |
+static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s, |
|
368 |
+ PutBitContext *pb, const float *in, |
|
369 |
+ const float *scaled, int size, int scale_idx, |
|
370 |
+ int cb, const float lambda, const float uplim, |
|
371 |
+ int *bits) |
|
372 |
+{ |
|
373 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
374 |
+ int i; |
|
375 |
+ int qc1, qc2, qc3, qc4; |
|
376 |
+ |
|
377 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
378 |
+ uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; |
|
379 |
+ |
|
380 |
+ abs_pow34_v(s->scoefs, in, size); |
|
381 |
+ scaled = s->scoefs; |
|
382 |
+ for (i = 0; i < size; i += 4) { |
|
383 |
+ int curidx, curidx2; |
|
384 |
+ int *in_int = (int *)&in[i]; |
|
385 |
+ uint8_t v_bits; |
|
386 |
+ unsigned int v_codes; |
|
387 |
+ |
|
388 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
389 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
390 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
391 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
392 |
+ |
|
393 |
+ __asm__ volatile ( |
|
394 |
+ ".set push \n\t" |
|
395 |
+ ".set noreorder \n\t" |
|
396 |
+ |
|
397 |
+ "ori $t4, $zero, 4 \n\t" |
|
398 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
399 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
400 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
401 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
402 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
403 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
404 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
405 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
406 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
407 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
408 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
409 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
410 |
+ "srl $t0, $t0, 31 \n\t" |
|
411 |
+ "srl $t1, $t1, 31 \n\t" |
|
412 |
+ "srl $t2, $t2, 31 \n\t" |
|
413 |
+ "srl $t3, $t3, 31 \n\t" |
|
414 |
+ "subu $t4, $zero, %[qc1] \n\t" |
|
415 |
+ "subu $t5, $zero, %[qc2] \n\t" |
|
416 |
+ "subu $t6, $zero, %[qc3] \n\t" |
|
417 |
+ "subu $t7, $zero, %[qc4] \n\t" |
|
418 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
419 |
+ "movn %[qc2], $t5, $t1 \n\t" |
|
420 |
+ "movn %[qc3], $t6, $t2 \n\t" |
|
421 |
+ "movn %[qc4], $t7, $t3 \n\t" |
|
422 |
+ |
|
423 |
+ ".set pop \n\t" |
|
424 |
+ |
|
425 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
426 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
427 |
+ : [in_int]"r"(in_int) |
|
428 |
+ : "t0", "t1", "t2", "t3", |
|
429 |
+ "t4", "t5", "t6", "t7", |
|
430 |
+ "memory" |
|
431 |
+ ); |
|
432 |
+ |
|
433 |
+ curidx = 9 * qc1; |
|
434 |
+ curidx += qc2 + 40; |
|
435 |
+ |
|
436 |
+ curidx2 = 9 * qc3; |
|
437 |
+ curidx2 += qc4 + 40; |
|
438 |
+ |
|
439 |
+ v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]); |
|
440 |
+ v_bits = p_bits[curidx] + p_bits[curidx2]; |
|
441 |
+ put_bits(pb, v_bits, v_codes); |
|
442 |
+ } |
|
443 |
+} |
|
444 |
+ |
|
445 |
+static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s, |
|
446 |
+ PutBitContext *pb, const float *in, |
|
447 |
+ const float *scaled, int size, int scale_idx, |
|
448 |
+ int cb, const float lambda, const float uplim, |
|
449 |
+ int *bits) |
|
450 |
+{ |
|
451 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
452 |
+ int i; |
|
453 |
+ int qc1, qc2, qc3, qc4; |
|
454 |
+ |
|
455 |
+ uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1]; |
|
456 |
+ uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; |
|
457 |
+ |
|
458 |
+ abs_pow34_v(s->scoefs, in, size); |
|
459 |
+ scaled = s->scoefs; |
|
460 |
+ for (i = 0; i < size; i += 4) { |
|
461 |
+ int curidx, sign1, count1, sign2, count2; |
|
462 |
+ int *in_int = (int *)&in[i]; |
|
463 |
+ uint8_t v_bits; |
|
464 |
+ unsigned int v_codes; |
|
465 |
+ |
|
466 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
467 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
468 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
469 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
470 |
+ |
|
471 |
+ __asm__ volatile ( |
|
472 |
+ ".set push \n\t" |
|
473 |
+ ".set noreorder \n\t" |
|
474 |
+ |
|
475 |
+ "ori $t4, $zero, 7 \n\t" |
|
476 |
+ "ori %[sign1], $zero, 0 \n\t" |
|
477 |
+ "ori %[sign2], $zero, 0 \n\t" |
|
478 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
479 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
480 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
481 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
482 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
483 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
484 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
485 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
486 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
487 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
488 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
489 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
490 |
+ "slt $t0, $t0, $zero \n\t" |
|
491 |
+ "movn %[sign1], $t0, %[qc1] \n\t" |
|
492 |
+ "slt $t2, $t2, $zero \n\t" |
|
493 |
+ "movn %[sign2], $t2, %[qc3] \n\t" |
|
494 |
+ "slt $t1, $t1, $zero \n\t" |
|
495 |
+ "sll $t0, %[sign1], 1 \n\t" |
|
496 |
+ "or $t0, $t0, $t1 \n\t" |
|
497 |
+ "movn %[sign1], $t0, %[qc2] \n\t" |
|
498 |
+ "slt $t3, $t3, $zero \n\t" |
|
499 |
+ "sll $t0, %[sign2], 1 \n\t" |
|
500 |
+ "or $t0, $t0, $t3 \n\t" |
|
501 |
+ "movn %[sign2], $t0, %[qc4] \n\t" |
|
502 |
+ "slt %[count1], $zero, %[qc1] \n\t" |
|
503 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
504 |
+ "slt %[count2], $zero, %[qc3] \n\t" |
|
505 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
506 |
+ "addu %[count1], %[count1], $t1 \n\t" |
|
507 |
+ "addu %[count2], %[count2], $t2 \n\t" |
|
508 |
+ |
|
509 |
+ ".set pop \n\t" |
|
510 |
+ |
|
511 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
512 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
513 |
+ [sign1]"=&r"(sign1), [count1]"=&r"(count1), |
|
514 |
+ [sign2]"=&r"(sign2), [count2]"=&r"(count2) |
|
515 |
+ : [in_int]"r"(in_int) |
|
516 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
517 |
+ "memory" |
|
518 |
+ ); |
|
519 |
+ |
|
520 |
+ curidx = 8 * qc1; |
|
521 |
+ curidx += qc2; |
|
522 |
+ |
|
523 |
+ v_codes = (p_codes[curidx] << count1) | sign1; |
|
524 |
+ v_bits = p_bits[curidx] + count1; |
|
525 |
+ put_bits(pb, v_bits, v_codes); |
|
526 |
+ |
|
527 |
+ curidx = 8 * qc3; |
|
528 |
+ curidx += qc4; |
|
529 |
+ |
|
530 |
+ v_codes = (p_codes[curidx] << count2) | sign2; |
|
531 |
+ v_bits = p_bits[curidx] + count2; |
|
532 |
+ put_bits(pb, v_bits, v_codes); |
|
533 |
+ } |
|
534 |
+} |
|
535 |
+ |
|
536 |
+static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s, |
|
537 |
+ PutBitContext *pb, const float *in, |
|
538 |
+ const float *scaled, int size, int scale_idx, |
|
539 |
+ int cb, const float lambda, const float uplim, |
|
540 |
+ int *bits) |
|
541 |
+{ |
|
542 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
543 |
+ int i; |
|
544 |
+ int qc1, qc2, qc3, qc4; |
|
545 |
+ |
|
546 |
+ uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1]; |
|
547 |
+ uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; |
|
548 |
+ |
|
549 |
+ abs_pow34_v(s->scoefs, in, size); |
|
550 |
+ scaled = s->scoefs; |
|
551 |
+ for (i = 0; i < size; i += 4) { |
|
552 |
+ int curidx, sign1, count1, sign2, count2; |
|
553 |
+ int *in_int = (int *)&in[i]; |
|
554 |
+ uint8_t v_bits; |
|
555 |
+ unsigned int v_codes; |
|
556 |
+ |
|
557 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
558 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
559 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
560 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
561 |
+ |
|
562 |
+ __asm__ volatile ( |
|
563 |
+ ".set push \n\t" |
|
564 |
+ ".set noreorder \n\t" |
|
565 |
+ |
|
566 |
+ "ori $t4, $zero, 12 \n\t" |
|
567 |
+ "ori %[sign1], $zero, 0 \n\t" |
|
568 |
+ "ori %[sign2], $zero, 0 \n\t" |
|
569 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
570 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
571 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
572 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
573 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
574 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
575 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
576 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
577 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
578 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
579 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
580 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
581 |
+ "slt $t0, $t0, $zero \n\t" |
|
582 |
+ "movn %[sign1], $t0, %[qc1] \n\t" |
|
583 |
+ "slt $t2, $t2, $zero \n\t" |
|
584 |
+ "movn %[sign2], $t2, %[qc3] \n\t" |
|
585 |
+ "slt $t1, $t1, $zero \n\t" |
|
586 |
+ "sll $t0, %[sign1], 1 \n\t" |
|
587 |
+ "or $t0, $t0, $t1 \n\t" |
|
588 |
+ "movn %[sign1], $t0, %[qc2] \n\t" |
|
589 |
+ "slt $t3, $t3, $zero \n\t" |
|
590 |
+ "sll $t0, %[sign2], 1 \n\t" |
|
591 |
+ "or $t0, $t0, $t3 \n\t" |
|
592 |
+ "movn %[sign2], $t0, %[qc4] \n\t" |
|
593 |
+ "slt %[count1], $zero, %[qc1] \n\t" |
|
594 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
595 |
+ "slt %[count2], $zero, %[qc3] \n\t" |
|
596 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
597 |
+ "addu %[count1], %[count1], $t1 \n\t" |
|
598 |
+ "addu %[count2], %[count2], $t2 \n\t" |
|
599 |
+ |
|
600 |
+ ".set pop \n\t" |
|
601 |
+ |
|
602 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
603 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
604 |
+ [sign1]"=&r"(sign1), [count1]"=&r"(count1), |
|
605 |
+ [sign2]"=&r"(sign2), [count2]"=&r"(count2) |
|
606 |
+ : [in_int]"r"(in_int) |
|
607 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
608 |
+ "memory" |
|
609 |
+ ); |
|
610 |
+ |
|
611 |
+ curidx = 13 * qc1; |
|
612 |
+ curidx += qc2; |
|
613 |
+ |
|
614 |
+ v_codes = (p_codes[curidx] << count1) | sign1; |
|
615 |
+ v_bits = p_bits[curidx] + count1; |
|
616 |
+ put_bits(pb, v_bits, v_codes); |
|
617 |
+ |
|
618 |
+ curidx = 13 * qc3; |
|
619 |
+ curidx += qc4; |
|
620 |
+ |
|
621 |
+ v_codes = (p_codes[curidx] << count2) | sign2; |
|
622 |
+ v_bits = p_bits[curidx] + count2; |
|
623 |
+ put_bits(pb, v_bits, v_codes); |
|
624 |
+ } |
|
625 |
+} |
|
626 |
+ |
|
627 |
+static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s, |
|
628 |
+ PutBitContext *pb, const float *in, |
|
629 |
+ const float *scaled, int size, int scale_idx, |
|
630 |
+ int cb, const float lambda, const float uplim, |
|
631 |
+ int *bits) |
|
632 |
+{ |
|
633 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
634 |
+ int i; |
|
635 |
+ int qc1, qc2, qc3, qc4; |
|
636 |
+ |
|
637 |
+ uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1]; |
|
638 |
+ uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; |
|
639 |
+ float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1]; |
|
640 |
+ |
|
641 |
+ abs_pow34_v(s->scoefs, in, size); |
|
642 |
+ scaled = s->scoefs; |
|
643 |
+ |
|
644 |
+ if (cb < 11) { |
|
645 |
+ for (i = 0; i < size; i += 4) { |
|
646 |
+ int curidx, curidx2, sign1, count1, sign2, count2; |
|
647 |
+ int *in_int = (int *)&in[i]; |
|
648 |
+ uint8_t v_bits; |
|
649 |
+ unsigned int v_codes; |
|
650 |
+ |
|
651 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
652 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
653 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
654 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
655 |
+ |
|
656 |
+ __asm__ volatile ( |
|
657 |
+ ".set push \n\t" |
|
658 |
+ ".set noreorder \n\t" |
|
659 |
+ |
|
660 |
+ "ori $t4, $zero, 16 \n\t" |
|
661 |
+ "ori %[sign1], $zero, 0 \n\t" |
|
662 |
+ "ori %[sign2], $zero, 0 \n\t" |
|
663 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
664 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
665 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
666 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
667 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
668 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
669 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
670 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
671 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
672 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
673 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
674 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
675 |
+ "slt $t0, $t0, $zero \n\t" |
|
676 |
+ "movn %[sign1], $t0, %[qc1] \n\t" |
|
677 |
+ "slt $t2, $t2, $zero \n\t" |
|
678 |
+ "movn %[sign2], $t2, %[qc3] \n\t" |
|
679 |
+ "slt $t1, $t1, $zero \n\t" |
|
680 |
+ "sll $t0, %[sign1], 1 \n\t" |
|
681 |
+ "or $t0, $t0, $t1 \n\t" |
|
682 |
+ "movn %[sign1], $t0, %[qc2] \n\t" |
|
683 |
+ "slt $t3, $t3, $zero \n\t" |
|
684 |
+ "sll $t0, %[sign2], 1 \n\t" |
|
685 |
+ "or $t0, $t0, $t3 \n\t" |
|
686 |
+ "movn %[sign2], $t0, %[qc4] \n\t" |
|
687 |
+ "slt %[count1], $zero, %[qc1] \n\t" |
|
688 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
689 |
+ "slt %[count2], $zero, %[qc3] \n\t" |
|
690 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
691 |
+ "addu %[count1], %[count1], $t1 \n\t" |
|
692 |
+ "addu %[count2], %[count2], $t2 \n\t" |
|
693 |
+ |
|
694 |
+ ".set pop \n\t" |
|
695 |
+ |
|
696 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
697 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
698 |
+ [sign1]"=&r"(sign1), [count1]"=&r"(count1), |
|
699 |
+ [sign2]"=&r"(sign2), [count2]"=&r"(count2) |
|
700 |
+ : [in_int]"r"(in_int) |
|
701 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
702 |
+ "memory" |
|
703 |
+ ); |
|
704 |
+ |
|
705 |
+ curidx = 17 * qc1; |
|
706 |
+ curidx += qc2; |
|
707 |
+ curidx2 = 17 * qc3; |
|
708 |
+ curidx2 += qc4; |
|
709 |
+ |
|
710 |
+ v_codes = (p_codes[curidx] << count1) | sign1; |
|
711 |
+ v_bits = p_bits[curidx] + count1; |
|
712 |
+ put_bits(pb, v_bits, v_codes); |
|
713 |
+ |
|
714 |
+ v_codes = (p_codes[curidx2] << count2) | sign2; |
|
715 |
+ v_bits = p_bits[curidx2] + count2; |
|
716 |
+ put_bits(pb, v_bits, v_codes); |
|
717 |
+ } |
|
718 |
+ } else { |
|
719 |
+ for (i = 0; i < size; i += 4) { |
|
720 |
+ int curidx, curidx2, sign1, count1, sign2, count2; |
|
721 |
+ int *in_int = (int *)&in[i]; |
|
722 |
+ uint8_t v_bits; |
|
723 |
+ unsigned int v_codes; |
|
724 |
+ int c1, c2, c3, c4; |
|
725 |
+ |
|
726 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
727 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
728 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
729 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
730 |
+ |
|
731 |
+ __asm__ volatile ( |
|
732 |
+ ".set push \n\t" |
|
733 |
+ ".set noreorder \n\t" |
|
734 |
+ |
|
735 |
+ "ori $t4, $zero, 16 \n\t" |
|
736 |
+ "ori %[sign1], $zero, 0 \n\t" |
|
737 |
+ "ori %[sign2], $zero, 0 \n\t" |
|
738 |
+ "shll_s.w %[c1], %[qc1], 18 \n\t" |
|
739 |
+ "shll_s.w %[c2], %[qc2], 18 \n\t" |
|
740 |
+ "shll_s.w %[c3], %[qc3], 18 \n\t" |
|
741 |
+ "shll_s.w %[c4], %[qc4], 18 \n\t" |
|
742 |
+ "srl %[c1], %[c1], 18 \n\t" |
|
743 |
+ "srl %[c2], %[c2], 18 \n\t" |
|
744 |
+ "srl %[c3], %[c3], 18 \n\t" |
|
745 |
+ "srl %[c4], %[c4], 18 \n\t" |
|
746 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
747 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
748 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
749 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
750 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
751 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
752 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
753 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
754 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
755 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
756 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
757 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
758 |
+ "slt $t0, $t0, $zero \n\t" |
|
759 |
+ "movn %[sign1], $t0, %[qc1] \n\t" |
|
760 |
+ "slt $t2, $t2, $zero \n\t" |
|
761 |
+ "movn %[sign2], $t2, %[qc3] \n\t" |
|
762 |
+ "slt $t1, $t1, $zero \n\t" |
|
763 |
+ "sll $t0, %[sign1], 1 \n\t" |
|
764 |
+ "or $t0, $t0, $t1 \n\t" |
|
765 |
+ "movn %[sign1], $t0, %[qc2] \n\t" |
|
766 |
+ "slt $t3, $t3, $zero \n\t" |
|
767 |
+ "sll $t0, %[sign2], 1 \n\t" |
|
768 |
+ "or $t0, $t0, $t3 \n\t" |
|
769 |
+ "movn %[sign2], $t0, %[qc4] \n\t" |
|
770 |
+ "slt %[count1], $zero, %[qc1] \n\t" |
|
771 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
772 |
+ "slt %[count2], $zero, %[qc3] \n\t" |
|
773 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
774 |
+ "addu %[count1], %[count1], $t1 \n\t" |
|
775 |
+ "addu %[count2], %[count2], $t2 \n\t" |
|
776 |
+ |
|
777 |
+ ".set pop \n\t" |
|
778 |
+ |
|
779 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
780 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
781 |
+ [sign1]"=&r"(sign1), [count1]"=&r"(count1), |
|
782 |
+ [sign2]"=&r"(sign2), [count2]"=&r"(count2), |
|
783 |
+ [c1]"=&r"(c1), [c2]"=&r"(c2), |
|
784 |
+ [c3]"=&r"(c3), [c4]"=&r"(c4) |
|
785 |
+ : [in_int]"r"(in_int) |
|
786 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
787 |
+ "memory" |
|
788 |
+ ); |
|
789 |
+ |
|
790 |
+ curidx = 17 * qc1; |
|
791 |
+ curidx += qc2; |
|
792 |
+ |
|
793 |
+ curidx2 = 17 * qc3; |
|
794 |
+ curidx2 += qc4; |
|
795 |
+ |
|
796 |
+ v_codes = (p_codes[curidx] << count1) | sign1; |
|
797 |
+ v_bits = p_bits[curidx] + count1; |
|
798 |
+ put_bits(pb, v_bits, v_codes); |
|
799 |
+ |
|
800 |
+ if (p_vectors[curidx*2 ] == 64.0f) { |
|
801 |
+ int len = av_log2(c1); |
|
802 |
+ v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1)); |
|
803 |
+ put_bits(pb, len * 2 - 3, v_codes); |
|
804 |
+ } |
|
805 |
+ if (p_vectors[curidx*2+1] == 64.0f) { |
|
806 |
+ int len = av_log2(c2); |
|
807 |
+ v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1)); |
|
808 |
+ put_bits(pb, len*2-3, v_codes); |
|
809 |
+ } |
|
810 |
+ |
|
811 |
+ v_codes = (p_codes[curidx2] << count2) | sign2; |
|
812 |
+ v_bits = p_bits[curidx2] + count2; |
|
813 |
+ put_bits(pb, v_bits, v_codes); |
|
814 |
+ |
|
815 |
+ if (p_vectors[curidx2*2 ] == 64.0f) { |
|
816 |
+ int len = av_log2(c3); |
|
817 |
+ v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1)); |
|
818 |
+ put_bits(pb, len* 2 - 3, v_codes); |
|
819 |
+ } |
|
820 |
+ if (p_vectors[curidx2*2+1] == 64.0f) { |
|
821 |
+ int len = av_log2(c4); |
|
822 |
+ v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1)); |
|
823 |
+ put_bits(pb, len * 2 - 3, v_codes); |
|
824 |
+ } |
|
825 |
+ } |
|
826 |
+ } |
|
827 |
+} |
|
828 |
+ |
|
829 |
+static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s, |
|
830 |
+ PutBitContext *pb, const float *in, |
|
831 |
+ const float *scaled, int size, int scale_idx, |
|
832 |
+ int cb, const float lambda, const float uplim, |
|
833 |
+ int *bits) = { |
|
834 |
+ NULL, |
|
835 |
+ quantize_and_encode_band_cost_SQUAD_mips, |
|
836 |
+ quantize_and_encode_band_cost_SQUAD_mips, |
|
837 |
+ quantize_and_encode_band_cost_UQUAD_mips, |
|
838 |
+ quantize_and_encode_band_cost_UQUAD_mips, |
|
839 |
+ quantize_and_encode_band_cost_SPAIR_mips, |
|
840 |
+ quantize_and_encode_band_cost_SPAIR_mips, |
|
841 |
+ quantize_and_encode_band_cost_UPAIR7_mips, |
|
842 |
+ quantize_and_encode_band_cost_UPAIR7_mips, |
|
843 |
+ quantize_and_encode_band_cost_UPAIR12_mips, |
|
844 |
+ quantize_and_encode_band_cost_UPAIR12_mips, |
|
845 |
+ quantize_and_encode_band_cost_ESC_mips, |
|
846 |
+}; |
|
847 |
+ |
|
848 |
+#define quantize_and_encode_band_cost( \ |
|
849 |
+ s, pb, in, scaled, size, scale_idx, cb, \ |
|
850 |
+ lambda, uplim, bits) \ |
|
851 |
+ quantize_and_encode_band_cost_arr[cb]( \ |
|
852 |
+ s, pb, in, scaled, size, scale_idx, cb, \ |
|
853 |
+ lambda, uplim, bits) |
|
854 |
+ |
|
855 |
+static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb, |
|
856 |
+ const float *in, int size, int scale_idx, |
|
857 |
+ int cb, const float lambda) |
|
858 |
+{ |
|
859 |
+ quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda, |
|
860 |
+ INFINITY, NULL); |
|
861 |
+} |
|
862 |
+ |
|
863 |
+/** |
|
864 |
+ * Functions developed from template function and optimized for getting the number of bits |
|
865 |
+ */ |
|
866 |
+static float get_band_numbits_ZERO_mips(struct AACEncContext *s, |
|
867 |
+ PutBitContext *pb, const float *in, |
|
868 |
+ const float *scaled, int size, int scale_idx, |
|
869 |
+ int cb, const float lambda, const float uplim, |
|
870 |
+ int *bits) |
|
871 |
+{ |
|
872 |
+ return 0; |
|
873 |
+} |
|
874 |
+ |
|
875 |
+static float get_band_numbits_SQUAD_mips(struct AACEncContext *s, |
|
876 |
+ PutBitContext *pb, const float *in, |
|
877 |
+ const float *scaled, int size, int scale_idx, |
|
878 |
+ int cb, const float lambda, const float uplim, |
|
879 |
+ int *bits) |
|
880 |
+{ |
|
881 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
882 |
+ int i; |
|
883 |
+ int qc1, qc2, qc3, qc4; |
|
884 |
+ int curbits = 0; |
|
885 |
+ |
|
886 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
887 |
+ |
|
888 |
+ for (i = 0; i < size; i += 4) { |
|
889 |
+ int curidx; |
|
890 |
+ int *in_int = (int *)&in[i]; |
|
891 |
+ |
|
892 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
893 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
894 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
895 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
896 |
+ |
|
897 |
+ __asm__ volatile ( |
|
898 |
+ ".set push \n\t" |
|
899 |
+ ".set noreorder \n\t" |
|
900 |
+ |
|
901 |
+ "slt %[qc1], $zero, %[qc1] \n\t" |
|
902 |
+ "slt %[qc2], $zero, %[qc2] \n\t" |
|
903 |
+ "slt %[qc3], $zero, %[qc3] \n\t" |
|
904 |
+ "slt %[qc4], $zero, %[qc4] \n\t" |
|
905 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
906 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
907 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
908 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
909 |
+ "srl $t0, $t0, 31 \n\t" |
|
910 |
+ "srl $t1, $t1, 31 \n\t" |
|
911 |
+ "srl $t2, $t2, 31 \n\t" |
|
912 |
+ "srl $t3, $t3, 31 \n\t" |
|
913 |
+ "subu $t4, $zero, %[qc1] \n\t" |
|
914 |
+ "subu $t5, $zero, %[qc2] \n\t" |
|
915 |
+ "subu $t6, $zero, %[qc3] \n\t" |
|
916 |
+ "subu $t7, $zero, %[qc4] \n\t" |
|
917 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
918 |
+ "movn %[qc2], $t5, $t1 \n\t" |
|
919 |
+ "movn %[qc3], $t6, $t2 \n\t" |
|
920 |
+ "movn %[qc4], $t7, $t3 \n\t" |
|
921 |
+ |
|
922 |
+ ".set pop \n\t" |
|
923 |
+ |
|
924 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
925 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
926 |
+ : [in_int]"r"(in_int) |
|
927 |
+ : "t0", "t1", "t2", "t3", |
|
928 |
+ "t4", "t5", "t6", "t7", |
|
929 |
+ "memory" |
|
930 |
+ ); |
|
931 |
+ |
|
932 |
+ curidx = qc1; |
|
933 |
+ curidx *= 3; |
|
934 |
+ curidx += qc2; |
|
935 |
+ curidx *= 3; |
|
936 |
+ curidx += qc3; |
|
937 |
+ curidx *= 3; |
|
938 |
+ curidx += qc4; |
|
939 |
+ curidx += 40; |
|
940 |
+ |
|
941 |
+ curbits += p_bits[curidx]; |
|
942 |
+ } |
|
943 |
+ return curbits; |
|
944 |
+} |
|
945 |
+ |
|
946 |
+static float get_band_numbits_UQUAD_mips(struct AACEncContext *s, |
|
947 |
+ PutBitContext *pb, const float *in, |
|
948 |
+ const float *scaled, int size, int scale_idx, |
|
949 |
+ int cb, const float lambda, const float uplim, |
|
950 |
+ int *bits) |
|
951 |
+{ |
|
952 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
953 |
+ int i; |
|
954 |
+ int curbits = 0; |
|
955 |
+ int qc1, qc2, qc3, qc4; |
|
956 |
+ |
|
957 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
958 |
+ |
|
959 |
+ for (i = 0; i < size; i += 4) { |
|
960 |
+ int curidx; |
|
961 |
+ |
|
962 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
963 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
964 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
965 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
966 |
+ |
|
967 |
+ __asm__ volatile ( |
|
968 |
+ ".set push \n\t" |
|
969 |
+ ".set noreorder \n\t" |
|
970 |
+ |
|
971 |
+ "ori $t4, $zero, 2 \n\t" |
|
972 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
973 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
974 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
975 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
976 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
977 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
978 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
979 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
980 |
+ |
|
981 |
+ ".set pop \n\t" |
|
982 |
+ |
|
983 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
984 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
985 |
+ : |
|
986 |
+ : "t0", "t1", "t2", "t3", "t4" |
|
987 |
+ ); |
|
988 |
+ |
|
989 |
+ curidx = qc1; |
|
990 |
+ curidx *= 3; |
|
991 |
+ curidx += qc2; |
|
992 |
+ curidx *= 3; |
|
993 |
+ curidx += qc3; |
|
994 |
+ curidx *= 3; |
|
995 |
+ curidx += qc4; |
|
996 |
+ |
|
997 |
+ curbits += p_bits[curidx]; |
|
998 |
+ curbits += uquad_sign_bits[curidx]; |
|
999 |
+ } |
|
1000 |
+ return curbits; |
|
1001 |
+} |
|
1002 |
+ |
|
1003 |
+static float get_band_numbits_SPAIR_mips(struct AACEncContext *s, |
|
1004 |
+ PutBitContext *pb, const float *in, |
|
1005 |
+ const float *scaled, int size, int scale_idx, |
|
1006 |
+ int cb, const float lambda, const float uplim, |
|
1007 |
+ int *bits) |
|
1008 |
+{ |
|
1009 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1010 |
+ int i; |
|
1011 |
+ int qc1, qc2, qc3, qc4; |
|
1012 |
+ int curbits = 0; |
|
1013 |
+ |
|
1014 |
+ uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; |
|
1015 |
+ |
|
1016 |
+ for (i = 0; i < size; i += 4) { |
|
1017 |
+ int curidx, curidx2; |
|
1018 |
+ int *in_int = (int *)&in[i]; |
|
1019 |
+ |
|
1020 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1021 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1022 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1023 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1024 |
+ |
|
1025 |
+ __asm__ volatile ( |
|
1026 |
+ ".set push \n\t" |
|
1027 |
+ ".set noreorder \n\t" |
|
1028 |
+ |
|
1029 |
+ "ori $t4, $zero, 4 \n\t" |
|
1030 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1031 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1032 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1033 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1034 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1035 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1036 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1037 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1038 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
1039 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
1040 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
1041 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
1042 |
+ "srl $t0, $t0, 31 \n\t" |
|
1043 |
+ "srl $t1, $t1, 31 \n\t" |
|
1044 |
+ "srl $t2, $t2, 31 \n\t" |
|
1045 |
+ "srl $t3, $t3, 31 \n\t" |
|
1046 |
+ "subu $t4, $zero, %[qc1] \n\t" |
|
1047 |
+ "subu $t5, $zero, %[qc2] \n\t" |
|
1048 |
+ "subu $t6, $zero, %[qc3] \n\t" |
|
1049 |
+ "subu $t7, $zero, %[qc4] \n\t" |
|
1050 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1051 |
+ "movn %[qc2], $t5, $t1 \n\t" |
|
1052 |
+ "movn %[qc3], $t6, $t2 \n\t" |
|
1053 |
+ "movn %[qc4], $t7, $t3 \n\t" |
|
1054 |
+ |
|
1055 |
+ ".set pop \n\t" |
|
1056 |
+ |
|
1057 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1058 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
1059 |
+ : [in_int]"r"(in_int) |
|
1060 |
+ : "t0", "t1", "t2", "t3", |
|
1061 |
+ "t4", "t5", "t6", "t7", |
|
1062 |
+ "memory" |
|
1063 |
+ ); |
|
1064 |
+ |
|
1065 |
+ curidx = 9 * qc1; |
|
1066 |
+ curidx += qc2 + 40; |
|
1067 |
+ |
|
1068 |
+ curidx2 = 9 * qc3; |
|
1069 |
+ curidx2 += qc4 + 40; |
|
1070 |
+ |
|
1071 |
+ curbits += p_bits[curidx] + p_bits[curidx2]; |
|
1072 |
+ } |
|
1073 |
+ return curbits; |
|
1074 |
+} |
|
1075 |
+ |
|
1076 |
+static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s, |
|
1077 |
+ PutBitContext *pb, const float *in, |
|
1078 |
+ const float *scaled, int size, int scale_idx, |
|
1079 |
+ int cb, const float lambda, const float uplim, |
|
1080 |
+ int *bits) |
|
1081 |
+{ |
|
1082 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1083 |
+ int i; |
|
1084 |
+ int qc1, qc2, qc3, qc4; |
|
1085 |
+ int curbits = 0; |
|
1086 |
+ |
|
1087 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
1088 |
+ |
|
1089 |
+ for (i = 0; i < size; i += 4) { |
|
1090 |
+ int curidx, curidx2; |
|
1091 |
+ |
|
1092 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1093 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1094 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1095 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1096 |
+ |
|
1097 |
+ __asm__ volatile ( |
|
1098 |
+ ".set push \n\t" |
|
1099 |
+ ".set noreorder \n\t" |
|
1100 |
+ |
|
1101 |
+ "ori $t4, $zero, 7 \n\t" |
|
1102 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1103 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1104 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1105 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1106 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1107 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1108 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1109 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1110 |
+ |
|
1111 |
+ ".set pop \n\t" |
|
1112 |
+ |
|
1113 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1114 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
1115 |
+ : |
|
1116 |
+ : "t0", "t1", "t2", "t3", "t4" |
|
1117 |
+ ); |
|
1118 |
+ |
|
1119 |
+ curidx = 8 * qc1; |
|
1120 |
+ curidx += qc2; |
|
1121 |
+ |
|
1122 |
+ curidx2 = 8 * qc3; |
|
1123 |
+ curidx2 += qc4; |
|
1124 |
+ |
|
1125 |
+ curbits += p_bits[curidx] + |
|
1126 |
+ upair7_sign_bits[curidx] + |
|
1127 |
+ p_bits[curidx2] + |
|
1128 |
+ upair7_sign_bits[curidx2]; |
|
1129 |
+ } |
|
1130 |
+ return curbits; |
|
1131 |
+} |
|
1132 |
+ |
|
1133 |
+static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s, |
|
1134 |
+ PutBitContext *pb, const float *in, |
|
1135 |
+ const float *scaled, int size, int scale_idx, |
|
1136 |
+ int cb, const float lambda, const float uplim, |
|
1137 |
+ int *bits) |
|
1138 |
+{ |
|
1139 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1140 |
+ int i; |
|
1141 |
+ int qc1, qc2, qc3, qc4; |
|
1142 |
+ int curbits = 0; |
|
1143 |
+ |
|
1144 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
1145 |
+ |
|
1146 |
+ for (i = 0; i < size; i += 4) { |
|
1147 |
+ int curidx, curidx2; |
|
1148 |
+ |
|
1149 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1150 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1151 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1152 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1153 |
+ |
|
1154 |
+ __asm__ volatile ( |
|
1155 |
+ ".set push \n\t" |
|
1156 |
+ ".set noreorder \n\t" |
|
1157 |
+ |
|
1158 |
+ "ori $t4, $zero, 12 \n\t" |
|
1159 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1160 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1161 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1162 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1163 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1164 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1165 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1166 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1167 |
+ |
|
1168 |
+ ".set pop \n\t" |
|
1169 |
+ |
|
1170 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1171 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
1172 |
+ : |
|
1173 |
+ : "t0", "t1", "t2", "t3", "t4" |
|
1174 |
+ ); |
|
1175 |
+ |
|
1176 |
+ curidx = 13 * qc1; |
|
1177 |
+ curidx += qc2; |
|
1178 |
+ |
|
1179 |
+ curidx2 = 13 * qc3; |
|
1180 |
+ curidx2 += qc4; |
|
1181 |
+ |
|
1182 |
+ curbits += p_bits[curidx] + |
|
1183 |
+ p_bits[curidx2] + |
|
1184 |
+ upair12_sign_bits[curidx] + |
|
1185 |
+ upair12_sign_bits[curidx2]; |
|
1186 |
+ } |
|
1187 |
+ return curbits; |
|
1188 |
+} |
|
1189 |
+ |
|
1190 |
+static float get_band_numbits_ESC_mips(struct AACEncContext *s, |
|
1191 |
+ PutBitContext *pb, const float *in, |
|
1192 |
+ const float *scaled, int size, int scale_idx, |
|
1193 |
+ int cb, const float lambda, const float uplim, |
|
1194 |
+ int *bits) |
|
1195 |
+{ |
|
1196 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1197 |
+ int i; |
|
1198 |
+ int qc1, qc2, qc3, qc4; |
|
1199 |
+ int curbits = 0; |
|
1200 |
+ |
|
1201 |
+ uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; |
|
1202 |
+ |
|
1203 |
+ for (i = 0; i < size; i += 4) { |
|
1204 |
+ int curidx, curidx2; |
|
1205 |
+ int cond0, cond1, cond2, cond3; |
|
1206 |
+ int c1, c2, c3, c4; |
|
1207 |
+ |
|
1208 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1209 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1210 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1211 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1212 |
+ |
|
1213 |
+ __asm__ volatile ( |
|
1214 |
+ ".set push \n\t" |
|
1215 |
+ ".set noreorder \n\t" |
|
1216 |
+ |
|
1217 |
+ "ori $t4, $zero, 15 \n\t" |
|
1218 |
+ "ori $t5, $zero, 16 \n\t" |
|
1219 |
+ "shll_s.w %[c1], %[qc1], 18 \n\t" |
|
1220 |
+ "shll_s.w %[c2], %[qc2], 18 \n\t" |
|
1221 |
+ "shll_s.w %[c3], %[qc3], 18 \n\t" |
|
1222 |
+ "shll_s.w %[c4], %[qc4], 18 \n\t" |
|
1223 |
+ "srl %[c1], %[c1], 18 \n\t" |
|
1224 |
+ "srl %[c2], %[c2], 18 \n\t" |
|
1225 |
+ "srl %[c3], %[c3], 18 \n\t" |
|
1226 |
+ "srl %[c4], %[c4], 18 \n\t" |
|
1227 |
+ "slt %[cond0], $t4, %[qc1] \n\t" |
|
1228 |
+ "slt %[cond1], $t4, %[qc2] \n\t" |
|
1229 |
+ "slt %[cond2], $t4, %[qc3] \n\t" |
|
1230 |
+ "slt %[cond3], $t4, %[qc4] \n\t" |
|
1231 |
+ "movn %[qc1], $t5, %[cond0] \n\t" |
|
1232 |
+ "movn %[qc2], $t5, %[cond1] \n\t" |
|
1233 |
+ "movn %[qc3], $t5, %[cond2] \n\t" |
|
1234 |
+ "movn %[qc4], $t5, %[cond3] \n\t" |
|
1235 |
+ "ori $t5, $zero, 31 \n\t" |
|
1236 |
+ "clz %[c1], %[c1] \n\t" |
|
1237 |
+ "clz %[c2], %[c2] \n\t" |
|
1238 |
+ "clz %[c3], %[c3] \n\t" |
|
1239 |
+ "clz %[c4], %[c4] \n\t" |
|
1240 |
+ "subu %[c1], $t5, %[c1] \n\t" |
|
1241 |
+ "subu %[c2], $t5, %[c2] \n\t" |
|
1242 |
+ "subu %[c3], $t5, %[c3] \n\t" |
|
1243 |
+ "subu %[c4], $t5, %[c4] \n\t" |
|
1244 |
+ "sll %[c1], %[c1], 1 \n\t" |
|
1245 |
+ "sll %[c2], %[c2], 1 \n\t" |
|
1246 |
+ "sll %[c3], %[c3], 1 \n\t" |
|
1247 |
+ "sll %[c4], %[c4], 1 \n\t" |
|
1248 |
+ "addiu %[c1], %[c1], -3 \n\t" |
|
1249 |
+ "addiu %[c2], %[c2], -3 \n\t" |
|
1250 |
+ "addiu %[c3], %[c3], -3 \n\t" |
|
1251 |
+ "addiu %[c4], %[c4], -3 \n\t" |
|
1252 |
+ "subu %[cond0], $zero, %[cond0] \n\t" |
|
1253 |
+ "subu %[cond1], $zero, %[cond1] \n\t" |
|
1254 |
+ "subu %[cond2], $zero, %[cond2] \n\t" |
|
1255 |
+ "subu %[cond3], $zero, %[cond3] \n\t" |
|
1256 |
+ "and %[c1], %[c1], %[cond0] \n\t" |
|
1257 |
+ "and %[c2], %[c2], %[cond1] \n\t" |
|
1258 |
+ "and %[c3], %[c3], %[cond2] \n\t" |
|
1259 |
+ "and %[c4], %[c4], %[cond3] \n\t" |
|
1260 |
+ |
|
1261 |
+ ".set pop \n\t" |
|
1262 |
+ |
|
1263 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1264 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
1265 |
+ [cond0]"=&r"(cond0), [cond1]"=&r"(cond1), |
|
1266 |
+ [cond2]"=&r"(cond2), [cond3]"=&r"(cond3), |
|
1267 |
+ [c1]"=&r"(c1), [c2]"=&r"(c2), |
|
1268 |
+ [c3]"=&r"(c3), [c4]"=&r"(c4) |
|
1269 |
+ : |
|
1270 |
+ : "t4", "t5" |
|
1271 |
+ ); |
|
1272 |
+ |
|
1273 |
+ curidx = 17 * qc1; |
|
1274 |
+ curidx += qc2; |
|
1275 |
+ |
|
1276 |
+ curidx2 = 17 * qc3; |
|
1277 |
+ curidx2 += qc4; |
|
1278 |
+ |
|
1279 |
+ curbits += p_bits[curidx]; |
|
1280 |
+ curbits += esc_sign_bits[curidx]; |
|
1281 |
+ curbits += p_bits[curidx2]; |
|
1282 |
+ curbits += esc_sign_bits[curidx2]; |
|
1283 |
+ |
|
1284 |
+ curbits += c1; |
|
1285 |
+ curbits += c2; |
|
1286 |
+ curbits += c3; |
|
1287 |
+ curbits += c4; |
|
1288 |
+ } |
|
1289 |
+ return curbits; |
|
1290 |
+} |
|
1291 |
+ |
|
1292 |
+static float (*const get_band_numbits_arr[])(struct AACEncContext *s, |
|
1293 |
+ PutBitContext *pb, const float *in, |
|
1294 |
+ const float *scaled, int size, int scale_idx, |
|
1295 |
+ int cb, const float lambda, const float uplim, |
|
1296 |
+ int *bits) = { |
|
1297 |
+ get_band_numbits_ZERO_mips, |
|
1298 |
+ get_band_numbits_SQUAD_mips, |
|
1299 |
+ get_band_numbits_SQUAD_mips, |
|
1300 |
+ get_band_numbits_UQUAD_mips, |
|
1301 |
+ get_band_numbits_UQUAD_mips, |
|
1302 |
+ get_band_numbits_SPAIR_mips, |
|
1303 |
+ get_band_numbits_SPAIR_mips, |
|
1304 |
+ get_band_numbits_UPAIR7_mips, |
|
1305 |
+ get_band_numbits_UPAIR7_mips, |
|
1306 |
+ get_band_numbits_UPAIR12_mips, |
|
1307 |
+ get_band_numbits_UPAIR12_mips, |
|
1308 |
+ get_band_numbits_ESC_mips, |
|
1309 |
+}; |
|
1310 |
+ |
|
1311 |
+#define get_band_numbits( \ |
|
1312 |
+ s, pb, in, scaled, size, scale_idx, cb, \ |
|
1313 |
+ lambda, uplim, bits) \ |
|
1314 |
+ get_band_numbits_arr[cb]( \ |
|
1315 |
+ s, pb, in, scaled, size, scale_idx, cb, \ |
|
1316 |
+ lambda, uplim, bits) |
|
1317 |
+ |
|
1318 |
+static float quantize_band_cost_bits(struct AACEncContext *s, const float *in, |
|
1319 |
+ const float *scaled, int size, int scale_idx, |
|
1320 |
+ int cb, const float lambda, const float uplim, |
|
1321 |
+ int *bits) |
|
1322 |
+{ |
|
1323 |
+ return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits); |
|
1324 |
+} |
|
1325 |
+ |
|
1326 |
+/** |
|
1327 |
+ * Functions developed from template function and optimized for getting the band cost |
|
1328 |
+ */ |
|
1329 |
+#if HAVE_MIPSFPU |
|
1330 |
+static float get_band_cost_ZERO_mips(struct AACEncContext *s, |
|
1331 |
+ PutBitContext *pb, const float *in, |
|
1332 |
+ const float *scaled, int size, int scale_idx, |
|
1333 |
+ int cb, const float lambda, const float uplim, |
|
1334 |
+ int *bits) |
|
1335 |
+{ |
|
1336 |
+ int i; |
|
1337 |
+ float cost = 0; |
|
1338 |
+ |
|
1339 |
+ for (i = 0; i < size; i += 4) { |
|
1340 |
+ cost += in[i ] * in[i ]; |
|
1341 |
+ cost += in[i+1] * in[i+1]; |
|
1342 |
+ cost += in[i+2] * in[i+2]; |
|
1343 |
+ cost += in[i+3] * in[i+3]; |
|
1344 |
+ } |
|
1345 |
+ if (bits) |
|
1346 |
+ *bits = 0; |
|
1347 |
+ return cost * lambda; |
|
1348 |
+} |
|
1349 |
+ |
|
1350 |
+static float get_band_cost_SQUAD_mips(struct AACEncContext *s, |
|
1351 |
+ PutBitContext *pb, const float *in, |
|
1352 |
+ const float *scaled, int size, int scale_idx, |
|
1353 |
+ int cb, const float lambda, const float uplim, |
|
1354 |
+ int *bits) |
|
1355 |
+{ |
|
1356 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1357 |
+ const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; |
|
1358 |
+ int i; |
|
1359 |
+ float cost = 0; |
|
1360 |
+ int qc1, qc2, qc3, qc4; |
|
1361 |
+ int curbits = 0; |
|
1362 |
+ |
|
1363 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
1364 |
+ float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; |
|
1365 |
+ |
|
1366 |
+ for (i = 0; i < size; i += 4) { |
|
1367 |
+ const float *vec; |
|
1368 |
+ int curidx; |
|
1369 |
+ int *in_int = (int *)&in[i]; |
|
1370 |
+ float *in_pos = (float *)&in[i]; |
|
1371 |
+ float di0, di1, di2, di3; |
|
1372 |
+ |
|
1373 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1374 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1375 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1376 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1377 |
+ |
|
1378 |
+ __asm__ volatile ( |
|
1379 |
+ ".set push \n\t" |
|
1380 |
+ ".set noreorder \n\t" |
|
1381 |
+ |
|
1382 |
+ "slt %[qc1], $zero, %[qc1] \n\t" |
|
1383 |
+ "slt %[qc2], $zero, %[qc2] \n\t" |
|
1384 |
+ "slt %[qc3], $zero, %[qc3] \n\t" |
|
1385 |
+ "slt %[qc4], $zero, %[qc4] \n\t" |
|
1386 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
1387 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
1388 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
1389 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
1390 |
+ "srl $t0, $t0, 31 \n\t" |
|
1391 |
+ "srl $t1, $t1, 31 \n\t" |
|
1392 |
+ "srl $t2, $t2, 31 \n\t" |
|
1393 |
+ "srl $t3, $t3, 31 \n\t" |
|
1394 |
+ "subu $t4, $zero, %[qc1] \n\t" |
|
1395 |
+ "subu $t5, $zero, %[qc2] \n\t" |
|
1396 |
+ "subu $t6, $zero, %[qc3] \n\t" |
|
1397 |
+ "subu $t7, $zero, %[qc4] \n\t" |
|
1398 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1399 |
+ "movn %[qc2], $t5, $t1 \n\t" |
|
1400 |
+ "movn %[qc3], $t6, $t2 \n\t" |
|
1401 |
+ "movn %[qc4], $t7, $t3 \n\t" |
|
1402 |
+ |
|
1403 |
+ ".set pop \n\t" |
|
1404 |
+ |
|
1405 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1406 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
1407 |
+ : [in_int]"r"(in_int) |
|
1408 |
+ : "t0", "t1", "t2", "t3", |
|
1409 |
+ "t4", "t5", "t6", "t7", |
|
1410 |
+ "memory" |
|
1411 |
+ ); |
|
1412 |
+ |
|
1413 |
+ curidx = qc1; |
|
1414 |
+ curidx *= 3; |
|
1415 |
+ curidx += qc2; |
|
1416 |
+ curidx *= 3; |
|
1417 |
+ curidx += qc3; |
|
1418 |
+ curidx *= 3; |
|
1419 |
+ curidx += qc4; |
|
1420 |
+ curidx += 40; |
|
1421 |
+ |
|
1422 |
+ curbits += p_bits[curidx]; |
|
1423 |
+ vec = &p_codes[curidx*4]; |
|
1424 |
+ |
|
1425 |
+ __asm__ volatile ( |
|
1426 |
+ ".set push \n\t" |
|
1427 |
+ ".set noreorder \n\t" |
|
1428 |
+ |
|
1429 |
+ "lwc1 $f0, 0(%[in_pos]) \n\t" |
|
1430 |
+ "lwc1 $f1, 0(%[vec]) \n\t" |
|
1431 |
+ "lwc1 $f2, 4(%[in_pos]) \n\t" |
|
1432 |
+ "lwc1 $f3, 4(%[vec]) \n\t" |
|
1433 |
+ "lwc1 $f4, 8(%[in_pos]) \n\t" |
|
1434 |
+ "lwc1 $f5, 8(%[vec]) \n\t" |
|
1435 |
+ "lwc1 $f6, 12(%[in_pos]) \n\t" |
|
1436 |
+ "lwc1 $f7, 12(%[vec]) \n\t" |
|
1437 |
+ "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t" |
|
1438 |
+ "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t" |
|
1439 |
+ "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t" |
|
1440 |
+ "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t" |
|
1441 |
+ |
|
1442 |
+ ".set pop \n\t" |
|
1443 |
+ |
|
1444 |
+ : [di0]"=&f"(di0), [di1]"=&f"(di1), |
|
1445 |
+ [di2]"=&f"(di2), [di3]"=&f"(di3) |
|
1446 |
+ : [in_pos]"r"(in_pos), [vec]"r"(vec), |
|
1447 |
+ [IQ]"f"(IQ) |
|
1448 |
+ : "$f0", "$f1", "$f2", "$f3", |
|
1449 |
+ "$f4", "$f5", "$f6", "$f7", |
|
1450 |
+ "memory" |
|
1451 |
+ ); |
|
1452 |
+ |
|
1453 |
+ cost += di0 * di0 + di1 * di1 |
|
1454 |
+ + di2 * di2 + di3 * di3; |
|
1455 |
+ } |
|
1456 |
+ |
|
1457 |
+ if (bits) |
|
1458 |
+ *bits = curbits; |
|
1459 |
+ return cost * lambda + curbits; |
|
1460 |
+} |
|
1461 |
+ |
|
1462 |
+static float get_band_cost_UQUAD_mips(struct AACEncContext *s, |
|
1463 |
+ PutBitContext *pb, const float *in, |
|
1464 |
+ const float *scaled, int size, int scale_idx, |
|
1465 |
+ int cb, const float lambda, const float uplim, |
|
1466 |
+ int *bits) |
|
1467 |
+{ |
|
1468 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1469 |
+ const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; |
|
1470 |
+ int i; |
|
1471 |
+ float cost = 0; |
|
1472 |
+ int curbits = 0; |
|
1473 |
+ int qc1, qc2, qc3, qc4; |
|
1474 |
+ |
|
1475 |
+ uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; |
|
1476 |
+ float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; |
|
1477 |
+ |
|
1478 |
+ for (i = 0; i < size; i += 4) { |
|
1479 |
+ const float *vec; |
|
1480 |
+ int curidx; |
|
1481 |
+ float *in_pos = (float *)&in[i]; |
|
1482 |
+ float di0, di1, di2, di3; |
|
1483 |
+ |
|
1484 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1485 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1486 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1487 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1488 |
+ |
|
1489 |
+ __asm__ volatile ( |
|
1490 |
+ ".set push \n\t" |
|
1491 |
+ ".set noreorder \n\t" |
|
1492 |
+ |
|
1493 |
+ "ori $t4, $zero, 2 \n\t" |
|
1494 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1495 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1496 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1497 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1498 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1499 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1500 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1501 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1502 |
+ |
|
1503 |
+ ".set pop \n\t" |
|
1504 |
+ |
|
1505 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1506 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
1507 |
+ : |
|
1508 |
+ : "t0", "t1", "t2", "t3", "t4" |
|
1509 |
+ ); |
|
1510 |
+ |
|
1511 |
+ curidx = qc1; |
|
1512 |
+ curidx *= 3; |
|
1513 |
+ curidx += qc2; |
|
1514 |
+ curidx *= 3; |
|
1515 |
+ curidx += qc3; |
|
1516 |
+ curidx *= 3; |
|
1517 |
+ curidx += qc4; |
|
1518 |
+ |
|
1519 |
+ curbits += p_bits[curidx]; |
|
1520 |
+ curbits += uquad_sign_bits[curidx]; |
|
1521 |
+ vec = &p_codes[curidx*4]; |
|
1522 |
+ |
|
1523 |
+ __asm__ volatile ( |
|
1524 |
+ ".set push \n\t" |
|
1525 |
+ ".set noreorder \n\t" |
|
1526 |
+ |
|
1527 |
+ "lwc1 %[di0], 0(%[in_pos]) \n\t" |
|
1528 |
+ "lwc1 %[di1], 4(%[in_pos]) \n\t" |
|
1529 |
+ "lwc1 %[di2], 8(%[in_pos]) \n\t" |
|
1530 |
+ "lwc1 %[di3], 12(%[in_pos]) \n\t" |
|
1531 |
+ "abs.s %[di0], %[di0] \n\t" |
|
1532 |
+ "abs.s %[di1], %[di1] \n\t" |
|
1533 |
+ "abs.s %[di2], %[di2] \n\t" |
|
1534 |
+ "abs.s %[di3], %[di3] \n\t" |
|
1535 |
+ "lwc1 $f0, 0(%[vec]) \n\t" |
|
1536 |
+ "lwc1 $f1, 4(%[vec]) \n\t" |
|
1537 |
+ "lwc1 $f2, 8(%[vec]) \n\t" |
|
1538 |
+ "lwc1 $f3, 12(%[vec]) \n\t" |
|
1539 |
+ "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" |
|
1540 |
+ "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" |
|
1541 |
+ "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" |
|
1542 |
+ "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" |
|
1543 |
+ |
|
1544 |
+ ".set pop \n\t" |
|
1545 |
+ |
|
1546 |
+ : [di0]"=&f"(di0), [di1]"=&f"(di1), |
|
1547 |
+ [di2]"=&f"(di2), [di3]"=&f"(di3) |
|
1548 |
+ : [in_pos]"r"(in_pos), [vec]"r"(vec), |
|
1549 |
+ [IQ]"f"(IQ) |
|
1550 |
+ : "$f0", "$f1", "$f2", "$f3", |
|
1551 |
+ "memory" |
|
1552 |
+ ); |
|
1553 |
+ |
|
1554 |
+ cost += di0 * di0 + di1 * di1 |
|
1555 |
+ + di2 * di2 + di3 * di3; |
|
1556 |
+ } |
|
1557 |
+ |
|
1558 |
+ if (bits) |
|
1559 |
+ *bits = curbits; |
|
1560 |
+ return cost * lambda + curbits; |
|
1561 |
+} |
|
1562 |
+ |
|
1563 |
+static float get_band_cost_SPAIR_mips(struct AACEncContext *s, |
|
1564 |
+ PutBitContext *pb, const float *in, |
|
1565 |
+ const float *scaled, int size, int scale_idx, |
|
1566 |
+ int cb, const float lambda, const float uplim, |
|
1567 |
+ int *bits) |
|
1568 |
+{ |
|
1569 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1570 |
+ const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; |
|
1571 |
+ int i; |
|
1572 |
+ float cost = 0; |
|
1573 |
+ int qc1, qc2, qc3, qc4; |
|
1574 |
+ int curbits = 0; |
|
1575 |
+ |
|
1576 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
1577 |
+ float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; |
|
1578 |
+ |
|
1579 |
+ for (i = 0; i < size; i += 4) { |
|
1580 |
+ const float *vec, *vec2; |
|
1581 |
+ int curidx, curidx2; |
|
1582 |
+ int *in_int = (int *)&in[i]; |
|
1583 |
+ float *in_pos = (float *)&in[i]; |
|
1584 |
+ float di0, di1, di2, di3; |
|
1585 |
+ |
|
1586 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1587 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1588 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1589 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1590 |
+ |
|
1591 |
+ __asm__ volatile ( |
|
1592 |
+ ".set push \n\t" |
|
1593 |
+ ".set noreorder \n\t" |
|
1594 |
+ |
|
1595 |
+ "ori $t4, $zero, 4 \n\t" |
|
1596 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1597 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1598 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1599 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1600 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1601 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1602 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1603 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1604 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
1605 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
1606 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
1607 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
1608 |
+ "srl $t0, $t0, 31 \n\t" |
|
1609 |
+ "srl $t1, $t1, 31 \n\t" |
|
1610 |
+ "srl $t2, $t2, 31 \n\t" |
|
1611 |
+ "srl $t3, $t3, 31 \n\t" |
|
1612 |
+ "subu $t4, $zero, %[qc1] \n\t" |
|
1613 |
+ "subu $t5, $zero, %[qc2] \n\t" |
|
1614 |
+ "subu $t6, $zero, %[qc3] \n\t" |
|
1615 |
+ "subu $t7, $zero, %[qc4] \n\t" |
|
1616 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1617 |
+ "movn %[qc2], $t5, $t1 \n\t" |
|
1618 |
+ "movn %[qc3], $t6, $t2 \n\t" |
|
1619 |
+ "movn %[qc4], $t7, $t3 \n\t" |
|
1620 |
+ |
|
1621 |
+ ".set pop \n\t" |
|
1622 |
+ |
|
1623 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1624 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4) |
|
1625 |
+ : [in_int]"r"(in_int) |
|
1626 |
+ : "t0", "t1", "t2", "t3", |
|
1627 |
+ "t4", "t5", "t6", "t7", |
|
1628 |
+ "memory" |
|
1629 |
+ ); |
|
1630 |
+ |
|
1631 |
+ curidx = 9 * qc1; |
|
1632 |
+ curidx += qc2 + 40; |
|
1633 |
+ |
|
1634 |
+ curidx2 = 9 * qc3; |
|
1635 |
+ curidx2 += qc4 + 40; |
|
1636 |
+ |
|
1637 |
+ curbits += p_bits[curidx]; |
|
1638 |
+ curbits += p_bits[curidx2]; |
|
1639 |
+ |
|
1640 |
+ vec = &p_codes[curidx*2]; |
|
1641 |
+ vec2 = &p_codes[curidx2*2]; |
|
1642 |
+ |
|
1643 |
+ __asm__ volatile ( |
|
1644 |
+ ".set push \n\t" |
|
1645 |
+ ".set noreorder \n\t" |
|
1646 |
+ |
|
1647 |
+ "lwc1 $f0, 0(%[in_pos]) \n\t" |
|
1648 |
+ "lwc1 $f1, 0(%[vec]) \n\t" |
|
1649 |
+ "lwc1 $f2, 4(%[in_pos]) \n\t" |
|
1650 |
+ "lwc1 $f3, 4(%[vec]) \n\t" |
|
1651 |
+ "lwc1 $f4, 8(%[in_pos]) \n\t" |
|
1652 |
+ "lwc1 $f5, 0(%[vec2]) \n\t" |
|
1653 |
+ "lwc1 $f6, 12(%[in_pos]) \n\t" |
|
1654 |
+ "lwc1 $f7, 4(%[vec2]) \n\t" |
|
1655 |
+ "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t" |
|
1656 |
+ "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t" |
|
1657 |
+ "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t" |
|
1658 |
+ "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t" |
|
1659 |
+ |
|
1660 |
+ ".set pop \n\t" |
|
1661 |
+ |
|
1662 |
+ : [di0]"=&f"(di0), [di1]"=&f"(di1), |
|
1663 |
+ [di2]"=&f"(di2), [di3]"=&f"(di3) |
|
1664 |
+ : [in_pos]"r"(in_pos), [vec]"r"(vec), |
|
1665 |
+ [vec2]"r"(vec2), [IQ]"f"(IQ) |
|
1666 |
+ : "$f0", "$f1", "$f2", "$f3", |
|
1667 |
+ "$f4", "$f5", "$f6", "$f7", |
|
1668 |
+ "memory" |
|
1669 |
+ ); |
|
1670 |
+ |
|
1671 |
+ cost += di0 * di0 + di1 * di1 |
|
1672 |
+ + di2 * di2 + di3 * di3; |
|
1673 |
+ } |
|
1674 |
+ |
|
1675 |
+ if (bits) |
|
1676 |
+ *bits = curbits; |
|
1677 |
+ return cost * lambda + curbits; |
|
1678 |
+} |
|
1679 |
+ |
|
1680 |
+static float get_band_cost_UPAIR7_mips(struct AACEncContext *s, |
|
1681 |
+ PutBitContext *pb, const float *in, |
|
1682 |
+ const float *scaled, int size, int scale_idx, |
|
1683 |
+ int cb, const float lambda, const float uplim, |
|
1684 |
+ int *bits) |
|
1685 |
+{ |
|
1686 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1687 |
+ const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; |
|
1688 |
+ int i; |
|
1689 |
+ float cost = 0; |
|
1690 |
+ int qc1, qc2, qc3, qc4; |
|
1691 |
+ int curbits = 0; |
|
1692 |
+ |
|
1693 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
1694 |
+ float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; |
|
1695 |
+ |
|
1696 |
+ for (i = 0; i < size; i += 4) { |
|
1697 |
+ const float *vec, *vec2; |
|
1698 |
+ int curidx, curidx2, sign1, count1, sign2, count2; |
|
1699 |
+ int *in_int = (int *)&in[i]; |
|
1700 |
+ float *in_pos = (float *)&in[i]; |
|
1701 |
+ float di0, di1, di2, di3; |
|
1702 |
+ |
|
1703 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1704 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1705 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1706 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1707 |
+ |
|
1708 |
+ __asm__ volatile ( |
|
1709 |
+ ".set push \n\t" |
|
1710 |
+ ".set noreorder \n\t" |
|
1711 |
+ |
|
1712 |
+ "ori $t4, $zero, 7 \n\t" |
|
1713 |
+ "ori %[sign1], $zero, 0 \n\t" |
|
1714 |
+ "ori %[sign2], $zero, 0 \n\t" |
|
1715 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1716 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1717 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1718 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1719 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1720 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1721 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1722 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1723 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
1724 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
1725 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
1726 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
1727 |
+ "slt $t0, $t0, $zero \n\t" |
|
1728 |
+ "movn %[sign1], $t0, %[qc1] \n\t" |
|
1729 |
+ "slt $t2, $t2, $zero \n\t" |
|
1730 |
+ "movn %[sign2], $t2, %[qc3] \n\t" |
|
1731 |
+ "slt $t1, $t1, $zero \n\t" |
|
1732 |
+ "sll $t0, %[sign1], 1 \n\t" |
|
1733 |
+ "or $t0, $t0, $t1 \n\t" |
|
1734 |
+ "movn %[sign1], $t0, %[qc2] \n\t" |
|
1735 |
+ "slt $t3, $t3, $zero \n\t" |
|
1736 |
+ "sll $t0, %[sign2], 1 \n\t" |
|
1737 |
+ "or $t0, $t0, $t3 \n\t" |
|
1738 |
+ "movn %[sign2], $t0, %[qc4] \n\t" |
|
1739 |
+ "slt %[count1], $zero, %[qc1] \n\t" |
|
1740 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
1741 |
+ "slt %[count2], $zero, %[qc3] \n\t" |
|
1742 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
1743 |
+ "addu %[count1], %[count1], $t1 \n\t" |
|
1744 |
+ "addu %[count2], %[count2], $t2 \n\t" |
|
1745 |
+ |
|
1746 |
+ ".set pop \n\t" |
|
1747 |
+ |
|
1748 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1749 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
1750 |
+ [sign1]"=&r"(sign1), [count1]"=&r"(count1), |
|
1751 |
+ [sign2]"=&r"(sign2), [count2]"=&r"(count2) |
|
1752 |
+ : [in_int]"r"(in_int) |
|
1753 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
1754 |
+ "memory" |
|
1755 |
+ ); |
|
1756 |
+ |
|
1757 |
+ curidx = 8 * qc1; |
|
1758 |
+ curidx += qc2; |
|
1759 |
+ |
|
1760 |
+ curidx2 = 8 * qc3; |
|
1761 |
+ curidx2 += qc4; |
|
1762 |
+ |
|
1763 |
+ curbits += p_bits[curidx]; |
|
1764 |
+ curbits += upair7_sign_bits[curidx]; |
|
1765 |
+ vec = &p_codes[curidx*2]; |
|
1766 |
+ |
|
1767 |
+ curbits += p_bits[curidx2]; |
|
1768 |
+ curbits += upair7_sign_bits[curidx2]; |
|
1769 |
+ vec2 = &p_codes[curidx2*2]; |
|
1770 |
+ |
|
1771 |
+ __asm__ volatile ( |
|
1772 |
+ ".set push \n\t" |
|
1773 |
+ ".set noreorder \n\t" |
|
1774 |
+ |
|
1775 |
+ "lwc1 %[di0], 0(%[in_pos]) \n\t" |
|
1776 |
+ "lwc1 %[di1], 4(%[in_pos]) \n\t" |
|
1777 |
+ "lwc1 %[di2], 8(%[in_pos]) \n\t" |
|
1778 |
+ "lwc1 %[di3], 12(%[in_pos]) \n\t" |
|
1779 |
+ "abs.s %[di0], %[di0] \n\t" |
|
1780 |
+ "abs.s %[di1], %[di1] \n\t" |
|
1781 |
+ "abs.s %[di2], %[di2] \n\t" |
|
1782 |
+ "abs.s %[di3], %[di3] \n\t" |
|
1783 |
+ "lwc1 $f0, 0(%[vec]) \n\t" |
|
1784 |
+ "lwc1 $f1, 4(%[vec]) \n\t" |
|
1785 |
+ "lwc1 $f2, 0(%[vec2]) \n\t" |
|
1786 |
+ "lwc1 $f3, 4(%[vec2]) \n\t" |
|
1787 |
+ "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" |
|
1788 |
+ "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" |
|
1789 |
+ "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" |
|
1790 |
+ "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" |
|
1791 |
+ |
|
1792 |
+ ".set pop \n\t" |
|
1793 |
+ |
|
1794 |
+ : [di0]"=&f"(di0), [di1]"=&f"(di1), |
|
1795 |
+ [di2]"=&f"(di2), [di3]"=&f"(di3) |
|
1796 |
+ : [in_pos]"r"(in_pos), [vec]"r"(vec), |
|
1797 |
+ [vec2]"r"(vec2), [IQ]"f"(IQ) |
|
1798 |
+ : "$f0", "$f1", "$f2", "$f3", |
|
1799 |
+ "memory" |
|
1800 |
+ ); |
|
1801 |
+ |
|
1802 |
+ cost += di0 * di0 + di1 * di1 |
|
1803 |
+ + di2 * di2 + di3 * di3; |
|
1804 |
+ } |
|
1805 |
+ |
|
1806 |
+ if (bits) |
|
1807 |
+ *bits = curbits; |
|
1808 |
+ return cost * lambda + curbits; |
|
1809 |
+} |
|
1810 |
+ |
|
1811 |
+static float get_band_cost_UPAIR12_mips(struct AACEncContext *s, |
|
1812 |
+ PutBitContext *pb, const float *in, |
|
1813 |
+ const float *scaled, int size, int scale_idx, |
|
1814 |
+ int cb, const float lambda, const float uplim, |
|
1815 |
+ int *bits) |
|
1816 |
+{ |
|
1817 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1818 |
+ const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; |
|
1819 |
+ int i; |
|
1820 |
+ float cost = 0; |
|
1821 |
+ int qc1, qc2, qc3, qc4; |
|
1822 |
+ int curbits = 0; |
|
1823 |
+ |
|
1824 |
+ uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; |
|
1825 |
+ float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; |
|
1826 |
+ |
|
1827 |
+ for (i = 0; i < size; i += 4) { |
|
1828 |
+ const float *vec, *vec2; |
|
1829 |
+ int curidx, curidx2; |
|
1830 |
+ int sign1, count1, sign2, count2; |
|
1831 |
+ int *in_int = (int *)&in[i]; |
|
1832 |
+ float *in_pos = (float *)&in[i]; |
|
1833 |
+ float di0, di1, di2, di3; |
|
1834 |
+ |
|
1835 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1836 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1837 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1838 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1839 |
+ |
|
1840 |
+ __asm__ volatile ( |
|
1841 |
+ ".set push \n\t" |
|
1842 |
+ ".set noreorder \n\t" |
|
1843 |
+ |
|
1844 |
+ "ori $t4, $zero, 12 \n\t" |
|
1845 |
+ "ori %[sign1], $zero, 0 \n\t" |
|
1846 |
+ "ori %[sign2], $zero, 0 \n\t" |
|
1847 |
+ "slt $t0, $t4, %[qc1] \n\t" |
|
1848 |
+ "slt $t1, $t4, %[qc2] \n\t" |
|
1849 |
+ "slt $t2, $t4, %[qc3] \n\t" |
|
1850 |
+ "slt $t3, $t4, %[qc4] \n\t" |
|
1851 |
+ "movn %[qc1], $t4, $t0 \n\t" |
|
1852 |
+ "movn %[qc2], $t4, $t1 \n\t" |
|
1853 |
+ "movn %[qc3], $t4, $t2 \n\t" |
|
1854 |
+ "movn %[qc4], $t4, $t3 \n\t" |
|
1855 |
+ "lw $t0, 0(%[in_int]) \n\t" |
|
1856 |
+ "lw $t1, 4(%[in_int]) \n\t" |
|
1857 |
+ "lw $t2, 8(%[in_int]) \n\t" |
|
1858 |
+ "lw $t3, 12(%[in_int]) \n\t" |
|
1859 |
+ "slt $t0, $t0, $zero \n\t" |
|
1860 |
+ "movn %[sign1], $t0, %[qc1] \n\t" |
|
1861 |
+ "slt $t2, $t2, $zero \n\t" |
|
1862 |
+ "movn %[sign2], $t2, %[qc3] \n\t" |
|
1863 |
+ "slt $t1, $t1, $zero \n\t" |
|
1864 |
+ "sll $t0, %[sign1], 1 \n\t" |
|
1865 |
+ "or $t0, $t0, $t1 \n\t" |
|
1866 |
+ "movn %[sign1], $t0, %[qc2] \n\t" |
|
1867 |
+ "slt $t3, $t3, $zero \n\t" |
|
1868 |
+ "sll $t0, %[sign2], 1 \n\t" |
|
1869 |
+ "or $t0, $t0, $t3 \n\t" |
|
1870 |
+ "movn %[sign2], $t0, %[qc4] \n\t" |
|
1871 |
+ "slt %[count1], $zero, %[qc1] \n\t" |
|
1872 |
+ "slt $t1, $zero, %[qc2] \n\t" |
|
1873 |
+ "slt %[count2], $zero, %[qc3] \n\t" |
|
1874 |
+ "slt $t2, $zero, %[qc4] \n\t" |
|
1875 |
+ "addu %[count1], %[count1], $t1 \n\t" |
|
1876 |
+ "addu %[count2], %[count2], $t2 \n\t" |
|
1877 |
+ |
|
1878 |
+ ".set pop \n\t" |
|
1879 |
+ |
|
1880 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1881 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
1882 |
+ [sign1]"=&r"(sign1), [count1]"=&r"(count1), |
|
1883 |
+ [sign2]"=&r"(sign2), [count2]"=&r"(count2) |
|
1884 |
+ : [in_int]"r"(in_int) |
|
1885 |
+ : "t0", "t1", "t2", "t3", "t4", |
|
1886 |
+ "memory" |
|
1887 |
+ ); |
|
1888 |
+ |
|
1889 |
+ curidx = 13 * qc1; |
|
1890 |
+ curidx += qc2; |
|
1891 |
+ |
|
1892 |
+ curidx2 = 13 * qc3; |
|
1893 |
+ curidx2 += qc4; |
|
1894 |
+ |
|
1895 |
+ curbits += p_bits[curidx]; |
|
1896 |
+ curbits += p_bits[curidx2]; |
|
1897 |
+ curbits += upair12_sign_bits[curidx]; |
|
1898 |
+ curbits += upair12_sign_bits[curidx2]; |
|
1899 |
+ vec = &p_codes[curidx*2]; |
|
1900 |
+ vec2 = &p_codes[curidx2*2]; |
|
1901 |
+ |
|
1902 |
+ __asm__ volatile ( |
|
1903 |
+ ".set push \n\t" |
|
1904 |
+ ".set noreorder \n\t" |
|
1905 |
+ |
|
1906 |
+ "lwc1 %[di0], 0(%[in_pos]) \n\t" |
|
1907 |
+ "lwc1 %[di1], 4(%[in_pos]) \n\t" |
|
1908 |
+ "lwc1 %[di2], 8(%[in_pos]) \n\t" |
|
1909 |
+ "lwc1 %[di3], 12(%[in_pos]) \n\t" |
|
1910 |
+ "abs.s %[di0], %[di0] \n\t" |
|
1911 |
+ "abs.s %[di1], %[di1] \n\t" |
|
1912 |
+ "abs.s %[di2], %[di2] \n\t" |
|
1913 |
+ "abs.s %[di3], %[di3] \n\t" |
|
1914 |
+ "lwc1 $f0, 0(%[vec]) \n\t" |
|
1915 |
+ "lwc1 $f1, 4(%[vec]) \n\t" |
|
1916 |
+ "lwc1 $f2, 0(%[vec2]) \n\t" |
|
1917 |
+ "lwc1 $f3, 4(%[vec2]) \n\t" |
|
1918 |
+ "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" |
|
1919 |
+ "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" |
|
1920 |
+ "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" |
|
1921 |
+ "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" |
|
1922 |
+ |
|
1923 |
+ ".set pop \n\t" |
|
1924 |
+ |
|
1925 |
+ : [di0]"=&f"(di0), [di1]"=&f"(di1), |
|
1926 |
+ [di2]"=&f"(di2), [di3]"=&f"(di3) |
|
1927 |
+ : [in_pos]"r"(in_pos), [vec]"r"(vec), |
|
1928 |
+ [vec2]"r"(vec2), [IQ]"f"(IQ) |
|
1929 |
+ : "$f0", "$f1", "$f2", "$f3", |
|
1930 |
+ "memory" |
|
1931 |
+ ); |
|
1932 |
+ |
|
1933 |
+ cost += di0 * di0 + di1 * di1 |
|
1934 |
+ + di2 * di2 + di3 * di3; |
|
1935 |
+ } |
|
1936 |
+ |
|
1937 |
+ if (bits) |
|
1938 |
+ *bits = curbits; |
|
1939 |
+ return cost * lambda + curbits; |
|
1940 |
+} |
|
1941 |
+ |
|
1942 |
+static float get_band_cost_ESC_mips(struct AACEncContext *s, |
|
1943 |
+ PutBitContext *pb, const float *in, |
|
1944 |
+ const float *scaled, int size, int scale_idx, |
|
1945 |
+ int cb, const float lambda, const float uplim, |
|
1946 |
+ int *bits) |
|
1947 |
+{ |
|
1948 |
+ const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; |
|
1949 |
+ const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; |
|
1950 |
+ const float CLIPPED_ESCAPE = 165140.0f * IQ; |
|
1951 |
+ int i; |
|
1952 |
+ float cost = 0; |
|
1953 |
+ int qc1, qc2, qc3, qc4; |
|
1954 |
+ int curbits = 0; |
|
1955 |
+ |
|
1956 |
+ uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; |
|
1957 |
+ float *p_codes = (float* )ff_aac_codebook_vectors[cb-1]; |
|
1958 |
+ |
|
1959 |
+ for (i = 0; i < size; i += 4) { |
|
1960 |
+ const float *vec, *vec2; |
|
1961 |
+ int curidx, curidx2; |
|
1962 |
+ float t1, t2, t3, t4; |
|
1963 |
+ float di1, di2, di3, di4; |
|
1964 |
+ int cond0, cond1, cond2, cond3; |
|
1965 |
+ int c1, c2, c3, c4; |
|
1966 |
+ |
|
1967 |
+ qc1 = scaled[i ] * Q34 + 0.4054f; |
|
1968 |
+ qc2 = scaled[i+1] * Q34 + 0.4054f; |
|
1969 |
+ qc3 = scaled[i+2] * Q34 + 0.4054f; |
|
1970 |
+ qc4 = scaled[i+3] * Q34 + 0.4054f; |
|
1971 |
+ |
|
1972 |
+ __asm__ volatile ( |
|
1973 |
+ ".set push \n\t" |
|
1974 |
+ ".set noreorder \n\t" |
|
1975 |
+ |
|
1976 |
+ "ori $t4, $zero, 15 \n\t" |
|
1977 |
+ "ori $t5, $zero, 16 \n\t" |
|
1978 |
+ "shll_s.w %[c1], %[qc1], 18 \n\t" |
|
1979 |
+ "shll_s.w %[c2], %[qc2], 18 \n\t" |
|
1980 |
+ "shll_s.w %[c3], %[qc3], 18 \n\t" |
|
1981 |
+ "shll_s.w %[c4], %[qc4], 18 \n\t" |
|
1982 |
+ "srl %[c1], %[c1], 18 \n\t" |
|
1983 |
+ "srl %[c2], %[c2], 18 \n\t" |
|
1984 |
+ "srl %[c3], %[c3], 18 \n\t" |
|
1985 |
+ "srl %[c4], %[c4], 18 \n\t" |
|
1986 |
+ "slt %[cond0], $t4, %[qc1] \n\t" |
|
1987 |
+ "slt %[cond1], $t4, %[qc2] \n\t" |
|
1988 |
+ "slt %[cond2], $t4, %[qc3] \n\t" |
|
1989 |
+ "slt %[cond3], $t4, %[qc4] \n\t" |
|
1990 |
+ "movn %[qc1], $t5, %[cond0] \n\t" |
|
1991 |
+ "movn %[qc2], $t5, %[cond1] \n\t" |
|
1992 |
+ "movn %[qc3], $t5, %[cond2] \n\t" |
|
1993 |
+ "movn %[qc4], $t5, %[cond3] \n\t" |
|
1994 |
+ |
|
1995 |
+ ".set pop \n\t" |
|
1996 |
+ |
|
1997 |
+ : [qc1]"+r"(qc1), [qc2]"+r"(qc2), |
|
1998 |
+ [qc3]"+r"(qc3), [qc4]"+r"(qc4), |
|
1999 |
+ [cond0]"=&r"(cond0), [cond1]"=&r"(cond1), |
|
2000 |
+ [cond2]"=&r"(cond2), [cond3]"=&r"(cond3), |
|
2001 |
+ [c1]"=&r"(c1), [c2]"=&r"(c2), |
|
2002 |
+ [c3]"=&r"(c3), [c4]"=&r"(c4) |
|
2003 |
+ : |
|
2004 |
+ : "t4", "t5" |
|
2005 |
+ ); |
|
2006 |
+ |
|
2007 |
+ curidx = 17 * qc1; |
|
2008 |
+ curidx += qc2; |
|
2009 |
+ |
|
2010 |
+ curidx2 = 17 * qc3; |
|
2011 |
+ curidx2 += qc4; |
|
2012 |
+ |
|
2013 |
+ curbits += p_bits[curidx]; |
|
2014 |
+ curbits += esc_sign_bits[curidx]; |
|
2015 |
+ vec = &p_codes[curidx*2]; |
|
2016 |
+ |
|
2017 |
+ curbits += p_bits[curidx2]; |
|
2018 |
+ curbits += esc_sign_bits[curidx2]; |
|
2019 |
+ vec2 = &p_codes[curidx2*2]; |
|
2020 |
+ |
|
2021 |
+ curbits += (av_log2(c1) * 2 - 3) & (-cond0); |
|
2022 |
+ curbits += (av_log2(c2) * 2 - 3) & (-cond1); |
|
2023 |
+ curbits += (av_log2(c3) * 2 - 3) & (-cond2); |
|
2024 |
+ curbits += (av_log2(c4) * 2 - 3) & (-cond3); |
|
2025 |
+ |
|
2026 |
+ t1 = fabsf(in[i ]); |
|
2027 |
+ t2 = fabsf(in[i+1]); |
|
2028 |
+ t3 = fabsf(in[i+2]); |
|
2029 |
+ t4 = fabsf(in[i+3]); |
|
2030 |
+ |
|
2031 |
+ if (cond0) { |
|
2032 |
+ if (t1 >= CLIPPED_ESCAPE) { |
|
2033 |
+ di1 = t1 - CLIPPED_ESCAPE; |
|
2034 |
+ } else { |
|
2035 |
+ di1 = t1 - c1 * cbrtf(c1) * IQ; |
|
2036 |
+ } |
|
2037 |
+ } else |
|
2038 |
+ di1 = t1 - vec[0] * IQ; |
|
2039 |
+ |
|
2040 |
+ if (cond1) { |
|
2041 |
+ if (t2 >= CLIPPED_ESCAPE) { |
|
2042 |
+ di2 = t2 - CLIPPED_ESCAPE; |
|
2043 |
+ } else { |
|
2044 |
+ di2 = t2 - c2 * cbrtf(c2) * IQ; |
|
2045 |
+ } |
|
2046 |
+ } else |
|
2047 |
+ di2 = t2 - vec[1] * IQ; |
|
2048 |
+ |
|
2049 |
+ if (cond2) { |
|
2050 |
+ if (t3 >= CLIPPED_ESCAPE) { |
|
2051 |
+ di3 = t3 - CLIPPED_ESCAPE; |
|
2052 |
+ } else { |
|
2053 |
+ di3 = t3 - c3 * cbrtf(c3) * IQ; |
|
2054 |
+ } |
|
2055 |
+ } else |
|
2056 |
+ di3 = t3 - vec2[0] * IQ; |
|
2057 |
+ |
|
2058 |
+ if (cond3) { |
|
2059 |
+ if (t4 >= CLIPPED_ESCAPE) { |
|
2060 |
+ di4 = t4 - CLIPPED_ESCAPE; |
|
2061 |
+ } else { |
|
2062 |
+ di4 = t4 - c4 * cbrtf(c4) * IQ; |
|
2063 |
+ } |
|
2064 |
+ } else |
|
2065 |
+ di4 = t4 - vec2[1]*IQ; |
|
2066 |
+ |
|
2067 |
+ cost += di1 * di1 + di2 * di2 |
|
2068 |
+ + di3 * di3 + di4 * di4; |
|
2069 |
+ } |
|
2070 |
+ |
|
2071 |
+ if (bits) |
|
2072 |
+ *bits = curbits; |
|
2073 |
+ return cost * lambda + curbits; |
|
2074 |
+} |
|
2075 |
+ |
|
2076 |
+static float (*const get_band_cost_arr[])(struct AACEncContext *s, |
|
2077 |
+ PutBitContext *pb, const float *in, |
|
2078 |
+ const float *scaled, int size, int scale_idx, |
|
2079 |
+ int cb, const float lambda, const float uplim, |
|
2080 |
+ int *bits) = { |
|
2081 |
+ get_band_cost_ZERO_mips, |
|
2082 |
+ get_band_cost_SQUAD_mips, |
|
2083 |
+ get_band_cost_SQUAD_mips, |
|
2084 |
+ get_band_cost_UQUAD_mips, |
|
2085 |
+ get_band_cost_UQUAD_mips, |
|
2086 |
+ get_band_cost_SPAIR_mips, |
|
2087 |
+ get_band_cost_SPAIR_mips, |
|
2088 |
+ get_band_cost_UPAIR7_mips, |
|
2089 |
+ get_band_cost_UPAIR7_mips, |
|
2090 |
+ get_band_cost_UPAIR12_mips, |
|
2091 |
+ get_band_cost_UPAIR12_mips, |
|
2092 |
+ get_band_cost_ESC_mips, |
|
2093 |
+}; |
|
2094 |
+ |
|
2095 |
+#define get_band_cost( \ |
|
2096 |
+ s, pb, in, scaled, size, scale_idx, cb, \ |
|
2097 |
+ lambda, uplim, bits) \ |
|
2098 |
+ get_band_cost_arr[cb]( \ |
|
2099 |
+ s, pb, in, scaled, size, scale_idx, cb, \ |
|
2100 |
+ lambda, uplim, bits) |
|
2101 |
+ |
|
2102 |
+static float quantize_band_cost(struct AACEncContext *s, const float *in, |
|
2103 |
+ const float *scaled, int size, int scale_idx, |
|
2104 |
+ int cb, const float lambda, const float uplim, |
|
2105 |
+ int *bits) |
|
2106 |
+{ |
|
2107 |
+ return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits); |
|
2108 |
+} |
|
2109 |
+ |
|
2110 |
+static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx, |
|
2111 |
+ AACEncContext *s, |
|
2112 |
+ SingleChannelElement *sce, |
|
2113 |
+ const float lambda) |
|
2114 |
+{ |
|
2115 |
+ int start = 0, i, w, w2, g; |
|
2116 |
+ int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels; |
|
2117 |
+ float dists[128] = { 0 }, uplims[128]; |
|
2118 |
+ float maxvals[128]; |
|
2119 |
+ int fflag, minscaler; |
|
2120 |
+ int its = 0; |
|
2121 |
+ int allz = 0; |
|
2122 |
+ float minthr = INFINITY; |
|
2123 |
+ |
|
2124 |
+ destbits = FFMIN(destbits, 5800); |
|
2125 |
+ for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { |
|
2126 |
+ for (g = 0; g < sce->ics.num_swb; g++) { |
|
2127 |
+ int nz = 0; |
|
2128 |
+ float uplim = 0.0f; |
|
2129 |
+ for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) { |
|
2130 |
+ FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g]; |
|
2131 |
+ uplim += band->threshold; |
|
2132 |
+ if (band->energy <= band->threshold || band->threshold == 0.0f) { |
|
2133 |
+ sce->zeroes[(w+w2)*16+g] = 1; |
|
2134 |
+ continue; |
|
2135 |
+ } |
|
2136 |
+ nz = 1; |
|
2137 |
+ } |
|
2138 |
+ uplims[w*16+g] = uplim *512; |
|
2139 |
+ sce->zeroes[w*16+g] = !nz; |
|
2140 |
+ if (nz) |
|
2141 |
+ minthr = FFMIN(minthr, uplim); |
|
2142 |
+ allz |= nz; |
|
2143 |
+ } |
|
2144 |
+ } |
|
2145 |
+ for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { |
|
2146 |
+ for (g = 0; g < sce->ics.num_swb; g++) { |
|
2147 |
+ if (sce->zeroes[w*16+g]) { |
|
2148 |
+ sce->sf_idx[w*16+g] = SCALE_ONE_POS; |
|
2149 |
+ continue; |
|
2150 |
+ } |
|
2151 |
+ sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59); |
|
2152 |
+ } |
|
2153 |
+ } |
|
2154 |
+ |
|
2155 |
+ if (!allz) |
|
2156 |
+ return; |
|
2157 |
+ abs_pow34_v(s->scoefs, sce->coeffs, 1024); |
|
2158 |
+ |
|
2159 |
+ for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { |
|
2160 |
+ start = w*128; |
|
2161 |
+ for (g = 0; g < sce->ics.num_swb; g++) { |
|
2162 |
+ const float *scaled = s->scoefs + start; |
|
2163 |
+ maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled); |
|
2164 |
+ start += sce->ics.swb_sizes[g]; |
|
2165 |
+ } |
|
2166 |
+ } |
|
2167 |
+ |
|
2168 |
+ do { |
|
2169 |
+ int tbits, qstep; |
|
2170 |
+ minscaler = sce->sf_idx[0]; |
|
2171 |
+ qstep = its ? 1 : 32; |
|
2172 |
+ do { |
|
2173 |
+ int prev = -1; |
|
2174 |
+ tbits = 0; |
|
2175 |
+ fflag = 0; |
|
2176 |
+ |
|
2177 |
+ if (qstep > 1) { |
|
2178 |
+ for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { |
|
2179 |
+ start = w*128; |
|
2180 |
+ for (g = 0; g < sce->ics.num_swb; g++) { |
|
2181 |
+ const float *coefs = sce->coeffs + start; |
|
2182 |
+ const float *scaled = s->scoefs + start; |
|
2183 |
+ int bits = 0; |
|
2184 |
+ int cb; |
|
2185 |
+ |
|
2186 |
+ if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) { |
|
2187 |
+ start += sce->ics.swb_sizes[g]; |
|
2188 |
+ continue; |
|
2189 |
+ } |
|
2190 |
+ minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]); |
|
2191 |
+ cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); |
|
2192 |
+ for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) { |
|
2193 |
+ int b; |
|
2194 |
+ bits += quantize_band_cost_bits(s, coefs + w2*128, |
|
2195 |
+ scaled + w2*128, |
|
2196 |
+ sce->ics.swb_sizes[g], |
|
2197 |
+ sce->sf_idx[w*16+g], |
|
2198 |
+ cb, |
|
2199 |
+ 1.0f, |
|
2200 |
+ INFINITY, |
|
2201 |
+ &b); |
|
2202 |
+ } |
|
2203 |
+ if (prev != -1) { |
|
2204 |
+ bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO]; |
|
2205 |
+ } |
|
2206 |
+ tbits += bits; |
|
2207 |
+ start += sce->ics.swb_sizes[g]; |
|
2208 |
+ prev = sce->sf_idx[w*16+g]; |
|
2209 |
+ } |
|
2210 |
+ } |
|
2211 |
+ } |
|
2212 |
+ else { |
|
2213 |
+ for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { |
|
2214 |
+ start = w*128; |
|
2215 |
+ for (g = 0; g < sce->ics.num_swb; g++) { |
|
2216 |
+ const float *coefs = sce->coeffs + start; |
|
2217 |
+ const float *scaled = s->scoefs + start; |
|
2218 |
+ int bits = 0; |
|
2219 |
+ int cb; |
|
2220 |
+ float dist = 0.0f; |
|
2221 |
+ |
|
2222 |
+ if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) { |
|
2223 |
+ start += sce->ics.swb_sizes[g]; |
|
2224 |
+ continue; |
|
2225 |
+ } |
|
2226 |
+ minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]); |
|
2227 |
+ cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); |
|
2228 |
+ for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) { |
|
2229 |
+ int b; |
|
2230 |
+ dist += quantize_band_cost(s, coefs + w2*128, |
|
2231 |
+ scaled + w2*128, |
|
2232 |
+ sce->ics.swb_sizes[g], |
|
2233 |
+ sce->sf_idx[w*16+g], |
|
2234 |
+ cb, |
|
2235 |
+ 1.0f, |
|
2236 |
+ INFINITY, |
|
2237 |
+ &b); |
|
2238 |
+ bits += b; |
|
2239 |
+ } |
|
2240 |
+ dists[w*16+g] = dist - bits; |
|
2241 |
+ if (prev != -1) { |
|
2242 |
+ bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO]; |
|
2243 |
+ } |
|
2244 |
+ tbits += bits; |
|
2245 |
+ start += sce->ics.swb_sizes[g]; |
|
2246 |
+ prev = sce->sf_idx[w*16+g]; |
|
2247 |
+ } |
|
2248 |
+ } |
|
2249 |
+ } |
|
2250 |
+ if (tbits > destbits) { |
|
2251 |
+ for (i = 0; i < 128; i++) |
|
2252 |
+ if (sce->sf_idx[i] < 218 - qstep) |
|
2253 |
+ sce->sf_idx[i] += qstep; |
|
2254 |
+ } else { |
|
2255 |
+ for (i = 0; i < 128; i++) |
|
2256 |
+ if (sce->sf_idx[i] > 60 - qstep) |
|
2257 |
+ sce->sf_idx[i] -= qstep; |
|
2258 |
+ } |
|
2259 |
+ qstep >>= 1; |
|
2260 |
+ if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217) |
|
2261 |
+ qstep = 1; |
|
2262 |
+ } while (qstep); |
|
2263 |
+ |
|
2264 |
+ fflag = 0; |
|
2265 |
+ minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF); |
|
2266 |
+ for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { |
|
2267 |
+ for (g = 0; g < sce->ics.num_swb; g++) { |
|
2268 |
+ int prevsc = sce->sf_idx[w*16+g]; |
|
2269 |
+ if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) { |
|
2270 |
+ if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1)) |
|
2271 |
+ sce->sf_idx[w*16+g]--; |
|
2272 |
+ else |
|
2273 |
+ sce->sf_idx[w*16+g]-=2; |
|
2274 |
+ } |
|
2275 |
+ sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF); |
|
2276 |
+ sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219); |
|
2277 |
+ if (sce->sf_idx[w*16+g] != prevsc) |
|
2278 |
+ fflag = 1; |
|
2279 |
+ sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); |
|
2280 |
+ } |
|
2281 |
+ } |
|
2282 |
+ its++; |
|
2283 |
+ } while (fflag && its < 10); |
|
2284 |
+} |
|
2285 |
+ |
|
2286 |
+static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe, |
|
2287 |
+ const float lambda) |
|
2288 |
+{ |
|
2289 |
+ int start = 0, i, w, w2, g; |
|
2290 |
+ float M[128], S[128]; |
|
2291 |
+ float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3; |
|
2292 |
+ SingleChannelElement *sce0 = &cpe->ch[0]; |
|
2293 |
+ SingleChannelElement *sce1 = &cpe->ch[1]; |
|
2294 |
+ if (!cpe->common_window) |
|
2295 |
+ return; |
|
2296 |
+ for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) { |
|
2297 |
+ for (g = 0; g < sce0->ics.num_swb; g++) { |
|
2298 |
+ if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) { |
|
2299 |
+ float dist1 = 0.0f, dist2 = 0.0f; |
|
2300 |
+ for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) { |
|
2301 |
+ FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g]; |
|
2302 |
+ FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g]; |
|
2303 |
+ float minthr = FFMIN(band0->threshold, band1->threshold); |
|
2304 |
+ float maxthr = FFMAX(band0->threshold, band1->threshold); |
|
2305 |
+ for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) { |
|
2306 |
+ M[i ] = (sce0->coeffs[start+w2*128+i ] |
|
2307 |
+ + sce1->coeffs[start+w2*128+i ]) * 0.5; |
|
2308 |
+ M[i+1] = (sce0->coeffs[start+w2*128+i+1] |
|
2309 |
+ + sce1->coeffs[start+w2*128+i+1]) * 0.5; |
|
2310 |
+ M[i+2] = (sce0->coeffs[start+w2*128+i+2] |
|
2311 |
+ + sce1->coeffs[start+w2*128+i+2]) * 0.5; |
|
2312 |
+ M[i+3] = (sce0->coeffs[start+w2*128+i+3] |
|
2313 |
+ + sce1->coeffs[start+w2*128+i+3]) * 0.5; |
|
2314 |
+ |
|
2315 |
+ S[i ] = M[i ] |
|
2316 |
+ - sce1->coeffs[start+w2*128+i ]; |
|
2317 |
+ S[i+1] = M[i+1] |
|
2318 |
+ - sce1->coeffs[start+w2*128+i+1]; |
|
2319 |
+ S[i+2] = M[i+2] |
|
2320 |
+ - sce1->coeffs[start+w2*128+i+2]; |
|
2321 |
+ S[i+3] = M[i+3] |
|
2322 |
+ - sce1->coeffs[start+w2*128+i+3]; |
|
2323 |
+ } |
|
2324 |
+ abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]); |
|
2325 |
+ abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]); |
|
2326 |
+ abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]); |
|
2327 |
+ abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]); |
|
2328 |
+ dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128, |
|
2329 |
+ L34, |
|
2330 |
+ sce0->ics.swb_sizes[g], |
|
2331 |
+ sce0->sf_idx[(w+w2)*16+g], |
|
2332 |
+ sce0->band_type[(w+w2)*16+g], |
|
2333 |
+ lambda / band0->threshold, INFINITY, NULL); |
|
2334 |
+ dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128, |
|
2335 |
+ R34, |
|
2336 |
+ sce1->ics.swb_sizes[g], |
|
2337 |
+ sce1->sf_idx[(w+w2)*16+g], |
|
2338 |
+ sce1->band_type[(w+w2)*16+g], |
|
2339 |
+ lambda / band1->threshold, INFINITY, NULL); |
|
2340 |
+ dist2 += quantize_band_cost(s, M, |
|
2341 |
+ M34, |
|
2342 |
+ sce0->ics.swb_sizes[g], |
|
2343 |
+ sce0->sf_idx[(w+w2)*16+g], |
|
2344 |
+ sce0->band_type[(w+w2)*16+g], |
|
2345 |
+ lambda / maxthr, INFINITY, NULL); |
|
2346 |
+ dist2 += quantize_band_cost(s, S, |
|
2347 |
+ S34, |
|
2348 |
+ sce1->ics.swb_sizes[g], |
|
2349 |
+ sce1->sf_idx[(w+w2)*16+g], |
|
2350 |
+ sce1->band_type[(w+w2)*16+g], |
|
2351 |
+ lambda / minthr, INFINITY, NULL); |
|
2352 |
+ } |
|
2353 |
+ cpe->ms_mask[w*16+g] = dist2 < dist1; |
|
2354 |
+ } |
|
2355 |
+ start += sce0->ics.swb_sizes[g]; |
|
2356 |
+ } |
|
2357 |
+ } |
|
2358 |
+} |
|
2359 |
+#endif /*HAVE_MIPSFPU */ |
|
2360 |
+ |
|
2361 |
+static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce, |
|
2362 |
+ int win, int group_len, const float lambda) |
|
2363 |
+{ |
|
2364 |
+ BandCodingPath path[120][12]; |
|
2365 |
+ int w, swb, cb, start, size; |
|
2366 |
+ int i, j; |
|
2367 |
+ const int max_sfb = sce->ics.max_sfb; |
|
2368 |
+ const int run_bits = sce->ics.num_windows == 1 ? 5 : 3; |
|
2369 |
+ const int run_esc = (1 << run_bits) - 1; |
|
2370 |
+ int idx, ppos, count; |
|
2371 |
+ int stackrun[120], stackcb[120], stack_len; |
|
2372 |
+ float next_minbits = INFINITY; |
|
2373 |
+ int next_mincb = 0; |
|
2374 |
+ |
|
2375 |
+ abs_pow34_v(s->scoefs, sce->coeffs, 1024); |
|
2376 |
+ start = win*128; |
|
2377 |
+ for (cb = 0; cb < 12; cb++) { |
|
2378 |
+ path[0][cb].cost = run_bits+4; |
|
2379 |
+ path[0][cb].prev_idx = -1; |
|
2380 |
+ path[0][cb].run = 0; |
|
2381 |
+ } |
|
2382 |
+ for (swb = 0; swb < max_sfb; swb++) { |
|
2383 |
+ size = sce->ics.swb_sizes[swb]; |
|
2384 |
+ if (sce->zeroes[win*16 + swb]) { |
|
2385 |
+ float cost_stay_here = path[swb][0].cost; |
|
2386 |
+ float cost_get_here = next_minbits + run_bits + 4; |
|
2387 |
+ if ( run_value_bits[sce->ics.num_windows == 8][path[swb][0].run] |
|
2388 |
+ != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1]) |
|
2389 |
+ cost_stay_here += run_bits; |
|
2390 |
+ if (cost_get_here < cost_stay_here) { |
|
2391 |
+ path[swb+1][0].prev_idx = next_mincb; |
|
2392 |
+ path[swb+1][0].cost = cost_get_here; |
|
2393 |
+ path[swb+1][0].run = 1; |
|
2394 |
+ } else { |
|
2395 |
+ path[swb+1][0].prev_idx = 0; |
|
2396 |
+ path[swb+1][0].cost = cost_stay_here; |
|
2397 |
+ path[swb+1][0].run = path[swb][0].run + 1; |
|
2398 |
+ } |
|
2399 |
+ next_minbits = path[swb+1][0].cost; |
|
2400 |
+ next_mincb = 0; |
|
2401 |
+ for (cb = 1; cb < 12; cb++) { |
|
2402 |
+ path[swb+1][cb].cost = 61450; |
|
2403 |
+ path[swb+1][cb].prev_idx = -1; |
|
2404 |
+ path[swb+1][cb].run = 0; |
|
2405 |
+ } |
|
2406 |
+ } else { |
|
2407 |
+ float minbits = next_minbits; |
|
2408 |
+ int mincb = next_mincb; |
|
2409 |
+ int startcb = sce->band_type[win*16+swb]; |
|
2410 |
+ next_minbits = INFINITY; |
|
2411 |
+ next_mincb = 0; |
|
2412 |
+ for (cb = 0; cb < startcb; cb++) { |
|
2413 |
+ path[swb+1][cb].cost = 61450; |
|
2414 |
+ path[swb+1][cb].prev_idx = -1; |
|
2415 |
+ path[swb+1][cb].run = 0; |
|
2416 |
+ } |
|
2417 |
+ for (cb = startcb; cb < 12; cb++) { |
|
2418 |
+ float cost_stay_here, cost_get_here; |
|
2419 |
+ float bits = 0.0f; |
|
2420 |
+ for (w = 0; w < group_len; w++) { |
|
2421 |
+ bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128, |
|
2422 |
+ s->scoefs + start + w*128, size, |
|
2423 |
+ sce->sf_idx[(win+w)*16+swb], cb, |
|
2424 |
+ 0, INFINITY, NULL); |
|
2425 |
+ } |
|
2426 |
+ cost_stay_here = path[swb][cb].cost + bits; |
|
2427 |
+ cost_get_here = minbits + bits + run_bits + 4; |
|
2428 |
+ if ( run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run] |
|
2429 |
+ != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1]) |
|
2430 |
+ cost_stay_here += run_bits; |
|
2431 |
+ if (cost_get_here < cost_stay_here) { |
|
2432 |
+ path[swb+1][cb].prev_idx = mincb; |
|
2433 |
+ path[swb+1][cb].cost = cost_get_here; |
|
2434 |
+ path[swb+1][cb].run = 1; |
|
2435 |
+ } else { |
|
2436 |
+ path[swb+1][cb].prev_idx = cb; |
|
2437 |
+ path[swb+1][cb].cost = cost_stay_here; |
|
2438 |
+ path[swb+1][cb].run = path[swb][cb].run + 1; |
|
2439 |
+ } |
|
2440 |
+ if (path[swb+1][cb].cost < next_minbits) { |
|
2441 |
+ next_minbits = path[swb+1][cb].cost; |
|
2442 |
+ next_mincb = cb; |
|
2443 |
+ } |
|
2444 |
+ } |
|
2445 |
+ } |
|
2446 |
+ start += sce->ics.swb_sizes[swb]; |
|
2447 |
+ } |
|
2448 |
+ |
|
2449 |
+ stack_len = 0; |
|
2450 |
+ idx = 0; |
|
2451 |
+ for (cb = 1; cb < 12; cb++) |
|
2452 |
+ if (path[max_sfb][cb].cost < path[max_sfb][idx].cost) |
|
2453 |
+ idx = cb; |
|
2454 |
+ ppos = max_sfb; |
|
2455 |
+ while (ppos > 0) { |
|
2456 |
+ av_assert1(idx >= 0); |
|
2457 |
+ cb = idx; |
|
2458 |
+ stackrun[stack_len] = path[ppos][cb].run; |
|
2459 |
+ stackcb [stack_len] = cb; |
|
2460 |
+ idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx; |
|
2461 |
+ ppos -= path[ppos][cb].run; |
|
2462 |
+ stack_len++; |
|
2463 |
+ } |
|
2464 |
+ |
|
2465 |
+ start = 0; |
|
2466 |
+ for (i = stack_len - 1; i >= 0; i--) { |
|
2467 |
+ put_bits(&s->pb, 4, stackcb[i]); |
|
2468 |
+ count = stackrun[i]; |
|
2469 |
+ memset(sce->zeroes + win*16 + start, !stackcb[i], count); |
|
2470 |
+ for (j = 0; j < count; j++) { |
|
2471 |
+ sce->band_type[win*16 + start] = stackcb[i]; |
|
2472 |
+ start++; |
|
2473 |
+ } |
|
2474 |
+ while (count >= run_esc) { |
|
2475 |
+ put_bits(&s->pb, run_bits, run_esc); |
|
2476 |
+ count -= run_esc; |
|
2477 |
+ } |
|
2478 |
+ put_bits(&s->pb, run_bits, count); |
|
2479 |
+ } |
|
2480 |
+} |
|
2481 |
+#endif /* HAVE_INLINE_ASM */ |
|
2482 |
+ |
|
2483 |
+void ff_aac_coder_init_mips(AACEncContext *c) { |
|
2484 |
+#if HAVE_INLINE_ASM |
|
2485 |
+ AACCoefficientsEncoder *e = c->coder; |
|
2486 |
+ int option = c->options.aac_coder; |
|
2487 |
+ |
|
2488 |
+ if (option == 2) { |
|
2489 |
+ e->quantize_and_encode_band = quantize_and_encode_band_mips; |
|
2490 |
+ e->encode_window_bands_info = codebook_trellis_rate_mips; |
|
2491 |
+#if HAVE_MIPSFPU |
|
2492 |
+ e->search_for_quantizers = search_for_quantizers_twoloop_mips; |
|
2493 |
+ e->search_for_ms = search_for_ms_mips; |
|
2494 |
+#endif /* HAVE_MIPSFPU */ |
|
2495 |
+ } |
|
2496 |
+#endif /* HAVE_INLINE_ASM */ |
|
2497 |
+} |