Browse code

mips: Optimization of AAC coefficients encoder functions

Signed-off-by: Bojan Zivkovic <bojan@mips.com>
Reviewed-by: Nedeljko Babic <Nedeljko.Babic@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>

Bojan Zivkovic authored on 2013/03/06 22:55:05
Showing 5 changed files
... ...
@@ -48,6 +48,7 @@ Files that have MIPS copyright notice in them:
48 48
       float_dsp_mips.c
49 49
       libm_mips.h
50 50
 * libavcodec/mips/
51
+      aaccoder_mips.c
51 52
       ac3dsp_mips.c
52 53
       acelp_filters_mips.c
53 54
       acelp_vectors_mips.c
... ...
@@ -766,6 +766,9 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
766 766
     s->psypp = ff_psy_preprocess_init(avctx);
767 767
     s->coder = &ff_aac_coders[s->options.aac_coder];
768 768
 
769
+    if (HAVE_MIPSDSPR1)
770
+        ff_aac_coder_init_mips(s);
771
+
769 772
     s->lambda = avctx->global_quality ? avctx->global_quality : 120;
770 773
 
771 774
     ff_aac_tableinit();
... ...
@@ -85,4 +85,6 @@ typedef struct AACEncContext {
85 85
 
86 86
 extern float ff_aac_pow34sf_tab[428];
87 87
 
88
+void ff_aac_coder_init_mips(AACEncContext *c);
89
+
88 90
 #endif /* AVCODEC_AACENC_H */
... ...
@@ -17,3 +17,4 @@ OBJS-$(CONFIG_AAC_DECODER)                += mips/aacdec_mips.o            \
17 17
                                              mips/aacsbr_mips.o            \
18 18
                                              mips/sbrdsp_mips.o            \
19 19
                                              mips/aacpsdsp_mips.o
20
+MIPSDSPR1-OBJS-$(CONFIG_AAC_ENCODER)      += mips/aaccoder_mips.o
20 21
new file mode 100644
... ...
@@ -0,0 +1,2498 @@
0
+/*
1
+ * Copyright (c) 2012
2
+ *      MIPS Technologies, Inc., California.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * 1. Redistributions of source code must retain the above copyright
8
+ *    notice, this list of conditions and the following disclaimer.
9
+ * 2. Redistributions in binary form must reproduce the above copyright
10
+ *    notice, this list of conditions and the following disclaimer in the
11
+ *    documentation and/or other materials provided with the distribution.
12
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
13
+ *    contributors may be used to endorse or promote products derived from
14
+ *    this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
20
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
+ * SUCH DAMAGE.
27
+ *
28
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
29
+ *          Szabolcs Pal     (sabolc@mips.com)
30
+ *
31
+ * AAC coefficients encoder optimized for MIPS floating-point architecture
32
+ *
33
+ * This file is part of FFmpeg.
34
+ *
35
+ * FFmpeg is free software; you can redistribute it and/or
36
+ * modify it under the terms of the GNU Lesser General Public
37
+ * License as published by the Free Software Foundation; either
38
+ * version 2.1 of the License, or (at your option) any later version.
39
+ *
40
+ * FFmpeg is distributed in the hope that it will be useful,
41
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
42
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
43
+ * Lesser General Public License for more details.
44
+ *
45
+ * You should have received a copy of the GNU Lesser General Public
46
+ * License along with FFmpeg; if not, write to the Free Software
47
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48
+ */
49
+
50
+/**
51
+ * @file
52
+ * Reference: libavcodec/aaccoder.c
53
+ */
54
+
55
+#include "libavutil/libm.h"
56
+
57
+#include <float.h>
58
+#include "libavutil/mathematics.h"
59
+#include "libavcodec/avcodec.h"
60
+#include "libavcodec/put_bits.h"
61
+#include "libavcodec/aac.h"
62
+#include "libavcodec/aacenc.h"
63
+#include "libavcodec/aactab.h"
64
+
65
+#if HAVE_INLINE_ASM
66
+typedef struct BandCodingPath {
67
+    int prev_idx;
68
+    float cost;
69
+    int run;
70
+} BandCodingPath;
71
+
72
+static const uint8_t run_value_bits_long[64] = {
73
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
74
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
75
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
76
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
77
+};
78
+
79
+static const uint8_t run_value_bits_short[16] = {
80
+    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
81
+};
82
+
83
+static const uint8_t *run_value_bits[2] = {
84
+    run_value_bits_long, run_value_bits_short
85
+};
86
+
87
+static const uint8_t uquad_sign_bits[81] = {
88
+    0, 1, 1, 1, 2, 2, 1, 2, 2,
89
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
90
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
91
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
92
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
93
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
94
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
95
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
96
+    2, 3, 3, 3, 4, 4, 3, 4, 4
97
+};
98
+
99
+static const uint8_t upair7_sign_bits[64] = {
100
+    0, 1, 1, 1, 1, 1, 1, 1,
101
+    1, 2, 2, 2, 2, 2, 2, 2,
102
+    1, 2, 2, 2, 2, 2, 2, 2,
103
+    1, 2, 2, 2, 2, 2, 2, 2,
104
+    1, 2, 2, 2, 2, 2, 2, 2,
105
+    1, 2, 2, 2, 2, 2, 2, 2,
106
+    1, 2, 2, 2, 2, 2, 2, 2,
107
+    1, 2, 2, 2, 2, 2, 2, 2,
108
+};
109
+
110
+static const uint8_t upair12_sign_bits[169] = {
111
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
112
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
113
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
124
+};
125
+
126
+static const uint8_t esc_sign_bits[289] = {
127
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
129
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
139
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
142
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
143
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
144
+};
145
+
146
+static void abs_pow34_v(float *out, const float *in, const int size) {
147
+#ifndef USE_REALLY_FULL_SEARCH
148
+    int i;
149
+    float a, b, c, d;
150
+    float ax, bx, cx, dx;
151
+
152
+    for (i = 0; i < size; i += 4) {
153
+        a = fabsf(in[i  ]);
154
+        b = fabsf(in[i+1]);
155
+        c = fabsf(in[i+2]);
156
+        d = fabsf(in[i+3]);
157
+
158
+        ax = sqrtf(a);
159
+        bx = sqrtf(b);
160
+        cx = sqrtf(c);
161
+        dx = sqrtf(d);
162
+
163
+        a = a * ax;
164
+        b = b * bx;
165
+        c = c * cx;
166
+        d = d * dx;
167
+
168
+        out[i  ] = sqrtf(a);
169
+        out[i+1] = sqrtf(b);
170
+        out[i+2] = sqrtf(c);
171
+        out[i+3] = sqrtf(d);
172
+    }
173
+#endif /* USE_REALLY_FULL_SEARCH */
174
+}
175
+
176
+static float find_max_val(int group_len, int swb_size, const float *scaled) {
177
+    float maxval = 0.0f;
178
+    int w2, i;
179
+    for (w2 = 0; w2 < group_len; w2++) {
180
+        for (i = 0; i < swb_size; i++) {
181
+            maxval = FFMAX(maxval, scaled[w2*128+i]);
182
+        }
183
+    }
184
+    return maxval;
185
+}
186
+
187
+static int find_min_book(float maxval, int sf) {
188
+    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
189
+    float Q34 = sqrtf(Q * sqrtf(Q));
190
+    int qmaxval, cb;
191
+    qmaxval = maxval * Q34 + 0.4054f;
192
+    if      (qmaxval ==  0) cb = 0;
193
+    else if (qmaxval ==  1) cb = 1;
194
+    else if (qmaxval ==  2) cb = 3;
195
+    else if (qmaxval <=  4) cb = 5;
196
+    else if (qmaxval <=  7) cb = 7;
197
+    else if (qmaxval <= 12) cb = 9;
198
+    else                    cb = 11;
199
+    return cb;
200
+}
201
+
202
+/**
203
+ * Functions developed from template function and optimized for quantizing and encoding band
204
+ */
205
+static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
206
+                                                     PutBitContext *pb, const float *in,
207
+                                                     const float *scaled, int size, int scale_idx,
208
+                                                     int cb, const float lambda, const float uplim,
209
+                                                     int *bits)
210
+{
211
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
212
+    int i;
213
+    int qc1, qc2, qc3, qc4;
214
+
215
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
216
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
217
+
218
+    abs_pow34_v(s->scoefs, in, size);
219
+    scaled = s->scoefs;
220
+    for (i = 0; i < size; i += 4) {
221
+        int curidx;
222
+        int *in_int = (int *)&in[i];
223
+
224
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
225
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
226
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
227
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
228
+
229
+        __asm__ volatile (
230
+            ".set push                      \n\t"
231
+            ".set noreorder                 \n\t"
232
+
233
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
234
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
235
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
236
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
237
+            "lw     $t0,    0(%[in_int])    \n\t"
238
+            "lw     $t1,    4(%[in_int])    \n\t"
239
+            "lw     $t2,    8(%[in_int])    \n\t"
240
+            "lw     $t3,    12(%[in_int])   \n\t"
241
+            "srl    $t0,    $t0,    31      \n\t"
242
+            "srl    $t1,    $t1,    31      \n\t"
243
+            "srl    $t2,    $t2,    31      \n\t"
244
+            "srl    $t3,    $t3,    31      \n\t"
245
+            "subu   $t4,    $zero,  %[qc1]  \n\t"
246
+            "subu   $t5,    $zero,  %[qc2]  \n\t"
247
+            "subu   $t6,    $zero,  %[qc3]  \n\t"
248
+            "subu   $t7,    $zero,  %[qc4]  \n\t"
249
+            "movn   %[qc1], $t4,    $t0     \n\t"
250
+            "movn   %[qc2], $t5,    $t1     \n\t"
251
+            "movn   %[qc3], $t6,    $t2     \n\t"
252
+            "movn   %[qc4], $t7,    $t3     \n\t"
253
+
254
+            ".set pop                       \n\t"
255
+
256
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
257
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
258
+            : [in_int]"r"(in_int)
259
+            : "t0", "t1", "t2", "t3",
260
+              "t4", "t5", "t6", "t7",
261
+              "memory"
262
+        );
263
+
264
+        curidx = qc1;
265
+        curidx *= 3;
266
+        curidx += qc2;
267
+        curidx *= 3;
268
+        curidx += qc3;
269
+        curidx *= 3;
270
+        curidx += qc4;
271
+        curidx += 40;
272
+
273
+        put_bits(pb, p_bits[curidx], p_codes[curidx]);
274
+    }
275
+}
276
+
277
+static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
278
+                                                     PutBitContext *pb, const float *in,
279
+                                                     const float *scaled, int size, int scale_idx,
280
+                                                     int cb, const float lambda, const float uplim,
281
+                                                     int *bits)
282
+{
283
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
284
+    int i;
285
+    int qc1, qc2, qc3, qc4;
286
+
287
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
288
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
289
+
290
+    abs_pow34_v(s->scoefs, in, size);
291
+    scaled = s->scoefs;
292
+    for (i = 0; i < size; i += 4) {
293
+        int curidx, sign, count;
294
+        int *in_int = (int *)&in[i];
295
+        uint8_t v_bits;
296
+        unsigned int v_codes;
297
+
298
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
299
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
300
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
301
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
302
+
303
+        __asm__ volatile (
304
+            ".set push                              \n\t"
305
+            ".set noreorder                         \n\t"
306
+
307
+            "ori    $t4,        $zero,      2       \n\t"
308
+            "ori    %[sign],    $zero,      0       \n\t"
309
+            "slt    $t0,        $t4,        %[qc1]  \n\t"
310
+            "slt    $t1,        $t4,        %[qc2]  \n\t"
311
+            "slt    $t2,        $t4,        %[qc3]  \n\t"
312
+            "slt    $t3,        $t4,        %[qc4]  \n\t"
313
+            "movn   %[qc1],     $t4,        $t0     \n\t"
314
+            "movn   %[qc2],     $t4,        $t1     \n\t"
315
+            "movn   %[qc3],     $t4,        $t2     \n\t"
316
+            "movn   %[qc4],     $t4,        $t3     \n\t"
317
+            "lw     $t0,        0(%[in_int])        \n\t"
318
+            "lw     $t1,        4(%[in_int])        \n\t"
319
+            "lw     $t2,        8(%[in_int])        \n\t"
320
+            "lw     $t3,        12(%[in_int])       \n\t"
321
+            "slt    $t0,        $t0,        $zero   \n\t"
322
+            "movn   %[sign],    $t0,        %[qc1]  \n\t"
323
+            "slt    $t1,        $t1,        $zero   \n\t"
324
+            "slt    $t2,        $t2,        $zero   \n\t"
325
+            "slt    $t3,        $t3,        $zero   \n\t"
326
+            "sll    $t0,        %[sign],    1       \n\t"
327
+            "or     $t0,        $t0,        $t1     \n\t"
328
+            "movn   %[sign],    $t0,        %[qc2]  \n\t"
329
+            "slt    $t4,        $zero,      %[qc1]  \n\t"
330
+            "slt    $t1,        $zero,      %[qc2]  \n\t"
331
+            "slt    %[count],   $zero,      %[qc3]  \n\t"
332
+            "sll    $t0,        %[sign],    1       \n\t"
333
+            "or     $t0,        $t0,        $t2     \n\t"
334
+            "movn   %[sign],    $t0,        %[qc3]  \n\t"
335
+            "slt    $t2,        $zero,      %[qc4]  \n\t"
336
+            "addu   %[count],   %[count],   $t4     \n\t"
337
+            "addu   %[count],   %[count],   $t1     \n\t"
338
+            "sll    $t0,        %[sign],    1       \n\t"
339
+            "or     $t0,        $t0,        $t3     \n\t"
340
+            "movn   %[sign],    $t0,        %[qc4]  \n\t"
341
+            "addu   %[count],   %[count],   $t2     \n\t"
342
+
343
+            ".set pop                               \n\t"
344
+
345
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
346
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
347
+              [sign]"=&r"(sign), [count]"=&r"(count)
348
+            : [in_int]"r"(in_int)
349
+            : "t0", "t1", "t2", "t3", "t4",
350
+              "memory"
351
+        );
352
+
353
+        curidx = qc1;
354
+        curidx *= 3;
355
+        curidx += qc2;
356
+        curidx *= 3;
357
+        curidx += qc3;
358
+        curidx *= 3;
359
+        curidx += qc4;
360
+
361
+        v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
362
+        v_bits  = p_bits[curidx] + count;
363
+        put_bits(pb, v_bits, v_codes);
364
+    }
365
+}
366
+
367
+static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
368
+                                                     PutBitContext *pb, const float *in,
369
+                                                     const float *scaled, int size, int scale_idx,
370
+                                                     int cb, const float lambda, const float uplim,
371
+                                                     int *bits)
372
+{
373
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
374
+    int i;
375
+    int qc1, qc2, qc3, qc4;
376
+
377
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
378
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
379
+
380
+    abs_pow34_v(s->scoefs, in, size);
381
+    scaled = s->scoefs;
382
+    for (i = 0; i < size; i += 4) {
383
+        int curidx, curidx2;
384
+        int *in_int = (int *)&in[i];
385
+        uint8_t v_bits;
386
+        unsigned int v_codes;
387
+
388
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
389
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
390
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
391
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
392
+
393
+        __asm__ volatile (
394
+            ".set push                      \n\t"
395
+            ".set noreorder                 \n\t"
396
+
397
+            "ori    $t4,    $zero,  4       \n\t"
398
+            "slt    $t0,    $t4,    %[qc1]  \n\t"
399
+            "slt    $t1,    $t4,    %[qc2]  \n\t"
400
+            "slt    $t2,    $t4,    %[qc3]  \n\t"
401
+            "slt    $t3,    $t4,    %[qc4]  \n\t"
402
+            "movn   %[qc1], $t4,    $t0     \n\t"
403
+            "movn   %[qc2], $t4,    $t1     \n\t"
404
+            "movn   %[qc3], $t4,    $t2     \n\t"
405
+            "movn   %[qc4], $t4,    $t3     \n\t"
406
+            "lw     $t0,    0(%[in_int])    \n\t"
407
+            "lw     $t1,    4(%[in_int])    \n\t"
408
+            "lw     $t2,    8(%[in_int])    \n\t"
409
+            "lw     $t3,    12(%[in_int])   \n\t"
410
+            "srl    $t0,    $t0,    31      \n\t"
411
+            "srl    $t1,    $t1,    31      \n\t"
412
+            "srl    $t2,    $t2,    31      \n\t"
413
+            "srl    $t3,    $t3,    31      \n\t"
414
+            "subu   $t4,    $zero,  %[qc1]  \n\t"
415
+            "subu   $t5,    $zero,  %[qc2]  \n\t"
416
+            "subu   $t6,    $zero,  %[qc3]  \n\t"
417
+            "subu   $t7,    $zero,  %[qc4]  \n\t"
418
+            "movn   %[qc1], $t4,    $t0     \n\t"
419
+            "movn   %[qc2], $t5,    $t1     \n\t"
420
+            "movn   %[qc3], $t6,    $t2     \n\t"
421
+            "movn   %[qc4], $t7,    $t3     \n\t"
422
+
423
+            ".set pop                       \n\t"
424
+
425
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
426
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
427
+            : [in_int]"r"(in_int)
428
+            : "t0", "t1", "t2", "t3",
429
+              "t4", "t5", "t6", "t7",
430
+              "memory"
431
+        );
432
+
433
+        curidx = 9 * qc1;
434
+        curidx += qc2 + 40;
435
+
436
+        curidx2 = 9 * qc3;
437
+        curidx2 += qc4 + 40;
438
+
439
+        v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
440
+        v_bits  = p_bits[curidx] + p_bits[curidx2];
441
+        put_bits(pb, v_bits, v_codes);
442
+    }
443
+}
444
+
445
+static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
446
+                                                      PutBitContext *pb, const float *in,
447
+                                                      const float *scaled, int size, int scale_idx,
448
+                                                      int cb, const float lambda, const float uplim,
449
+                                                      int *bits)
450
+{
451
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
452
+    int i;
453
+    int qc1, qc2, qc3, qc4;
454
+
455
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
456
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
457
+
458
+    abs_pow34_v(s->scoefs, in, size);
459
+    scaled = s->scoefs;
460
+    for (i = 0; i < size; i += 4) {
461
+        int curidx, sign1, count1, sign2, count2;
462
+        int *in_int = (int *)&in[i];
463
+        uint8_t v_bits;
464
+        unsigned int v_codes;
465
+
466
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
467
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
468
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
469
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
470
+
471
+        __asm__ volatile (
472
+            ".set push                              \n\t"
473
+            ".set noreorder                         \n\t"
474
+
475
+            "ori    $t4,        $zero,      7       \n\t"
476
+            "ori    %[sign1],   $zero,      0       \n\t"
477
+            "ori    %[sign2],   $zero,      0       \n\t"
478
+            "slt    $t0,        $t4,        %[qc1]  \n\t"
479
+            "slt    $t1,        $t4,        %[qc2]  \n\t"
480
+            "slt    $t2,        $t4,        %[qc3]  \n\t"
481
+            "slt    $t3,        $t4,        %[qc4]  \n\t"
482
+            "movn   %[qc1],     $t4,        $t0     \n\t"
483
+            "movn   %[qc2],     $t4,        $t1     \n\t"
484
+            "movn   %[qc3],     $t4,        $t2     \n\t"
485
+            "movn   %[qc4],     $t4,        $t3     \n\t"
486
+            "lw     $t0,        0(%[in_int])        \n\t"
487
+            "lw     $t1,        4(%[in_int])        \n\t"
488
+            "lw     $t2,        8(%[in_int])        \n\t"
489
+            "lw     $t3,        12(%[in_int])       \n\t"
490
+            "slt    $t0,        $t0,        $zero   \n\t"
491
+            "movn   %[sign1],   $t0,        %[qc1]  \n\t"
492
+            "slt    $t2,        $t2,        $zero   \n\t"
493
+            "movn   %[sign2],   $t2,        %[qc3]  \n\t"
494
+            "slt    $t1,        $t1,        $zero   \n\t"
495
+            "sll    $t0,        %[sign1],   1       \n\t"
496
+            "or     $t0,        $t0,        $t1     \n\t"
497
+            "movn   %[sign1],   $t0,        %[qc2]  \n\t"
498
+            "slt    $t3,        $t3,        $zero   \n\t"
499
+            "sll    $t0,        %[sign2],   1       \n\t"
500
+            "or     $t0,        $t0,        $t3     \n\t"
501
+            "movn   %[sign2],   $t0,        %[qc4]  \n\t"
502
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
503
+            "slt    $t1,        $zero,      %[qc2]  \n\t"
504
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
505
+            "slt    $t2,        $zero,      %[qc4]  \n\t"
506
+            "addu   %[count1],  %[count1],  $t1     \n\t"
507
+            "addu   %[count2],  %[count2],  $t2     \n\t"
508
+
509
+            ".set pop                               \n\t"
510
+
511
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
512
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
513
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
514
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
515
+            : [in_int]"r"(in_int)
516
+            : "t0", "t1", "t2", "t3", "t4",
517
+              "memory"
518
+        );
519
+
520
+        curidx  = 8 * qc1;
521
+        curidx += qc2;
522
+
523
+        v_codes = (p_codes[curidx] << count1) | sign1;
524
+        v_bits  = p_bits[curidx] + count1;
525
+        put_bits(pb, v_bits, v_codes);
526
+
527
+        curidx  = 8 * qc3;
528
+        curidx += qc4;
529
+
530
+        v_codes = (p_codes[curidx] << count2) | sign2;
531
+        v_bits  = p_bits[curidx] + count2;
532
+        put_bits(pb, v_bits, v_codes);
533
+    }
534
+}
535
+
536
+static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
537
+                                                       PutBitContext *pb, const float *in,
538
+                                                       const float *scaled, int size, int scale_idx,
539
+                                                       int cb, const float lambda, const float uplim,
540
+                                                       int *bits)
541
+{
542
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
543
+    int i;
544
+    int qc1, qc2, qc3, qc4;
545
+
546
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
547
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
548
+
549
+    abs_pow34_v(s->scoefs, in, size);
550
+    scaled = s->scoefs;
551
+    for (i = 0; i < size; i += 4) {
552
+        int curidx, sign1, count1, sign2, count2;
553
+        int *in_int = (int *)&in[i];
554
+        uint8_t v_bits;
555
+        unsigned int v_codes;
556
+
557
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
558
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
559
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
560
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
561
+
562
+        __asm__ volatile (
563
+            ".set push                              \n\t"
564
+            ".set noreorder                         \n\t"
565
+
566
+            "ori    $t4,        $zero,      12      \n\t"
567
+            "ori    %[sign1],   $zero,      0       \n\t"
568
+            "ori    %[sign2],   $zero,      0       \n\t"
569
+            "slt    $t0,        $t4,        %[qc1]  \n\t"
570
+            "slt    $t1,        $t4,        %[qc2]  \n\t"
571
+            "slt    $t2,        $t4,        %[qc3]  \n\t"
572
+            "slt    $t3,        $t4,        %[qc4]  \n\t"
573
+            "movn   %[qc1],     $t4,        $t0     \n\t"
574
+            "movn   %[qc2],     $t4,        $t1     \n\t"
575
+            "movn   %[qc3],     $t4,        $t2     \n\t"
576
+            "movn   %[qc4],     $t4,        $t3     \n\t"
577
+            "lw     $t0,        0(%[in_int])        \n\t"
578
+            "lw     $t1,        4(%[in_int])        \n\t"
579
+            "lw     $t2,        8(%[in_int])        \n\t"
580
+            "lw     $t3,        12(%[in_int])       \n\t"
581
+            "slt    $t0,        $t0,        $zero   \n\t"
582
+            "movn   %[sign1],   $t0,        %[qc1]  \n\t"
583
+            "slt    $t2,        $t2,        $zero   \n\t"
584
+            "movn   %[sign2],   $t2,        %[qc3]  \n\t"
585
+            "slt    $t1,        $t1,        $zero   \n\t"
586
+            "sll    $t0,        %[sign1],   1       \n\t"
587
+            "or     $t0,        $t0,        $t1     \n\t"
588
+            "movn   %[sign1],   $t0,        %[qc2]  \n\t"
589
+            "slt    $t3,        $t3,        $zero   \n\t"
590
+            "sll    $t0,        %[sign2],   1       \n\t"
591
+            "or     $t0,        $t0,        $t3     \n\t"
592
+            "movn   %[sign2],   $t0,        %[qc4]  \n\t"
593
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
594
+            "slt    $t1,        $zero,      %[qc2]  \n\t"
595
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
596
+            "slt    $t2,        $zero,      %[qc4]  \n\t"
597
+            "addu   %[count1],  %[count1],  $t1     \n\t"
598
+            "addu   %[count2],  %[count2],  $t2     \n\t"
599
+
600
+            ".set pop                               \n\t"
601
+
602
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
603
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
604
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
605
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
606
+            : [in_int]"r"(in_int)
607
+            : "t0", "t1", "t2", "t3", "t4",
608
+              "memory"
609
+        );
610
+
611
+        curidx  = 13 * qc1;
612
+        curidx += qc2;
613
+
614
+        v_codes = (p_codes[curidx] << count1) | sign1;
615
+        v_bits  = p_bits[curidx] + count1;
616
+        put_bits(pb, v_bits, v_codes);
617
+
618
+        curidx  = 13 * qc3;
619
+        curidx += qc4;
620
+
621
+        v_codes = (p_codes[curidx] << count2) | sign2;
622
+        v_bits  = p_bits[curidx] + count2;
623
+        put_bits(pb, v_bits, v_codes);
624
+    }
625
+}
626
+
627
+static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
628
+                                                   PutBitContext *pb, const float *in,
629
+                                                   const float *scaled, int size, int scale_idx,
630
+                                                   int cb, const float lambda, const float uplim,
631
+                                                   int *bits)
632
+{
633
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
634
+    int i;
635
+    int qc1, qc2, qc3, qc4;
636
+
637
+    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
638
+    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
639
+    float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
640
+
641
+    abs_pow34_v(s->scoefs, in, size);
642
+    scaled = s->scoefs;
643
+
644
+    if (cb < 11) {
645
+        for (i = 0; i < size; i += 4) {
646
+            int curidx, curidx2, sign1, count1, sign2, count2;
647
+            int *in_int = (int *)&in[i];
648
+            uint8_t v_bits;
649
+            unsigned int v_codes;
650
+
651
+            qc1 = scaled[i  ] * Q34 + 0.4054f;
652
+            qc2 = scaled[i+1] * Q34 + 0.4054f;
653
+            qc3 = scaled[i+2] * Q34 + 0.4054f;
654
+            qc4 = scaled[i+3] * Q34 + 0.4054f;
655
+
656
+            __asm__ volatile (
657
+                ".set push                                  \n\t"
658
+                ".set noreorder                             \n\t"
659
+
660
+                "ori        $t4,        $zero,      16      \n\t"
661
+                "ori        %[sign1],   $zero,      0       \n\t"
662
+                "ori        %[sign2],   $zero,      0       \n\t"
663
+                "slt        $t0,        $t4,        %[qc1]  \n\t"
664
+                "slt        $t1,        $t4,        %[qc2]  \n\t"
665
+                "slt        $t2,        $t4,        %[qc3]  \n\t"
666
+                "slt        $t3,        $t4,        %[qc4]  \n\t"
667
+                "movn       %[qc1],     $t4,        $t0     \n\t"
668
+                "movn       %[qc2],     $t4,        $t1     \n\t"
669
+                "movn       %[qc3],     $t4,        $t2     \n\t"
670
+                "movn       %[qc4],     $t4,        $t3     \n\t"
671
+                "lw         $t0,        0(%[in_int])        \n\t"
672
+                "lw         $t1,        4(%[in_int])        \n\t"
673
+                "lw         $t2,        8(%[in_int])        \n\t"
674
+                "lw         $t3,        12(%[in_int])       \n\t"
675
+                "slt        $t0,        $t0,        $zero   \n\t"
676
+                "movn       %[sign1],   $t0,        %[qc1]  \n\t"
677
+                "slt        $t2,        $t2,        $zero   \n\t"
678
+                "movn       %[sign2],   $t2,        %[qc3]  \n\t"
679
+                "slt        $t1,        $t1,        $zero   \n\t"
680
+                "sll        $t0,        %[sign1],   1       \n\t"
681
+                "or         $t0,        $t0,        $t1     \n\t"
682
+                "movn       %[sign1],   $t0,        %[qc2]  \n\t"
683
+                "slt        $t3,        $t3,        $zero   \n\t"
684
+                "sll        $t0,        %[sign2],   1       \n\t"
685
+                "or         $t0,        $t0,        $t3     \n\t"
686
+                "movn       %[sign2],   $t0,        %[qc4]  \n\t"
687
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
688
+                "slt        $t1,        $zero,      %[qc2]  \n\t"
689
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
690
+                "slt        $t2,        $zero,      %[qc4]  \n\t"
691
+                "addu       %[count1],  %[count1],  $t1     \n\t"
692
+                "addu       %[count2],  %[count2],  $t2     \n\t"
693
+
694
+                ".set pop                                   \n\t"
695
+
696
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
697
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
698
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
699
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
700
+                : [in_int]"r"(in_int)
701
+                : "t0", "t1", "t2", "t3", "t4",
702
+                  "memory"
703
+            );
704
+
705
+            curidx = 17 * qc1;
706
+            curidx += qc2;
707
+            curidx2 = 17 * qc3;
708
+            curidx2 += qc4;
709
+
710
+            v_codes = (p_codes[curidx] << count1) | sign1;
711
+            v_bits  = p_bits[curidx] + count1;
712
+            put_bits(pb, v_bits, v_codes);
713
+
714
+            v_codes = (p_codes[curidx2] << count2) | sign2;
715
+            v_bits  = p_bits[curidx2] + count2;
716
+            put_bits(pb, v_bits, v_codes);
717
+        }
718
+    } else {
719
+        for (i = 0; i < size; i += 4) {
720
+            int curidx, curidx2, sign1, count1, sign2, count2;
721
+            int *in_int = (int *)&in[i];
722
+            uint8_t v_bits;
723
+            unsigned int v_codes;
724
+            int c1, c2, c3, c4;
725
+
726
+            qc1 = scaled[i  ] * Q34 + 0.4054f;
727
+            qc2 = scaled[i+1] * Q34 + 0.4054f;
728
+            qc3 = scaled[i+2] * Q34 + 0.4054f;
729
+            qc4 = scaled[i+3] * Q34 + 0.4054f;
730
+
731
+            __asm__ volatile (
732
+                ".set push                                  \n\t"
733
+                ".set noreorder                             \n\t"
734
+
735
+                "ori        $t4,        $zero,      16      \n\t"
736
+                "ori        %[sign1],   $zero,      0       \n\t"
737
+                "ori        %[sign2],   $zero,      0       \n\t"
738
+                "shll_s.w   %[c1],      %[qc1],     18      \n\t"
739
+                "shll_s.w   %[c2],      %[qc2],     18      \n\t"
740
+                "shll_s.w   %[c3],      %[qc3],     18      \n\t"
741
+                "shll_s.w   %[c4],      %[qc4],     18      \n\t"
742
+                "srl        %[c1],      %[c1],      18      \n\t"
743
+                "srl        %[c2],      %[c2],      18      \n\t"
744
+                "srl        %[c3],      %[c3],      18      \n\t"
745
+                "srl        %[c4],      %[c4],      18      \n\t"
746
+                "slt        $t0,        $t4,        %[qc1]  \n\t"
747
+                "slt        $t1,        $t4,        %[qc2]  \n\t"
748
+                "slt        $t2,        $t4,        %[qc3]  \n\t"
749
+                "slt        $t3,        $t4,        %[qc4]  \n\t"
750
+                "movn       %[qc1],     $t4,        $t0     \n\t"
751
+                "movn       %[qc2],     $t4,        $t1     \n\t"
752
+                "movn       %[qc3],     $t4,        $t2     \n\t"
753
+                "movn       %[qc4],     $t4,        $t3     \n\t"
754
+                "lw         $t0,        0(%[in_int])        \n\t"
755
+                "lw         $t1,        4(%[in_int])        \n\t"
756
+                "lw         $t2,        8(%[in_int])        \n\t"
757
+                "lw         $t3,        12(%[in_int])       \n\t"
758
+                "slt        $t0,        $t0,        $zero   \n\t"
759
+                "movn       %[sign1],   $t0,        %[qc1]  \n\t"
760
+                "slt        $t2,        $t2,        $zero   \n\t"
761
+                "movn       %[sign2],   $t2,        %[qc3]  \n\t"
762
+                "slt        $t1,        $t1,        $zero   \n\t"
763
+                "sll        $t0,        %[sign1],   1       \n\t"
764
+                "or         $t0,        $t0,        $t1     \n\t"
765
+                "movn       %[sign1],   $t0,        %[qc2]  \n\t"
766
+                "slt        $t3,        $t3,        $zero   \n\t"
767
+                "sll        $t0,        %[sign2],   1       \n\t"
768
+                "or         $t0,        $t0,        $t3     \n\t"
769
+                "movn       %[sign2],   $t0,        %[qc4]  \n\t"
770
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
771
+                "slt        $t1,        $zero,      %[qc2]  \n\t"
772
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
773
+                "slt        $t2,        $zero,      %[qc4]  \n\t"
774
+                "addu       %[count1],  %[count1],  $t1     \n\t"
775
+                "addu       %[count2],  %[count2],  $t2     \n\t"
776
+
777
+                ".set pop                                   \n\t"
778
+
779
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
780
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
781
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
782
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
783
+                  [c1]"=&r"(c1), [c2]"=&r"(c2),
784
+                  [c3]"=&r"(c3), [c4]"=&r"(c4)
785
+                : [in_int]"r"(in_int)
786
+                : "t0", "t1", "t2", "t3", "t4",
787
+                  "memory"
788
+            );
789
+
790
+            curidx = 17 * qc1;
791
+            curidx += qc2;
792
+
793
+            curidx2 = 17 * qc3;
794
+            curidx2 += qc4;
795
+
796
+            v_codes = (p_codes[curidx] << count1) | sign1;
797
+            v_bits  = p_bits[curidx] + count1;
798
+            put_bits(pb, v_bits, v_codes);
799
+
800
+            if (p_vectors[curidx*2  ] == 64.0f) {
801
+                int len = av_log2(c1);
802
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
803
+                put_bits(pb, len * 2 - 3, v_codes);
804
+            }
805
+            if (p_vectors[curidx*2+1] == 64.0f) {
806
+                int len = av_log2(c2);
807
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
808
+                put_bits(pb, len*2-3, v_codes);
809
+            }
810
+
811
+            v_codes = (p_codes[curidx2] << count2) | sign2;
812
+            v_bits  = p_bits[curidx2] + count2;
813
+            put_bits(pb, v_bits, v_codes);
814
+
815
+            if (p_vectors[curidx2*2  ] == 64.0f) {
816
+                int len = av_log2(c3);
817
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
818
+                put_bits(pb, len* 2 - 3, v_codes);
819
+            }
820
+            if (p_vectors[curidx2*2+1] == 64.0f) {
821
+                int len = av_log2(c4);
822
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
823
+                put_bits(pb, len * 2 - 3, v_codes);
824
+            }
825
+        }
826
+    }
827
+}
828
+
829
+static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
830
+                                                         PutBitContext *pb, const float *in,
831
+                                                         const float *scaled, int size, int scale_idx,
832
+                                                         int cb, const float lambda, const float uplim,
833
+                                                         int *bits) = {
834
+    NULL,
835
+    quantize_and_encode_band_cost_SQUAD_mips,
836
+    quantize_and_encode_band_cost_SQUAD_mips,
837
+    quantize_and_encode_band_cost_UQUAD_mips,
838
+    quantize_and_encode_band_cost_UQUAD_mips,
839
+    quantize_and_encode_band_cost_SPAIR_mips,
840
+    quantize_and_encode_band_cost_SPAIR_mips,
841
+    quantize_and_encode_band_cost_UPAIR7_mips,
842
+    quantize_and_encode_band_cost_UPAIR7_mips,
843
+    quantize_and_encode_band_cost_UPAIR12_mips,
844
+    quantize_and_encode_band_cost_UPAIR12_mips,
845
+    quantize_and_encode_band_cost_ESC_mips,
846
+};
847
+
848
+#define quantize_and_encode_band_cost(                                  \
849
+                                s, pb, in, scaled, size, scale_idx, cb, \
850
+                                lambda, uplim, bits)                    \
851
+    quantize_and_encode_band_cost_arr[cb](                              \
852
+                                s, pb, in, scaled, size, scale_idx, cb, \
853
+                                lambda, uplim, bits)
854
+
855
+static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
856
+                                          const float *in, int size, int scale_idx,
857
+                                          int cb, const float lambda)
858
+{
859
+    quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
860
+                                  INFINITY, NULL);
861
+}
862
+
863
+/**
864
+ * Functions developed from template function and optimized for getting the number of bits
865
+ */
866
+static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
867
+                                        PutBitContext *pb, const float *in,
868
+                                        const float *scaled, int size, int scale_idx,
869
+                                        int cb, const float lambda, const float uplim,
870
+                                        int *bits)
871
+{
872
+    return 0;
873
+}
874
+
875
+static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
876
+                                         PutBitContext *pb, const float *in,
877
+                                         const float *scaled, int size, int scale_idx,
878
+                                         int cb, const float lambda, const float uplim,
879
+                                         int *bits)
880
+{
881
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
882
+    int i;
883
+    int qc1, qc2, qc3, qc4;
884
+    int curbits = 0;
885
+
886
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
887
+
888
+    for (i = 0; i < size; i += 4) {
889
+        int curidx;
890
+        int *in_int = (int *)&in[i];
891
+
892
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
893
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
894
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
895
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
896
+
897
+        __asm__ volatile (
898
+            ".set push                      \n\t"
899
+            ".set noreorder                 \n\t"
900
+
901
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
902
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
903
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
904
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
905
+            "lw     $t0,    0(%[in_int])    \n\t"
906
+            "lw     $t1,    4(%[in_int])    \n\t"
907
+            "lw     $t2,    8(%[in_int])    \n\t"
908
+            "lw     $t3,    12(%[in_int])   \n\t"
909
+            "srl    $t0,    $t0,    31      \n\t"
910
+            "srl    $t1,    $t1,    31      \n\t"
911
+            "srl    $t2,    $t2,    31      \n\t"
912
+            "srl    $t3,    $t3,    31      \n\t"
913
+            "subu   $t4,    $zero,  %[qc1]  \n\t"
914
+            "subu   $t5,    $zero,  %[qc2]  \n\t"
915
+            "subu   $t6,    $zero,  %[qc3]  \n\t"
916
+            "subu   $t7,    $zero,  %[qc4]  \n\t"
917
+            "movn   %[qc1], $t4,    $t0     \n\t"
918
+            "movn   %[qc2], $t5,    $t1     \n\t"
919
+            "movn   %[qc3], $t6,    $t2     \n\t"
920
+            "movn   %[qc4], $t7,    $t3     \n\t"
921
+
922
+            ".set pop                       \n\t"
923
+
924
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
925
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
926
+            : [in_int]"r"(in_int)
927
+            : "t0", "t1", "t2", "t3",
928
+              "t4", "t5", "t6", "t7",
929
+              "memory"
930
+        );
931
+
932
+        curidx = qc1;
933
+        curidx *= 3;
934
+        curidx += qc2;
935
+        curidx *= 3;
936
+        curidx += qc3;
937
+        curidx *= 3;
938
+        curidx += qc4;
939
+        curidx += 40;
940
+
941
+        curbits += p_bits[curidx];
942
+    }
943
+    return curbits;
944
+}
945
+
946
+static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
947
+                                         PutBitContext *pb, const float *in,
948
+                                         const float *scaled, int size, int scale_idx,
949
+                                         int cb, const float lambda, const float uplim,
950
+                                         int *bits)
951
+{
952
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
953
+    int i;
954
+    int curbits = 0;
955
+    int qc1, qc2, qc3, qc4;
956
+
957
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
958
+
959
+    for (i = 0; i < size; i += 4) {
960
+        int curidx;
961
+
962
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
963
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
964
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
965
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
966
+
967
+        __asm__ volatile (
968
+            ".set push                      \n\t"
969
+            ".set noreorder                 \n\t"
970
+
971
+            "ori    $t4,    $zero,  2       \n\t"
972
+            "slt    $t0,    $t4,    %[qc1]  \n\t"
973
+            "slt    $t1,    $t4,    %[qc2]  \n\t"
974
+            "slt    $t2,    $t4,    %[qc3]  \n\t"
975
+            "slt    $t3,    $t4,    %[qc4]  \n\t"
976
+            "movn   %[qc1], $t4,    $t0     \n\t"
977
+            "movn   %[qc2], $t4,    $t1     \n\t"
978
+            "movn   %[qc3], $t4,    $t2     \n\t"
979
+            "movn   %[qc4], $t4,    $t3     \n\t"
980
+
981
+            ".set pop                       \n\t"
982
+
983
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
984
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
985
+            :
986
+            : "t0", "t1", "t2", "t3", "t4"
987
+        );
988
+
989
+        curidx = qc1;
990
+        curidx *= 3;
991
+        curidx += qc2;
992
+        curidx *= 3;
993
+        curidx += qc3;
994
+        curidx *= 3;
995
+        curidx += qc4;
996
+
997
+        curbits += p_bits[curidx];
998
+        curbits += uquad_sign_bits[curidx];
999
+    }
1000
+    return curbits;
1001
+}
1002
+
1003
+static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1004
+                                         PutBitContext *pb, const float *in,
1005
+                                         const float *scaled, int size, int scale_idx,
1006
+                                         int cb, const float lambda, const float uplim,
1007
+                                         int *bits)
1008
+{
1009
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1010
+    int i;
1011
+    int qc1, qc2, qc3, qc4;
1012
+    int curbits = 0;
1013
+
1014
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1015
+
1016
+    for (i = 0; i < size; i += 4) {
1017
+        int curidx, curidx2;
1018
+        int *in_int = (int *)&in[i];
1019
+
1020
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1021
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1022
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1023
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1024
+
1025
+        __asm__ volatile (
1026
+            ".set push                      \n\t"
1027
+            ".set noreorder                 \n\t"
1028
+
1029
+            "ori    $t4,    $zero,  4       \n\t"
1030
+            "slt    $t0,    $t4,    %[qc1]  \n\t"
1031
+            "slt    $t1,    $t4,    %[qc2]  \n\t"
1032
+            "slt    $t2,    $t4,    %[qc3]  \n\t"
1033
+            "slt    $t3,    $t4,    %[qc4]  \n\t"
1034
+            "movn   %[qc1], $t4,    $t0     \n\t"
1035
+            "movn   %[qc2], $t4,    $t1     \n\t"
1036
+            "movn   %[qc3], $t4,    $t2     \n\t"
1037
+            "movn   %[qc4], $t4,    $t3     \n\t"
1038
+            "lw     $t0,    0(%[in_int])    \n\t"
1039
+            "lw     $t1,    4(%[in_int])    \n\t"
1040
+            "lw     $t2,    8(%[in_int])    \n\t"
1041
+            "lw     $t3,    12(%[in_int])   \n\t"
1042
+            "srl    $t0,    $t0,    31      \n\t"
1043
+            "srl    $t1,    $t1,    31      \n\t"
1044
+            "srl    $t2,    $t2,    31      \n\t"
1045
+            "srl    $t3,    $t3,    31      \n\t"
1046
+            "subu   $t4,    $zero,  %[qc1]  \n\t"
1047
+            "subu   $t5,    $zero,  %[qc2]  \n\t"
1048
+            "subu   $t6,    $zero,  %[qc3]  \n\t"
1049
+            "subu   $t7,    $zero,  %[qc4]  \n\t"
1050
+            "movn   %[qc1], $t4,    $t0     \n\t"
1051
+            "movn   %[qc2], $t5,    $t1     \n\t"
1052
+            "movn   %[qc3], $t6,    $t2     \n\t"
1053
+            "movn   %[qc4], $t7,    $t3     \n\t"
1054
+
1055
+            ".set pop                       \n\t"
1056
+
1057
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1058
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1059
+            : [in_int]"r"(in_int)
1060
+            : "t0", "t1", "t2", "t3",
1061
+              "t4", "t5", "t6", "t7",
1062
+              "memory"
1063
+        );
1064
+
1065
+        curidx  = 9 * qc1;
1066
+        curidx += qc2 + 40;
1067
+
1068
+        curidx2  = 9 * qc3;
1069
+        curidx2 += qc4 + 40;
1070
+
1071
+        curbits += p_bits[curidx] + p_bits[curidx2];
1072
+    }
1073
+    return curbits;
1074
+}
1075
+
1076
+static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1077
+                                          PutBitContext *pb, const float *in,
1078
+                                          const float *scaled, int size, int scale_idx,
1079
+                                          int cb, const float lambda, const float uplim,
1080
+                                          int *bits)
1081
+{
1082
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1083
+    int i;
1084
+    int qc1, qc2, qc3, qc4;
1085
+    int curbits = 0;
1086
+
1087
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1088
+
1089
+    for (i = 0; i < size; i += 4) {
1090
+        int curidx, curidx2;
1091
+
1092
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1093
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1094
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1095
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1096
+
1097
+        __asm__ volatile (
1098
+            ".set push                      \n\t"
1099
+            ".set noreorder                 \n\t"
1100
+
1101
+            "ori    $t4,    $zero,  7       \n\t"
1102
+            "slt    $t0,    $t4,    %[qc1]  \n\t"
1103
+            "slt    $t1,    $t4,    %[qc2]  \n\t"
1104
+            "slt    $t2,    $t4,    %[qc3]  \n\t"
1105
+            "slt    $t3,    $t4,    %[qc4]  \n\t"
1106
+            "movn   %[qc1], $t4,    $t0     \n\t"
1107
+            "movn   %[qc2], $t4,    $t1     \n\t"
1108
+            "movn   %[qc3], $t4,    $t2     \n\t"
1109
+            "movn   %[qc4], $t4,    $t3     \n\t"
1110
+
1111
+            ".set pop                       \n\t"
1112
+
1113
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1114
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1115
+            :
1116
+            : "t0", "t1", "t2", "t3", "t4"
1117
+        );
1118
+
1119
+        curidx  = 8 * qc1;
1120
+        curidx += qc2;
1121
+
1122
+        curidx2  = 8 * qc3;
1123
+        curidx2 += qc4;
1124
+
1125
+        curbits += p_bits[curidx] +
1126
+                   upair7_sign_bits[curidx] +
1127
+                   p_bits[curidx2] +
1128
+                   upair7_sign_bits[curidx2];
1129
+    }
1130
+    return curbits;
1131
+}
1132
+
1133
+static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1134
+                                           PutBitContext *pb, const float *in,
1135
+                                           const float *scaled, int size, int scale_idx,
1136
+                                           int cb, const float lambda, const float uplim,
1137
+                                           int *bits)
1138
+{
1139
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1140
+    int i;
1141
+    int qc1, qc2, qc3, qc4;
1142
+    int curbits = 0;
1143
+
1144
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1145
+
1146
+    for (i = 0; i < size; i += 4) {
1147
+        int curidx, curidx2;
1148
+
1149
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1150
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1151
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1152
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1153
+
1154
+        __asm__ volatile (
1155
+            ".set push                      \n\t"
1156
+            ".set noreorder                 \n\t"
1157
+
1158
+            "ori    $t4,    $zero,  12      \n\t"
1159
+            "slt    $t0,    $t4,    %[qc1]  \n\t"
1160
+            "slt    $t1,    $t4,    %[qc2]  \n\t"
1161
+            "slt    $t2,    $t4,    %[qc3]  \n\t"
1162
+            "slt    $t3,    $t4,    %[qc4]  \n\t"
1163
+            "movn   %[qc1], $t4,    $t0     \n\t"
1164
+            "movn   %[qc2], $t4,    $t1     \n\t"
1165
+            "movn   %[qc3], $t4,    $t2     \n\t"
1166
+            "movn   %[qc4], $t4,    $t3     \n\t"
1167
+
1168
+            ".set pop                       \n\t"
1169
+
1170
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1171
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1172
+            :
1173
+            : "t0", "t1", "t2", "t3", "t4"
1174
+        );
1175
+
1176
+        curidx  = 13 * qc1;
1177
+        curidx += qc2;
1178
+
1179
+        curidx2  = 13 * qc3;
1180
+        curidx2 += qc4;
1181
+
1182
+        curbits += p_bits[curidx] +
1183
+                   p_bits[curidx2] +
1184
+                   upair12_sign_bits[curidx] +
1185
+                   upair12_sign_bits[curidx2];
1186
+    }
1187
+    return curbits;
1188
+}
1189
+
1190
+static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1191
+                                       PutBitContext *pb, const float *in,
1192
+                                       const float *scaled, int size, int scale_idx,
1193
+                                       int cb, const float lambda, const float uplim,
1194
+                                       int *bits)
1195
+{
1196
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1197
+    int i;
1198
+    int qc1, qc2, qc3, qc4;
1199
+    int curbits = 0;
1200
+
1201
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1202
+
1203
+    for (i = 0; i < size; i += 4) {
1204
+        int curidx, curidx2;
1205
+        int cond0, cond1, cond2, cond3;
1206
+        int c1, c2, c3, c4;
1207
+
1208
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1209
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1210
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1211
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1212
+
1213
+        __asm__ volatile (
1214
+            ".set push                                  \n\t"
1215
+            ".set noreorder                             \n\t"
1216
+
1217
+            "ori        $t4,        $zero,  15          \n\t"
1218
+            "ori        $t5,        $zero,  16          \n\t"
1219
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1220
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1221
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1222
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1223
+            "srl        %[c1],      %[c1],  18          \n\t"
1224
+            "srl        %[c2],      %[c2],  18          \n\t"
1225
+            "srl        %[c3],      %[c3],  18          \n\t"
1226
+            "srl        %[c4],      %[c4],  18          \n\t"
1227
+            "slt        %[cond0],   $t4,    %[qc1]      \n\t"
1228
+            "slt        %[cond1],   $t4,    %[qc2]      \n\t"
1229
+            "slt        %[cond2],   $t4,    %[qc3]      \n\t"
1230
+            "slt        %[cond3],   $t4,    %[qc4]      \n\t"
1231
+            "movn       %[qc1],     $t5,    %[cond0]    \n\t"
1232
+            "movn       %[qc2],     $t5,    %[cond1]    \n\t"
1233
+            "movn       %[qc3],     $t5,    %[cond2]    \n\t"
1234
+            "movn       %[qc4],     $t5,    %[cond3]    \n\t"
1235
+            "ori        $t5,        $zero,  31          \n\t"
1236
+            "clz        %[c1],      %[c1]               \n\t"
1237
+            "clz        %[c2],      %[c2]               \n\t"
1238
+            "clz        %[c3],      %[c3]               \n\t"
1239
+            "clz        %[c4],      %[c4]               \n\t"
1240
+            "subu       %[c1],      $t5,    %[c1]       \n\t"
1241
+            "subu       %[c2],      $t5,    %[c2]       \n\t"
1242
+            "subu       %[c3],      $t5,    %[c3]       \n\t"
1243
+            "subu       %[c4],      $t5,    %[c4]       \n\t"
1244
+            "sll        %[c1],      %[c1],  1           \n\t"
1245
+            "sll        %[c2],      %[c2],  1           \n\t"
1246
+            "sll        %[c3],      %[c3],  1           \n\t"
1247
+            "sll        %[c4],      %[c4],  1           \n\t"
1248
+            "addiu      %[c1],      %[c1],  -3          \n\t"
1249
+            "addiu      %[c2],      %[c2],  -3          \n\t"
1250
+            "addiu      %[c3],      %[c3],  -3          \n\t"
1251
+            "addiu      %[c4],      %[c4],  -3          \n\t"
1252
+            "subu       %[cond0],   $zero,  %[cond0]    \n\t"
1253
+            "subu       %[cond1],   $zero,  %[cond1]    \n\t"
1254
+            "subu       %[cond2],   $zero,  %[cond2]    \n\t"
1255
+            "subu       %[cond3],   $zero,  %[cond3]    \n\t"
1256
+            "and        %[c1],      %[c1],  %[cond0]    \n\t"
1257
+            "and        %[c2],      %[c2],  %[cond1]    \n\t"
1258
+            "and        %[c3],      %[c3],  %[cond2]    \n\t"
1259
+            "and        %[c4],      %[c4],  %[cond3]    \n\t"
1260
+
1261
+            ".set pop                                   \n\t"
1262
+
1263
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1264
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1265
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1266
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1267
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
1268
+              [c3]"=&r"(c3), [c4]"=&r"(c4)
1269
+            :
1270
+            : "t4", "t5"
1271
+        );
1272
+
1273
+        curidx = 17 * qc1;
1274
+        curidx += qc2;
1275
+
1276
+        curidx2 = 17 * qc3;
1277
+        curidx2 += qc4;
1278
+
1279
+        curbits += p_bits[curidx];
1280
+        curbits += esc_sign_bits[curidx];
1281
+        curbits += p_bits[curidx2];
1282
+        curbits += esc_sign_bits[curidx2];
1283
+
1284
+        curbits += c1;
1285
+        curbits += c2;
1286
+        curbits += c3;
1287
+        curbits += c4;
1288
+    }
1289
+    return curbits;
1290
+}
1291
+
1292
+static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1293
+                                             PutBitContext *pb, const float *in,
1294
+                                             const float *scaled, int size, int scale_idx,
1295
+                                             int cb, const float lambda, const float uplim,
1296
+                                             int *bits) = {
1297
+    get_band_numbits_ZERO_mips,
1298
+    get_band_numbits_SQUAD_mips,
1299
+    get_band_numbits_SQUAD_mips,
1300
+    get_band_numbits_UQUAD_mips,
1301
+    get_band_numbits_UQUAD_mips,
1302
+    get_band_numbits_SPAIR_mips,
1303
+    get_band_numbits_SPAIR_mips,
1304
+    get_band_numbits_UPAIR7_mips,
1305
+    get_band_numbits_UPAIR7_mips,
1306
+    get_band_numbits_UPAIR12_mips,
1307
+    get_band_numbits_UPAIR12_mips,
1308
+    get_band_numbits_ESC_mips,
1309
+};
1310
+
1311
+#define get_band_numbits(                                  \
1312
+                                s, pb, in, scaled, size, scale_idx, cb, \
1313
+                                lambda, uplim, bits)                    \
1314
+    get_band_numbits_arr[cb](                              \
1315
+                                s, pb, in, scaled, size, scale_idx, cb, \
1316
+                                lambda, uplim, bits)
1317
+
1318
+static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1319
+                                     const float *scaled, int size, int scale_idx,
1320
+                                     int cb, const float lambda, const float uplim,
1321
+                                     int *bits)
1322
+{
1323
+    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1324
+}
1325
+
1326
+/**
1327
+ * Functions developed from template function and optimized for getting the band cost
1328
+ */
1329
+#if HAVE_MIPSFPU
1330
+static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1331
+                                     PutBitContext *pb, const float *in,
1332
+                                     const float *scaled, int size, int scale_idx,
1333
+                                     int cb, const float lambda, const float uplim,
1334
+                                     int *bits)
1335
+{
1336
+    int i;
1337
+    float cost = 0;
1338
+
1339
+    for (i = 0; i < size; i += 4) {
1340
+        cost += in[i  ] * in[i  ];
1341
+        cost += in[i+1] * in[i+1];
1342
+        cost += in[i+2] * in[i+2];
1343
+        cost += in[i+3] * in[i+3];
1344
+    }
1345
+    if (bits)
1346
+        *bits = 0;
1347
+    return cost * lambda;
1348
+}
1349
+
1350
+static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1351
+                                      PutBitContext *pb, const float *in,
1352
+                                      const float *scaled, int size, int scale_idx,
1353
+                                      int cb, const float lambda, const float uplim,
1354
+                                      int *bits)
1355
+{
1356
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1357
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1358
+    int i;
1359
+    float cost = 0;
1360
+    int qc1, qc2, qc3, qc4;
1361
+    int curbits = 0;
1362
+
1363
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1364
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1365
+
1366
+    for (i = 0; i < size; i += 4) {
1367
+        const float *vec;
1368
+        int curidx;
1369
+        int   *in_int = (int   *)&in[i];
1370
+        float *in_pos = (float *)&in[i];
1371
+        float di0, di1, di2, di3;
1372
+
1373
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1374
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1375
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1376
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1377
+
1378
+        __asm__ volatile (
1379
+            ".set push                                  \n\t"
1380
+            ".set noreorder                             \n\t"
1381
+
1382
+            "slt        %[qc1], $zero,  %[qc1]          \n\t"
1383
+            "slt        %[qc2], $zero,  %[qc2]          \n\t"
1384
+            "slt        %[qc3], $zero,  %[qc3]          \n\t"
1385
+            "slt        %[qc4], $zero,  %[qc4]          \n\t"
1386
+            "lw         $t0,    0(%[in_int])            \n\t"
1387
+            "lw         $t1,    4(%[in_int])            \n\t"
1388
+            "lw         $t2,    8(%[in_int])            \n\t"
1389
+            "lw         $t3,    12(%[in_int])           \n\t"
1390
+            "srl        $t0,    $t0,    31              \n\t"
1391
+            "srl        $t1,    $t1,    31              \n\t"
1392
+            "srl        $t2,    $t2,    31              \n\t"
1393
+            "srl        $t3,    $t3,    31              \n\t"
1394
+            "subu       $t4,    $zero,  %[qc1]          \n\t"
1395
+            "subu       $t5,    $zero,  %[qc2]          \n\t"
1396
+            "subu       $t6,    $zero,  %[qc3]          \n\t"
1397
+            "subu       $t7,    $zero,  %[qc4]          \n\t"
1398
+            "movn       %[qc1], $t4,    $t0             \n\t"
1399
+            "movn       %[qc2], $t5,    $t1             \n\t"
1400
+            "movn       %[qc3], $t6,    $t2             \n\t"
1401
+            "movn       %[qc4], $t7,    $t3             \n\t"
1402
+
1403
+            ".set pop                                   \n\t"
1404
+
1405
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1406
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1407
+            : [in_int]"r"(in_int)
1408
+            : "t0", "t1", "t2", "t3",
1409
+              "t4", "t5", "t6", "t7",
1410
+              "memory"
1411
+        );
1412
+
1413
+        curidx = qc1;
1414
+        curidx *= 3;
1415
+        curidx += qc2;
1416
+        curidx *= 3;
1417
+        curidx += qc3;
1418
+        curidx *= 3;
1419
+        curidx += qc4;
1420
+        curidx += 40;
1421
+
1422
+        curbits += p_bits[curidx];
1423
+        vec     = &p_codes[curidx*4];
1424
+
1425
+        __asm__ volatile (
1426
+            ".set push                                  \n\t"
1427
+            ".set noreorder                             \n\t"
1428
+
1429
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
1430
+            "lwc1       $f1,    0(%[vec])               \n\t"
1431
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
1432
+            "lwc1       $f3,    4(%[vec])               \n\t"
1433
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
1434
+            "lwc1       $f5,    8(%[vec])               \n\t"
1435
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
1436
+            "lwc1       $f7,    12(%[vec])              \n\t"
1437
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1438
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1439
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1440
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1441
+
1442
+            ".set pop                                   \n\t"
1443
+
1444
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1445
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
1446
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1447
+              [IQ]"f"(IQ)
1448
+            : "$f0", "$f1", "$f2", "$f3",
1449
+              "$f4", "$f5", "$f6", "$f7",
1450
+              "memory"
1451
+        );
1452
+
1453
+        cost += di0 * di0 + di1 * di1
1454
+                + di2 * di2 + di3 * di3;
1455
+    }
1456
+
1457
+    if (bits)
1458
+        *bits = curbits;
1459
+    return cost * lambda + curbits;
1460
+}
1461
+
1462
+static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1463
+                                      PutBitContext *pb, const float *in,
1464
+                                      const float *scaled, int size, int scale_idx,
1465
+                                      int cb, const float lambda, const float uplim,
1466
+                                      int *bits)
1467
+{
1468
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1469
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1470
+    int i;
1471
+    float cost = 0;
1472
+    int curbits = 0;
1473
+    int qc1, qc2, qc3, qc4;
1474
+
1475
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1476
+    float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
1477
+
1478
+    for (i = 0; i < size; i += 4) {
1479
+        const float *vec;
1480
+        int curidx;
1481
+        float *in_pos = (float *)&in[i];
1482
+        float di0, di1, di2, di3;
1483
+
1484
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1485
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1486
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1487
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1488
+
1489
+        __asm__ volatile (
1490
+            ".set push                                  \n\t"
1491
+            ".set noreorder                             \n\t"
1492
+
1493
+            "ori        $t4,    $zero,  2               \n\t"
1494
+            "slt        $t0,    $t4,    %[qc1]          \n\t"
1495
+            "slt        $t1,    $t4,    %[qc2]          \n\t"
1496
+            "slt        $t2,    $t4,    %[qc3]          \n\t"
1497
+            "slt        $t3,    $t4,    %[qc4]          \n\t"
1498
+            "movn       %[qc1], $t4,    $t0             \n\t"
1499
+            "movn       %[qc2], $t4,    $t1             \n\t"
1500
+            "movn       %[qc3], $t4,    $t2             \n\t"
1501
+            "movn       %[qc4], $t4,    $t3             \n\t"
1502
+
1503
+            ".set pop                                   \n\t"
1504
+
1505
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1506
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1507
+            :
1508
+            : "t0", "t1", "t2", "t3", "t4"
1509
+        );
1510
+
1511
+        curidx = qc1;
1512
+        curidx *= 3;
1513
+        curidx += qc2;
1514
+        curidx *= 3;
1515
+        curidx += qc3;
1516
+        curidx *= 3;
1517
+        curidx += qc4;
1518
+
1519
+        curbits += p_bits[curidx];
1520
+        curbits += uquad_sign_bits[curidx];
1521
+        vec     = &p_codes[curidx*4];
1522
+
1523
+        __asm__ volatile (
1524
+            ".set push                                  \n\t"
1525
+            ".set noreorder                             \n\t"
1526
+
1527
+            "lwc1       %[di0], 0(%[in_pos])            \n\t"
1528
+            "lwc1       %[di1], 4(%[in_pos])            \n\t"
1529
+            "lwc1       %[di2], 8(%[in_pos])            \n\t"
1530
+            "lwc1       %[di3], 12(%[in_pos])           \n\t"
1531
+            "abs.s      %[di0], %[di0]                  \n\t"
1532
+            "abs.s      %[di1], %[di1]                  \n\t"
1533
+            "abs.s      %[di2], %[di2]                  \n\t"
1534
+            "abs.s      %[di3], %[di3]                  \n\t"
1535
+            "lwc1       $f0,    0(%[vec])               \n\t"
1536
+            "lwc1       $f1,    4(%[vec])               \n\t"
1537
+            "lwc1       $f2,    8(%[vec])               \n\t"
1538
+            "lwc1       $f3,    12(%[vec])              \n\t"
1539
+            "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
1540
+            "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
1541
+            "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
1542
+            "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
1543
+
1544
+            ".set pop                                   \n\t"
1545
+
1546
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1547
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
1548
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1549
+              [IQ]"f"(IQ)
1550
+            : "$f0", "$f1", "$f2", "$f3",
1551
+              "memory"
1552
+        );
1553
+
1554
+        cost += di0 * di0 + di1 * di1
1555
+                + di2 * di2 + di3 * di3;
1556
+    }
1557
+
1558
+    if (bits)
1559
+        *bits = curbits;
1560
+    return cost * lambda + curbits;
1561
+}
1562
+
1563
+static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1564
+                                      PutBitContext *pb, const float *in,
1565
+                                      const float *scaled, int size, int scale_idx,
1566
+                                      int cb, const float lambda, const float uplim,
1567
+                                      int *bits)
1568
+{
1569
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1570
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1571
+    int i;
1572
+    float cost = 0;
1573
+    int qc1, qc2, qc3, qc4;
1574
+    int curbits = 0;
1575
+
1576
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1577
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1578
+
1579
+    for (i = 0; i < size; i += 4) {
1580
+        const float *vec, *vec2;
1581
+        int curidx, curidx2;
1582
+        int   *in_int = (int   *)&in[i];
1583
+        float *in_pos = (float *)&in[i];
1584
+        float di0, di1, di2, di3;
1585
+
1586
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1587
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1588
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1589
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1590
+
1591
+        __asm__ volatile (
1592
+            ".set push                                  \n\t"
1593
+            ".set noreorder                             \n\t"
1594
+
1595
+            "ori        $t4,    $zero,  4               \n\t"
1596
+            "slt        $t0,    $t4,    %[qc1]          \n\t"
1597
+            "slt        $t1,    $t4,    %[qc2]          \n\t"
1598
+            "slt        $t2,    $t4,    %[qc3]          \n\t"
1599
+            "slt        $t3,    $t4,    %[qc4]          \n\t"
1600
+            "movn       %[qc1], $t4,    $t0             \n\t"
1601
+            "movn       %[qc2], $t4,    $t1             \n\t"
1602
+            "movn       %[qc3], $t4,    $t2             \n\t"
1603
+            "movn       %[qc4], $t4,    $t3             \n\t"
1604
+            "lw         $t0,    0(%[in_int])            \n\t"
1605
+            "lw         $t1,    4(%[in_int])            \n\t"
1606
+            "lw         $t2,    8(%[in_int])            \n\t"
1607
+            "lw         $t3,    12(%[in_int])           \n\t"
1608
+            "srl        $t0,    $t0,    31              \n\t"
1609
+            "srl        $t1,    $t1,    31              \n\t"
1610
+            "srl        $t2,    $t2,    31              \n\t"
1611
+            "srl        $t3,    $t3,    31              \n\t"
1612
+            "subu       $t4,    $zero,  %[qc1]          \n\t"
1613
+            "subu       $t5,    $zero,  %[qc2]          \n\t"
1614
+            "subu       $t6,    $zero,  %[qc3]          \n\t"
1615
+            "subu       $t7,    $zero,  %[qc4]          \n\t"
1616
+            "movn       %[qc1], $t4,    $t0             \n\t"
1617
+            "movn       %[qc2], $t5,    $t1             \n\t"
1618
+            "movn       %[qc3], $t6,    $t2             \n\t"
1619
+            "movn       %[qc4], $t7,    $t3             \n\t"
1620
+
1621
+            ".set pop                                   \n\t"
1622
+
1623
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1624
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1625
+            : [in_int]"r"(in_int)
1626
+            : "t0", "t1", "t2", "t3",
1627
+              "t4", "t5", "t6", "t7",
1628
+              "memory"
1629
+        );
1630
+
1631
+        curidx = 9 * qc1;
1632
+        curidx += qc2 + 40;
1633
+
1634
+        curidx2 = 9 * qc3;
1635
+        curidx2 += qc4 + 40;
1636
+
1637
+        curbits += p_bits[curidx];
1638
+        curbits += p_bits[curidx2];
1639
+
1640
+        vec     = &p_codes[curidx*2];
1641
+        vec2    = &p_codes[curidx2*2];
1642
+
1643
+        __asm__ volatile (
1644
+            ".set push                                  \n\t"
1645
+            ".set noreorder                             \n\t"
1646
+
1647
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
1648
+            "lwc1       $f1,    0(%[vec])               \n\t"
1649
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
1650
+            "lwc1       $f3,    4(%[vec])               \n\t"
1651
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
1652
+            "lwc1       $f5,    0(%[vec2])              \n\t"
1653
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
1654
+            "lwc1       $f7,    4(%[vec2])              \n\t"
1655
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1656
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1657
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1658
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1659
+
1660
+            ".set pop                                   \n\t"
1661
+
1662
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1663
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
1664
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1665
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
1666
+            : "$f0", "$f1", "$f2", "$f3",
1667
+              "$f4", "$f5", "$f6", "$f7",
1668
+              "memory"
1669
+        );
1670
+
1671
+        cost += di0 * di0 + di1 * di1
1672
+                + di2 * di2 + di3 * di3;
1673
+    }
1674
+
1675
+    if (bits)
1676
+        *bits = curbits;
1677
+    return cost * lambda + curbits;
1678
+}
1679
+
1680
+static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1681
+                                       PutBitContext *pb, const float *in,
1682
+                                       const float *scaled, int size, int scale_idx,
1683
+                                       int cb, const float lambda, const float uplim,
1684
+                                       int *bits)
1685
+{
1686
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1687
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1688
+    int i;
1689
+    float cost = 0;
1690
+    int qc1, qc2, qc3, qc4;
1691
+    int curbits = 0;
1692
+
1693
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1694
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1695
+
1696
+    for (i = 0; i < size; i += 4) {
1697
+        const float *vec, *vec2;
1698
+        int curidx, curidx2, sign1, count1, sign2, count2;
1699
+        int   *in_int = (int   *)&in[i];
1700
+        float *in_pos = (float *)&in[i];
1701
+        float di0, di1, di2, di3;
1702
+
1703
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1704
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1705
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1706
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1707
+
1708
+        __asm__ volatile (
1709
+            ".set push                                          \n\t"
1710
+            ".set noreorder                                     \n\t"
1711
+
1712
+            "ori        $t4,        $zero,      7               \n\t"
1713
+            "ori        %[sign1],   $zero,      0               \n\t"
1714
+            "ori        %[sign2],   $zero,      0               \n\t"
1715
+            "slt        $t0,        $t4,        %[qc1]          \n\t"
1716
+            "slt        $t1,        $t4,        %[qc2]          \n\t"
1717
+            "slt        $t2,        $t4,        %[qc3]          \n\t"
1718
+            "slt        $t3,        $t4,        %[qc4]          \n\t"
1719
+            "movn       %[qc1],     $t4,        $t0             \n\t"
1720
+            "movn       %[qc2],     $t4,        $t1             \n\t"
1721
+            "movn       %[qc3],     $t4,        $t2             \n\t"
1722
+            "movn       %[qc4],     $t4,        $t3             \n\t"
1723
+            "lw         $t0,        0(%[in_int])                \n\t"
1724
+            "lw         $t1,        4(%[in_int])                \n\t"
1725
+            "lw         $t2,        8(%[in_int])                \n\t"
1726
+            "lw         $t3,        12(%[in_int])               \n\t"
1727
+            "slt        $t0,        $t0,        $zero           \n\t"
1728
+            "movn       %[sign1],   $t0,        %[qc1]          \n\t"
1729
+            "slt        $t2,        $t2,        $zero           \n\t"
1730
+            "movn       %[sign2],   $t2,        %[qc3]          \n\t"
1731
+            "slt        $t1,        $t1,        $zero           \n\t"
1732
+            "sll        $t0,        %[sign1],   1               \n\t"
1733
+            "or         $t0,        $t0,        $t1             \n\t"
1734
+            "movn       %[sign1],   $t0,        %[qc2]          \n\t"
1735
+            "slt        $t3,        $t3,        $zero           \n\t"
1736
+            "sll        $t0,        %[sign2],   1               \n\t"
1737
+            "or         $t0,        $t0,        $t3             \n\t"
1738
+            "movn       %[sign2],   $t0,        %[qc4]          \n\t"
1739
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
1740
+            "slt        $t1,        $zero,      %[qc2]          \n\t"
1741
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
1742
+            "slt        $t2,        $zero,      %[qc4]          \n\t"
1743
+            "addu       %[count1],  %[count1],  $t1             \n\t"
1744
+            "addu       %[count2],  %[count2],  $t2             \n\t"
1745
+
1746
+            ".set pop                                           \n\t"
1747
+
1748
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1749
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1750
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1751
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1752
+            : [in_int]"r"(in_int)
1753
+            : "t0", "t1", "t2", "t3", "t4",
1754
+              "memory"
1755
+        );
1756
+
1757
+        curidx = 8 * qc1;
1758
+        curidx += qc2;
1759
+
1760
+        curidx2 = 8 * qc3;
1761
+        curidx2 += qc4;
1762
+
1763
+        curbits += p_bits[curidx];
1764
+        curbits += upair7_sign_bits[curidx];
1765
+        vec     = &p_codes[curidx*2];
1766
+
1767
+        curbits += p_bits[curidx2];
1768
+        curbits += upair7_sign_bits[curidx2];
1769
+        vec2    = &p_codes[curidx2*2];
1770
+
1771
+        __asm__ volatile (
1772
+            ".set push                                          \n\t"
1773
+            ".set noreorder                                     \n\t"
1774
+
1775
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
1776
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
1777
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
1778
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
1779
+            "abs.s      %[di0],     %[di0]                      \n\t"
1780
+            "abs.s      %[di1],     %[di1]                      \n\t"
1781
+            "abs.s      %[di2],     %[di2]                      \n\t"
1782
+            "abs.s      %[di3],     %[di3]                      \n\t"
1783
+            "lwc1       $f0,        0(%[vec])                   \n\t"
1784
+            "lwc1       $f1,        4(%[vec])                   \n\t"
1785
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
1786
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
1787
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1788
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1789
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1790
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1791
+
1792
+            ".set pop                                           \n\t"
1793
+
1794
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1795
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
1796
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1797
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
1798
+            : "$f0", "$f1", "$f2", "$f3",
1799
+              "memory"
1800
+        );
1801
+
1802
+        cost += di0 * di0 + di1 * di1
1803
+                + di2 * di2 + di3 * di3;
1804
+    }
1805
+
1806
+    if (bits)
1807
+        *bits = curbits;
1808
+    return cost * lambda + curbits;
1809
+}
1810
+
1811
+static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1812
+                                        PutBitContext *pb, const float *in,
1813
+                                        const float *scaled, int size, int scale_idx,
1814
+                                        int cb, const float lambda, const float uplim,
1815
+                                        int *bits)
1816
+{
1817
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1818
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1819
+    int i;
1820
+    float cost = 0;
1821
+    int qc1, qc2, qc3, qc4;
1822
+    int curbits = 0;
1823
+
1824
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1825
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1826
+
1827
+    for (i = 0; i < size; i += 4) {
1828
+        const float *vec, *vec2;
1829
+        int curidx, curidx2;
1830
+        int sign1, count1, sign2, count2;
1831
+        int   *in_int = (int   *)&in[i];
1832
+        float *in_pos = (float *)&in[i];
1833
+        float di0, di1, di2, di3;
1834
+
1835
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1836
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1837
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1838
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1839
+
1840
+        __asm__ volatile (
1841
+            ".set push                                          \n\t"
1842
+            ".set noreorder                                     \n\t"
1843
+
1844
+            "ori        $t4,        $zero,      12              \n\t"
1845
+            "ori        %[sign1],   $zero,      0               \n\t"
1846
+            "ori        %[sign2],   $zero,      0               \n\t"
1847
+            "slt        $t0,        $t4,        %[qc1]          \n\t"
1848
+            "slt        $t1,        $t4,        %[qc2]          \n\t"
1849
+            "slt        $t2,        $t4,        %[qc3]          \n\t"
1850
+            "slt        $t3,        $t4,        %[qc4]          \n\t"
1851
+            "movn       %[qc1],     $t4,        $t0             \n\t"
1852
+            "movn       %[qc2],     $t4,        $t1             \n\t"
1853
+            "movn       %[qc3],     $t4,        $t2             \n\t"
1854
+            "movn       %[qc4],     $t4,        $t3             \n\t"
1855
+            "lw         $t0,        0(%[in_int])                \n\t"
1856
+            "lw         $t1,        4(%[in_int])                \n\t"
1857
+            "lw         $t2,        8(%[in_int])                \n\t"
1858
+            "lw         $t3,        12(%[in_int])               \n\t"
1859
+            "slt        $t0,        $t0,        $zero           \n\t"
1860
+            "movn       %[sign1],   $t0,        %[qc1]          \n\t"
1861
+            "slt        $t2,        $t2,        $zero           \n\t"
1862
+            "movn       %[sign2],   $t2,        %[qc3]          \n\t"
1863
+            "slt        $t1,        $t1,        $zero           \n\t"
1864
+            "sll        $t0,        %[sign1],   1               \n\t"
1865
+            "or         $t0,        $t0,        $t1             \n\t"
1866
+            "movn       %[sign1],   $t0,        %[qc2]          \n\t"
1867
+            "slt        $t3,        $t3,        $zero           \n\t"
1868
+            "sll        $t0,        %[sign2],   1               \n\t"
1869
+            "or         $t0,        $t0,        $t3             \n\t"
1870
+            "movn       %[sign2],   $t0,        %[qc4]          \n\t"
1871
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
1872
+            "slt        $t1,        $zero,      %[qc2]          \n\t"
1873
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
1874
+            "slt        $t2,        $zero,      %[qc4]          \n\t"
1875
+            "addu       %[count1],  %[count1],  $t1             \n\t"
1876
+            "addu       %[count2],  %[count2],  $t2             \n\t"
1877
+
1878
+            ".set pop                                           \n\t"
1879
+
1880
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1881
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1882
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1883
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1884
+            : [in_int]"r"(in_int)
1885
+            : "t0", "t1", "t2", "t3", "t4",
1886
+              "memory"
1887
+        );
1888
+
1889
+        curidx = 13 * qc1;
1890
+        curidx += qc2;
1891
+
1892
+        curidx2 = 13 * qc3;
1893
+        curidx2 += qc4;
1894
+
1895
+        curbits += p_bits[curidx];
1896
+        curbits += p_bits[curidx2];
1897
+        curbits += upair12_sign_bits[curidx];
1898
+        curbits += upair12_sign_bits[curidx2];
1899
+        vec     = &p_codes[curidx*2];
1900
+        vec2    = &p_codes[curidx2*2];
1901
+
1902
+        __asm__ volatile (
1903
+            ".set push                                          \n\t"
1904
+            ".set noreorder                                     \n\t"
1905
+
1906
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
1907
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
1908
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
1909
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
1910
+            "abs.s      %[di0],     %[di0]                      \n\t"
1911
+            "abs.s      %[di1],     %[di1]                      \n\t"
1912
+            "abs.s      %[di2],     %[di2]                      \n\t"
1913
+            "abs.s      %[di3],     %[di3]                      \n\t"
1914
+            "lwc1       $f0,        0(%[vec])                   \n\t"
1915
+            "lwc1       $f1,        4(%[vec])                   \n\t"
1916
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
1917
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
1918
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1919
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1920
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1921
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1922
+
1923
+            ".set pop                                           \n\t"
1924
+
1925
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1926
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
1927
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1928
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
1929
+            : "$f0", "$f1", "$f2", "$f3",
1930
+              "memory"
1931
+        );
1932
+
1933
+        cost += di0 * di0 + di1 * di1
1934
+                + di2 * di2 + di3 * di3;
1935
+    }
1936
+
1937
+    if (bits)
1938
+        *bits = curbits;
1939
+    return cost * lambda + curbits;
1940
+}
1941
+
1942
+static float get_band_cost_ESC_mips(struct AACEncContext *s,
1943
+                                    PutBitContext *pb, const float *in,
1944
+                                    const float *scaled, int size, int scale_idx,
1945
+                                    int cb, const float lambda, const float uplim,
1946
+                                    int *bits)
1947
+{
1948
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1949
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1950
+    const float CLIPPED_ESCAPE = 165140.0f * IQ;
1951
+    int i;
1952
+    float cost = 0;
1953
+    int qc1, qc2, qc3, qc4;
1954
+    int curbits = 0;
1955
+
1956
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1957
+    float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
1958
+
1959
+    for (i = 0; i < size; i += 4) {
1960
+        const float *vec, *vec2;
1961
+        int curidx, curidx2;
1962
+        float t1, t2, t3, t4;
1963
+        float di1, di2, di3, di4;
1964
+        int cond0, cond1, cond2, cond3;
1965
+        int c1, c2, c3, c4;
1966
+
1967
+        qc1 = scaled[i  ] * Q34 + 0.4054f;
1968
+        qc2 = scaled[i+1] * Q34 + 0.4054f;
1969
+        qc3 = scaled[i+2] * Q34 + 0.4054f;
1970
+        qc4 = scaled[i+3] * Q34 + 0.4054f;
1971
+
1972
+        __asm__ volatile (
1973
+            ".set push                                  \n\t"
1974
+            ".set noreorder                             \n\t"
1975
+
1976
+            "ori        $t4,        $zero,  15          \n\t"
1977
+            "ori        $t5,        $zero,  16          \n\t"
1978
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1979
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1980
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1981
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1982
+            "srl        %[c1],      %[c1],  18          \n\t"
1983
+            "srl        %[c2],      %[c2],  18          \n\t"
1984
+            "srl        %[c3],      %[c3],  18          \n\t"
1985
+            "srl        %[c4],      %[c4],  18          \n\t"
1986
+            "slt        %[cond0],   $t4,    %[qc1]      \n\t"
1987
+            "slt        %[cond1],   $t4,    %[qc2]      \n\t"
1988
+            "slt        %[cond2],   $t4,    %[qc3]      \n\t"
1989
+            "slt        %[cond3],   $t4,    %[qc4]      \n\t"
1990
+            "movn       %[qc1],     $t5,    %[cond0]    \n\t"
1991
+            "movn       %[qc2],     $t5,    %[cond1]    \n\t"
1992
+            "movn       %[qc3],     $t5,    %[cond2]    \n\t"
1993
+            "movn       %[qc4],     $t5,    %[cond3]    \n\t"
1994
+
1995
+            ".set pop                                   \n\t"
1996
+
1997
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1998
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1999
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2000
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2001
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
2002
+              [c3]"=&r"(c3), [c4]"=&r"(c4)
2003
+            :
2004
+            : "t4", "t5"
2005
+        );
2006
+
2007
+        curidx = 17 * qc1;
2008
+        curidx += qc2;
2009
+
2010
+        curidx2 = 17 * qc3;
2011
+        curidx2 += qc4;
2012
+
2013
+        curbits += p_bits[curidx];
2014
+        curbits += esc_sign_bits[curidx];
2015
+        vec     = &p_codes[curidx*2];
2016
+
2017
+        curbits += p_bits[curidx2];
2018
+        curbits += esc_sign_bits[curidx2];
2019
+        vec2     = &p_codes[curidx2*2];
2020
+
2021
+        curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2022
+        curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2023
+        curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2024
+        curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2025
+
2026
+        t1 = fabsf(in[i  ]);
2027
+        t2 = fabsf(in[i+1]);
2028
+        t3 = fabsf(in[i+2]);
2029
+        t4 = fabsf(in[i+3]);
2030
+
2031
+        if (cond0) {
2032
+            if (t1 >= CLIPPED_ESCAPE) {
2033
+                di1 = t1 - CLIPPED_ESCAPE;
2034
+            } else {
2035
+                di1 = t1 - c1 * cbrtf(c1) * IQ;
2036
+            }
2037
+        } else
2038
+            di1 = t1 - vec[0] * IQ;
2039
+
2040
+        if (cond1) {
2041
+            if (t2 >= CLIPPED_ESCAPE) {
2042
+                di2 = t2 - CLIPPED_ESCAPE;
2043
+            } else {
2044
+                di2 = t2 - c2 * cbrtf(c2) * IQ;
2045
+            }
2046
+        } else
2047
+            di2 = t2 - vec[1] * IQ;
2048
+
2049
+        if (cond2) {
2050
+            if (t3 >= CLIPPED_ESCAPE) {
2051
+                di3 = t3 - CLIPPED_ESCAPE;
2052
+            } else {
2053
+                di3 = t3 - c3 * cbrtf(c3) * IQ;
2054
+            }
2055
+        } else
2056
+            di3 = t3 - vec2[0] * IQ;
2057
+
2058
+        if (cond3) {
2059
+            if (t4 >= CLIPPED_ESCAPE) {
2060
+                di4 = t4 - CLIPPED_ESCAPE;
2061
+            } else {
2062
+                di4 = t4 - c4 * cbrtf(c4) * IQ;
2063
+            }
2064
+        } else
2065
+            di4 = t4 - vec2[1]*IQ;
2066
+
2067
+        cost += di1 * di1 + di2 * di2
2068
+                + di3 * di3 + di4 * di4;
2069
+    }
2070
+
2071
+    if (bits)
2072
+        *bits = curbits;
2073
+    return cost * lambda + curbits;
2074
+}
2075
+
2076
+static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2077
+                                          PutBitContext *pb, const float *in,
2078
+                                          const float *scaled, int size, int scale_idx,
2079
+                                          int cb, const float lambda, const float uplim,
2080
+                                          int *bits) = {
2081
+    get_band_cost_ZERO_mips,
2082
+    get_band_cost_SQUAD_mips,
2083
+    get_band_cost_SQUAD_mips,
2084
+    get_band_cost_UQUAD_mips,
2085
+    get_band_cost_UQUAD_mips,
2086
+    get_band_cost_SPAIR_mips,
2087
+    get_band_cost_SPAIR_mips,
2088
+    get_band_cost_UPAIR7_mips,
2089
+    get_band_cost_UPAIR7_mips,
2090
+    get_band_cost_UPAIR12_mips,
2091
+    get_band_cost_UPAIR12_mips,
2092
+    get_band_cost_ESC_mips,
2093
+};
2094
+
2095
+#define get_band_cost(                                  \
2096
+                                s, pb, in, scaled, size, scale_idx, cb, \
2097
+                                lambda, uplim, bits)                    \
2098
+    get_band_cost_arr[cb](                              \
2099
+                                s, pb, in, scaled, size, scale_idx, cb, \
2100
+                                lambda, uplim, bits)
2101
+
2102
+static float quantize_band_cost(struct AACEncContext *s, const float *in,
2103
+                                const float *scaled, int size, int scale_idx,
2104
+                                int cb, const float lambda, const float uplim,
2105
+                                int *bits)
2106
+{
2107
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2108
+}
2109
+
2110
+static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2111
+                                               AACEncContext *s,
2112
+                                               SingleChannelElement *sce,
2113
+                                               const float lambda)
2114
+{
2115
+    int start = 0, i, w, w2, g;
2116
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2117
+    float dists[128] = { 0 }, uplims[128];
2118
+    float maxvals[128];
2119
+    int fflag, minscaler;
2120
+    int its  = 0;
2121
+    int allz = 0;
2122
+    float minthr = INFINITY;
2123
+
2124
+    destbits = FFMIN(destbits, 5800);
2125
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2126
+        for (g = 0;  g < sce->ics.num_swb; g++) {
2127
+            int nz = 0;
2128
+            float uplim = 0.0f;
2129
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2130
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2131
+                uplim += band->threshold;
2132
+                if (band->energy <= band->threshold || band->threshold == 0.0f) {
2133
+                    sce->zeroes[(w+w2)*16+g] = 1;
2134
+                    continue;
2135
+                }
2136
+                nz = 1;
2137
+            }
2138
+            uplims[w*16+g] = uplim *512;
2139
+            sce->zeroes[w*16+g] = !nz;
2140
+            if (nz)
2141
+                minthr = FFMIN(minthr, uplim);
2142
+            allz |= nz;
2143
+        }
2144
+    }
2145
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2146
+        for (g = 0;  g < sce->ics.num_swb; g++) {
2147
+            if (sce->zeroes[w*16+g]) {
2148
+                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2149
+                continue;
2150
+            }
2151
+            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2152
+        }
2153
+    }
2154
+
2155
+    if (!allz)
2156
+        return;
2157
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2158
+
2159
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2160
+        start = w*128;
2161
+        for (g = 0;  g < sce->ics.num_swb; g++) {
2162
+            const float *scaled = s->scoefs + start;
2163
+            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2164
+            start += sce->ics.swb_sizes[g];
2165
+        }
2166
+    }
2167
+
2168
+    do {
2169
+        int tbits, qstep;
2170
+        minscaler = sce->sf_idx[0];
2171
+        qstep = its ? 1 : 32;
2172
+        do {
2173
+            int prev = -1;
2174
+            tbits = 0;
2175
+            fflag = 0;
2176
+
2177
+            if (qstep > 1) {
2178
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2179
+                    start = w*128;
2180
+                    for (g = 0;  g < sce->ics.num_swb; g++) {
2181
+                        const float *coefs = sce->coeffs + start;
2182
+                        const float *scaled = s->scoefs + start;
2183
+                        int bits = 0;
2184
+                        int cb;
2185
+
2186
+                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2187
+                            start += sce->ics.swb_sizes[g];
2188
+                            continue;
2189
+                        }
2190
+                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2191
+                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2192
+                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2193
+                            int b;
2194
+                            bits += quantize_band_cost_bits(s, coefs + w2*128,
2195
+                                                            scaled + w2*128,
2196
+                                                            sce->ics.swb_sizes[g],
2197
+                                                            sce->sf_idx[w*16+g],
2198
+                                                            cb,
2199
+                                                            1.0f,
2200
+                                                            INFINITY,
2201
+                                                            &b);
2202
+                        }
2203
+                        if (prev != -1) {
2204
+                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2205
+                        }
2206
+                        tbits += bits;
2207
+                        start += sce->ics.swb_sizes[g];
2208
+                        prev = sce->sf_idx[w*16+g];
2209
+                    }
2210
+                }
2211
+            }
2212
+            else {
2213
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2214
+                    start = w*128;
2215
+                    for (g = 0;  g < sce->ics.num_swb; g++) {
2216
+                        const float *coefs = sce->coeffs + start;
2217
+                        const float *scaled = s->scoefs + start;
2218
+                        int bits = 0;
2219
+                        int cb;
2220
+                        float dist = 0.0f;
2221
+
2222
+                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2223
+                            start += sce->ics.swb_sizes[g];
2224
+                            continue;
2225
+                        }
2226
+                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2227
+                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2228
+                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2229
+                            int b;
2230
+                            dist += quantize_band_cost(s, coefs + w2*128,
2231
+                                                       scaled + w2*128,
2232
+                                                       sce->ics.swb_sizes[g],
2233
+                                                       sce->sf_idx[w*16+g],
2234
+                                                       cb,
2235
+                                                       1.0f,
2236
+                                                       INFINITY,
2237
+                                                       &b);
2238
+                            bits += b;
2239
+                        }
2240
+                        dists[w*16+g] = dist - bits;
2241
+                        if (prev != -1) {
2242
+                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2243
+                        }
2244
+                        tbits += bits;
2245
+                        start += sce->ics.swb_sizes[g];
2246
+                        prev = sce->sf_idx[w*16+g];
2247
+                    }
2248
+                }
2249
+            }
2250
+            if (tbits > destbits) {
2251
+                for (i = 0; i < 128; i++)
2252
+                    if (sce->sf_idx[i] < 218 - qstep)
2253
+                        sce->sf_idx[i] += qstep;
2254
+            } else {
2255
+                for (i = 0; i < 128; i++)
2256
+                    if (sce->sf_idx[i] > 60 - qstep)
2257
+                        sce->sf_idx[i] -= qstep;
2258
+            }
2259
+            qstep >>= 1;
2260
+            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2261
+                qstep = 1;
2262
+        } while (qstep);
2263
+
2264
+        fflag = 0;
2265
+        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2266
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2267
+            for (g = 0; g < sce->ics.num_swb; g++) {
2268
+                int prevsc = sce->sf_idx[w*16+g];
2269
+                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2270
+                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2271
+                        sce->sf_idx[w*16+g]--;
2272
+                    else
2273
+                        sce->sf_idx[w*16+g]-=2;
2274
+                }
2275
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2276
+                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2277
+                if (sce->sf_idx[w*16+g] != prevsc)
2278
+                    fflag = 1;
2279
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2280
+            }
2281
+        }
2282
+        its++;
2283
+    } while (fflag && its < 10);
2284
+}
2285
+
2286
+static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe,
2287
+                               const float lambda)
2288
+{
2289
+    int start = 0, i, w, w2, g;
2290
+    float M[128], S[128];
2291
+    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2292
+    SingleChannelElement *sce0 = &cpe->ch[0];
2293
+    SingleChannelElement *sce1 = &cpe->ch[1];
2294
+    if (!cpe->common_window)
2295
+        return;
2296
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2297
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
2298
+            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2299
+                float dist1 = 0.0f, dist2 = 0.0f;
2300
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2301
+                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2302
+                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2303
+                    float minthr = FFMIN(band0->threshold, band1->threshold);
2304
+                    float maxthr = FFMAX(band0->threshold, band1->threshold);
2305
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2306
+                        M[i  ] = (sce0->coeffs[start+w2*128+i  ]
2307
+                                + sce1->coeffs[start+w2*128+i  ]) * 0.5;
2308
+                        M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2309
+                                + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2310
+                        M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2311
+                                + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2312
+                        M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2313
+                                + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2314
+
2315
+                        S[i  ] =  M[i  ]
2316
+                                - sce1->coeffs[start+w2*128+i  ];
2317
+                        S[i+1] =  M[i+1]
2318
+                                - sce1->coeffs[start+w2*128+i+1];
2319
+                        S[i+2] =  M[i+2]
2320
+                                - sce1->coeffs[start+w2*128+i+2];
2321
+                        S[i+3] =  M[i+3]
2322
+                                - sce1->coeffs[start+w2*128+i+3];
2323
+                   }
2324
+                    abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2325
+                    abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2326
+                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
2327
+                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
2328
+                    dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2329
+                                                L34,
2330
+                                                sce0->ics.swb_sizes[g],
2331
+                                                sce0->sf_idx[(w+w2)*16+g],
2332
+                                                sce0->band_type[(w+w2)*16+g],
2333
+                                                lambda / band0->threshold, INFINITY, NULL);
2334
+                    dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2335
+                                                R34,
2336
+                                                sce1->ics.swb_sizes[g],
2337
+                                                sce1->sf_idx[(w+w2)*16+g],
2338
+                                                sce1->band_type[(w+w2)*16+g],
2339
+                                                lambda / band1->threshold, INFINITY, NULL);
2340
+                    dist2 += quantize_band_cost(s, M,
2341
+                                                M34,
2342
+                                                sce0->ics.swb_sizes[g],
2343
+                                                sce0->sf_idx[(w+w2)*16+g],
2344
+                                                sce0->band_type[(w+w2)*16+g],
2345
+                                                lambda / maxthr, INFINITY, NULL);
2346
+                    dist2 += quantize_band_cost(s, S,
2347
+                                                S34,
2348
+                                                sce1->ics.swb_sizes[g],
2349
+                                                sce1->sf_idx[(w+w2)*16+g],
2350
+                                                sce1->band_type[(w+w2)*16+g],
2351
+                                                lambda / minthr, INFINITY, NULL);
2352
+                }
2353
+                cpe->ms_mask[w*16+g] = dist2 < dist1;
2354
+            }
2355
+            start += sce0->ics.swb_sizes[g];
2356
+        }
2357
+    }
2358
+}
2359
+#endif /*HAVE_MIPSFPU */
2360
+
2361
+static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2362
+                                       int win, int group_len, const float lambda)
2363
+{
2364
+    BandCodingPath path[120][12];
2365
+    int w, swb, cb, start, size;
2366
+    int i, j;
2367
+    const int max_sfb  = sce->ics.max_sfb;
2368
+    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2369
+    const int run_esc  = (1 << run_bits) - 1;
2370
+    int idx, ppos, count;
2371
+    int stackrun[120], stackcb[120], stack_len;
2372
+    float next_minbits = INFINITY;
2373
+    int next_mincb = 0;
2374
+
2375
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2376
+    start = win*128;
2377
+    for (cb = 0; cb < 12; cb++) {
2378
+        path[0][cb].cost     = run_bits+4;
2379
+        path[0][cb].prev_idx = -1;
2380
+        path[0][cb].run      = 0;
2381
+    }
2382
+    for (swb = 0; swb < max_sfb; swb++) {
2383
+        size = sce->ics.swb_sizes[swb];
2384
+        if (sce->zeroes[win*16 + swb]) {
2385
+            float cost_stay_here = path[swb][0].cost;
2386
+            float cost_get_here  = next_minbits + run_bits + 4;
2387
+            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2388
+                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2389
+                cost_stay_here += run_bits;
2390
+            if (cost_get_here < cost_stay_here) {
2391
+                path[swb+1][0].prev_idx = next_mincb;
2392
+                path[swb+1][0].cost     = cost_get_here;
2393
+                path[swb+1][0].run      = 1;
2394
+            } else {
2395
+                path[swb+1][0].prev_idx = 0;
2396
+                path[swb+1][0].cost     = cost_stay_here;
2397
+                path[swb+1][0].run      = path[swb][0].run + 1;
2398
+            }
2399
+            next_minbits = path[swb+1][0].cost;
2400
+            next_mincb = 0;
2401
+            for (cb = 1; cb < 12; cb++) {
2402
+                path[swb+1][cb].cost = 61450;
2403
+                path[swb+1][cb].prev_idx = -1;
2404
+                path[swb+1][cb].run = 0;
2405
+            }
2406
+        } else {
2407
+            float minbits = next_minbits;
2408
+            int mincb = next_mincb;
2409
+            int startcb = sce->band_type[win*16+swb];
2410
+            next_minbits = INFINITY;
2411
+            next_mincb = 0;
2412
+            for (cb = 0; cb < startcb; cb++) {
2413
+                path[swb+1][cb].cost = 61450;
2414
+                path[swb+1][cb].prev_idx = -1;
2415
+                path[swb+1][cb].run = 0;
2416
+            }
2417
+            for (cb = startcb; cb < 12; cb++) {
2418
+                float cost_stay_here, cost_get_here;
2419
+                float bits = 0.0f;
2420
+                for (w = 0; w < group_len; w++) {
2421
+                    bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2422
+                                                    s->scoefs + start + w*128, size,
2423
+                                                    sce->sf_idx[(win+w)*16+swb], cb,
2424
+                                                    0, INFINITY, NULL);
2425
+                }
2426
+                cost_stay_here = path[swb][cb].cost + bits;
2427
+                cost_get_here  = minbits            + bits + run_bits + 4;
2428
+                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2429
+                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2430
+                    cost_stay_here += run_bits;
2431
+                if (cost_get_here < cost_stay_here) {
2432
+                    path[swb+1][cb].prev_idx = mincb;
2433
+                    path[swb+1][cb].cost     = cost_get_here;
2434
+                    path[swb+1][cb].run      = 1;
2435
+                } else {
2436
+                    path[swb+1][cb].prev_idx = cb;
2437
+                    path[swb+1][cb].cost     = cost_stay_here;
2438
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
2439
+                }
2440
+                if (path[swb+1][cb].cost < next_minbits) {
2441
+                    next_minbits = path[swb+1][cb].cost;
2442
+                    next_mincb = cb;
2443
+                }
2444
+            }
2445
+        }
2446
+        start += sce->ics.swb_sizes[swb];
2447
+    }
2448
+
2449
+    stack_len = 0;
2450
+    idx       = 0;
2451
+    for (cb = 1; cb < 12; cb++)
2452
+        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2453
+            idx = cb;
2454
+    ppos = max_sfb;
2455
+    while (ppos > 0) {
2456
+        av_assert1(idx >= 0);
2457
+        cb = idx;
2458
+        stackrun[stack_len] = path[ppos][cb].run;
2459
+        stackcb [stack_len] = cb;
2460
+        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2461
+        ppos -= path[ppos][cb].run;
2462
+        stack_len++;
2463
+    }
2464
+
2465
+    start = 0;
2466
+    for (i = stack_len - 1; i >= 0; i--) {
2467
+        put_bits(&s->pb, 4, stackcb[i]);
2468
+        count = stackrun[i];
2469
+        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2470
+        for (j = 0; j < count; j++) {
2471
+            sce->band_type[win*16 + start] =  stackcb[i];
2472
+            start++;
2473
+        }
2474
+        while (count >= run_esc) {
2475
+            put_bits(&s->pb, run_bits, run_esc);
2476
+            count -= run_esc;
2477
+        }
2478
+        put_bits(&s->pb, run_bits, count);
2479
+    }
2480
+}
2481
+#endif /* HAVE_INLINE_ASM */
2482
+
2483
+void ff_aac_coder_init_mips(AACEncContext *c) {
2484
+#if HAVE_INLINE_ASM
2485
+    AACCoefficientsEncoder *e = c->coder;
2486
+    int option = c->options.aac_coder;
2487
+
2488
+    if (option == 2) {
2489
+        e->quantize_and_encode_band = quantize_and_encode_band_mips;
2490
+        e->encode_window_bands_info = codebook_trellis_rate_mips;
2491
+#if HAVE_MIPSFPU
2492
+        e->search_for_quantizers    = search_for_quantizers_twoloop_mips;
2493
+        e->search_for_ms            = search_for_ms_mips;
2494
+#endif /* HAVE_MIPSFPU */
2495
+    }
2496
+#endif /* HAVE_INLINE_ASM */
2497
+}