Browse code

Optimization of AC3 floating point decoder for MIPS

FFT in MIPS implementation is working iteratively instead
of "recursively" calling functions for smaller FFT sizes.
Some of DSP and format convert utils functions are also optimized.

Signed-off-by: Nedeljko Babic <nbabic@mips.com>
Reviewed-by: Vitor Sessak <vitor1001@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>

Nedeljko Babic authored on 2012/09/05 01:43:34
Showing 13 changed files
... ...
@@ -59,6 +59,7 @@ Files that have MIPS copyright notice in them:
59 59
       dsputil_mips.c
60 60
       fft_mips.c
61 61
       fft_table.h
62
+      fft_init_table.c
62 63
       fmtconvert_mips.c
63 64
       mpegaudiodsp_mips_fixed.c
64 65
       mpegaudiodsp_mips_float.c
... ...
@@ -3173,6 +3173,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
3173 3173
     if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
3174 3174
     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
3175 3175
     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
3176
+    if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);
3176 3177
 
3177 3178
     for (i = 0; i < 4; i++) {
3178 3179
         for (j = 0; j < 16; j++) {
... ...
@@ -622,6 +622,7 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
622 622
 void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
623 623
 void ff_dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
624 624
 void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
625
+void ff_dsputil_init_mips(DSPContext* c, AVCodecContext *avctx);
625 626
 
626 627
 void ff_dsputil_init_dwt(DSPContext *c);
627 628
 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
... ...
@@ -162,6 +162,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
162 162
     if (HAVE_ALTIVEC) ff_fft_init_altivec(s);
163 163
     if (HAVE_MMX)     ff_fft_init_mmx(s);
164 164
     if (CONFIG_MDCT)  s->mdct_calcw = s->mdct_calc;
165
+    if (HAVE_MIPSFPU) ff_fft_init_mips(s);
165 166
 #else
166 167
     if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
167 168
     if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
... ...
@@ -137,6 +137,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse);
137 137
 void ff_fft_init_altivec(FFTContext *s);
138 138
 void ff_fft_init_mmx(FFTContext *s);
139 139
 void ff_fft_init_arm(FFTContext *s);
140
+void ff_fft_init_mips(FFTContext *s);
140 141
 #else
141 142
 void ff_fft_fixed_init_arm(FFTContext *s);
142 143
 #endif
... ...
@@ -86,6 +86,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
86 86
     if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
87 87
     if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
88 88
     if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
89
+    if (HAVE_MIPSFPU) ff_fmt_convert_init_mips(c);
89 90
 }
90 91
 
91 92
 /* ffdshow custom code */
... ...
@@ -92,6 +92,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
92 92
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
93 93
 void ff_fmt_convert_init_altivec(FmtConvertContext *c, AVCodecContext *avctx);
94 94
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
95
+void ff_fmt_convert_init_mips(FmtConvertContext *c);
95 96
 
96 97
 /* ffdshow custom code */
97 98
 void float_interleave(float *dst, const float **src, long len, int channels);
... ...
@@ -13,3 +13,7 @@ MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER)      += mips/acelp_filters_mips.o     \
13 13
                                              mips/acelp_vectors_mips.o
14 14
 MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_float.o
15 15
 MIPSDSPR1-OBJS-$(CONFIG_MPEGAUDIODSP)     += mips/mpegaudiodsp_mips_fixed.o
16
+OBJS-$(CONFIG_FFT)                        += mips/fft_init_table.o
17
+MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
18
+MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/fmtconvert_mips.o
19
+MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/dsputil_mips.o
16 20
new file mode 100644
... ...
@@ -0,0 +1,164 @@
0
+/*
1
+ * Copyright (c) 2012
2
+ *      MIPS Technologies, Inc., California.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * 1. Redistributions of source code must retain the above copyright
8
+ *    notice, this list of conditions and the following disclaimer.
9
+ * 2. Redistributions in binary form must reproduce the above copyright
10
+ *    notice, this list of conditions and the following disclaimer in the
11
+ *    documentation and/or other materials provided with the distribution.
12
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
13
+ *    contributors may be used to endorse or promote products derived from
14
+ *    this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
20
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
+ * SUCH DAMAGE.
27
+ *
28
+ * Author:  Zoran Lukic (zoranl@mips.com)
29
+ *
30
+ * This file is part of FFmpeg.
31
+ *
32
+ * FFmpeg is free software; you can redistribute it and/or
33
+ * modify it under the terms of the GNU Lesser General Public
34
+ * License as published by the Free Software Foundation; either
35
+ * version 2.1 of the License, or (at your option) any later version.
36
+ *
37
+ * FFmpeg is distributed in the hope that it will be useful,
38
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
39
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
40
+ * Lesser General Public License for more details.
41
+ *
42
+ * You should have received a copy of the GNU Lesser General Public
43
+ * License along with FFmpeg; if not, write to the Free Software
44
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
45
+ */
46
+#include "config.h"
47
+#include "libavcodec/dsputil.h"
48
+
49
+static void vector_fmul_window_mips(float *dst, const float *src0,
50
+        const float *src1, const float *win, int len)
51
+{
52
+    int i, j;
53
+    /*
54
+     * variables used in inline assembler
55
+     */
56
+    float * dst_i, * dst_j, * dst_i2, * dst_j2;
57
+    float temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
58
+
59
+    dst  += len;
60
+    win  += len;
61
+    src0 += len;
62
+
63
+    for (i = -len, j = len - 1; i < 0; i += 8, j -= 8) {
64
+
65
+        dst_i = dst + i;
66
+        dst_j = dst + j;
67
+
68
+        dst_i2 = dst + i + 4;
69
+        dst_j2 = dst + j - 4;
70
+
71
+        __asm__ volatile (
72
+            "mul.s   %[temp],   %[s1],       %[wi]            \n\t"
73
+            "mul.s   %[temp1],  %[s1],       %[wj]            \n\t"
74
+            "mul.s   %[temp2],  %[s11],      %[wi1]           \n\t"
75
+            "mul.s   %[temp3],  %[s11],      %[wj1]           \n\t"
76
+
77
+            "msub.s  %[temp],   %[temp],     %[s0],  %[wj]    \n\t"
78
+            "madd.s  %[temp1],  %[temp1],    %[s0],  %[wi]    \n\t"
79
+            "msub.s  %[temp2],  %[temp2],    %[s01], %[wj1]   \n\t"
80
+            "madd.s  %[temp3],  %[temp3],    %[s01], %[wi1]   \n\t"
81
+
82
+            "swc1    %[temp],   0(%[dst_i])                   \n\t" /* dst[i] = s0*wj - s1*wi; */
83
+            "swc1    %[temp1],  0(%[dst_j])                   \n\t" /* dst[j] = s0*wi + s1*wj; */
84
+            "swc1    %[temp2],  4(%[dst_i])                   \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
85
+            "swc1    %[temp3], -4(%[dst_j])                   \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
86
+
87
+            "mul.s   %[temp4],  %[s12],      %[wi2]           \n\t"
88
+            "mul.s   %[temp5],  %[s12],      %[wj2]           \n\t"
89
+            "mul.s   %[temp6],  %[s13],      %[wi3]           \n\t"
90
+            "mul.s   %[temp7],  %[s13],      %[wj3]           \n\t"
91
+
92
+            "msub.s  %[temp4],  %[temp4],    %[s02], %[wj2]   \n\t"
93
+            "madd.s  %[temp5],  %[temp5],    %[s02], %[wi2]   \n\t"
94
+            "msub.s  %[temp6],  %[temp6],    %[s03], %[wj3]   \n\t"
95
+            "madd.s  %[temp7],  %[temp7],    %[s03], %[wi3]   \n\t"
96
+
97
+            "swc1    %[temp4],  8(%[dst_i])                   \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */
98
+            "swc1    %[temp5], -8(%[dst_j])                   \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */
99
+            "swc1    %[temp6],  12(%[dst_i])                  \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */
100
+            "swc1    %[temp7], -12(%[dst_j])                  \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */
101
+            : [temp]"=&f"(temp),  [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
102
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
103
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7)
104
+            : [dst_j]"r"(dst_j),     [dst_i]"r" (dst_i),
105
+              [s0] "f"(src0[i]),     [wj] "f"(win[j]),     [s1] "f"(src1[j]),
106
+              [wi] "f"(win[i]),      [s01]"f"(src0[i + 1]),[wj1]"f"(win[j - 1]),
107
+              [s11]"f"(src1[j - 1]), [wi1]"f"(win[i + 1]), [s02]"f"(src0[i + 2]),
108
+              [wj2]"f"(win[j - 2]),  [s12]"f"(src1[j - 2]),[wi2]"f"(win[i + 2]),
109
+              [s03]"f"(src0[i + 3]), [wj3]"f"(win[j - 3]), [s13]"f"(src1[j - 3]),
110
+              [wi3]"f"(win[i + 3])
111
+            : "memory"
112
+        );
113
+
114
+        __asm__ volatile (
115
+            "mul.s  %[temp],   %[s1],       %[wi]            \n\t"
116
+            "mul.s  %[temp1],  %[s1],       %[wj]            \n\t"
117
+            "mul.s  %[temp2],  %[s11],      %[wi1]           \n\t"
118
+            "mul.s  %[temp3],  %[s11],      %[wj1]           \n\t"
119
+
120
+            "msub.s %[temp],   %[temp],     %[s0],  %[wj]    \n\t"
121
+            "madd.s %[temp1],  %[temp1],    %[s0],  %[wi]    \n\t"
122
+            "msub.s %[temp2],  %[temp2],    %[s01], %[wj1]   \n\t"
123
+            "madd.s %[temp3],  %[temp3],    %[s01], %[wi1]   \n\t"
124
+
125
+            "swc1   %[temp],   0(%[dst_i2])                  \n\t" /* dst[i] = s0*wj - s1*wi; */
126
+            "swc1   %[temp1],  0(%[dst_j2])                  \n\t" /* dst[j] = s0*wi + s1*wj; */
127
+            "swc1   %[temp2],  4(%[dst_i2])                  \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
128
+            "swc1   %[temp3], -4(%[dst_j2])                  \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
129
+
130
+            "mul.s  %[temp4],  %[s12],      %[wi2]           \n\t"
131
+            "mul.s  %[temp5],  %[s12],      %[wj2]           \n\t"
132
+            "mul.s  %[temp6],  %[s13],      %[wi3]           \n\t"
133
+            "mul.s  %[temp7],  %[s13],      %[wj3]           \n\t"
134
+
135
+            "msub.s %[temp4],  %[temp4],    %[s02], %[wj2]   \n\t"
136
+            "madd.s %[temp5],  %[temp5],    %[s02], %[wi2]   \n\t"
137
+            "msub.s %[temp6],  %[temp6],    %[s03], %[wj3]   \n\t"
138
+            "madd.s %[temp7],  %[temp7],    %[s03], %[wi3]   \n\t"
139
+
140
+            "swc1   %[temp4],  8(%[dst_i2])                  \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */
141
+            "swc1   %[temp5], -8(%[dst_j2])                  \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */
142
+            "swc1   %[temp6],  12(%[dst_i2])                 \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */
143
+            "swc1   %[temp7], -12(%[dst_j2])                 \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */
144
+            : [temp]"=&f"(temp),
145
+              [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
146
+              [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
147
+              [temp7]  "=&f" (temp7)
148
+            : [dst_j2]"r"(dst_j2),   [dst_i2]"r"(dst_i2),
149
+              [s0] "f"(src0[i + 4]), [wj] "f"(win[j - 4]), [s1] "f"(src1[j - 4]),
150
+              [wi] "f"(win[i + 4]),  [s01]"f"(src0[i + 5]),[wj1]"f"(win[j - 5]),
151
+              [s11]"f"(src1[j - 5]), [wi1]"f"(win[i + 5]), [s02]"f"(src0[i + 6]),
152
+              [wj2]"f"(win[j - 6]),  [s12]"f"(src1[j - 6]),[wi2]"f"(win[i + 6]),
153
+              [s03]"f"(src0[i + 7]), [wj3]"f"(win[j - 7]), [s13]"f"(src1[j - 7]),
154
+              [wi3]"f"(win[i + 7])
155
+            : "memory"
156
+        );
157
+    }
158
+}
159
+
160
+av_cold void ff_dsputil_init_mips( DSPContext* c, AVCodecContext *avctx )
161
+{
162
+    c->vector_fmul_window = vector_fmul_window_mips;
163
+}
0 164
new file mode 100644
... ...
@@ -0,0 +1,67 @@
0
+/*
1
+ * Copyright (c) 2012
2
+ *      MIPS Technologies, Inc., California.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * 1. Redistributions of source code must retain the above copyright
8
+ *    notice, this list of conditions and the following disclaimer.
9
+ * 2. Redistributions in binary form must reproduce the above copyright
10
+ *    notice, this list of conditions and the following disclaimer in the
11
+ *    documentation and/or other materials provided with the distribution.
12
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
13
+ *    contributors may be used to endorse or promote products derived from
14
+ *    this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
20
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
+ * SUCH DAMAGE.
27
+ *
28
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
29
+ *
30
+ * This file is part of FFmpeg.
31
+ *
32
+ * FFmpeg is free software; you can redistribute it and/or
33
+ * modify it under the terms of the GNU Lesser General Public
34
+ * License as published by the Free Software Foundation; either
35
+ * version 2.1 of the License, or (at your option) any later version.
36
+ *
37
+ * FFmpeg is distributed in the hope that it will be useful,
38
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
39
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
40
+ * Lesser General Public License for more details.
41
+ *
42
+ * You should have received a copy of the GNU Lesser General Public
43
+ * License along with FFmpeg; if not, write to the Free Software
44
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
45
+ */
46
+
47
+/**
48
+ * @file
49
+ * definitions and initialization of LUT table for MIPS FFT
50
+ */
51
+#include "fft_table.h"
52
+
53
+uint16_t fft_offsets_lut[0x2aab];
54
+
55
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index)
56
+{
57
+    if (size < 16) {
58
+        table[*index] = off >> 2;
59
+        (*index)++;
60
+    }
61
+    else {
62
+        ff_fft_lut_init(table, off, size>>1, index);
63
+        ff_fft_lut_init(table, off+(size>>1), size>>2, index);
64
+        ff_fft_lut_init(table, off+3*(size>>2), size>>2, index);
65
+    }
66
+}
0 67
new file mode 100644
... ...
@@ -0,0 +1,530 @@
0
+/*
1
+ * Copyright (c) 2012
2
+ *      MIPS Technologies, Inc., California.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * 1. Redistributions of source code must retain the above copyright
8
+ *    notice, this list of conditions and the following disclaimer.
9
+ * 2. Redistributions in binary form must reproduce the above copyright
10
+ *    notice, this list of conditions and the following disclaimer in the
11
+ *    documentation and/or other materials provided with the distribution.
12
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
13
+ *    contributors may be used to endorse or promote products derived from
14
+ *    this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
20
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
+ * SUCH DAMAGE.
27
+ *
28
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
29
+ * Author:  Zoran Lukic (zoranl@mips.com)
30
+ *
31
+ * Optimized MDCT/IMDCT and FFT transforms
32
+ *
33
+ * This file is part of FFmpeg.
34
+ *
35
+ * FFmpeg is free software; you can redistribute it and/or
36
+ * modify it under the terms of the GNU Lesser General Public
37
+ * License as published by the Free Software Foundation; either
38
+ * version 2.1 of the License, or (at your option) any later version.
39
+ *
40
+ * FFmpeg is distributed in the hope that it will be useful,
41
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
42
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
43
+ * Lesser General Public License for more details.
44
+ *
45
+ * You should have received a copy of the GNU Lesser General Public
46
+ * License along with FFmpeg; if not, write to the Free Software
47
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48
+ */
49
+#include "config.h"
50
+#include "libavcodec/fft.h"
51
+#include "fft_table.h"
52
+
53
+/**
54
+ * FFT transform
55
+ */
56
+
57
+#if HAVE_INLINE_ASM
58
+static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
59
+{
60
+    int nbits, i, n, num_transforms, offset, step;
61
+    int n4, n2, n34;
62
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
63
+    FFTComplex *tmpz;
64
+    float w_re, w_im;
65
+    float *w_re_ptr, *w_im_ptr;
66
+    const int fft_size = (1 << s->nbits);
67
+    int s_n = s->nbits;
68
+    int tem1, tem2;
69
+    float pom,  pom1,  pom2,  pom3;
70
+    float temp, temp1, temp3, temp4;
71
+    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
72
+    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
73
+
74
+    /**
75
+    *num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
76
+    */
77
+    __asm__ volatile (
78
+        "li   %[tem1], 16                                      \n\t"
79
+        "sub  %[s_n],  %[tem1], %[s_n]                         \n\t"
80
+        "li   %[tem2], 10923                                   \n\t"
81
+        "srav %[tem2], %[tem2], %[s_n]                         \n\t"
82
+        "ori  %[num_t],%[tem2], 1                              \n\t"
83
+        : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
84
+          [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
85
+    );
86
+
87
+
88
+    for (n=0; n<num_transforms; n++) {
89
+        offset = fft_offsets_lut[n] << 2;
90
+        tmpz = z + offset;
91
+
92
+        tmp1 = tmpz[0].re + tmpz[1].re;
93
+        tmp5 = tmpz[2].re + tmpz[3].re;
94
+        tmp2 = tmpz[0].im + tmpz[1].im;
95
+        tmp6 = tmpz[2].im + tmpz[3].im;
96
+        tmp3 = tmpz[0].re - tmpz[1].re;
97
+        tmp8 = tmpz[2].im - tmpz[3].im;
98
+        tmp4 = tmpz[0].im - tmpz[1].im;
99
+        tmp7 = tmpz[2].re - tmpz[3].re;
100
+
101
+        tmpz[0].re = tmp1 + tmp5;
102
+        tmpz[2].re = tmp1 - tmp5;
103
+        tmpz[0].im = tmp2 + tmp6;
104
+        tmpz[2].im = tmp2 - tmp6;
105
+        tmpz[1].re = tmp3 + tmp8;
106
+        tmpz[3].re = tmp3 - tmp8;
107
+        tmpz[1].im = tmp4 - tmp7;
108
+        tmpz[3].im = tmp4 + tmp7;
109
+
110
+    }
111
+
112
+    if (fft_size < 8)
113
+        return;
114
+
115
+    num_transforms = (num_transforms >> 1) | 1;
116
+
117
+    for (n=0; n<num_transforms; n++) {
118
+        offset = fft_offsets_lut[n] << 3;
119
+        tmpz = z + offset;
120
+
121
+        __asm__ volatile (
122
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
123
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
124
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
125
+            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
126
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
127
+            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
128
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
129
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
130
+            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
131
+            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
132
+            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
133
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
134
+            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
135
+            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
136
+            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
137
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
138
+            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
139
+            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
140
+            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
141
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
142
+            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
143
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
144
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
145
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
146
+            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
147
+            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
148
+            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
149
+            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
150
+            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
151
+            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
152
+            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
153
+            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
154
+            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
155
+            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
156
+            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
157
+            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
158
+            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
159
+            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
160
+            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
161
+            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
162
+            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
163
+            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
164
+            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
165
+            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
166
+            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
167
+            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
168
+            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
169
+            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
170
+            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
171
+            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
172
+            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
173
+            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
174
+            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
175
+            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
176
+            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
177
+            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
178
+            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
179
+            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
180
+            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
181
+            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
182
+            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
183
+            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
184
+            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
185
+            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
186
+            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
187
+            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
188
+            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
189
+            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
190
+            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
191
+            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
192
+            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
193
+            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
194
+            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
195
+            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
196
+            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
197
+            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
198
+            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
199
+            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
200
+            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
201
+            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
202
+            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
203
+            : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
204
+              [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5),  [tmp7]"=&f"(tmp7),
205
+              [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
206
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
207
+            : [tmpz]"r"(tmpz)
208
+            : "memory"
209
+        );
210
+    }
211
+
212
+    step = 1 << (MAX_LOG2_NFFT - 4);
213
+    n4 = 4;
214
+
215
+    for (nbits=4; nbits<=s->nbits; nbits++) {
216
+        /*
217
+        * num_transforms = (num_transforms >> 1) | 1;
218
+        */
219
+        __asm__ volatile (
220
+            "sra %[num_t], %[num_t], 1               \n\t"
221
+            "ori %[num_t], %[num_t], 1               \n\t"
222
+
223
+            : [num_t] "+r" (num_transforms)
224
+        );
225
+        n2  = 2 * n4;
226
+        n34 = 3 * n4;
227
+
228
+        for (n=0; n<num_transforms; n++) {
229
+            offset = fft_offsets_lut[n] << nbits;
230
+            tmpz = z + offset;
231
+
232
+            tmpz_n2  = tmpz +  n2;
233
+            tmpz_n4  = tmpz +  n4;
234
+            tmpz_n34 = tmpz +  n34;
235
+
236
+            __asm__ volatile (
237
+                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
238
+                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
239
+                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
240
+                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
241
+                "lwc1  %[temp1],0(%[tmpz])               \n\t"
242
+                "lwc1  %[temp3],4(%[tmpz])               \n\t"
243
+                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
244
+                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
245
+                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
246
+                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
247
+                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
248
+                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
249
+                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
250
+                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
251
+                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
252
+                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
253
+                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
254
+                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
255
+                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
256
+                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
257
+                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
258
+                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
259
+                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
260
+                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
261
+                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
262
+                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
263
+                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
264
+                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
265
+                : [tmp5]"=&f"(tmp5),
266
+                  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
267
+                  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6),          [pom3]"=&f"(pom3),
268
+                  [temp]"=&f"(temp), [temp1]"=&f"(temp1),     [temp3]"=&f"(temp3),       [temp4]"=&f"(temp4)
269
+                : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
270
+                : "memory"
271
+            );
272
+
273
+            w_re_ptr = (float*)(ff_cos_65536 + step);
274
+            w_im_ptr = (float*)(ff_cos_65536 + MAX_FFT_SIZE/4 - step);
275
+
276
+            for (i=1; i<n4; i++) {
277
+                w_re = w_re_ptr[0];
278
+                w_im = w_im_ptr[0];
279
+                tmpz_n2_i = tmpz_n2  + i;
280
+                tmpz_n4_i = tmpz_n4  + i;
281
+                tmpz_n34_i= tmpz_n34 + i;
282
+                tmpz_i    = tmpz     + i;
283
+
284
+                __asm__ volatile (
285
+                    "lwc1     %[temp],  0(%[tmpz_n2_i])               \n\t"
286
+                    "lwc1     %[temp1], 4(%[tmpz_n2_i])               \n\t"
287
+                    "lwc1     %[pom],   0(%[tmpz_n34_i])              \n\t"
288
+                    "lwc1     %[pom1],  4(%[tmpz_n34_i])              \n\t"
289
+                    "mul.s    %[temp3], %[w_im],    %[temp]           \n\t"
290
+                    "mul.s    %[temp4], %[w_im],    %[temp1]          \n\t"
291
+                    "mul.s    %[pom2],  %[w_im],    %[pom1]           \n\t"
292
+                    "mul.s    %[pom3],  %[w_im],    %[pom]            \n\t"
293
+                    "msub.s   %[tmp2],  %[temp3],   %[w_re], %[temp1] \n\t"  // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
294
+                    "madd.s   %[tmp1],  %[temp4],   %[w_re], %[temp]  \n\t"  // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
295
+                    "msub.s   %[tmp3],  %[pom2],    %[w_re], %[pom]   \n\t"  // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
296
+                    "madd.s   %[tmp4],  %[pom3],    %[w_re], %[pom1]  \n\t"  // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
297
+                    "lwc1     %[temp],  0(%[tmpz_i])                  \n\t"
298
+                    "lwc1     %[pom],   4(%[tmpz_i])                  \n\t"
299
+                    "add.s    %[tmp5],  %[tmp1],    %[tmp3]           \n\t"  // tmp5 = tmp1 + tmp3;
300
+                    "sub.s    %[tmp1],  %[tmp1],    %[tmp3]           \n\t"  // tmp1 = tmp1 - tmp3;
301
+                    "add.s    %[tmp6],  %[tmp2],    %[tmp4]           \n\t"  // tmp6 = tmp2 + tmp4;
302
+                    "sub.s    %[tmp2],  %[tmp2],    %[tmp4]           \n\t"  // tmp2 = tmp2 - tmp4;
303
+                    "sub.s    %[temp1], %[temp],    %[tmp5]           \n\t"
304
+                    "add.s    %[temp],  %[temp],    %[tmp5]           \n\t"
305
+                    "sub.s    %[pom1],  %[pom],     %[tmp6]           \n\t"
306
+                    "add.s    %[pom],   %[pom],     %[tmp6]           \n\t"
307
+                    "lwc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"
308
+                    "lwc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"
309
+                    "swc1     %[temp1], 0(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].re = tmpz[   i].re - tmp5;
310
+                    "swc1     %[temp],  0(%[tmpz_i])                  \n\t"  // tmpz[    i].re = tmpz[   i].re + tmp5;
311
+                    "swc1     %[pom1],  4(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].im = tmpz[   i].im - tmp6;
312
+                    "swc1     %[pom] ,  4(%[tmpz_i])                  \n\t"  // tmpz[    i].im = tmpz[   i].im + tmp6;
313
+                    "sub.s    %[temp4], %[temp3],   %[tmp2]           \n\t"
314
+                    "add.s    %[pom3],  %[pom2],    %[tmp1]           \n\t"
315
+                    "add.s    %[temp3], %[temp3],   %[tmp2]           \n\t"
316
+                    "sub.s    %[pom2],  %[pom2],    %[tmp1]           \n\t"
317
+                    "swc1     %[temp4], 0(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
318
+                    "swc1     %[pom3],  4(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
319
+                    "swc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
320
+                    "swc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
321
+                    : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
322
+                      [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
323
+                      [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
324
+                      [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
325
+                    : [w_re]"f"(w_re), [w_im]"f"(w_im),
326
+                      [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
327
+                      [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
328
+                    : "memory"
329
+                );
330
+                w_re_ptr += step;
331
+                w_im_ptr -= step;
332
+            }
333
+        }
334
+        step >>= 1;
335
+        n4   <<= 1;
336
+    }
337
+}
338
+
339
+/**
340
+ * MDCT/IMDCT transforms.
341
+ */
342
+
343
+static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
344
+{
345
+    int k, n8, n4, n2, n, j;
346
+    const uint16_t *revtab = s->revtab;
347
+    const FFTSample *tcos = s->tcos;
348
+    const FFTSample *tsin = s->tsin;
349
+    const FFTSample *in1, *in2, *in3, *in4;
350
+    FFTComplex *z = (FFTComplex *)output;
351
+
352
+    int j1;
353
+    const float *tcos1, *tsin1, *tcos2, *tsin2;
354
+    float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
355
+        temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
356
+    FFTComplex *z1, *z2;
357
+
358
+    n = 1 << s->mdct_bits;
359
+    n2 = n >> 1;
360
+    n4 = n >> 2;
361
+    n8 = n >> 3;
362
+
363
+    /* pre rotation */
364
+    in1 = input;
365
+    in2 = input + n2 - 1;
366
+    in3 = input + 2;
367
+    in4 = input + n2 - 3;
368
+
369
+    tcos1 = tcos;
370
+    tsin1 = tsin;
371
+
372
+    /* n4 = 64 or 128 */
373
+    for(k = 0; k < n4; k += 2) {
374
+        j  = revtab[k    ];
375
+        j1 = revtab[k + 1];
376
+
377
+        __asm__ volatile (
378
+            "lwc1           %[temp1],       0(%[in2])                           \t\n"
379
+            "lwc1           %[temp2],       0(%[tcos1])                         \t\n"
380
+            "lwc1           %[temp3],       0(%[tsin1])                         \t\n"
381
+            "lwc1           %[temp4],       0(%[in1])                           \t\n"
382
+            "lwc1           %[temp5],       0(%[in4])                           \t\n"
383
+            "mul.s          %[temp9],       %[temp1],   %[temp2]                \t\n"
384
+            "mul.s          %[temp10],      %[temp1],   %[temp3]                \t\n"
385
+            "lwc1           %[temp6],       4(%[tcos1])                         \t\n"
386
+            "lwc1           %[temp7],       4(%[tsin1])                         \t\n"
387
+            "nmsub.s        %[temp9],       %[temp9],   %[temp4],   %[temp3]    \t\n"
388
+            "madd.s         %[temp10],      %[temp10],  %[temp4],   %[temp2]    \t\n"
389
+            "mul.s          %[temp11],      %[temp5],   %[temp6]                \t\n"
390
+            "mul.s          %[temp12],      %[temp5],   %[temp7]                \t\n"
391
+            "lwc1           %[temp8],       0(%[in3])                           \t\n"
392
+            "addiu          %[tcos1],       %[tcos1],   8                       \t\n"
393
+            "addiu          %[tsin1],       %[tsin1],   8                       \t\n"
394
+            "addiu          %[in1],         %[in1],     16                      \t\n"
395
+            "nmsub.s        %[temp11],      %[temp11],  %[temp8],   %[temp7]    \t\n"
396
+            "madd.s         %[temp12],      %[temp12],  %[temp8],   %[temp6]    \t\n"
397
+            "addiu          %[in2],         %[in2],     -16                     \t\n"
398
+            "addiu          %[in3],         %[in3],     16                      \t\n"
399
+            "addiu          %[in4],         %[in4],     -16                     \t\n"
400
+
401
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
402
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
403
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
404
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
405
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
406
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
407
+              [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
408
+              [in1]"+r"(in1), [in2]"+r"(in2),
409
+              [in3]"+r"(in3), [in4]"+r"(in4)
410
+        );
411
+
412
+        z[j ].re = temp9;
413
+        z[j ].im = temp10;
414
+        z[j1].re = temp11;
415
+        z[j1].im = temp12;
416
+    }
417
+
418
+    s->fft_calc(s, z);
419
+
420
+    /* post rotation + reordering */
421
+    /* n8 = 32 or 64 */
422
+    for(k = 0; k < n8; k += 2) {
423
+        tcos1 = &tcos[n8 - k - 2];
424
+        tsin1 = &tsin[n8 - k - 2];
425
+        tcos2 = &tcos[n8 + k];
426
+        tsin2 = &tsin[n8 + k];
427
+        z1 = &z[n8 - k - 2];
428
+        z2 = &z[n8 + k    ];
429
+
430
+        __asm__ volatile (
431
+            "lwc1       %[temp1],   12(%[z1])                           \t\n"
432
+            "lwc1       %[temp2],   4(%[tsin1])                         \t\n"
433
+            "lwc1       %[temp3],   4(%[tcos1])                         \t\n"
434
+            "lwc1       %[temp4],   8(%[z1])                            \t\n"
435
+            "lwc1       %[temp5],   4(%[z1])                            \t\n"
436
+            "mul.s      %[temp9],   %[temp1],   %[temp2]                \t\n"
437
+            "mul.s      %[temp10],  %[temp1],   %[temp3]                \t\n"
438
+            "lwc1       %[temp6],   0(%[tsin1])                         \t\n"
439
+            "lwc1       %[temp7],   0(%[tcos1])                         \t\n"
440
+            "nmsub.s    %[temp9],   %[temp9],   %[temp4],   %[temp3]    \t\n"
441
+            "madd.s     %[temp10],  %[temp10],  %[temp4],   %[temp2]    \t\n"
442
+            "mul.s      %[temp11],  %[temp5],   %[temp6]                \t\n"
443
+            "mul.s      %[temp12],  %[temp5],   %[temp7]                \t\n"
444
+            "lwc1       %[temp8],   0(%[z1])                            \t\n"
445
+            "lwc1       %[temp1],   4(%[z2])                            \t\n"
446
+            "lwc1       %[temp2],   0(%[tsin2])                         \t\n"
447
+            "lwc1       %[temp3],   0(%[tcos2])                         \t\n"
448
+            "nmsub.s    %[temp11],  %[temp11],  %[temp8],   %[temp7]    \t\n"
449
+            "madd.s     %[temp12],  %[temp12],  %[temp8],   %[temp6]    \t\n"
450
+            "mul.s      %[temp13],  %[temp1],   %[temp2]                \t\n"
451
+            "mul.s      %[temp14],  %[temp1],   %[temp3]                \t\n"
452
+            "lwc1       %[temp4],   0(%[z2])                            \t\n"
453
+            "lwc1       %[temp5],   12(%[z2])                           \t\n"
454
+            "lwc1       %[temp6],   4(%[tsin2])                         \t\n"
455
+            "lwc1       %[temp7],   4(%[tcos2])                         \t\n"
456
+            "nmsub.s    %[temp13],  %[temp13],  %[temp4],   %[temp3]    \t\n"
457
+            "madd.s     %[temp14],  %[temp14],  %[temp4],   %[temp2]    \t\n"
458
+            "mul.s      %[temp15],  %[temp5],   %[temp6]                \t\n"
459
+            "mul.s      %[temp16],  %[temp5],   %[temp7]                \t\n"
460
+            "lwc1       %[temp8],   8(%[z2])                            \t\n"
461
+            "nmsub.s    %[temp15],  %[temp15],  %[temp8],   %[temp7]    \t\n"
462
+            "madd.s     %[temp16],  %[temp16],  %[temp8],   %[temp6]    \t\n"
463
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
464
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
465
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
466
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
467
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
468
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
469
+              [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
470
+              [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
471
+            : [z1]"r"(z1), [z2]"r"(z2),
472
+              [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
473
+              [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
474
+        );
475
+
476
+        z1[1].re = temp9;
477
+        z1[1].im = temp14;
478
+        z2[0].re = temp13;
479
+        z2[0].im = temp10;
480
+
481
+        z1[0].re = temp11;
482
+        z1[0].im = temp16;
483
+        z2[1].re = temp15;
484
+        z2[1].im = temp12;
485
+    }
486
+}
487
+#endif /* HAVE_INLINE_ASM */
488
+
489
+/**
490
+ * Compute inverse MDCT of size N = 2^nbits
491
+ * @param output N samples
492
+ * @param input N/2 samples
493
+ */
494
+static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
495
+{
496
+    int k;
497
+    int n = 1 << s->mdct_bits;
498
+    int n2 = n >> 1;
499
+    int n4 = n >> 2;
500
+
501
+    ff_imdct_half_mips(s, output+n4, input);
502
+
503
+    for(k = 0; k < n4; k+=4) {
504
+        output[k] = -output[n2-k-1];
505
+        output[k+1] = -output[n2-k-2];
506
+        output[k+2] = -output[n2-k-3];
507
+        output[k+3] = -output[n2-k-4];
508
+
509
+        output[n-k-1] = output[n2+k];
510
+        output[n-k-2] = output[n2+k+1];
511
+        output[n-k-3] = output[n2+k+2];
512
+        output[n-k-4] = output[n2+k+3];
513
+    }
514
+}
515
+
516
+av_cold void ff_fft_init_mips(FFTContext *s)
517
+{
518
+    int n=0;
519
+
520
+    ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
521
+
522
+#if HAVE_INLINE_ASM
523
+    s->fft_calc     = ff_fft_calc_mips;
524
+#endif
525
+#if CONFIG_MDCT
526
+    s->imdct_calc   = ff_imdct_calc_mips;
527
+    s->imdct_half   = ff_imdct_half_mips;
528
+#endif
529
+}
0 530
new file mode 100644
... ...
@@ -0,0 +1,63 @@
0
+/*
1
+ * Copyright (c) 2012
2
+ *      MIPS Technologies, Inc., California.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * 1. Redistributions of source code must retain the above copyright
8
+ *    notice, this list of conditions and the following disclaimer.
9
+ * 2. Redistributions in binary form must reproduce the above copyright
10
+ *    notice, this list of conditions and the following disclaimer in the
11
+ *    documentation and/or other materials provided with the distribution.
12
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
13
+ *    contributors may be used to endorse or promote products derived from
14
+ *    this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
20
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
+ * SUCH DAMAGE.
27
+ *
28
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
29
+ *
30
+ * This file is part of FFmpeg.
31
+ *
32
+ * FFmpeg is free software; you can redistribute it and/or
33
+ * modify it under the terms of the GNU Lesser General Public
34
+ * License as published by the Free Software Foundation; either
35
+ * version 2.1 of the License, or (at your option) any later version.
36
+ *
37
+ * FFmpeg is distributed in the hope that it will be useful,
38
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
39
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
40
+ * Lesser General Public License for more details.
41
+ *
42
+ * You should have received a copy of the GNU Lesser General Public
43
+ * License along with FFmpeg; if not, write to the Free Software
44
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
45
+ */
46
+
47
+/**
48
+ * @file
49
+ * definitions and LUT table for MIPS FFT
50
+ */
51
+#ifndef AVCODEC_MIPS_FFT_TABLE_H
52
+#define AVCODEC_MIPS_FFT_TABLE_H
53
+
54
+#include "libavcodec/fft.h"
55
+
56
+#define MAX_LOG2_NFFT 16 //!< Specifies maxiumum allowed fft size
57
+#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
58
+
59
+extern uint16_t fft_offsets_lut[];
60
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index);
61
+
62
+#endif /* AVCODEC_MIPS_FFT_TABLE_H */
0 63
new file mode 100644
... ...
@@ -0,0 +1,338 @@
0
+/*
1
+ * Format Conversion Utils for MIPS
2
+ *
3
+ * Copyright (c) 2012
4
+ *      MIPS Technologies, Inc., California.
5
+ *
6
+ * Redistribution and use in source and binary forms, with or without
7
+ * modification, are permitted provided that the following conditions
8
+ * are met:
9
+ * 1. Redistributions of source code must retain the above copyright
10
+ *    notice, this list of conditions and the following disclaimer.
11
+ * 2. Redistributions in binary form must reproduce the above copyright
12
+ *    notice, this list of conditions and the following disclaimer in the
13
+ *    documentation and/or other materials provided with the distribution.
14
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
15
+ *    contributors may be used to endorse or promote products derived from
16
+ *    this software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
19
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
22
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28
+ * SUCH DAMAGE.
29
+ *
30
+ * Author:  Zoran Lukic (zoranl@mips.com)
31
+ * Author:  Nedeljko Babic (nbabic@mips.com)
32
+ *
33
+ * This file is part of FFmpeg.
34
+ *
35
+ * FFmpeg is free software; you can redistribute it and/or
36
+ * modify it under the terms of the GNU Lesser General Public
37
+ * License as published by the Free Software Foundation; either
38
+ * version 2.1 of the License, or (at your option) any later version.
39
+ *
40
+ * FFmpeg is distributed in the hope that it will be useful,
41
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
42
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
43
+ * Lesser General Public License for more details.
44
+ *
45
+ * You should have received a copy of the GNU Lesser General Public
46
+ * License along with FFmpeg; if not, write to the Free Software
47
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48
+ */
49
+#include "config.h"
50
+#include "libavcodec/avcodec.h"
51
+#include "libavcodec/fmtconvert.h"
52
+
53
+#if HAVE_MIPSDSPR1
54
+static void float_to_int16_mips(int16_t *dst, const float *src, long len)
55
+{
56
+    const float *src_end = src + len;
57
+    int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7;
58
+    float src0, src1, src2, src3, src4, src5, src6, src7;
59
+
60
+    /*
61
+     * loop is 8 times unrolled in assembler in order to achieve better performance
62
+     */
63
+    __asm__ volatile(
64
+        "beq        %[len],  $zero,   fti16_end%=   \n\t"
65
+        "fti16_lp%=:                                \n\t"
66
+        "lwc1       %[src0], 0(%[src])              \n\t"
67
+        "lwc1       %[src1], 4(%[src])              \n\t"
68
+        "lwc1       %[src2], 8(%[src])              \n\t"
69
+        "lwc1       %[src3], 12(%[src])             \n\t"
70
+        "cvt.w.s    %[src0], %[src0]                \n\t"
71
+        "cvt.w.s    %[src1], %[src1]                \n\t"
72
+        "cvt.w.s    %[src2], %[src2]                \n\t"
73
+        "cvt.w.s    %[src3], %[src3]                \n\t"
74
+        "mfc1       %[ret0], %[src0]                \n\t"
75
+        "mfc1       %[ret1], %[src1]                \n\t"
76
+        "mfc1       %[ret2], %[src2]                \n\t"
77
+        "mfc1       %[ret3], %[src3]                \n\t"
78
+        "lwc1       %[src4], 16(%[src])             \n\t"
79
+        "lwc1       %[src5], 20(%[src])             \n\t"
80
+        "lwc1       %[src6], 24(%[src])             \n\t"
81
+        "lwc1       %[src7], 28(%[src])             \n\t"
82
+        "cvt.w.s    %[src4], %[src4]                \n\t"
83
+        "cvt.w.s    %[src5], %[src5]                \n\t"
84
+        "cvt.w.s    %[src6], %[src6]                \n\t"
85
+        "cvt.w.s    %[src7], %[src7]                \n\t"
86
+        "addiu      %[src],  32                     \n\t"
87
+        "shll_s.w   %[ret0], %[ret0], 16            \n\t"
88
+        "shll_s.w   %[ret1], %[ret1], 16            \n\t"
89
+        "shll_s.w   %[ret2], %[ret2], 16            \n\t"
90
+        "shll_s.w   %[ret3], %[ret3], 16            \n\t"
91
+        "srl        %[ret0], %[ret0], 16            \n\t"
92
+        "srl        %[ret1], %[ret1], 16            \n\t"
93
+        "srl        %[ret2], %[ret2], 16            \n\t"
94
+        "srl        %[ret3], %[ret3], 16            \n\t"
95
+        "sh         %[ret0], 0(%[dst])              \n\t"
96
+        "sh         %[ret1], 2(%[dst])              \n\t"
97
+        "sh         %[ret2], 4(%[dst])              \n\t"
98
+        "sh         %[ret3], 6(%[dst])              \n\t"
99
+        "mfc1       %[ret4], %[src4]                \n\t"
100
+        "mfc1       %[ret5], %[src5]                \n\t"
101
+        "mfc1       %[ret6], %[src6]                \n\t"
102
+        "mfc1       %[ret7], %[src7]                \n\t"
103
+        "shll_s.w   %[ret4], %[ret4], 16            \n\t"
104
+        "shll_s.w   %[ret5], %[ret5], 16            \n\t"
105
+        "shll_s.w   %[ret6], %[ret6], 16            \n\t"
106
+        "shll_s.w   %[ret7], %[ret7], 16            \n\t"
107
+        "srl        %[ret4], %[ret4], 16            \n\t"
108
+        "srl        %[ret5], %[ret5], 16            \n\t"
109
+        "srl        %[ret6], %[ret6], 16            \n\t"
110
+        "srl        %[ret7], %[ret7], 16            \n\t"
111
+        "sh         %[ret4], 8(%[dst])              \n\t"
112
+        "sh         %[ret5], 10(%[dst])             \n\t"
113
+        "sh         %[ret6], 12(%[dst])             \n\t"
114
+        "sh         %[ret7], 14(%[dst])             \n\t"
115
+        "addiu      %[dst],  16                     \n\t"
116
+        "bne        %[src],  %[src_end], fti16_lp%= \n\t"
117
+        "fti16_end%=:                               \n\t"
118
+        : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3),
119
+          [ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7),
120
+          [src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3),
121
+          [src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7),
122
+          [src]"+r"(src), [dst]"+r"(dst)
123
+        : [src_end]"r"(src_end), [len]"r"(len)
124
+        : "memory"
125
+    );
126
+}
127
+
128
+static void float_to_int16_interleave_mips(int16_t *dst, const float **src, long len,
129
+        int channels)
130
+{
131
+    int   c, ch2 = channels <<1;
132
+    int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7;
133
+    float src0, src1, src2, src3, src4, src5, src6, src7;
134
+    int16_t *dst_ptr0, *dst_ptr1, *dst_ptr2, *dst_ptr3;
135
+    int16_t *dst_ptr4, *dst_ptr5, *dst_ptr6, *dst_ptr7;
136
+    const float *src_ptr, *src_ptr2, *src_end;
137
+
138
+    if (channels == 2) {
139
+        src_ptr = &src[0][0];
140
+        src_ptr2 = &src[1][0];
141
+        src_end = src_ptr + len;
142
+
143
+        __asm__ volatile (
144
+            "fti16i2_lp%=:                                   \n\t"
145
+            "lwc1       %[src0],    0(%[src_ptr])            \n\t"
146
+            "lwc1       %[src1],    0(%[src_ptr2])           \n\t"
147
+            "addiu      %[src_ptr], 4                        \n\t"
148
+            "cvt.w.s    $f9,        %[src0]                  \n\t"
149
+            "cvt.w.s    $f10,       %[src1]                  \n\t"
150
+            "mfc1       %[ret0],    $f9                      \n\t"
151
+            "mfc1       %[ret1],    $f10                     \n\t"
152
+            "shll_s.w   %[ret0],    %[ret0], 16              \n\t"
153
+            "shll_s.w   %[ret1],    %[ret1], 16              \n\t"
154
+            "addiu      %[src_ptr2], 4                       \n\t"
155
+            "srl        %[ret0],    %[ret0], 16              \n\t"
156
+            "srl        %[ret1],    %[ret1], 16              \n\t"
157
+            "sh         %[ret0],    0(%[dst])                \n\t"
158
+            "sh         %[ret1],    2(%[dst])                \n\t"
159
+            "addiu      %[dst],     4                        \n\t"
160
+            "bne        %[src_ptr], %[src_end], fti16i2_lp%= \n\t"
161
+            : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1),
162
+              [src0]"=&f"(src0), [src1]"=&f"(src1),
163
+              [src_ptr]"+r"(src_ptr), [src_ptr2]"+r"(src_ptr2),
164
+              [dst]"+r"(dst)
165
+            : [src_end]"r"(src_end)
166
+            : "memory"
167
+        );
168
+    } else {
169
+        for (c = 0; c < channels; c++) {
170
+            src_ptr  = &src[c][0];
171
+            dst_ptr0 = &dst[c];
172
+            src_end = src_ptr + len;
173
+            /*
174
+             * loop is 8 times unrolled in assembler in order to achieve better performance
175
+             */
176
+            __asm__ volatile(
177
+                "fti16i_lp%=:                                     \n\t"
178
+                "lwc1       %[src0], 0(%[src_ptr])                \n\t"
179
+                "lwc1       %[src1], 4(%[src_ptr])                \n\t"
180
+                "lwc1       %[src2], 8(%[src_ptr])                \n\t"
181
+                "lwc1       %[src3], 12(%[src_ptr])               \n\t"
182
+                "cvt.w.s    %[src0], %[src0]                      \n\t"
183
+                "cvt.w.s    %[src1], %[src1]                      \n\t"
184
+                "cvt.w.s    %[src2], %[src2]                      \n\t"
185
+                "cvt.w.s    %[src3], %[src3]                      \n\t"
186
+                "mfc1       %[ret0], %[src0]                      \n\t"
187
+                "mfc1       %[ret1], %[src1]                      \n\t"
188
+                "mfc1       %[ret2], %[src2]                      \n\t"
189
+                "mfc1       %[ret3], %[src3]                      \n\t"
190
+                "lwc1       %[src4], 16(%[src_ptr])               \n\t"
191
+                "lwc1       %[src5], 20(%[src_ptr])               \n\t"
192
+                "lwc1       %[src6], 24(%[src_ptr])               \n\t"
193
+                "lwc1       %[src7], 28(%[src_ptr])               \n\t"
194
+                "addu       %[dst_ptr1], %[dst_ptr0], %[ch2]      \n\t"
195
+                "addu       %[dst_ptr2], %[dst_ptr1], %[ch2]      \n\t"
196
+                "addu       %[dst_ptr3], %[dst_ptr2], %[ch2]      \n\t"
197
+                "addu       %[dst_ptr4], %[dst_ptr3], %[ch2]      \n\t"
198
+                "addu       %[dst_ptr5], %[dst_ptr4], %[ch2]      \n\t"
199
+                "addu       %[dst_ptr6], %[dst_ptr5], %[ch2]      \n\t"
200
+                "addu       %[dst_ptr7], %[dst_ptr6], %[ch2]      \n\t"
201
+                "addiu      %[src_ptr],  32                       \n\t"
202
+                "cvt.w.s    %[src4], %[src4]                      \n\t"
203
+                "cvt.w.s    %[src5], %[src5]                      \n\t"
204
+                "cvt.w.s    %[src6], %[src6]                      \n\t"
205
+                "cvt.w.s    %[src7], %[src7]                      \n\t"
206
+                "shll_s.w   %[ret0], %[ret0], 16                  \n\t"
207
+                "shll_s.w   %[ret1], %[ret1], 16                  \n\t"
208
+                "shll_s.w   %[ret2], %[ret2], 16                  \n\t"
209
+                "shll_s.w   %[ret3], %[ret3], 16                  \n\t"
210
+                "srl        %[ret0], %[ret0], 16                  \n\t"
211
+                "srl        %[ret1], %[ret1], 16                  \n\t"
212
+                "srl        %[ret2], %[ret2], 16                  \n\t"
213
+                "srl        %[ret3], %[ret3], 16                  \n\t"
214
+                "sh         %[ret0], 0(%[dst_ptr0])               \n\t"
215
+                "sh         %[ret1], 0(%[dst_ptr1])               \n\t"
216
+                "sh         %[ret2], 0(%[dst_ptr2])               \n\t"
217
+                "sh         %[ret3], 0(%[dst_ptr3])               \n\t"
218
+                "mfc1       %[ret4], %[src4]                      \n\t"
219
+                "mfc1       %[ret5], %[src5]                      \n\t"
220
+                "mfc1       %[ret6], %[src6]                      \n\t"
221
+                "mfc1       %[ret7], %[src7]                      \n\t"
222
+                "shll_s.w   %[ret4], %[ret4], 16                  \n\t"
223
+                "shll_s.w   %[ret5], %[ret5], 16                  \n\t"
224
+                "shll_s.w   %[ret6], %[ret6], 16                  \n\t"
225
+                "shll_s.w   %[ret7], %[ret7], 16                  \n\t"
226
+                "srl        %[ret4], %[ret4], 16                  \n\t"
227
+                "srl        %[ret5], %[ret5], 16                  \n\t"
228
+                "srl        %[ret6], %[ret6], 16                  \n\t"
229
+                "srl        %[ret7], %[ret7], 16                  \n\t"
230
+                "sh         %[ret4], 0(%[dst_ptr4])               \n\t"
231
+                "sh         %[ret5], 0(%[dst_ptr5])               \n\t"
232
+                "sh         %[ret6], 0(%[dst_ptr6])               \n\t"
233
+                "sh         %[ret7], 0(%[dst_ptr7])               \n\t"
234
+                "addu       %[dst_ptr0], %[dst_ptr7], %[ch2]      \n\t"
235
+                "bne        %[src_ptr],  %[src_end],  fti16i_lp%= \n\t"
236
+                : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3),
237
+                  [ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7),
238
+                  [src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3),
239
+                  [src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7),
240
+                  [dst_ptr1]"=&r"(dst_ptr1), [dst_ptr2]"=&r"(dst_ptr2), [dst_ptr3]"=&r"(dst_ptr3),
241
+                  [dst_ptr4]"=&r"(dst_ptr4), [dst_ptr5]"=&r"(dst_ptr5), [dst_ptr6]"=&r"(dst_ptr6),
242
+                  [dst_ptr7]"=&r"(dst_ptr7), [dst_ptr0]"+r"(dst_ptr0), [src_ptr]"+r"(src_ptr)
243
+                : [ch2]"r"(ch2), [src_end]"r"(src_end)
244
+                : "memory"
245
+            );
246
+        }
247
+    }
248
+}
249
+#endif /* HAVE_MIPSDSPR1 */
250
+
251
+static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
252
+        float mul, int len)
253
+{
254
+    /*
255
+     * variables used in inline assembler
256
+     */
257
+    float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15;
258
+
259
+    int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23;
260
+    const int *src_end = src + len;
261
+    /*
262
+     * loop is 8 times unrolled in assembler in order to achieve better performance
263
+     */
264
+    __asm__ volatile (
265
+        "i32tf_lp%=:                                    \n\t"
266
+        "lw       %[rpom11],     0(%[src])              \n\t"
267
+        "lw       %[rpom21],     4(%[src])              \n\t"
268
+        "lw       %[rpom1],      8(%[src])              \n\t"
269
+        "lw       %[rpom2],      12(%[src])             \n\t"
270
+        "mtc1     %[rpom11],     %[temp1]               \n\t"
271
+        "mtc1     %[rpom21],     %[temp3]               \n\t"
272
+        "mtc1     %[rpom1],      %[temp5]               \n\t"
273
+        "mtc1     %[rpom2],      %[temp7]               \n\t"
274
+
275
+        "lw       %[rpom13],     16(%[src])             \n\t"
276
+        "lw       %[rpom23],     20(%[src])             \n\t"
277
+        "lw       %[rpom12],     24(%[src])             \n\t"
278
+        "lw       %[rpom22],     28(%[src])             \n\t"
279
+        "mtc1     %[rpom13],     %[temp9]               \n\t"
280
+        "mtc1     %[rpom23],     %[temp11]              \n\t"
281
+        "mtc1     %[rpom12],     %[temp13]              \n\t"
282
+        "mtc1     %[rpom22],     %[temp15]              \n\t"
283
+
284
+        "addiu    %[src],        32                     \n\t"
285
+        "cvt.s.w  %[temp1],      %[temp1]               \n\t"
286
+        "cvt.s.w  %[temp3],      %[temp3]               \n\t"
287
+        "cvt.s.w  %[temp5],      %[temp5]               \n\t"
288
+        "cvt.s.w  %[temp7],      %[temp7]               \n\t"
289
+
290
+        "cvt.s.w  %[temp9],      %[temp9]               \n\t"
291
+        "cvt.s.w  %[temp11],     %[temp11]              \n\t"
292
+        "cvt.s.w  %[temp13],     %[temp13]              \n\t"
293
+        "cvt.s.w  %[temp15],     %[temp15]              \n\t"
294
+
295
+        "mul.s   %[temp1],       %[temp1],    %[mul]    \n\t"
296
+        "mul.s   %[temp3],       %[temp3],    %[mul]    \n\t"
297
+        "mul.s   %[temp5],       %[temp5],    %[mul]    \n\t"
298
+        "mul.s   %[temp7],       %[temp7],    %[mul]    \n\t"
299
+
300
+        "mul.s   %[temp9],       %[temp9],    %[mul]    \n\t"
301
+        "mul.s   %[temp11],      %[temp11],   %[mul]    \n\t"
302
+        "mul.s   %[temp13],      %[temp13],   %[mul]    \n\t"
303
+        "mul.s   %[temp15],      %[temp15],   %[mul]    \n\t"
304
+
305
+        "swc1    %[temp1],       0(%[dst])              \n\t" /*dst[i] = src[i] * mul;    */
306
+        "swc1    %[temp3],       4(%[dst])              \n\t" /*dst[i+1] = src[i+1] * mul;*/
307
+        "swc1    %[temp5],       8(%[dst])              \n\t" /*dst[i+2] = src[i+2] * mul;*/
308
+        "swc1    %[temp7],       12(%[dst])             \n\t" /*dst[i+3] = src[i+3] * mul;*/
309
+
310
+        "swc1    %[temp9],       16(%[dst])             \n\t" /*dst[i+4] = src[i+4] * mul;*/
311
+        "swc1    %[temp11],      20(%[dst])             \n\t" /*dst[i+5] = src[i+5] * mul;*/
312
+        "swc1    %[temp13],      24(%[dst])             \n\t" /*dst[i+6] = src[i+6] * mul;*/
313
+        "swc1    %[temp15],      28(%[dst])             \n\t" /*dst[i+7] = src[i+7] * mul;*/
314
+        "addiu   %[dst],        32                      \n\t"
315
+        "bne     %[src],        %[src_end], i32tf_lp%=  \n\t"
316
+        : [temp1]"=&f"(temp1),   [temp11]"=&f"(temp11),
317
+          [temp13]"=&f"(temp13), [temp15]"=&f"(temp15),
318
+          [temp3]"=&f"(temp3),   [temp5]"=&f"(temp5),
319
+          [temp7]"=&f"(temp7),   [temp9]"=&f"(temp9),
320
+          [rpom1]"=&r"(rpom1),   [rpom2]"=&r"(rpom2),
321
+          [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21),
322
+          [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22),
323
+          [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23),
324
+          [dst]"+r"(dst),       [src]"+r"(src)
325
+        : [mul]"f"(mul),        [src_end]"r"(src_end)
326
+        : "memory"
327
+    );
328
+}
329
+
330
+av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c)
331
+{
332
+#if HAVE_MIPSDSPR1
333
+    c->float_to_int16_interleave = float_to_int16_interleave_mips;
334
+    c->float_to_int16 = float_to_int16_mips;
335
+#endif
336
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips;
337
+}