Browse code

Merge commit 'f4d5a2cc35fcdf06ec031fabe8b0710e995fe924'

* commit 'f4d5a2cc35fcdf06ec031fabe8b0710e995fe924':
aarch64: NEON float to s16 audio conversion

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2014/04/23 06:32:55
Showing 5 changed files
... ...
@@ -1 +1,5 @@
1
+OBJS                             += aarch64/audio_convert_init.o
2
+
1 3
 OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
4
+
5
+NEON-OBJS                        += aarch64/audio_convert_neon.o
2 6
new file mode 100644
... ...
@@ -0,0 +1,49 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with FFmpeg; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#include <stdint.h>
19
+
20
+#include "config.h"
21
+#include "libavutil/attributes.h"
22
+#include "libavutil/cpu.h"
23
+#include "libavutil/aarch64/cpu.h"
24
+#include "libavutil/samplefmt.h"
25
+#include "libavresample/audio_convert.h"
26
+
27
+void ff_conv_flt_to_s16_neon(int16_t *dst, const float *src, int len);
28
+void ff_conv_fltp_to_s16_neon(int16_t *dst, float *const *src,
29
+                              int len, int channels);
30
+void ff_conv_fltp_to_s16_2ch_neon(int16_t *dst, float *const *src,
31
+                                  int len, int channels);
32
+
33
+av_cold void ff_audio_convert_init_aarch64(AudioConvert *ac)
34
+{
35
+    int cpu_flags = av_get_cpu_flags();
36
+
37
+    if (have_neon(cpu_flags)) {
38
+        ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLT,
39
+                                  0, 16, 8, "NEON",
40
+                                  ff_conv_flt_to_s16_neon);
41
+        ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP,
42
+                                  2, 16, 8, "NEON",
43
+                                  ff_conv_fltp_to_s16_2ch_neon);
44
+        ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP,
45
+                                  0, 16, 8, "NEON",
46
+                                  ff_conv_fltp_to_s16_neon);
47
+    }
48
+}
0 49
new file mode 100644
... ...
@@ -0,0 +1,363 @@
0
+/*
1
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
2
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
3
+ *
4
+ * This file is part of FFmpeg.
5
+ *
6
+ * FFmpeg is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * FFmpeg is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with FFmpeg; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#include "config.h"
22
+#include "libavutil/aarch64/asm.S"
23
+
24
+function ff_conv_flt_to_s16_neon, export=1
25
+        subs            x2,  x2,  #8
26
+        ld1             {v0.4s}, [x1],  #16
27
+        fcvtzs          v4.4s,  v0.4s,  #31
28
+        ld1             {v1.4s}, [x1],  #16
29
+        fcvtzs          v5.4s,  v1.4s,  #31
30
+        b.eq            3f
31
+        ands            x12, x2,  #~15
32
+        b.eq            2f
33
+1:      subs            x12, x12, #16
34
+        sqrshrn         v4.4h,  v4.4s,  #16
35
+        ld1             {v2.4s}, [x1],  #16
36
+        fcvtzs          v6.4s,  v2.4s,  #31
37
+        sqrshrn2        v4.8h,  v5.4s,  #16
38
+        ld1             {v3.4s}, [x1],  #16
39
+        fcvtzs          v7.4s,  v3.4s,  #31
40
+        sqrshrn         v6.4h,  v6.4s,  #16
41
+        st1             {v4.8h}, [x0],  #16
42
+        sqrshrn2        v6.8h,  v7.4s,  #16
43
+        ld1             {v0.4s}, [x1],  #16
44
+        fcvtzs          v4.4s,  v0.4s,  #31
45
+        ld1             {v1.4s}, [x1],  #16
46
+        fcvtzs          v5.4s,  v1.4s,  #31
47
+        st1             {v6.8h}, [x0],  #16
48
+        b.ne            1b
49
+        ands            x2,  x2,  #15
50
+        b.eq            3f
51
+2:      ld1             {v2.4s}, [x1],  #16
52
+        sqrshrn         v4.4h,  v4.4s,  #16
53
+        fcvtzs          v6.4s,  v2.4s,  #31
54
+        ld1             {v3.4s}, [x1],  #16
55
+        sqrshrn2        v4.8h,  v5.4s,  #16
56
+        fcvtzs          v7.4s,  v3.4s,  #31
57
+        sqrshrn         v6.4h,  v6.4s,  #16
58
+        st1             {v4.8h}, [x0],  #16
59
+        sqrshrn2        v6.8h,  v7.4s,  #16
60
+        st1             {v6.8h}, [x0]
61
+        ret
62
+3:      sqrshrn         v4.4h,  v4.4s,  #16
63
+        sqrshrn2        v4.8h,  v5.4s,  #16
64
+        st1             {v4.8h}, [x0]
65
+        ret
66
+endfunc
67
+
68
+function ff_conv_fltp_to_s16_2ch_neon, export=1
69
+        ldp             x4,  x5,  [x1]
70
+        subs            x2,  x2,  #8
71
+        ld1             {v0.4s},  [x4], #16
72
+        fcvtzs          v4.4s,  v0.4s,  #31
73
+        ld1             {v1.4s},  [x4], #16
74
+        fcvtzs          v5.4s,  v1.4s,  #31
75
+        ld1             {v2.4s},  [x5], #16
76
+        fcvtzs          v6.4s,  v2.4s,  #31
77
+        ld1             {v3.4s},  [x5], #16
78
+        fcvtzs          v7.4s,  v3.4s,  #31
79
+        b.eq            3f
80
+        ands            x12, x2,  #~15
81
+        b.eq            2f
82
+1:      subs            x12, x12, #16
83
+        ld1             {v16.4s}, [x4], #16
84
+        fcvtzs          v20.4s, v16.4s, #31
85
+        sri             v6.4s,  v4.4s,  #16
86
+        ld1             {v17.4s}, [x4], #16
87
+        fcvtzs          v21.4s, v17.4s, #31
88
+        ld1             {v18.4s}, [x5], #16
89
+        fcvtzs          v22.4s, v18.4s, #31
90
+        ld1             {v19.4s}, [x5], #16
91
+        sri             v7.4s,  v5.4s,  #16
92
+        st1             {v6.4s},  [x0], #16
93
+        fcvtzs          v23.4s, v19.4s, #31
94
+        st1             {v7.4s},  [x0], #16
95
+        sri             v22.4s, v20.4s, #16
96
+        ld1             {v0.4s},  [x4], #16
97
+        sri             v23.4s, v21.4s, #16
98
+        st1             {v22.4s}, [x0], #16
99
+        fcvtzs          v4.4s,  v0.4s,  #31
100
+        ld1             {v1.4s},  [x4], #16
101
+        fcvtzs          v5.4s,  v1.4s,  #31
102
+        ld1             {v2.4s},  [x5], #16
103
+        fcvtzs          v6.4s,  v2.4s,  #31
104
+        ld1             {v3.4s},  [x5], #16
105
+        fcvtzs          v7.4s,  v3.4s,  #31
106
+        st1             {v23.4s}, [x0], #16
107
+        b.ne            1b
108
+        ands            x2,  x2,  #15
109
+        b.eq            3f
110
+2:      sri             v6.4s,  v4.4s,  #16
111
+        ld1             {v0.4s},  [x4], #16
112
+        fcvtzs          v0.4s,  v0.4s,  #31
113
+        ld1             {v1.4s},  [x4], #16
114
+        fcvtzs          v1.4s,  v1.4s,  #31
115
+        ld1             {v2.4s},  [x5], #16
116
+        fcvtzs          v2.4s,  v2.4s,  #31
117
+        sri             v7.4s,  v5.4s,  #16
118
+        ld1             {v3.4s},  [x5], #16
119
+        fcvtzs          v3.4s,  v3.4s,  #31
120
+        sri             v2.4s,  v0.4s,  #16
121
+        st1             {v6.4s,v7.4s},  [x0], #32
122
+        sri             v3.4s,  v1.4s,  #16
123
+        st1             {v2.4s,v3.4s},  [x0], #32
124
+        ret
125
+3:      sri             v6.4s,  v4.4s,  #16
126
+        sri             v7.4s,  v5.4s,  #16
127
+        st1             {v6.4s,v7.4s},  [x0]
128
+        ret
129
+endfunc
130
+
131
+function ff_conv_fltp_to_s16_neon, export=1
132
+        cmp             w3,  #2
133
+        b.eq            X(ff_conv_fltp_to_s16_2ch_neon)
134
+        b.gt            1f
135
+        ldr             x1,  [x1]
136
+        b               X(ff_conv_flt_to_s16_neon)
137
+1:
138
+        cmp             w3,  #4
139
+        lsl             x12, x3,  #1
140
+        b.lt            4f
141
+
142
+5:      // 4 channels
143
+        ldp             x4, x5, [x1], #16
144
+        ldp             x6, x7, [x1], #16
145
+        mov             w9,  w2
146
+        mov             x8,  x0
147
+        ld1             {v4.4s},        [x4], #16
148
+        fcvtzs          v4.4s,  v4.4s,  #31
149
+        ld1             {v5.4s},        [x5], #16
150
+        fcvtzs          v5.4s,  v5.4s,  #31
151
+        ld1             {v6.4s},        [x6], #16
152
+        fcvtzs          v6.4s, v6.4s, #31
153
+        ld1             {v7.4s},        [x7], #16
154
+        fcvtzs          v7.4s, v7.4s, #31
155
+6:
156
+        subs            w9,  w9,  #8
157
+        ld1             {v0.4s},        [x4], #16
158
+        fcvtzs          v0.4s,  v0.4s,  #31
159
+        sri             v5.4s,  v4.4s,  #16
160
+        ld1             {v1.4s},        [x5], #16
161
+        fcvtzs          v1.4s,  v1.4s,  #31
162
+        sri             v7.4s,  v6.4s,  #16
163
+        ld1             {v2.4s},        [x6], #16
164
+        fcvtzs          v2.4s,  v2.4s,  #31
165
+        zip1            v16.4s, v5.4s,  v7.4s
166
+        ld1             {v3.4s},        [x7], #16
167
+        fcvtzs          v3.4s,  v3.4s,  #31
168
+        zip2            v17.4s, v5.4s,  v7.4s
169
+        st1             {v16.d}[0],     [x8], x12
170
+        sri             v1.4s,  v0.4s,  #16
171
+        st1             {v16.d}[1],     [x8], x12
172
+        sri             v3.4s,  v2.4s,  #16
173
+        st1             {v17.d}[0],     [x8], x12
174
+        zip1            v18.4s, v1.4s,  v3.4s
175
+        st1             {v17.d}[1],     [x8], x12
176
+        zip2            v19.4s, v1.4s,  v3.4s
177
+        b.eq            7f
178
+        ld1             {v4.4s},        [x4], #16
179
+        fcvtzs          v4.4s,  v4.4s,  #31
180
+        st1             {v18.d}[0],     [x8], x12
181
+        ld1             {v5.4s},        [x5], #16
182
+        fcvtzs          v5.4s,  v5.4s,  #31
183
+        st1             {v18.d}[1],     [x8], x12
184
+        ld1             {v6.4s},    [x6], #16
185
+        fcvtzs          v6.4s, v6.4s, #31
186
+        st1             {v19.d}[0],     [x8], x12
187
+        ld1             {v7.4s},    [x7], #16
188
+        fcvtzs          v7.4s, v7.4s, #31
189
+        st1             {v19.d}[1],     [x8], x12
190
+        b               6b
191
+7:
192
+        st1             {v18.d}[0],     [x8], x12
193
+        st1             {v18.d}[1],     [x8], x12
194
+        st1             {v19.d}[0],     [x8], x12
195
+        st1             {v19.d}[1],     [x8], x12
196
+        subs            w3,  w3,  #4
197
+        b.eq            end
198
+        cmp             w3,  #4
199
+        add             x0,  x0,  #8
200
+        b.ge            5b
201
+
202
+4:      // 2 channels
203
+        cmp             w3,  #2
204
+        b.lt            4f
205
+        ldp             x4,  x5,  [x1], #16
206
+        mov             w9,  w2
207
+        mov             x8,  x0
208
+        tst             w9,  #8
209
+        ld1             {v4.4s},        [x4], #16
210
+        fcvtzs          v4.4s,  v4.4s,  #31
211
+        ld1             {v5.4s},        [x5], #16
212
+        fcvtzs          v5.4s,  v5.4s,  #31
213
+        ld1             {v6.4s},        [x4], #16
214
+        fcvtzs          v6.4s,  v6.4s,  #31
215
+        ld1             {v7.4s},        [x5], #16
216
+        fcvtzs          v7.4s,  v7.4s,  #31
217
+        b.eq            6f
218
+        subs            w9,  w9,  #8
219
+        b.eq            7f
220
+        sri             v5.4s,  v4.4s,  #16
221
+        ld1             {v4.4s},        [x4], #16
222
+        fcvtzs          v4.4s,  v4.4s,  #31
223
+        st1             {v5.s}[0],      [x8], x12
224
+        sri             v7.4s,  v6.4s,  #16
225
+        st1             {v5.s}[1],      [x8], x12
226
+        ld1             {v6.4s},        [x4], #16
227
+        fcvtzs          v6.4s,  v6.4s, #31
228
+        st1             {v5.s}[2],      [x8], x12
229
+        st1             {v5.s}[3],      [x8], x12
230
+        st1             {v7.s}[0],      [x8], x12
231
+        st1             {v7.s}[1],      [x8], x12
232
+        ld1             {v5.4s},        [x5], #16
233
+        fcvtzs          v5.4s,  v5.4s,  #31
234
+        st1             {v7.s}[2],      [x8], x12
235
+        st1             {v7.s}[3],      [x8], x12
236
+        ld1             {v7.4s},        [x5], #16
237
+        fcvtzs          v7.4s,  v7.4s,  #31
238
+6:
239
+        subs            w9,  w9,  #16
240
+        ld1             {v0.4s},        [x4], #16
241
+        sri             v5.4s,  v4.4s,  #16
242
+        fcvtzs          v0.4s,  v0.4s,  #31
243
+        ld1             {v1.4s},        [x5], #16
244
+        sri             v7.4s,  v6.4s,  #16
245
+        st1             {v5.s}[0],      [x8], x12
246
+        st1             {v5.s}[1],      [x8], x12
247
+        fcvtzs          v1.4s,  v1.4s,  #31
248
+        st1             {v5.s}[2],      [x8], x12
249
+        st1             {v5.s}[3],      [x8], x12
250
+        ld1             {v2.4s},        [x4], #16
251
+        st1             {v7.s}[0],      [x8], x12
252
+        fcvtzs          v2.4s,  v2.4s,  #31
253
+        st1             {v7.s}[1],      [x8], x12
254
+        ld1             {v3.4s},        [x5], #16
255
+        st1             {v7.s}[2],      [x8], x12
256
+        fcvtzs          v3.4s,  v3.4s,  #31
257
+        st1             {v7.s}[3],      [x8], x12
258
+        sri             v1.4s,  v0.4s,  #16
259
+        sri             v3.4s,  v2.4s,  #16
260
+        b.eq            6f
261
+        ld1             {v4.4s},        [x4], #16
262
+        st1             {v1.s}[0],      [x8], x12
263
+        fcvtzs          v4.4s,  v4.4s,  #31
264
+        st1             {v1.s}[1],      [x8], x12
265
+        ld1             {v5.4s},        [x5], #16
266
+        st1             {v1.s}[2],      [x8], x12
267
+        fcvtzs          v5.4s,  v5.4s,  #31
268
+        st1             {v1.s}[3],      [x8], x12
269
+        ld1             {v6.4s},        [x4], #16
270
+        st1             {v3.s}[0],      [x8], x12
271
+        fcvtzs          v6.4s,  v6.4s,  #31
272
+        st1             {v3.s}[1],      [x8], x12
273
+        ld1             {v7.4s},        [x5], #16
274
+        st1             {v3.s}[2],      [x8], x12
275
+        fcvtzs          v7.4s,  v7.4s,  #31
276
+        st1             {v3.s}[3],      [x8], x12
277
+        b.gt            6b
278
+6:
279
+        st1             {v1.s}[0],      [x8], x12
280
+        st1             {v1.s}[1],      [x8], x12
281
+        st1             {v1.s}[2],      [x8], x12
282
+        st1             {v1.s}[3],      [x8], x12
283
+        st1             {v3.s}[0],      [x8], x12
284
+        st1             {v3.s}[1],      [x8], x12
285
+        st1             {v3.s}[2],      [x8], x12
286
+        st1             {v3.s}[3],      [x8], x12
287
+        b               8f
288
+7:
289
+        sri             v5.4s,  v4.4s,  #16
290
+        sri             v7.4s,  v6.4s,  #16
291
+        st1             {v5.s}[0],      [x8], x12
292
+        st1             {v5.s}[1],      [x8], x12
293
+        st1             {v5.s}[2],      [x8], x12
294
+        st1             {v5.s}[3],      [x8], x12
295
+        st1             {v7.s}[0],      [x8], x12
296
+        st1             {v7.s}[1],      [x8], x12
297
+        st1             {v7.s}[2],      [x8], x12
298
+        st1             {v7.s}[3],      [x8], x12
299
+8:
300
+        subs            w3,  w3,  #2
301
+        add             x0,  x0,  #4
302
+        b.eq            end
303
+
304
+4:      // 1 channel
305
+        ldr             x4,  [x1]
306
+        tst             w2,  #8
307
+        mov             w9,  w2
308
+        mov             x5,  x0
309
+        ld1             {v0.4s},        [x4], #16
310
+        fcvtzs          v0.4s,  v0.4s,  #31
311
+        ld1             {v1.4s},        [x4], #16
312
+        fcvtzs          v1.4s,  v1.4s,  #31
313
+        b.ne            8f
314
+6:
315
+        subs            w9,  w9,  #16
316
+        ld1             {v2.4s},        [x4], #16
317
+        fcvtzs          v2.4s,  v2.4s,  #31
318
+        ld1             {v3.4s},        [x4], #16
319
+        fcvtzs          v3.4s,  v3.4s,  #31
320
+        st1             {v0.h}[1],      [x5], x12
321
+        st1             {v0.h}[3],      [x5], x12
322
+        st1             {v0.h}[5],      [x5], x12
323
+        st1             {v0.h}[7],      [x5], x12
324
+        st1             {v1.h}[1],      [x5], x12
325
+        st1             {v1.h}[3],      [x5], x12
326
+        st1             {v1.h}[5],      [x5], x12
327
+        st1             {v1.h}[7],      [x5], x12
328
+        b.eq            7f
329
+        ld1             {v0.4s},        [x4], #16
330
+        fcvtzs          v0.4s,  v0.4s,  #31
331
+        ld1             {v1.4s},        [x4], #16
332
+        fcvtzs          v1.4s,  v1.4s,  #31
333
+7:
334
+        st1             {v2.h}[1],      [x5], x12
335
+        st1             {v2.h}[3],      [x5], x12
336
+        st1             {v2.h}[5],      [x5], x12
337
+        st1             {v2.h}[7],      [x5], x12
338
+        st1             {v3.h}[1],      [x5], x12
339
+        st1             {v3.h}[3],      [x5], x12
340
+        st1             {v3.h}[5],      [x5], x12
341
+        st1             {v3.h}[7],      [x5], x12
342
+        b.gt            6b
343
+        ret
344
+8:
345
+        subs            w9,  w9,  #8
346
+        st1             {v0.h}[1],      [x5], x12
347
+        st1             {v0.h}[3],      [x5], x12
348
+        st1             {v0.h}[5],      [x5], x12
349
+        st1             {v0.h}[7],      [x5], x12
350
+        st1             {v1.h}[1],      [x5], x12
351
+        st1             {v1.h}[3],      [x5], x12
352
+        st1             {v1.h}[5],      [x5], x12
353
+        st1             {v1.h}[7],      [x5], x12
354
+        b.eq            end
355
+        ld1             {v0.4s},        [x4], #16
356
+        fcvtzs          v0.4s,  v0.4s,  #31
357
+        ld1             {v1.4s},        [x4], #16
358
+        fcvtzs          v1.4s,  v1.4s,  #31
359
+        b               6b
360
+end:
361
+        ret
362
+endfunc
... ...
@@ -301,6 +301,8 @@ AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr,
301 301
 
302 302
     set_generic_function(ac);
303 303
 
304
+    if (ARCH_AARCH64)
305
+        ff_audio_convert_init_aarch64(ac);
304 306
     if (ARCH_ARM)
305 307
         ff_audio_convert_init_arm(ac);
306 308
     if (ARCH_X86)
... ...
@@ -96,6 +96,7 @@ int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in);
96 96
 
97 97
 /* arch-specific initialization functions */
98 98
 
99
+void ff_audio_convert_init_aarch64(AudioConvert *ac);
99 100
 void ff_audio_convert_init_arm(AudioConvert *ac);
100 101
 void ff_audio_convert_init_x86(AudioConvert *ac);
101 102