58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
... | ... |
@@ -53,6 +53,7 @@ OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o |
53 | 53 |
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o |
54 | 54 |
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o |
55 | 55 |
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o |
56 |
+OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o |
|
56 | 57 |
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o |
57 | 58 |
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o |
58 | 59 |
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o |
... | ... |
@@ -126,6 +127,7 @@ X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o |
126 | 126 |
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o |
127 | 127 |
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o |
128 | 128 |
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o |
129 |
+X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o |
|
129 | 130 |
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o |
130 | 131 |
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o |
131 | 132 |
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ |
132 | 133 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,114 @@ |
0 |
+;****************************************************************************** |
|
1 |
+;* Opus SIMD functions |
|
2 |
+;* |
|
3 |
+;* This file is part of FFmpeg. |
|
4 |
+;* |
|
5 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
6 |
+;* modify it under the terms of the GNU Lesser General Public |
|
7 |
+;* License as published by the Free Software Foundation; either |
|
8 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
9 |
+;* |
|
10 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+;* Lesser General Public License for more details. |
|
14 |
+;* |
|
15 |
+;* You should have received a copy of the GNU Lesser General Public |
|
16 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
17 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+;****************************************************************************** |
|
19 |
+ |
|
20 |
+%include "libavutil/x86/x86util.asm" |
|
21 |
+ |
|
22 |
+SECTION_RODATA |
|
23 |
+ |
|
24 |
+ ; 0.85..^1 0.85..^2 0.85..^3 0.85..^4 |
|
25 |
+tab_st: dd 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f |
|
26 |
+tab_x0: dd 0x0, 0x3f599a00, 0x3f599a00, 0x3f599a00 |
|
27 |
+tab_x1: dd 0x0, 0x0, 0x3f38f671, 0x3f38f671 |
|
28 |
+tab_x2: dd 0x0, 0x0, 0x0, 0x3f1d382a |
|
29 |
+ |
|
30 |
+SECTION .text |
|
31 |
+ |
|
32 |
+INIT_XMM fma3 |
|
33 |
+%if UNIX64 |
|
34 |
+cglobal opus_deemphasis, 3, 3, 8, out, in, len |
|
35 |
+%else |
|
36 |
+cglobal opus_deemphasis, 4, 4, 8, out, in, coeff, len |
|
37 |
+%endif |
|
38 |
+%if ARCH_X86_32 |
|
39 |
+ VBROADCASTSS m0, coeffm |
|
40 |
+%else |
|
41 |
+%if WIN64 |
|
42 |
+ SWAP 0, 2 |
|
43 |
+%endif |
|
44 |
+ shufps m0, m0, 0 |
|
45 |
+%endif |
|
46 |
+ |
|
47 |
+ movaps m4, [tab_st] |
|
48 |
+ movaps m5, [tab_x0] |
|
49 |
+ movaps m6, [tab_x1] |
|
50 |
+ movaps m7, [tab_x2] |
|
51 |
+ |
|
52 |
+.loop: |
|
53 |
+ movaps m1, [inq] ; x0, x1, x2, x3 |
|
54 |
+ |
|
55 |
+ pslldq m2, m1, 4 ; 0, x0, x1, x2 |
|
56 |
+ pslldq m3, m1, 8 ; 0, 0, x0, x1 |
|
57 |
+ |
|
58 |
+ fmaddps m2, m2, m5, m1 ; x + c1*x[0-2] |
|
59 |
+ pslldq m1, 12 ; 0, 0, 0, x0 |
|
60 |
+ |
|
61 |
+ fmaddps m2, m3, m6, m2 ; x + c1*x[0-2] + c2*x[0-1] |
|
62 |
+ fmaddps m1, m1, m7, m2 ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0] |
|
63 |
+ fmaddps m0, m0, m4, m1 ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0] + c*s |
|
64 |
+ |
|
65 |
+ movaps [outq], m0 |
|
66 |
+ shufps m0, m0, q3333 ; new state |
|
67 |
+ |
|
68 |
+ add inq, mmsize |
|
69 |
+ add outq, mmsize |
|
70 |
+ sub lenq, mmsize >> 2 |
|
71 |
+ jg .loop |
|
72 |
+ |
|
73 |
+%if ARCH_X86_64 == 0 |
|
74 |
+ movss r0m, m0 |
|
75 |
+ fld dword r0m |
|
76 |
+%endif |
|
77 |
+ RET |
|
78 |
+ |
|
79 |
+ |
|
80 |
+INIT_XMM fma3 |
|
81 |
+cglobal opus_postfilter, 4, 4, 8, data, period, gains, len |
|
82 |
+ VBROADCASTSS m0, [gainsq + 0] |
|
83 |
+ VBROADCASTSS m1, [gainsq + 4] |
|
84 |
+ VBROADCASTSS m2, [gainsq + 8] |
|
85 |
+ |
|
86 |
+ lea periodq, [periodq*4 + 8] |
|
87 |
+ neg periodq |
|
88 |
+ |
|
89 |
+ movups m3, [dataq + periodq] |
|
90 |
+ mulps m3, m2 |
|
91 |
+ |
|
92 |
+.loop: |
|
93 |
+ movups m4, [dataq + periodq + 4] |
|
94 |
+ movups m5, [dataq + periodq + 8] |
|
95 |
+ movups m6, [dataq + periodq + 12] |
|
96 |
+ movups m7, [dataq + periodq + 16] |
|
97 |
+ |
|
98 |
+ fmaddps m3, m7, m2, m3 |
|
99 |
+ addps m6, m4 |
|
100 |
+ |
|
101 |
+ fmaddps m5, m5, m0, [dataq] |
|
102 |
+ fmaddps m6, m6, m1, m3 |
|
103 |
+ |
|
104 |
+ addps m5, m6 |
|
105 |
+ mulps m3, m7, m2 |
|
106 |
+ |
|
107 |
+ movaps [dataq], m5 |
|
108 |
+ |
|
109 |
+ add dataq, mmsize |
|
110 |
+ sub lenq, mmsize >> 2 |
|
111 |
+ jg .loop |
|
112 |
+ |
|
113 |
+ RET |
0 | 114 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,35 @@ |
0 |
+/* |
|
1 |
+ * This file is part of FFmpeg. |
|
2 |
+ * |
|
3 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#include "config.h" |
|
19 |
+ |
|
20 |
+#include "libavutil/x86/cpu.h" |
|
21 |
+#include "libavcodec/opusdsp.h" |
|
22 |
+ |
|
23 |
+void ff_opus_postfilter_fma3(float *data, int period, float *gains, int len); |
|
24 |
+float ff_opus_deemphasis_fma3(float *out, float *in, float coeff, int len); |
|
25 |
+ |
|
26 |
+av_cold void ff_opus_dsp_init_x86(OpusDSP *ctx) |
|
27 |
+{ |
|
28 |
+ int cpu_flags = av_get_cpu_flags(); |
|
29 |
+ |
|
30 |
+ if (EXTERNAL_FMA3_FAST(cpu_flags)) { |
|
31 |
+ ctx->postfilter = ff_opus_postfilter_fma3; |
|
32 |
+ ctx->deemphasis = ff_opus_deemphasis_fma3; |
|
33 |
+ } |
|
34 |
+} |