Between 1.5 and 2.5 times faster
Reviewed-by: Michael Niedermayer <michael@niedermayer.cc>
Signed-off-by: James Almer <jamrial@gmail.com>
... | ... |
@@ -936,8 +936,8 @@ static void stereo_processing(PSContext *ps, INTFLOAT (*l)[32][2], INTFLOAT (*r) |
936 | 936 |
H22[0][e+1][b] = h22; |
937 | 937 |
} |
938 | 938 |
for (k = 0; k < NR_BANDS[is34]; k++) { |
939 |
- INTFLOAT h[2][4]; |
|
940 |
- INTFLOAT h_step[2][4]; |
|
939 |
+ LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]); |
|
940 |
+ LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]); |
|
941 | 941 |
int start = ps->border_position[e]; |
942 | 942 |
int stop = ps->border_position[e+1]; |
943 | 943 |
INTFLOAT width = Q30(1.f) / (stop - start); |
... | ... |
@@ -38,7 +38,8 @@ OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o |
38 | 38 |
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o |
39 | 39 |
|
40 | 40 |
# decoders/encoders |
41 |
-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o |
|
41 |
+OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \ |
|
42 |
+ x86/sbrdsp_init.o |
|
42 | 43 |
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o |
43 | 44 |
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o |
44 | 45 |
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o |
... | ... |
@@ -130,7 +131,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \ |
130 | 130 |
x86/vp8dsp_loopfilter.o |
131 | 131 |
|
132 | 132 |
# decoders/encoders |
133 |
-YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o |
|
133 |
+YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \ |
|
134 |
+ x86/sbrdsp.o |
|
134 | 135 |
YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o |
135 | 136 |
YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o |
136 | 137 |
YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o |
137 | 138 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,215 @@ |
0 |
+;****************************************************************************** |
|
1 |
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions |
|
2 |
+;* |
|
3 |
+;* Copyright (C) 2015 James Almer |
|
4 |
+;* |
|
5 |
+;* This file is part of FFmpeg. |
|
6 |
+;* |
|
7 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
8 |
+;* modify it under the terms of the GNU Lesser General Public |
|
9 |
+;* License as published by the Free Software Foundation; either |
|
10 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
11 |
+;* |
|
12 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+;* Lesser General Public License for more details. |
|
16 |
+;* |
|
17 |
+;* You should have received a copy of the GNU Lesser General Public |
|
18 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
19 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+;****************************************************************************** |
|
21 |
+ |
|
22 |
+%include "libavutil/x86/x86util.asm" |
|
23 |
+ |
|
24 |
+SECTION_RODATA |
|
25 |
+ |
|
26 |
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 |
|
27 |
+ |
|
28 |
+SECTION_TEXT |
|
29 |
+ |
|
30 |
+;************************************************************************* |
|
31 |
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n); |
|
32 |
+;************************************************************************* |
|
33 |
+%macro PS_ADD_SQUARES 1 |
|
34 |
+cglobal ps_add_squares, 3, 3, %1, dst, src, n |
|
35 |
+.loop: |
|
36 |
+ movaps m0, [srcq] |
|
37 |
+ movaps m1, [srcq+mmsize] |
|
38 |
+ mulps m0, m0 |
|
39 |
+ mulps m1, m1 |
|
40 |
+%if cpuflag(sse3) |
|
41 |
+ haddps m0, m1 |
|
42 |
+%else |
|
43 |
+ movaps m3, m0 |
|
44 |
+ movaps m4, m1 |
|
45 |
+ shufps m3, m3, q0301 |
|
46 |
+ shufps m4, m4, q0301 |
|
47 |
+ addps m0, m3 |
|
48 |
+ addps m1, m4 |
|
49 |
+ shufps m0, m1, q2020 |
|
50 |
+%endif |
|
51 |
+ addps m0, [dstq] |
|
52 |
+ movaps [dstq], m0 |
|
53 |
+ add dstq, mmsize |
|
54 |
+ add srcq, mmsize*2 |
|
55 |
+ sub nd, mmsize/4 |
|
56 |
+ jg .loop |
|
57 |
+ REP_RET |
|
58 |
+%endmacro |
|
59 |
+ |
|
60 |
+INIT_XMM sse |
|
61 |
+PS_ADD_SQUARES 3 |
|
62 |
+INIT_XMM sse3 |
|
63 |
+PS_ADD_SQUARES 5 |
|
64 |
+ |
|
65 |
+;******************************************************************* |
|
66 |
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2], |
|
67 |
+; float *src1, int n); |
|
68 |
+;******************************************************************* |
|
69 |
+INIT_XMM sse |
|
70 |
+cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n |
|
71 |
+ xor r4q, r4q |
|
72 |
+ |
|
73 |
+.loop: |
|
74 |
+ movu m0, [src1q+r4q] |
|
75 |
+ movu m1, [src1q+r4q+mmsize] |
|
76 |
+ mova m2, [src2q] |
|
77 |
+ mova m3, m2 |
|
78 |
+ unpcklps m2, m2 |
|
79 |
+ unpckhps m3, m3 |
|
80 |
+ mulps m0, m2 |
|
81 |
+ mulps m1, m3 |
|
82 |
+ mova [dstq+r4q], m0 |
|
83 |
+ mova [dstq+r4q+mmsize], m1 |
|
84 |
+ add src2q, mmsize |
|
85 |
+ add r4q, mmsize*2 |
|
86 |
+ sub nd, mmsize/4 |
|
87 |
+ jg .loop |
|
88 |
+ REP_RET |
|
89 |
+ |
|
90 |
+;*********************************************************************** |
|
91 |
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], |
|
92 |
+; float h[2][4], float h_step[2][4], |
|
93 |
+; int len); |
|
94 |
+;*********************************************************************** |
|
95 |
+INIT_XMM sse3 |
|
96 |
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n |
|
97 |
+ movaps m0, [hq] |
|
98 |
+ movaps m1, [h_stepq] |
|
99 |
+ cmp nd, 0 |
|
100 |
+ jle .ret |
|
101 |
+ shl nd, 3 |
|
102 |
+ add lq, nq |
|
103 |
+ add rq, nq |
|
104 |
+ neg nq |
|
105 |
+ |
|
106 |
+align 16 |
|
107 |
+.loop: |
|
108 |
+ addps m0, m1 |
|
109 |
+ movddup m2, [lq+nq] |
|
110 |
+ movddup m3, [rq+nq] |
|
111 |
+ movaps m4, m0 |
|
112 |
+ movaps m5, m0 |
|
113 |
+ unpcklps m4, m4 |
|
114 |
+ unpckhps m5, m5 |
|
115 |
+ mulps m2, m4 |
|
116 |
+ mulps m3, m5 |
|
117 |
+ addps m2, m3 |
|
118 |
+ movsd [lq+nq], m2 |
|
119 |
+ movhps [rq+nq], m2 |
|
120 |
+ add nq, 8 |
|
121 |
+ jl .loop |
|
122 |
+.ret: |
|
123 |
+ REP_RET |
|
124 |
+ |
|
125 |
+;******************************************************************* |
|
126 |
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2], |
|
127 |
+; const float (*filter)[8][2], |
|
128 |
+; int stride, int n); |
|
129 |
+;******************************************************************* |
|
130 |
+%macro PS_HYBRID_ANALYSIS_LOOP 3 |
|
131 |
+ movu %1, [inq+mmsize*%3] |
|
132 |
+ movu m1, [inq+mmsize*(5-%3)+8] |
|
133 |
+%if cpuflag(sse3) |
|
134 |
+ pshufd %2, %1, q2301 |
|
135 |
+ pshufd m4, m1, q0123 |
|
136 |
+ pshufd m1, m1, q1032 |
|
137 |
+ pshufd m2, [filterq+nq+mmsize*%3], q2301 |
|
138 |
+ addsubps %2, m4 |
|
139 |
+ addsubps %1, m1 |
|
140 |
+%else |
|
141 |
+ mova m2, [filterq+nq+mmsize*%3] |
|
142 |
+ mova %2, %1 |
|
143 |
+ mova m4, m1 |
|
144 |
+ shufps %2, %2, q2301 |
|
145 |
+ shufps m4, m4, q0123 |
|
146 |
+ shufps m1, m1, q1032 |
|
147 |
+ shufps m2, m2, q2301 |
|
148 |
+ xorps m4, m7 |
|
149 |
+ xorps m1, m7 |
|
150 |
+ subps %2, m4 |
|
151 |
+ subps %1, m1 |
|
152 |
+%endif |
|
153 |
+ mulps %2, m2 |
|
154 |
+ mulps %1, m2 |
|
155 |
+%if %3 |
|
156 |
+ addps m3, %2 |
|
157 |
+ addps m0, %1 |
|
158 |
+%endif |
|
159 |
+%endmacro |
|
160 |
+ |
|
161 |
+%macro PS_HYBRID_ANALYSIS 0 |
|
162 |
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n |
|
163 |
+%if cpuflag(sse3) |
|
164 |
+%define MOVH movsd |
|
165 |
+%else |
|
166 |
+%define MOVH movlps |
|
167 |
+%endif |
|
168 |
+ shl strided, 3 |
|
169 |
+ shl nd, 6 |
|
170 |
+ add filterq, nq |
|
171 |
+ neg nq |
|
172 |
+ mova m7, [ps_p1m1p1m1] |
|
173 |
+ |
|
174 |
+align 16 |
|
175 |
+.loop: |
|
176 |
+ PS_HYBRID_ANALYSIS_LOOP m0, m3, 0 |
|
177 |
+ PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 |
|
178 |
+ PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 |
|
179 |
+ |
|
180 |
+%if cpuflag(sse3) |
|
181 |
+ pshufd m3, m3, q2301 |
|
182 |
+ xorps m0, m7 |
|
183 |
+ hsubps m3, m0 |
|
184 |
+ pshufd m1, m3, q0020 |
|
185 |
+ pshufd m3, m3, q0031 |
|
186 |
+ addps m1, m3 |
|
187 |
+ movsd m2, [inq+6*8] |
|
188 |
+%else |
|
189 |
+ mova m1, m3 |
|
190 |
+ mova m2, m0 |
|
191 |
+ shufps m1, m1, q2301 |
|
192 |
+ shufps m2, m2, q2301 |
|
193 |
+ subps m1, m3 |
|
194 |
+ addps m2, m0 |
|
195 |
+ unpcklps m3, m1, m2 |
|
196 |
+ unpckhps m1, m2 |
|
197 |
+ addps m1, m3 |
|
198 |
+ movu m2, [inq+6*8] ; faster than movlps and no risk of overread |
|
199 |
+%endif |
|
200 |
+ movss m3, [filterq+nq+8*6] |
|
201 |
+ SPLATD m3 |
|
202 |
+ mulps m2, m3 |
|
203 |
+ addps m1, m2 |
|
204 |
+ MOVH [outq], m1 |
|
205 |
+ add outq, strideq |
|
206 |
+ add nq, 64 |
|
207 |
+ jl .loop |
|
208 |
+ REP_RET |
|
209 |
+%endmacro |
|
210 |
+ |
|
211 |
+INIT_XMM sse |
|
212 |
+PS_HYBRID_ANALYSIS |
|
213 |
+INIT_XMM sse3 |
|
214 |
+PS_HYBRID_ANALYSIS |
0 | 215 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,55 @@ |
0 |
+/* |
|
1 |
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "config.h" |
|
21 |
+ |
|
22 |
+#include "libavutil/x86/cpu.h" |
|
23 |
+#include "libavutil/attributes.h" |
|
24 |
+#include "libavcodec/aacpsdsp.h" |
|
25 |
+ |
|
26 |
+void ff_ps_add_squares_sse (float *dst, const float (*src)[2], int n); |
|
27 |
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n); |
|
28 |
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2], |
|
29 |
+ float *src1, int n); |
|
30 |
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2], |
|
31 |
+ const float (*filter)[8][2], |
|
32 |
+ int stride, int n); |
|
33 |
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2], |
|
34 |
+ const float (*filter)[8][2], |
|
35 |
+ int stride, int n); |
|
36 |
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], |
|
37 |
+ float h[2][4], float h_step[2][4], |
|
38 |
+ int len); |
|
39 |
+ |
|
40 |
+av_cold void ff_psdsp_init_x86(PSDSPContext *s) |
|
41 |
+{ |
|
42 |
+ int cpu_flags = av_get_cpu_flags(); |
|
43 |
+ |
|
44 |
+ if (EXTERNAL_SSE(cpu_flags)) { |
|
45 |
+ s->add_squares = ff_ps_add_squares_sse; |
|
46 |
+ s->mul_pair_single = ff_ps_mul_pair_single_sse; |
|
47 |
+ s->hybrid_analysis = ff_ps_hybrid_analysis_sse; |
|
48 |
+ } |
|
49 |
+ if (EXTERNAL_SSE3(cpu_flags)) { |
|
50 |
+ s->add_squares = ff_ps_add_squares_sse3; |
|
51 |
+ s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3; |
|
52 |
+ s->hybrid_analysis = ff_ps_hybrid_analysis_sse3; |
|
53 |
+ } |
|
54 |
+} |