on x86_64:
time PSNR
plain 3.303 inf
SSE 1.649 107.087535
SSE3 1.632 107.087535
AVX 1.409 106.986771
FMA3 1.265 107.108437
on x86_32 (PSNR compared to x86_64 plain):
time PSNR
plain 7.225 103.951979
SSE 1.827 105.859282
SSE3 1.819 105.859282
AVX 1.533 105.997661
FMA3 1.384 105.885377
FMA4 test is not available
Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
... | ... |
@@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s) |
320 | 320 |
w *= sign * (1.0 / s->fft_len); |
321 | 321 |
s->coeffs[m].val[x - s->coeffs[m].start] = w; |
322 | 322 |
} |
323 |
+ |
|
324 |
+ if (s->permute_coeffs) |
|
325 |
+ s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len); |
|
323 | 326 |
} |
324 | 327 |
|
325 | 328 |
av_expr_free(expr); |
... | ... |
@@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink) |
1230 | 1230 |
|
1231 | 1231 |
s->cqt_align = 1; |
1232 | 1232 |
s->cqt_calc = cqt_calc; |
1233 |
+ s->permute_coeffs = NULL; |
|
1233 | 1234 |
s->draw_sono = draw_sono; |
1234 | 1235 |
if (s->format == AV_PIX_FMT_RGB24) { |
1235 | 1236 |
s->draw_bar = draw_bar_rgb; |
... | ... |
@@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink) |
1241 | 1241 |
s->update_sono = update_sono_yuv; |
1242 | 1242 |
} |
1243 | 1243 |
|
1244 |
+ if (ARCH_X86) |
|
1245 |
+ ff_showcqt_init_x86(s); |
|
1246 |
+ |
|
1244 | 1247 |
if ((ret = init_cqt(s)) < 0) |
1245 | 1248 |
return ret; |
1246 | 1249 |
|
... | ... |
@@ -74,6 +74,7 @@ typedef struct { |
74 | 74 |
/* callback */ |
75 | 75 |
void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, |
76 | 76 |
int len, int fft_len); |
77 |
+ void (*permute_coeffs)(float *v, int len); |
|
77 | 78 |
void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h, |
78 | 79 |
const ColorFloat *c, int bar_h); |
79 | 80 |
void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off); |
... | ... |
@@ -112,4 +113,6 @@ typedef struct { |
112 | 112 |
int axis; |
113 | 113 |
} ShowCQTContext; |
114 | 114 |
|
115 |
+void ff_showcqt_init_x86(ShowCQTContext *s); |
|
116 |
+ |
|
115 | 117 |
#endif |
... | ... |
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o |
13 | 13 |
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o |
14 | 14 |
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o |
15 | 15 |
OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o |
16 |
+OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o |
|
16 | 17 |
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o |
17 | 18 |
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o |
18 | 19 |
OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o |
... | ... |
@@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o |
37 | 37 |
ifdef CONFIG_GPL |
38 | 38 |
YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o |
39 | 39 |
endif |
40 |
+YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o |
|
40 | 41 |
YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o |
41 | 42 |
YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o |
42 | 43 |
YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o |
43 | 44 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,206 @@ |
0 |
+;***************************************************************************** |
|
1 |
+;* x86-optimized functions for showcqt filter |
|
2 |
+;* |
|
3 |
+;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com> |
|
4 |
+;* |
|
5 |
+;* This file is part of FFmpeg. |
|
6 |
+;* |
|
7 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
8 |
+;* modify it under the terms of the GNU Lesser General Public |
|
9 |
+;* License as published by the Free Software Foundation; either |
|
10 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
11 |
+;* |
|
12 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+;* Lesser General Public License for more details. |
|
16 |
+;* |
|
17 |
+;* You should have received a copy of the GNU Lesser General Public |
|
18 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
19 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+;****************************************************************************** |
|
21 |
+ |
|
22 |
+%include "libavutil/x86/x86util.asm" |
|
23 |
+ |
|
24 |
+%if ARCH_X86_64 |
|
25 |
+%define pointer resq |
|
26 |
+%else |
|
27 |
+%define pointer resd |
|
28 |
+%endif |
|
29 |
+ |
|
30 |
+struc Coeffs |
|
31 |
+ .val: pointer 1 |
|
32 |
+ .start: resd 1 |
|
33 |
+ .len: resd 1 |
|
34 |
+ .sizeof: |
|
35 |
+endstruc |
|
36 |
+ |
|
37 |
+%macro EMULATE_HADDPS 3 ; dst, src, tmp |
|
38 |
+%if cpuflag(sse3) |
|
39 |
+ haddps %1, %2 |
|
40 |
+%else |
|
41 |
+ movaps %3, %1 |
|
42 |
+ shufps %1, %2, q2020 |
|
43 |
+ shufps %3, %2, q3131 |
|
44 |
+ addps %1, %3 |
|
45 |
+%endif |
|
46 |
+%endmacro ; EMULATE_HADDPS |
|
47 |
+ |
|
48 |
+%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp |
|
49 |
+%if cpuflag(fma3) || cpuflag(fma4) |
|
50 |
+ fmaddps %1, %2, %3, %4 |
|
51 |
+%else |
|
52 |
+ mulps %5, %2, %3 |
|
53 |
+ addps %1, %4, %5 |
|
54 |
+%endif |
|
55 |
+%endmacro ; EMULATE_FMADDPS |
|
56 |
+ |
|
57 |
+%macro CQT_CALC 9 |
|
58 |
+; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im |
|
59 |
+; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset |
|
60 |
+ mov id, xd |
|
61 |
+ add id, [coeffsq + Coeffs.start + %9] |
|
62 |
+ movaps m%5, [srcq + 8 * iq] |
|
63 |
+ movaps m%7, [srcq + 8 * iq + mmsize] |
|
64 |
+ shufps m%6, m%5, m%7, q3131 |
|
65 |
+ shufps m%5, m%5, m%7, q2020 |
|
66 |
+ sub id, fft_lend |
|
67 |
+ EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 |
|
68 |
+ neg id |
|
69 |
+ EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 |
|
70 |
+ movups m%5, [srcq + 8 * iq - mmsize + 8] |
|
71 |
+ movups m%7, [srcq + 8 * iq - 2*mmsize + 8] |
|
72 |
+ %if mmsize == 32 |
|
73 |
+ vperm2f128 m%5, m%5, m%5, 1 |
|
74 |
+ vperm2f128 m%7, m%7, m%7, 1 |
|
75 |
+ %endif |
|
76 |
+ shufps m%6, m%5, m%7, q1313 |
|
77 |
+ shufps m%5, m%5, m%7, q0202 |
|
78 |
+ EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 |
|
79 |
+ EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 |
|
80 |
+%endmacro ; CQT_CALC |
|
81 |
+ |
|
82 |
+%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 |
|
83 |
+ addps m%5, m%4, m%2 |
|
84 |
+ subps m%6, m%3, m%1 |
|
85 |
+ addps m%1, m%3 |
|
86 |
+ subps m%2, m%4 |
|
87 |
+ EMULATE_HADDPS m%5, m%6, m%3 |
|
88 |
+ EMULATE_HADDPS m%1, m%2, m%3 |
|
89 |
+ EMULATE_HADDPS m%1, m%5, m%2 |
|
90 |
+ %if mmsize == 32 |
|
91 |
+ vextractf128 xmm%2, m%1, 1 |
|
92 |
+ addps xmm%1, xmm%2 |
|
93 |
+ %endif |
|
94 |
+%endmacro ; CQT_SEPARATE |
|
95 |
+ |
|
96 |
+%macro DECLARE_CQT_CALC 0 |
|
97 |
+; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) |
|
98 |
+%if ARCH_X86_64 |
|
99 |
+cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len |
|
100 |
+ align 16 |
|
101 |
+ .loop_k: |
|
102 |
+ mov xd, [coeffsq + Coeffs.len] |
|
103 |
+ xorps m0, m0 |
|
104 |
+ movaps m1, m0 |
|
105 |
+ movaps m2, m0 |
|
106 |
+ mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] |
|
107 |
+ movaps m3, m0 |
|
108 |
+ movaps m8, m0 |
|
109 |
+ cmp coeffs_lend, xd |
|
110 |
+ movaps m9, m0 |
|
111 |
+ movaps m10, m0 |
|
112 |
+ movaps m11, m0 |
|
113 |
+ cmova coeffs_lend, xd |
|
114 |
+ xor xd, xd |
|
115 |
+ test coeffs_lend, coeffs_lend |
|
116 |
+ jz .check_loop_b |
|
117 |
+ mov coeffs_valq, [coeffsq + Coeffs.val] |
|
118 |
+ mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] |
|
119 |
+ align 16 |
|
120 |
+ .loop_ab: |
|
121 |
+ movaps m7, [coeffs_valq + 4 * xq] |
|
122 |
+ CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
|
123 |
+ movaps m7, [coeffs_val2q + 4 * xq] |
|
124 |
+ CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
|
125 |
+ add xd, mmsize/4 |
|
126 |
+ cmp xd, coeffs_lend |
|
127 |
+ jb .loop_ab |
|
128 |
+ .check_loop_b: |
|
129 |
+ cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
|
130 |
+ jae .check_loop_a |
|
131 |
+ align 16 |
|
132 |
+ .loop_b: |
|
133 |
+ movaps m7, [coeffs_val2q + 4 * xq] |
|
134 |
+ CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
|
135 |
+ add xd, mmsize/4 |
|
136 |
+ cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
|
137 |
+ jb .loop_b |
|
138 |
+ .loop_end: |
|
139 |
+ CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
|
140 |
+ CQT_SEPARATE 8, 9, 10, 11, 4, 5 |
|
141 |
+ mulps xmm0, xmm0 |
|
142 |
+ mulps xmm8, xmm8 |
|
143 |
+ EMULATE_HADDPS xmm0, xmm8, xmm1 |
|
144 |
+ movaps [dstq], xmm0 |
|
145 |
+ sub lend, 2 |
|
146 |
+ lea dstq, [dstq + 16] |
|
147 |
+ lea coeffsq, [coeffsq + 2*Coeffs.sizeof] |
|
148 |
+ jnz .loop_k |
|
149 |
+ REP_RET |
|
150 |
+ align 16 |
|
151 |
+ .check_loop_a: |
|
152 |
+ cmp xd, [coeffsq + Coeffs.len] |
|
153 |
+ jae .loop_end |
|
154 |
+ align 16 |
|
155 |
+ .loop_a: |
|
156 |
+ movaps m7, [coeffs_valq + 4 * xq] |
|
157 |
+ CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
|
158 |
+ add xd, mmsize/4 |
|
159 |
+ cmp xd, [coeffsq + Coeffs.len] |
|
160 |
+ jb .loop_a |
|
161 |
+ jmp .loop_end |
|
162 |
+%else |
|
163 |
+cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i |
|
164 |
+%define fft_lend r4m |
|
165 |
+ align 16 |
|
166 |
+ .loop_k: |
|
167 |
+ mov xd, [coeffsq + Coeffs.len] |
|
168 |
+ xorps m0, m0 |
|
169 |
+ movaps m1, m0 |
|
170 |
+ movaps m2, m0 |
|
171 |
+ movaps m3, m0 |
|
172 |
+ test xd, xd |
|
173 |
+ jz .store |
|
174 |
+ mov coeffs_valq, [coeffsq + Coeffs.val] |
|
175 |
+ xor xd, xd |
|
176 |
+ align 16 |
|
177 |
+ .loop_x: |
|
178 |
+ movaps m7, [coeffs_valq + 4 * xq] |
|
179 |
+ CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
|
180 |
+ add xd, mmsize/4 |
|
181 |
+ cmp xd, [coeffsq + Coeffs.len] |
|
182 |
+ jb .loop_x |
|
183 |
+ CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
|
184 |
+ mulps xmm0, xmm0 |
|
185 |
+ EMULATE_HADDPS xmm0, xmm0, xmm1 |
|
186 |
+ .store: |
|
187 |
+ movlps [dstq], xmm0 |
|
188 |
+ sub lend, 1 |
|
189 |
+ lea dstq, [dstq + 8] |
|
190 |
+ lea coeffsq, [coeffsq + Coeffs.sizeof] |
|
191 |
+ jnz .loop_k |
|
192 |
+ REP_RET |
|
193 |
+%endif ; ARCH_X86_64 |
|
194 |
+%endmacro ; DECLARE_CQT_CALC |
|
195 |
+ |
|
196 |
+INIT_XMM sse |
|
197 |
+DECLARE_CQT_CALC |
|
198 |
+INIT_XMM sse3 |
|
199 |
+DECLARE_CQT_CALC |
|
200 |
+INIT_YMM avx |
|
201 |
+DECLARE_CQT_CALC |
|
202 |
+INIT_YMM fma3 |
|
203 |
+DECLARE_CQT_CALC |
|
204 |
+INIT_XMM fma4 |
|
205 |
+DECLARE_CQT_CALC |
0 | 206 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,63 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "libavutil/attributes.h" |
|
21 |
+#include "libavutil/cpu.h" |
|
22 |
+#include "libavutil/x86/cpu.h" |
|
23 |
+#include "libavfilter/avf_showcqt.h" |
|
24 |
+ |
|
25 |
+#define DECLARE_CQT_CALC(type) \ |
|
26 |
+void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \ |
|
27 |
+ const Coeffs *coeffs, int len, int fft_len) |
|
28 |
+ |
|
29 |
+DECLARE_CQT_CALC(sse); |
|
30 |
+DECLARE_CQT_CALC(sse3); |
|
31 |
+DECLARE_CQT_CALC(avx); |
|
32 |
+DECLARE_CQT_CALC(fma3); |
|
33 |
+DECLARE_CQT_CALC(fma4); |
|
34 |
+ |
|
35 |
+#define permute_coeffs_0 NULL |
|
36 |
+ |
|
37 |
+static void permute_coeffs_01452367(float *v, int len) |
|
38 |
+{ |
|
39 |
+ int k; |
|
40 |
+ for (k = 0; k < len; k += 8) { |
|
41 |
+ FFSWAP(float, v[k+2], v[k+4]); |
|
42 |
+ FFSWAP(float, v[k+3], v[k+5]); |
|
43 |
+ } |
|
44 |
+} |
|
45 |
+ |
|
46 |
+av_cold void ff_showcqt_init_x86(ShowCQTContext *s) |
|
47 |
+{ |
|
48 |
+ int cpuflags = av_get_cpu_flags(); |
|
49 |
+ |
|
50 |
+#define SELECT_CQT_CALC(type, TYPE, align, perm) \ |
|
51 |
+if (EXTERNAL_##TYPE(cpuflags)) { \ |
|
52 |
+ s->cqt_calc = ff_showcqt_cqt_calc_##type; \ |
|
53 |
+ s->cqt_align = align; \ |
|
54 |
+ s->permute_coeffs = permute_coeffs_##perm; \ |
|
55 |
+} |
|
56 |
+ |
|
57 |
+ SELECT_CQT_CALC(sse, SSE, 4, 0); |
|
58 |
+ SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0); |
|
59 |
+ SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm |
|
60 |
+ SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367); |
|
61 |
+ SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367); |
|
62 |
+} |