Browse code

avfilter/avf_showcqt: cqt_calc optimization on x86

on x86_64:
time PSNR
plain 3.303 inf
SSE 1.649 107.087535
SSE3 1.632 107.087535
AVX 1.409 106.986771
FMA3 1.265 107.108437

on x86_32 (PSNR compared to x86_64 plain):
time PSNR
plain 7.225 103.951979
SSE 1.827 105.859282
SSE3 1.819 105.859282
AVX 1.533 105.997661
FMA3 1.384 105.885377

FMA4 test is not available

Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>

Muhammad Faiz authored on 2016/06/04 16:33:05
Showing 5 changed files
... ...
@@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
320 320
             w *= sign * (1.0 / s->fft_len);
321 321
             s->coeffs[m].val[x - s->coeffs[m].start] = w;
322 322
         }
323
+
324
+        if (s->permute_coeffs)
325
+            s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
323 326
     }
324 327
 
325 328
     av_expr_free(expr);
... ...
@@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
1230 1230
 
1231 1231
     s->cqt_align = 1;
1232 1232
     s->cqt_calc = cqt_calc;
1233
+    s->permute_coeffs = NULL;
1233 1234
     s->draw_sono = draw_sono;
1234 1235
     if (s->format == AV_PIX_FMT_RGB24) {
1235 1236
         s->draw_bar = draw_bar_rgb;
... ...
@@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
1241 1241
         s->update_sono = update_sono_yuv;
1242 1242
     }
1243 1243
 
1244
+    if (ARCH_X86)
1245
+        ff_showcqt_init_x86(s);
1246
+
1244 1247
     if ((ret = init_cqt(s)) < 0)
1245 1248
         return ret;
1246 1249
 
... ...
@@ -74,6 +74,7 @@ typedef struct {
74 74
     /* callback */
75 75
     void                (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
76 76
                                     int len, int fft_len);
77
+    void                (*permute_coeffs)(float *v, int len);
77 78
     void                (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
78 79
                                     const ColorFloat *c, int bar_h);
79 80
     void                (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
... ...
@@ -112,4 +113,6 @@ typedef struct {
112 112
     int                 axis;
113 113
 } ShowCQTContext;
114 114
 
115
+void ff_showcqt_init_x86(ShowCQTContext *s);
116
+
115 117
 #endif
... ...
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
13 13
 OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
14 14
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
15 15
 OBJS-$(CONFIG_REMOVEGRAIN_FILTER)            += x86/vf_removegrain_init.o
16
+OBJS-$(CONFIG_SHOWCQT_FILTER)                += x86/avf_showcqt_init.o
16 17
 OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
17 18
 OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
18 19
 OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
... ...
@@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
37 37
 ifdef CONFIG_GPL
38 38
 YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER)       += x86/vf_removegrain.o
39 39
 endif
40
+YASM-OBJS-$(CONFIG_SHOWCQT_FILTER)           += x86/avf_showcqt.o
40 41
 YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
41 42
 YASM-OBJS-$(CONFIG_STEREO3D_FILTER)          += x86/vf_stereo3d.o
42 43
 YASM-OBJS-$(CONFIG_TBLEND_FILTER)            += x86/vf_blend.o
43 44
new file mode 100644
... ...
@@ -0,0 +1,206 @@
0
+;*****************************************************************************
1
+;* x86-optimized functions for showcqt filter
2
+;*
3
+;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com>
4
+;*
5
+;* This file is part of FFmpeg.
6
+;*
7
+;* FFmpeg is free software; you can redistribute it and/or
8
+;* modify it under the terms of the GNU Lesser General Public
9
+;* License as published by the Free Software Foundation; either
10
+;* version 2.1 of the License, or (at your option) any later version.
11
+;*
12
+;* FFmpeg is distributed in the hope that it will be useful,
13
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+;* Lesser General Public License for more details.
16
+;*
17
+;* You should have received a copy of the GNU Lesser General Public
18
+;* License along with FFmpeg; if not, write to the Free Software
19
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+;******************************************************************************
21
+
22
+%include "libavutil/x86/x86util.asm"
23
+
24
+%if ARCH_X86_64
25
+%define pointer resq
26
+%else
27
+%define pointer resd
28
+%endif
29
+
30
+struc Coeffs
31
+    .val:   pointer 1
32
+    .start: resd 1
33
+    .len:   resd 1
34
+    .sizeof:
35
+endstruc
36
+
37
+%macro EMULATE_HADDPS 3 ; dst, src, tmp
38
+%if cpuflag(sse3)
39
+    haddps  %1, %2
40
+%else
41
+    movaps  %3, %1
42
+    shufps  %1, %2, q2020
43
+    shufps  %3, %2, q3131
44
+    addps   %1, %3
45
+%endif
46
+%endmacro ; EMULATE_HADDPS
47
+
48
+%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
49
+%if cpuflag(fma3) || cpuflag(fma4)
50
+    fmaddps %1, %2, %3, %4
51
+%else
52
+    mulps   %5, %2, %3
53
+    addps   %1, %4, %5
54
+%endif
55
+%endmacro ; EMULATE_FMADDPS
56
+
57
+%macro CQT_CALC 9
58
+; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
59
+; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
60
+    mov     id, xd
61
+    add     id, [coeffsq + Coeffs.start + %9]
62
+    movaps  m%5, [srcq + 8 * iq]
63
+    movaps  m%7, [srcq + 8 * iq + mmsize]
64
+    shufps  m%6, m%5, m%7, q3131
65
+    shufps  m%5, m%5, m%7, q2020
66
+    sub     id, fft_lend
67
+    EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
68
+    neg     id
69
+    EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
70
+    movups  m%5, [srcq + 8 * iq - mmsize + 8]
71
+    movups  m%7, [srcq + 8 * iq - 2*mmsize + 8]
72
+    %if mmsize == 32
73
+    vperm2f128 m%5, m%5, m%5, 1
74
+    vperm2f128 m%7, m%7, m%7, 1
75
+    %endif
76
+    shufps  m%6, m%5, m%7, q1313
77
+    shufps  m%5, m%5, m%7, q0202
78
+    EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
79
+    EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
80
+%endmacro ; CQT_CALC
81
+
82
+%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
83
+    addps   m%5, m%4, m%2
84
+    subps   m%6, m%3, m%1
85
+    addps   m%1, m%3
86
+    subps   m%2, m%4
87
+    EMULATE_HADDPS m%5, m%6, m%3
88
+    EMULATE_HADDPS m%1, m%2, m%3
89
+    EMULATE_HADDPS m%1, m%5, m%2
90
+    %if mmsize == 32
91
+    vextractf128 xmm%2, m%1, 1
92
+    addps   xmm%1, xmm%2
93
+    %endif
94
+%endmacro ; CQT_SEPARATE
95
+
96
+%macro DECLARE_CQT_CALC 0
97
+; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
98
+%if ARCH_X86_64
99
+cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
100
+    align   16
101
+    .loop_k:
102
+        mov     xd, [coeffsq + Coeffs.len]
103
+        xorps   m0, m0
104
+        movaps  m1, m0
105
+        movaps  m2, m0
106
+        mov     coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
107
+        movaps  m3, m0
108
+        movaps  m8, m0
109
+        cmp     coeffs_lend, xd
110
+        movaps  m9, m0
111
+        movaps  m10, m0
112
+        movaps  m11, m0
113
+        cmova   coeffs_lend, xd
114
+        xor     xd, xd
115
+        test    coeffs_lend, coeffs_lend
116
+        jz      .check_loop_b
117
+        mov     coeffs_valq, [coeffsq + Coeffs.val]
118
+        mov     coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
119
+        align   16
120
+        .loop_ab:
121
+            movaps  m7, [coeffs_valq + 4 * xq]
122
+            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
123
+            movaps  m7, [coeffs_val2q + 4 * xq]
124
+            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
125
+            add     xd, mmsize/4
126
+            cmp     xd, coeffs_lend
127
+            jb      .loop_ab
128
+        .check_loop_b:
129
+        cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
130
+        jae     .check_loop_a
131
+        align   16
132
+        .loop_b:
133
+            movaps  m7, [coeffs_val2q + 4 * xq]
134
+            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
135
+            add     xd, mmsize/4
136
+            cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
137
+            jb      .loop_b
138
+        .loop_end:
139
+        CQT_SEPARATE 0, 1, 2, 3, 4, 5
140
+        CQT_SEPARATE 8, 9, 10, 11, 4, 5
141
+        mulps   xmm0, xmm0
142
+        mulps   xmm8, xmm8
143
+        EMULATE_HADDPS xmm0, xmm8, xmm1
144
+        movaps  [dstq], xmm0
145
+        sub     lend, 2
146
+        lea     dstq, [dstq + 16]
147
+        lea     coeffsq, [coeffsq + 2*Coeffs.sizeof]
148
+        jnz     .loop_k
149
+        REP_RET
150
+        align   16
151
+        .check_loop_a:
152
+        cmp     xd, [coeffsq + Coeffs.len]
153
+        jae     .loop_end
154
+        align   16
155
+        .loop_a:
156
+            movaps  m7, [coeffs_valq + 4 * xq]
157
+            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
158
+            add     xd, mmsize/4
159
+            cmp     xd, [coeffsq + Coeffs.len]
160
+            jb      .loop_a
161
+        jmp     .loop_end
162
+%else
163
+cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
164
+%define fft_lend r4m
165
+    align   16
166
+    .loop_k:
167
+        mov     xd, [coeffsq + Coeffs.len]
168
+        xorps   m0, m0
169
+        movaps  m1, m0
170
+        movaps  m2, m0
171
+        movaps  m3, m0
172
+        test    xd, xd
173
+        jz      .store
174
+        mov     coeffs_valq, [coeffsq + Coeffs.val]
175
+        xor     xd, xd
176
+        align   16
177
+        .loop_x:
178
+            movaps  m7, [coeffs_valq + 4 * xq]
179
+            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
180
+            add     xd, mmsize/4
181
+            cmp     xd, [coeffsq + Coeffs.len]
182
+            jb      .loop_x
183
+        CQT_SEPARATE 0, 1, 2, 3, 4, 5
184
+        mulps   xmm0, xmm0
185
+        EMULATE_HADDPS xmm0, xmm0, xmm1
186
+        .store:
187
+        movlps  [dstq], xmm0
188
+        sub     lend, 1
189
+        lea     dstq, [dstq + 8]
190
+        lea     coeffsq, [coeffsq + Coeffs.sizeof]
191
+        jnz     .loop_k
192
+        REP_RET
193
+%endif ; ARCH_X86_64
194
+%endmacro ; DECLARE_CQT_CALC
195
+
196
+INIT_XMM sse
197
+DECLARE_CQT_CALC
198
+INIT_XMM sse3
199
+DECLARE_CQT_CALC
200
+INIT_YMM avx
201
+DECLARE_CQT_CALC
202
+INIT_YMM fma3
203
+DECLARE_CQT_CALC
204
+INIT_XMM fma4
205
+DECLARE_CQT_CALC
0 206
new file mode 100644
... ...
@@ -0,0 +1,63 @@
0
+/*
1
+ * Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com>
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "libavutil/attributes.h"
21
+#include "libavutil/cpu.h"
22
+#include "libavutil/x86/cpu.h"
23
+#include "libavfilter/avf_showcqt.h"
24
+
25
+#define DECLARE_CQT_CALC(type) \
26
+void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
27
+                                const Coeffs *coeffs, int len, int fft_len)
28
+
29
+DECLARE_CQT_CALC(sse);
30
+DECLARE_CQT_CALC(sse3);
31
+DECLARE_CQT_CALC(avx);
32
+DECLARE_CQT_CALC(fma3);
33
+DECLARE_CQT_CALC(fma4);
34
+
35
+#define permute_coeffs_0 NULL
36
+
37
+static void permute_coeffs_01452367(float *v, int len)
38
+{
39
+    int k;
40
+    for (k = 0; k < len; k += 8) {
41
+        FFSWAP(float, v[k+2], v[k+4]);
42
+        FFSWAP(float, v[k+3], v[k+5]);
43
+    }
44
+}
45
+
46
+av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
47
+{
48
+    int cpuflags = av_get_cpu_flags();
49
+
50
+#define SELECT_CQT_CALC(type, TYPE, align, perm) \
51
+if (EXTERNAL_##TYPE(cpuflags)) { \
52
+    s->cqt_calc = ff_showcqt_cqt_calc_##type; \
53
+    s->cqt_align = align; \
54
+    s->permute_coeffs = permute_coeffs_##perm; \
55
+}
56
+
57
+    SELECT_CQT_CALC(sse,  SSE,  4, 0);
58
+    SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0);
59
+    SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm
60
+    SELECT_CQT_CALC(avx,  AVX_FAST,  8, 01452367);
61
+    SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367);
62
+}