GitList

Browse code

avfilter/avf_showcqt: cqt_calc optimization on x86

on x86_64:
time PSNR
plain 3.303 inf
SSE 1.649 107.087535
SSE3 1.632 107.087535
AVX 1.409 106.986771
FMA3 1.265 107.108437

on x86_32 (PSNR compared to x86_64 plain):
time PSNR
plain 7.225 103.951979
SSE 1.827 105.859282
SSE3 1.819 105.859282
AVX 1.533 105.997661
FMA3 1.384 105.885377

FMA4 test is not available

Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>

Muhammad Faiz authored on 2016/06/04 16:33:05
Showing 5 changed files

libavfilter/avf_showcqt.c index b88c83c..62d5b09 100644
libavfilter/avf_showcqt.h index b945f49..588830f 100644
libavfilter/x86/Makefile index 4486b79..b6195f8 100644
libavfilter/x86/avf_showcqt.asm index 0000000..6dac0a7
libavfilter/x86/avf_showcqt_init.c index 0000000..0cc164c

libavfilter/avf_showcqt.c

History View file @ 1e69ac9

@@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
                                  w *= sign * (1.0 / s->fft_len);
                                  s->coeffs[m].val[x - s->coeffs[m].start] = w;
+                             }
+                    +
                     +        if (s->permute_coeffs)
                     +            s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
+                         }
                          av_expr_free(expr);
@@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
                          s->cqt_align = 1;
                          s->cqt_calc = cqt_calc;
                     +    s->permute_coeffs = NULL;
                          s->draw_sono = draw_sono;
                          if (s->format == AV_PIX_FMT_RGB24) {
                              s->draw_bar = draw_bar_rgb;
@@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
                              s->update_sono = update_sono_yuv;
+                         }
                     +    if (ARCH_X86)
                     +        ff_showcqt_init_x86(s);
+                    +
                          if ((ret = init_cqt(s)) < 0)
                              return ret;

libavfilter/avf_showcqt.h

History View file @ 1e69ac9

@@ -74,6 +74,7 @@ typedef struct {
                          /* callback */
                          void                (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
                                                          int len, int fft_len);
                     +    void                (*permute_coeffs)(float *v, int len);
                          void                (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
                                                          const ColorFloat *c, int bar_h);
                          void                (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
@@ -112,4 +113,6 @@ typedef struct {
                          int                 axis;
                      } ShowCQTContext;
                     +void ff_showcqt_init_x86(ShowCQTContext *s);
+                    +
                      #endif

libavfilter/x86/Makefile

History View file @ 1e69ac9

@@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
                      OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
                      OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
                      OBJS-$(CONFIG_REMOVEGRAIN_FILTER)            += x86/vf_removegrain_init.o
                     +OBJS-$(CONFIG_SHOWCQT_FILTER)                += x86/avf_showcqt_init.o
                      OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
                      OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
                      OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
@@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
                      ifdef CONFIG_GPL
                      YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER)       += x86/vf_removegrain.o
                      endif
                     +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER)           += x86/avf_showcqt.o
                      YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
                      YASM-OBJS-$(CONFIG_STEREO3D_FILTER)          += x86/vf_stereo3d.o
                      YASM-OBJS-$(CONFIG_TBLEND_FILTER)            += x86/vf_blend.o

libavfilter/x86/avf_showcqt.asm

History View file @ 1e69ac9

                     new file mode 100644
@@ -0,0 +1,206 @@
                     +;*****************************************************************************
                     +;* x86-optimized functions for showcqt filter
                     +;*
                     +;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com>
                     +;*
                     +;* This file is part of FFmpeg.
                     +;*
                     +;* FFmpeg is free software; you can redistribute it and/or
                     +;* modify it under the terms of the GNU Lesser General Public
                     +;* License as published by the Free Software Foundation; either
                     +;* version 2.1 of the License, or (at your option) any later version.
                     +;*
                     +;* FFmpeg is distributed in the hope that it will be useful,
                     +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     +;* Lesser General Public License for more details.
                     +;*
                     +;* You should have received a copy of the GNU Lesser General Public
                     +;* License along with FFmpeg; if not, write to the Free Software
                     +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     +;******************************************************************************
+                    +
                     +%include "libavutil/x86/x86util.asm"
+                    +
                     +%if ARCH_X86_64
                     +%define pointer resq
                     +%else
                     +%define pointer resd
                     +%endif
+                    +
                     +struc Coeffs
                     +    .val:   pointer 1
                     +    .start: resd 1
                     +    .len:   resd 1
                     +    .sizeof:
                     +endstruc
+                    +
                     +%macro EMULATE_HADDPS 3 ; dst, src, tmp
                     +%if cpuflag(sse3)
                     +    haddps  %1, %2
                     +%else
                     +    movaps  %3, %1
                     +    shufps  %1, %2, q2020
                     +    shufps  %3, %2, q3131
                     +    addps   %1, %3
                     +%endif
                     +%endmacro ; EMULATE_HADDPS
+                    +
                     +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
                     +%if cpuflag(fma3) || cpuflag(fma4)
                     +    fmaddps %1, %2, %3, %4
                     +%else
                     +    mulps   %5, %2, %3
                     +    addps   %1, %4, %5
                     +%endif
                     +%endmacro ; EMULATE_FMADDPS
+                    +
                     +%macro CQT_CALC 9
                     +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
                     +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
                     +    mov     id, xd
                     +    add     id, [coeffsq + Coeffs.start + %9]
                     +    movaps  m%5, [srcq + 8 * iq]
                     +    movaps  m%7, [srcq + 8 * iq + mmsize]
                     +    shufps  m%6, m%5, m%7, q3131
                     +    shufps  m%5, m%5, m%7, q2020
                     +    sub     id, fft_lend
                     +    EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
                     +    neg     id
                     +    EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
                     +    movups  m%5, [srcq + 8 * iq - mmsize + 8]
                     +    movups  m%7, [srcq + 8 * iq - 2*mmsize + 8]
                     +    %if mmsize == 32
                     +    vperm2f128 m%5, m%5, m%5, 1
                     +    vperm2f128 m%7, m%7, m%7, 1
                     +    %endif
                     +    shufps  m%6, m%5, m%7, q1313
                     +    shufps  m%5, m%5, m%7, q0202
                     +    EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
                     +    EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
                     +%endmacro ; CQT_CALC
+                    +
                     +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
                     +    addps   m%5, m%4, m%2
                     +    subps   m%6, m%3, m%1
                     +    addps   m%1, m%3
                     +    subps   m%2, m%4
                     +    EMULATE_HADDPS m%5, m%6, m%3
                     +    EMULATE_HADDPS m%1, m%2, m%3
                     +    EMULATE_HADDPS m%1, m%5, m%2
                     +    %if mmsize == 32
                     +    vextractf128 xmm%2, m%1, 1
                     +    addps   xmm%1, xmm%2
                     +    %endif
                     +%endmacro ; CQT_SEPARATE
+                    +
                     +%macro DECLARE_CQT_CALC 0
                     +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
                     +%if ARCH_X86_64
                     +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
                     +    align   16
                     +    .loop_k:
                     +        mov     xd, [coeffsq + Coeffs.len]
                     +        xorps   m0, m0
                     +        movaps  m1, m0
                     +        movaps  m2, m0
                     +        mov     coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
                     +        movaps  m3, m0
                     +        movaps  m8, m0
                     +        cmp     coeffs_lend, xd
                     +        movaps  m9, m0
                     +        movaps  m10, m0
                     +        movaps  m11, m0
                     +        cmova   coeffs_lend, xd
                     +        xor     xd, xd
                     +        test    coeffs_lend, coeffs_lend
                     +        jz      .check_loop_b
                     +        mov     coeffs_valq, [coeffsq + Coeffs.val]
                     +        mov     coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
                     +        align   16
                     +        .loop_ab:
                     +            movaps  m7, [coeffs_valq + 4 * xq]
                     +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
                     +            movaps  m7, [coeffs_val2q + 4 * xq]
                     +            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
                     +            add     xd, mmsize/4
                     +            cmp     xd, coeffs_lend
                     +            jb      .loop_ab
                     +        .check_loop_b:
                     +        cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
                     +        jae     .check_loop_a
                     +        align   16
                     +        .loop_b:
                     +            movaps  m7, [coeffs_val2q + 4 * xq]
                     +            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
                     +            add     xd, mmsize/4
                     +            cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
                     +            jb      .loop_b
                     +        .loop_end:
                     +        CQT_SEPARATE 0, 1, 2, 3, 4, 5
                     +        CQT_SEPARATE 8, 9, 10, 11, 4, 5
                     +        mulps   xmm0, xmm0
                     +        mulps   xmm8, xmm8
                     +        EMULATE_HADDPS xmm0, xmm8, xmm1
                     +        movaps  [dstq], xmm0
                     +        sub     lend, 2
                     +        lea     dstq, [dstq + 16]
                     +        lea     coeffsq, [coeffsq + 2*Coeffs.sizeof]
                     +        jnz     .loop_k
                     +        REP_RET
                     +        align   16
                     +        .check_loop_a:
                     +        cmp     xd, [coeffsq + Coeffs.len]
                     +        jae     .loop_end
                     +        align   16
                     +        .loop_a:
                     +            movaps  m7, [coeffs_valq + 4 * xq]
                     +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
                     +            add     xd, mmsize/4
                     +            cmp     xd, [coeffsq + Coeffs.len]
                     +            jb      .loop_a
                     +        jmp     .loop_end
                     +%else
                     +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
                     +%define fft_lend r4m
                     +    align   16
                     +    .loop_k:
                     +        mov     xd, [coeffsq + Coeffs.len]
                     +        xorps   m0, m0
                     +        movaps  m1, m0
                     +        movaps  m2, m0
                     +        movaps  m3, m0
                     +        test    xd, xd
                     +        jz      .store
                     +        mov     coeffs_valq, [coeffsq + Coeffs.val]
                     +        xor     xd, xd
                     +        align   16
                     +        .loop_x:
                     +            movaps  m7, [coeffs_valq + 4 * xq]
                     +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
                     +            add     xd, mmsize/4
                     +            cmp     xd, [coeffsq + Coeffs.len]
                     +            jb      .loop_x
                     +        CQT_SEPARATE 0, 1, 2, 3, 4, 5
                     +        mulps   xmm0, xmm0
                     +        EMULATE_HADDPS xmm0, xmm0, xmm1
                     +        .store:
                     +        movlps  [dstq], xmm0
                     +        sub     lend, 1
                     +        lea     dstq, [dstq + 8]
                     +        lea     coeffsq, [coeffsq + Coeffs.sizeof]
                     +        jnz     .loop_k
                     +        REP_RET
                     +%endif ; ARCH_X86_64
                     +%endmacro ; DECLARE_CQT_CALC
+                    +
                     +INIT_XMM sse
                     +DECLARE_CQT_CALC
                     +INIT_XMM sse3
                     +DECLARE_CQT_CALC
                     +INIT_YMM avx
                     +DECLARE_CQT_CALC
                     +INIT_YMM fma3
                     +DECLARE_CQT_CALC
                     +INIT_XMM fma4
                     +DECLARE_CQT_CALC

libavfilter/x86/avf_showcqt_init.c

History View file @ 1e69ac9

                     new file mode 100644
@@ -0,0 +1,63 @@
                     +/*
                     + * Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "libavutil/attributes.h"
                     +#include "libavutil/cpu.h"
                     +#include "libavutil/x86/cpu.h"
                     +#include "libavfilter/avf_showcqt.h"
+                    +
                     +#define DECLARE_CQT_CALC(type) \
                     +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
                     +                                const Coeffs *coeffs, int len, int fft_len)
+                    +
                     +DECLARE_CQT_CALC(sse);
                     +DECLARE_CQT_CALC(sse3);
                     +DECLARE_CQT_CALC(avx);
                     +DECLARE_CQT_CALC(fma3);
                     +DECLARE_CQT_CALC(fma4);
+                    +
                     +#define permute_coeffs_0 NULL
+                    +
                     +static void permute_coeffs_01452367(float *v, int len)
                     +{
                     +    int k;
                     +    for (k = 0; k < len; k += 8) {
                     +        FFSWAP(float, v[k+2], v[k+4]);
                     +        FFSWAP(float, v[k+3], v[k+5]);
                     +    }
                     +}
+                    +
                     +av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
                     +{
                     +    int cpuflags = av_get_cpu_flags();
+                    +
                     +#define SELECT_CQT_CALC(type, TYPE, align, perm) \
                     +if (EXTERNAL_##TYPE(cpuflags)) { \
                     +    s->cqt_calc = ff_showcqt_cqt_calc_##type; \
                     +    s->cqt_align = align; \
                     +    s->permute_coeffs = permute_coeffs_##perm; \
                     +}
+                    +
                     +    SELECT_CQT_CALC(sse,  SSE,  4, 0);
                     +    SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0);
                     +    SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm
                     +    SELECT_CQT_CALC(avx,  AVX_FAST,  8, 01452367);
                     +    SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367);
                     +}