;*****************************************************************************
;* x86-optimized functions for showcqt filter
;*
;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

%if ARCH_X86_64
%define pointer resq
%else
%define pointer resd
%endif

struc Coeffs
    .val:   pointer 1
    .start: resd 1
    .len:   resd 1
    .sizeof:
endstruc

%macro CQT_CALC 9
; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
    mov     id, xd
    add     id, [coeffsq + Coeffs.start + %9]
    movaps  m%5, [srcq + 8 * iq]
    movaps  m%7, [srcq + 8 * iq + mmsize]
    shufps  m%6, m%5, m%7, q3131
    shufps  m%5, m%5, m%7, q2020
    sub     id, fft_lend
    FMULADD_PS m%2, m%6, m%8, m%2, m%6
    neg     id
    FMULADD_PS m%1, m%5, m%8, m%1, m%5
    movups  m%5, [srcq + 8 * iq - mmsize + 8]
    movups  m%7, [srcq + 8 * iq - 2*mmsize + 8]
    %if mmsize == 32
    vperm2f128 m%5, m%5, m%5, 1
    vperm2f128 m%7, m%7, m%7, 1
    %endif
    shufps  m%6, m%5, m%7, q1313
    shufps  m%5, m%5, m%7, q0202
    FMULADD_PS m%4, m%6, m%8, m%4, m%6
    FMULADD_PS m%3, m%5, m%8, m%3, m%5
%endmacro ; CQT_CALC

%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
    addps   m%5, m%4, m%2
    subps   m%6, m%3, m%1
    addps   m%1, m%1, m%3
    subps   m%2, m%2, m%4
    HADDPS  m%5, m%6, m%3
    HADDPS  m%1, m%2, m%3
    HADDPS  m%1, m%5, m%2
    %if mmsize == 32
    vextractf128 xmm%2, m%1, 1
    addps   xmm%1, xmm%2
    %endif
%endmacro ; CQT_SEPARATE

%macro DECLARE_CQT_CALC 0
; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
%if ARCH_X86_64
cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
    align   16
    .loop_k:
        mov     xd, [coeffsq + Coeffs.len]
        xorps   m0, m0, m0
        movaps  m1, m0
        movaps  m2, m0
        mov     coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
        movaps  m3, m0
        movaps  m8, m0
        cmp     coeffs_lend, xd
        movaps  m9, m0
        movaps  m10, m0
        movaps  m11, m0
        cmova   coeffs_lend, xd
        xor     xd, xd
        test    coeffs_lend, coeffs_lend
        jz      .check_loop_b
        mov     coeffs_valq, [coeffsq + Coeffs.val]
        mov     coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
        align   16
        .loop_ab:
            movaps  m7, [coeffs_valq + 4 * xq]
            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
            movaps  m7, [coeffs_val2q + 4 * xq]
            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
            add     xd, mmsize/4
            cmp     xd, coeffs_lend
            jb      .loop_ab
        .check_loop_b:
        cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
        jae     .check_loop_a
        align   16
        .loop_b:
            movaps  m7, [coeffs_val2q + 4 * xq]
            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
            add     xd, mmsize/4
            cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
            jb      .loop_b
        .loop_end:
        CQT_SEPARATE 0, 1, 2, 3, 4, 5
        CQT_SEPARATE 8, 9, 10, 11, 4, 5
        mulps   xmm0, xmm0
        mulps   xmm8, xmm8
        HADDPS  xmm0, xmm8, xmm1
        movaps  [dstq], xmm0
        sub     lend, 2
        lea     dstq, [dstq + 16]
        lea     coeffsq, [coeffsq + 2*Coeffs.sizeof]
        jnz     .loop_k
        REP_RET
        align   16
        .check_loop_a:
        cmp     xd, [coeffsq + Coeffs.len]
        jae     .loop_end
        align   16
        .loop_a:
            movaps  m7, [coeffs_valq + 4 * xq]
            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
            add     xd, mmsize/4
            cmp     xd, [coeffsq + Coeffs.len]
            jb      .loop_a
        jmp     .loop_end
%else
cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
%define fft_lend r4m
    align   16
    .loop_k:
        mov     xd, [coeffsq + Coeffs.len]
        xorps   m0, m0, m0
        movaps  m1, m0
        movaps  m2, m0
        movaps  m3, m0
        test    xd, xd
        jz      .store
        mov     coeffs_valq, [coeffsq + Coeffs.val]
        xor     xd, xd
        align   16
        .loop_x:
            movaps  m7, [coeffs_valq + 4 * xq]
            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
            add     xd, mmsize/4
            cmp     xd, [coeffsq + Coeffs.len]
            jb      .loop_x
        CQT_SEPARATE 0, 1, 2, 3, 4, 5
        mulps   xmm0, xmm0
        HADDPS  xmm0, xmm0, xmm1
        .store:
        movlps  [dstq], xmm0
        sub     lend, 1
        lea     dstq, [dstq + 8]
        lea     coeffsq, [coeffsq + Coeffs.sizeof]
        jnz     .loop_k
        REP_RET
%endif ; ARCH_X86_64
%endmacro ; DECLARE_CQT_CALC

INIT_XMM sse
DECLARE_CQT_CALC
INIT_XMM sse3
DECLARE_CQT_CALC
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DECLARE_CQT_CALC
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
DECLARE_CQT_CALC
%endif
%if HAVE_FMA4_EXTERNAL
INIT_XMM fma4
DECLARE_CQT_CALC
%endif