;*****************************************************************************
;* x86-optimized functions for bwdif filter
;*
;* Copyright (C) 2016 Thomas Mundt <loudmax@yahoo.de>
;*
;* Based on yadif simd code
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
;*               2013 Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

pw_coefhf:  times 4 dw  1016, 5570
pw_coefhf1: times 8 dw -3801
pw_coefsp:  times 4 dw  5077, -981
pw_splfdif: times 4 dw  -768,  768

SECTION .text

%macro LOAD8 2
    movh         %1, %2
    punpcklbw    %1, m7
%endmacro

%macro LOAD12 2
    movu         %1, %2
%endmacro

%macro DISP8 0
    packuswb     m2, m2
    movh     [dstq], m2
%endmacro

%macro DISP12 0
    CLIPW        m2, m7, m12
    movu     [dstq], m2
%endmacro

%macro FILTER 5
    pxor         m7, m7
.loop%1:
    LOAD%4       m0, [curq+t0*%5]
    LOAD%4       m1, [curq+t1*%5]
    LOAD%4       m2, [%2]
    LOAD%4       m3, [%3]
    mova         m4, m3
    paddw        m3, m2
    psubw        m2, m4
    ABS1         m2, m4
    mova         m8, m3
    mova         m9, m2
    LOAD%4       m3, [prevq+t0*%5]
    LOAD%4       m4, [prevq+t1*%5]
    psubw        m3, m0
    psubw        m4, m1
    ABS2         m3, m4, m5, m6
    paddw        m3, m4
    psrlw        m2, 1
    psrlw        m3, 1
    pmaxsw       m2, m3
    LOAD%4       m3, [nextq+t0*%5]
    LOAD%4       m4, [nextq+t1*%5]
    psubw        m3, m0
    psubw        m4, m1
    ABS2         m3, m4, m5, m6
    paddw        m3, m4
    psrlw        m3, 1
    pmaxsw       m2, m3

    LOAD%4       m3, [%2+t0*2*%5]
    LOAD%4       m4, [%3+t0*2*%5]
    LOAD%4       m5, [%2+t1*2*%5]
    LOAD%4       m6, [%3+t1*2*%5]
    paddw        m3, m4
    paddw        m5, m6
    mova         m6, m3
    paddw        m6, m5
    mova        m10, m6
    psrlw        m3, 1
    psrlw        m5, 1
    psubw        m3, m0
    psubw        m5, m1
    mova         m6, m3
    pminsw       m3, m5
    pmaxsw       m5, m6
    mova         m4, m8
    psraw        m4, 1
    mova         m6, m4
    psubw        m6, m0
    psubw        m4, m1
    pmaxsw       m3, m6
    pminsw       m5, m6
    pmaxsw       m3, m4
    pminsw       m5, m4
    mova         m6, m7
    psubw        m6, m3
    pmaxsw       m6, m5
    mova         m3, m2
    pcmpgtw      m3, m7
    pand         m6, m3
    pmaxsw       m2, m6
    mova        m11, m2

    LOAD%4       m2, [%2+t0*4*%5]
    LOAD%4       m3, [%3+t0*4*%5]
    LOAD%4       m4, [%2+t1*4*%5]
    LOAD%4       m5, [%3+t1*4*%5]
    paddw        m2, m3
    paddw        m4, m5
    paddw        m2, m4
    mova         m3, m2
    punpcklwd    m2, m8
    punpckhwd    m3, m8
    pmaddwd      m2, [pw_coefhf]
    pmaddwd      m3, [pw_coefhf]
    mova         m4, m10
    mova         m6, m4
    pmullw       m4, [pw_coefhf1]
    pmulhw       m6, [pw_coefhf1]
    mova         m5, m4
    punpcklwd    m4, m6
    punpckhwd    m5, m6
    paddd        m2, m4
    paddd        m3, m5
    psrad        m2, 2
    psrad        m3, 2

    mova         m4, m0
    paddw        m0, m1
%if ARCH_X86_64
    LOAD%4       m5, [curq+t2*%5]
    LOAD%4       m6, [curq+t3*%5]
%else
    mov          r4, prefs3mp
    mov          r5, mrefs3mp
    LOAD%4       m5, [curq+t0*%5]
    LOAD%4       m6, [curq+t1*%5]
    mov          r4, prefsmp
    mov          r5, mrefsmp
%endif
    paddw        m6, m5
    psubw        m1, m4
    ABS1         m1, m4
    pcmpgtw      m1, m9
    mova         m4, m1
    punpcklwd    m1, m4
    punpckhwd    m4, m4
    pand         m2, m1
    pand         m3, m4
    mova         m5, [pw_splfdif]
    mova         m7, m5
    pand         m5, m1
    pand         m7, m4
    paddw        m5, [pw_coefsp]
    paddw        m7, [pw_coefsp]
    mova         m4, m0
    punpcklwd    m0, m6
    punpckhwd    m4, m6
    pmaddwd      m0, m5
    pmaddwd      m4, m7
    paddd        m2, m0
    paddd        m3, m4
    psrad        m2, 13
    psrad        m3, 13
    packssdw     m2, m3

    mova         m4, m8
    psraw        m4, 1
    mova         m0, m11
    mova         m3, m4
    psubw        m4, m0
    paddw        m3, m0
    CLIPW        m2, m4, m3
    pxor         m7, m7
    DISP%4

    add        dstq, STEP
    add       prevq, STEP
    add        curq, STEP
    add       nextq, STEP
    sub    DWORD wm, mmsize/2
    jg .loop%1
%endmacro

%macro PROC 2
%if ARCH_X86_64
    movsxd       r5, DWORD prefsm
    movsxd       r6, DWORD mrefsm
    movsxd       r7, DWORD prefs3m
    movsxd       r8, DWORD mrefs3m
    DECLARE_REG_TMP 5, 6, 7, 8
%else
    %define m8  [rsp+ 0]
    %define m9  [rsp+16]
    %define m10 [rsp+32]
    %define m11 [rsp+48]
    mov          r4, prefsmp
    mov          r5, mrefsmp
    DECLARE_REG_TMP 4, 5
%endif
    cmp DWORD paritym, 0
    je .parity0
    FILTER 1, prevq, curq, %1, %2
    jmp .ret
.parity0:
    FILTER 0, curq, nextq, %1, %2
.ret:
    RET
%endmacro

%macro BWDIF 0
%if ARCH_X86_64
cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \
                                        mrefs, prefs2, mrefs2, prefs3, mrefs3, \
                                        prefs4, mrefs4, parity, clip_max
%else
cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \
                                        mrefs, prefs2, mrefs2, prefs3, mrefs3, \
                                        prefs4, mrefs4, parity, clip_max
%endif
    %define STEP mmsize/2
    PROC 8, 1

%if ARCH_X86_64
cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
                                              prefs, mrefs, prefs2, mrefs2, \
                                              prefs3, mrefs3, prefs4, \
                                              mrefs4, parity, clip_max
    movd        m12, DWORD clip_maxm
    SPLATW      m12, m12, 0
%else
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
                                              prefs, mrefs, prefs2, mrefs2, \
                                              prefs3, mrefs3, prefs4, \
                                              mrefs4, parity, clip_max
    %define m12 [rsp+64]
    movd         m0, DWORD clip_maxm
    SPLATW       m0, m0, 0
    mova        m12, m0
%endif
    %define STEP mmsize
    PROC 12, 2
%endmacro

INIT_XMM ssse3
BWDIF
INIT_XMM sse2
BWDIF
%if ARCH_X86_32
INIT_MMX mmxext
BWDIF
%endif