;*****************************************************************************
;* x86-optimized functions for overlay filter
;*
;* Copyright (C) 2018 Paul B Mahol
;* Copyright (C) 2018 Henrik Gramner
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

pb_1:     times 16 db 1
pw_128:   times  8 dw 128
pw_255:   times  8 dw 255
pw_257:   times  8 dw 257

SECTION .text

INIT_XMM sse4
cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x
    xor          xq, xq
    movsxdifnidn wq, wd
    mov          rq, wq
    and          rq, mmsize/2 - 1
    cmp          wq, mmsize/2
    jl .end
    sub          wq, rq
    mova         m3, [pw_255]
    mova         m4, [pw_128]
    mova         m5, [pw_257]
    .loop:
        pmovzxbw    m0, [sq+xq]
        pmovzxbw    m2, [aq+xq]
        pmovzxbw    m1, [dq+xq]
        pmullw      m0, m2
        pxor        m2, m3
        pmullw      m1, m2
        paddw       m0, m4
        paddw       m0, m1
        pmulhuw     m0, m5
        packuswb    m0, m0
        movq   [dq+xq], m0
        add         xq, mmsize/2
        cmp         xq, wq
        jl .loop

    .end:
    mov    eax, xd
    RET

INIT_XMM sse4
cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x
    xor          xq, xq
    movsxdifnidn wq, wd
    sub          wq, 1
    mov          rq, wq
    and          rq, mmsize/2 - 1
    cmp          wq, mmsize/2
    jl .end
    sub          wq, rq
    mova         m3, [pw_255]
    mova         m4, [pw_128]
    mova         m5, [pw_257]
    .loop:
        pmovzxbw    m0, [sq+xq]
        movu        m1, [aq+2*xq]
        pandn       m2, m3, m1
        psllw       m1, 8
        pavgw       m2, m1
        pavgw       m2, m1
        psrlw       m2, 8
        pmovzxbw    m1, [dq+xq]
        pmullw      m0, m2
        pxor        m2, m3
        pmullw      m1, m2
        paddw       m0, m4
        paddw       m0, m1
        pmulhuw     m0, m5
        packuswb    m0, m0
        movq   [dq+xq], m0
        add         xq, mmsize/2
        cmp         xq, wq
        jl .loop

    .end:
    mov    eax, xd
    RET

INIT_XMM sse4
cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x
    mov         daq, aq
    add         daq, rmp
    xor          xq, xq
    movsxdifnidn wq, wd
    sub          wq, 1
    mov          rq, wq
    and          rq, mmsize/2 - 1
    cmp          wq, mmsize/2
    jl .end
    sub          wq, rq
    mova         m3, [pw_255]
    mova         m4, [pw_128]
    mova         m5, [pw_257]
    mova         m6, [pb_1]
    .loop:
        pmovzxbw    m0, [sq+xq]
        movu        m2, [aq+2*xq]
        movu        m1, [daq+2*xq]
        pmaddubsw   m2, m6
        pmaddubsw   m1, m6
        paddw       m2, m1
        psrlw       m2, 2
        pmovzxbw    m1, [dq+xq]
        pmullw      m0, m2
        pxor        m2, m3
        pmullw      m1, m2
        paddw       m0, m4
        paddw       m0, m1
        pmulhuw     m0, m5
        packuswb    m0, m0
        movq   [dq+xq], m0
        add         xq, mmsize/2
        cmp         xq, wq
        jl .loop

    .end:
    mov    eax, xd
    RET