;*****************************************************************************
;* x86-optimized functions for pullup filter
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License along
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

INIT_MMX mmx
cglobal pullup_filter_diff, 3, 5, 8, first, second, size
    mov        r3, 4
    pxor       m4, m4
    pxor       m7, m7

.loop:
    movq       m0, [firstq]
    movq       m2, [firstq]
    add        firstq, sizeq
    movq       m1, [secondq]
    add        secondq, sizeq
    psubusb    m2, m1
    psubusb    m1, m0
    movq       m0, m2
    movq       m3, m1
    punpcklbw  m0, m7
    punpcklbw  m1, m7
    punpckhbw  m2, m7
    punpckhbw  m3, m7
    paddw      m4, m0
    paddw      m4, m1
    paddw      m4, m2
    paddw      m4, m3

    dec        r3
    jnz .loop

    movq       m3, m4
    punpcklwd  m4, m7
    punpckhwd  m3, m7
    paddd      m3, m4
    movd      eax, m3
    psrlq      m3, 32
    movd      r4d, m3
    add       eax, r4d
    RET

INIT_MMX mmx
cglobal pullup_filter_comb, 3, 5, 8, first, second, size
    mov        r3, 4
    pxor       m6, m6
    pxor       m7, m7
    sub        secondq, sizeq

.loop:
    movq       m0, [firstq]
    movq       m1, [secondq]
    punpcklbw  m0, m7
    movq       m2, [secondq+sizeq]
    punpcklbw  m1, m7
    punpcklbw  m2, m7
    paddw      m0, m0
    paddw      m1, m2
    movq       m2, m0
    psubusw    m0, m1
    psubusw    m1, m2
    paddw      m6, m0
    paddw      m6, m1

    movq       m0, [firstq]
    movq       m1, [secondq]
    punpckhbw  m0, m7
    movq       m2, [secondq+sizeq]
    punpckhbw  m1, m7
    punpckhbw  m2, m7
    paddw      m0, m0
    paddw      m1, m2
    movq       m2, m0
    psubusw    m0, m1
    psubusw    m1, m2
    paddw      m6, m0
    paddw      m6, m1

    movq       m0, [secondq+sizeq]
    movq       m1, [firstq]
    punpcklbw  m0, m7
    movq       m2, [firstq+sizeq]
    punpcklbw  m1, m7
    punpcklbw  m2, m7
    paddw      m0, m0
    paddw      m1, m2
    movq       m2, m0
    psubusw    m0, m1
    psubusw    m1, m2
    paddw      m6, m0
    paddw      m6, m1

    movq       m0, [secondq+sizeq]
    movq       m1, [firstq]
    punpckhbw  m0, m7
    movq       m2, [firstq+sizeq]
    punpckhbw  m1, m7
    punpckhbw  m2, m7
    paddw      m0, m0
    paddw      m1, m2
    movq       m2, m0
    psubusw    m0, m1
    psubusw    m1, m2
    paddw      m6, m0
    paddw      m6, m1

    add        firstq, sizeq
    add        secondq, sizeq
    dec        r3
    jnz .loop

    movq       m5, m6
    punpcklwd  m6, m7
    punpckhwd  m5, m7
    paddd      m5, m6
    movd      eax, m5
    psrlq      m5, 32
    movd      r4d, m5
    add       eax, r4d
    RET

INIT_MMX mmx
cglobal pullup_filter_var, 3, 5, 8, first, second, size
    mov        r3, 3
    pxor       m4, m4
    pxor       m7, m7

.loop:
    movq       m0, [firstq]
    movq       m2, [firstq]
    movq       m1, [firstq+sizeq]
    add        firstq, sizeq
    psubusb    m2, m1
    psubusb    m1, m0
    movq       m0, m2
    movq       m3, m1
    punpcklbw  m0, m7
    punpcklbw  m1, m7
    punpckhbw  m2, m7
    punpckhbw  m3, m7
    paddw      m4, m0
    paddw      m4, m1
    paddw      m4, m2
    paddw      m4, m3

    dec        r3
    jnz .loop

    movq       m3, m4
    punpcklwd  m4, m7
    punpckhwd  m3, m7
    paddd      m3, m4
    movd      eax, m3
    psrlq      m3, 32
    movd      r4d, m3
    add       eax, r4d
    shl       eax, 2
    RET