/*
 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"
#include "asm-offsets.h"

.macro resample_one     fmt, es=2
.ifnc \fmt, dbl
    .macro  M_MUL2      x:vararg
    .endm
    .macro  M_MLA2      x:vararg
    .endm
.endif
function ff_resample_one_\fmt\()_neon, export=1
        sxtw            x2,  w2
        ldr             x9,  [x0, #FILTER_BANK]
        ldr             w6,  [x0, #FILTER_LENGTH]
        ldp             w7,  w8,  [x0, #PHASE_SHIFT]    // and phase_mask
        lsr             x10, x4,  x7                    // sample_index
        and             x4,  x4,  x8
        lsl             x11, x6,  #\es          // filter_length * elem_size
        add             x3,  x3,  x10, lsl #\es // src[sample_index]
        madd            x9,  x11, x4,  x9       // filter
        cmp             w6,  #16
        b.lt            5f
8:      // remaining filter_length at least 16
        subs            w6,  w6,  #16
        LOAD8           v4,  v5,  v6,  v7,  x3
        LOAD8           v16, v17, v18, v19, x9
        M_MUL           v0,  v4,  v16, v1
        M_MUL2          v1,  v6,  v18
7:
        LOAD8           v20, v21, v22, v23, x3
        M_MLA           v0,  v5,  v17, v1
        M_MLA2          v1,  v7,  v19
        LOAD8           v24, v25, v26, v27, x9
        M_MLA           v0,  v20, v24, v1
        M_MLA2          v1,  v22, v26
        b.eq            6f
        cmp             w6,  #16
        M_MLA           v0,  v21, v25, v1
        M_MLA2          v1,  v23, v27
        b.lt            4f
        subs            w6,  w6,  #16
        LOAD8           v4,  v5,  v6,  v7,  x3
        LOAD8           v16, v17, v18, v19, x9
        M_MLA           v0,  v4,  v16, v1
        M_MLA2          v1,  v6,  v18
        b               7b
6:
        M_MLA           v0,  v21, v25,  v1
        M_MLA2          v1,  v23, v27
        STORE_ONE       0,   x1,  x2,   v1
        ret
5:
        movi            v0.16b, #0
        movi            v1.16b, #0
4:      // remaining filter_length 1-15
        cmp             w6,  #4
        b.lt            2f
        subs            w6,  w6,  #4
        LOAD4           v4,  v5,  x3
        LOAD4           v6,  v7,  x9
        M_MLA           v0,  v4,  v6,  v1
        M_MLA2          v1,  v5,  v7
        b.eq            0f
        b               4b
2:      // remaining filter_length 1-3
        cmp             w6,  #2
        b.lt            1f
        LOAD2           2,   x3
        LOAD2           3,   x9
        subs            w6,  w6,  #2
        M_MLA           v0,  v2,  v3
        b.eq            0f
1:      // remaining filter_length 1
        LOAD1           6,   x3
        LOAD1           7,   x9
        M_MLA           v0,  v6,  v7
0:
        STORE_ONE       0,   x1,  x2,  v1
        ret
endfunc

.purgem LOAD1
.purgem LOAD2
.purgem LOAD4
.purgem LOAD8
.purgem M_MLA
.purgem M_MLA2
.purgem M_MUL
.purgem M_MUL2
.purgem STORE_ONE
.endm


.macro  LOAD1           d1, addr
        ldr             d\d1, [\addr], #8
.endm
.macro  LOAD2           d1, addr
        ld1             {v\d1\().2d}, [\addr], #16
.endm
.macro  LOAD4           d1, d2, addr
        ld1             {\d1\().2d,\d2\().2d}, [\addr], #32
.endm
.macro  LOAD8           d1, d2, d3, d4, addr
        ld1             {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64
.endm
.macro  M_MLA           d, r0, r1, d2:vararg
        fmla            \d\().2d, \r0\().2d, \r1\().2d
.endm
.macro  M_MLA2          second:vararg
        M_MLA           \second
.endm
.macro  M_MUL           d, r0, r1, d2:vararg
        fmul            \d\().2d, \r0\().2d, \r1\().2d
.endm
.macro  M_MUL2          second:vararg
        M_MUL           \second
.endm
.macro  STORE_ONE       rn, addr, idx, d2
        fadd            v\rn\().2d,  v\rn\().2d,  \d2\().2d
        faddp           d\rn\(),  v\rn\().2d
        str             d\rn\(),  [\addr, \idx, lsl #3]
.endm

resample_one dbl, 3


.macro  LOAD1           d1, addr
        ldr             s\d1, [\addr], #4
.endm
.macro  LOAD2           d1, addr
        ld1             {v\d1\().2s}, [\addr], #8
.endm
.macro  LOAD4           d1, d2, addr
        ld1             {\d1\().4s}, [\addr], #16
.endm
.macro  LOAD8           d1, d2, d3, d4, addr
        ld1             {\d1\().4s,\d2\().4s}, [\addr], #32
.endm
.macro  M_MLA           d, r0, r1, d2:vararg
        fmla            \d\().4s, \r0\().4s, \r1\().4s
.endm
.macro  M_MUL           d, r0, r1, d2:vararg
        fmul            \d\().4s, \r0\().4s, \r1\().4s
.endm
.macro  STORE_ONE       rn, addr, idx, d2
        faddp           v\rn\().4s,  v\rn\().4s,  v\rn\().4s
        faddp           s\rn\(),  v\rn\().2s
        str             s\rn\(),  [\addr, \idx, lsl #2]
.endm

resample_one flt


.macro  LOAD1           d1, addr
        ldr             h\d1, [\addr], #2
.endm
.macro  LOAD2           d1, addr
        ldr             s\d1, [\addr], #4
.endm
.macro  LOAD4           d1, d2, addr
        ld1             {\d1\().4h}, [\addr], #8
.endm
.macro  LOAD8           d1, d2, d3, d4, addr
        ld1             {\d1\().4h,\d2\().4h}, [\addr], #16
.endm
.macro  M_MLA           d, r0, r1, d2:vararg
        smlal           \d\().4s, \r0\().4h, \r1\().4h
.endm
.macro  M_MUL           d, r0, r1, d2:vararg
        smull           \d\().4s, \r0\().4h, \r1\().4h
.endm
.macro  STORE_ONE       rn, addr, idx, d2
        addp            v\rn\().4s,  v\rn\().4s,  v\rn\().4s
        addp            v\rn\().4s,  v\rn\().4s,  v\rn\().4s
        sqrshrn         v\rn\().4h,  v\rn\().4s,  #15
        str             h\rn\(),  [\addr, \idx, lsl #1]
.endm

resample_one s16, 1


.macro  LOAD1           d1, addr
        ldr             s\d1, [\addr], #4
.endm
.macro  LOAD2           d1, addr
        ld1             {v\d1\().2s}, [\addr], #8
.endm
.macro  LOAD4           d1, d2, addr
        ld1             {\d1\().4s}, [\addr], #16
.endm
.macro  LOAD8           d1, d2, d3, d4, addr
        ld1             {\d1\().4s,\d2\().4s}, [\addr], #32
.endm
.macro  M_MLA           d1, r0, r1, d2:vararg
        smlal           \d1\().2d, \r0\().2s, \r1\().2s
.ifnb \d2
        smlal2          \d2\().2d, \r0\().4s, \r1\().4s
.endif
.endm
.macro  M_MUL           d1, r0, r1, d2:vararg
        smull           \d1\().2d, \r0\().2s, \r1\().2s
.ifnb \d2
        smull2          \d2\().2d, \r0\().4s, \r1\().4s
.endif
.endm
.macro  STORE_ONE       rn, addr, idx, d2
        add             v\rn\().2d,  v\rn\().2d,  \d2\().2d
        addp            d\rn\(),     v\rn\().2d
        sqrshrn         v\rn\().2s,  v\rn\().2d,  #30
        str             s\rn\(),  [\addr, \idx, lsl #2]
.endm

resample_one s32