/*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

RESULT  .req    a1
BUF     .req    a1
SIZE    .req    a2
PATTERN .req    a3
PTR     .req    a4
DAT0    .req    v1
DAT1    .req    v2
DAT2    .req    v3
DAT3    .req    v4
TMP0    .req    v5
TMP1    .req    v6
TMP2    .req    ip
TMP3    .req    lr

#define PRELOAD_DISTANCE 4

.macro innerloop4
        ldr     DAT0, [PTR], #4
        subs    SIZE, SIZE, #4 @ C flag survives rest of macro
        sub     TMP0, DAT0, PATTERN, lsr #14
        bic     TMP0, TMP0, DAT0
        ands    TMP0, TMP0, PATTERN
.endm

.macro innerloop16  decrement, do_preload
        ldmia   PTR!, {DAT0,DAT1,DAT2,DAT3}
 .ifnc "\do_preload",""
        pld     [PTR, #PRELOAD_DISTANCE*32]
 .endif
 .ifnc "\decrement",""
        subs    SIZE, SIZE, #\decrement @ C flag survives rest of macro
 .endif
        sub     TMP0, DAT0, PATTERN, lsr #14
        sub     TMP1, DAT1, PATTERN, lsr #14
        bic     TMP0, TMP0, DAT0
        bic     TMP1, TMP1, DAT1
        sub     TMP2, DAT2, PATTERN, lsr #14
        sub     TMP3, DAT3, PATTERN, lsr #14
        ands    TMP0, TMP0, PATTERN
        bic     TMP2, TMP2, DAT2
        it      eq
        andseq  TMP1, TMP1, PATTERN
        bic     TMP3, TMP3, DAT3
        itt     eq
        andseq  TMP2, TMP2, PATTERN
        andseq  TMP3, TMP3, PATTERN
.endm

/* int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size) */
function ff_startcode_find_candidate_armv6, export=1
        push    {v1-v6,lr}
        mov     PTR, BUF
        @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
        @ before using code that does preloads
        cmp     SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
        blo     60f

        @ Get to word-alignment, 1 byte at a time
        tst     PTR, #3
        beq     2f
1:      ldrb    DAT0, [PTR], #1
        sub     SIZE, SIZE, #1
        teq     DAT0, #0
        beq     90f
        tst     PTR, #3
        bne     1b
2:      @ Get to 4-word alignment, 1 word at a time
        ldr     PATTERN, =0x80008000
        setend  be
        tst     PTR, #12
        beq     4f
3:      innerloop4
        bne     91f
        tst     PTR, #12
        bne     3b
4:      @ Get to cacheline (8-word) alignment
        tst     PTR, #16
        beq     5f
        innerloop16  16
        bne     93f
5:      @ Check complete cachelines, with preloading
        @ We need to stop when there are still (PRELOAD_DISTANCE+1)
        @ complete cachelines to go
        sub     SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
6:      innerloop16  , do_preload
        bne     93f
        innerloop16  32
        bne     93f
        bcs     6b
        @ Preload trailing part-cacheline, if any
        tst     SIZE, #31
        beq     7f
        pld     [PTR, #(PRELOAD_DISTANCE+1)*32]
        @ Check remaining data without doing any more preloads. First
        @ do in chunks of 4 words:
7:      adds    SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
        bmi     9f
8:      innerloop16  16
        bne     93f
        bcs     8b
        @ Then in words:
9:      adds    SIZE, SIZE, #16 - 4
        bmi     11f
10:     innerloop4
        bne     91f
        bcs     10b
11:     setend  le
        @ Check second byte of final halfword
        ldrb    DAT0, [PTR, #-1]
        teq     DAT0, #0
        beq     90f
        @ Check any remaining bytes
        tst     SIZE, #3
        beq     13f
12:     ldrb    DAT0, [PTR], #1
        sub     SIZE, SIZE, #1
        teq     DAT0, #0
        beq     90f
        tst     SIZE, #3
        bne     12b
        @ No candidate found
13:     sub     RESULT, PTR, BUF
        b       99f

60:     @ Small buffer - simply check by looping over bytes
        subs    SIZE, SIZE, #1
        bcc     99f
61:     ldrb    DAT0, [PTR], #1
        subs    SIZE, SIZE, #1
        teq     DAT0, #0
        beq     90f
        bcs     61b
        @ No candidate found
        sub     RESULT, PTR, BUF
        b       99f

90:     @ Found a candidate at the preceding byte
        sub     RESULT, PTR, BUF
        sub     RESULT, RESULT, #1
        b       99f

91:     @ Found a candidate somewhere in the preceding 4 bytes
        sub     RESULT, PTR, BUF
        sub     RESULT, RESULT, #4
        sub     TMP0, DAT0, #0x20000
        bics    TMP0, TMP0, DAT0
        itt     pl
        ldrbpl  DAT0, [PTR, #-3]
        addpl   RESULT, RESULT, #2
        bpl     92f
        teq     RESULT, #0
        beq     98f @ don't look back a byte if found at first byte in buffer
        ldrb    DAT0, [PTR, #-5]
92:     teq     DAT0, #0
        it      eq
        subeq   RESULT, RESULT, #1
        b       98f

93:     @ Found a candidate somewhere in the preceding 16 bytes
        sub     RESULT, PTR, BUF
        sub     RESULT, RESULT, #16
        teq     TMP0, #0
        beq     95f @ not in first 4 bytes
        sub     TMP0, DAT0, #0x20000
        bics    TMP0, TMP0, DAT0
        itt     pl
        ldrbpl  DAT0, [PTR, #-15]
        addpl   RESULT, RESULT, #2
        bpl     94f
        teq     RESULT, #0
        beq     98f @ don't look back a byte if found at first byte in buffer
        ldrb    DAT0, [PTR, #-17]
94:     teq     DAT0, #0
        it      eq
        subeq   RESULT, RESULT, #1
        b       98f
95:     add     RESULT, RESULT, #4
        teq     TMP1, #0
        beq     96f @ not in next 4 bytes
        sub     TMP1, DAT1, #0x20000
        bics    TMP1, TMP1, DAT1
        itee    mi
        ldrbmi  DAT0, [PTR, #-13]
        ldrbpl  DAT0, [PTR, #-11]
        addpl   RESULT, RESULT, #2
        teq     DAT0, #0
        it      eq
        subeq   RESULT, RESULT, #1
        b       98f
96:     add     RESULT, RESULT, #4
        teq     TMP2, #0
        beq     97f @ not in next 4 bytes
        sub     TMP2, DAT2, #0x20000
        bics    TMP2, TMP2, DAT2
        itee    mi
        ldrbmi  DAT0, [PTR, #-9]
        ldrbpl  DAT0, [PTR, #-7]
        addpl   RESULT, RESULT, #2
        teq     DAT0, #0
        it      eq
        subeq   RESULT, RESULT, #1
        b       98f
97:     add     RESULT, RESULT, #4
        sub     TMP3, DAT3, #0x20000
        bics    TMP3, TMP3, DAT3
        itee    mi
        ldrbmi  DAT0, [PTR, #-5]
        ldrbpl  DAT0, [PTR, #-3]
        addpl   RESULT, RESULT, #2
        teq     DAT0, #0
        it      eq
        subeq   RESULT, RESULT, #1
        @ drop through to 98f
98:     setend  le
99:     pop     {v1-v6,pc}
endfunc

        .unreq  RESULT
        .unreq  BUF
        .unreq  SIZE
        .unreq  PATTERN
        .unreq  PTR
        .unreq  DAT0
        .unreq  DAT1
        .unreq  DAT2
        .unreq  DAT3
        .unreq  TMP0
        .unreq  TMP1
        .unreq  TMP2
        .unreq  TMP3