/*
 * Copyright © 2006-2008, 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

#ifdef __arm__

.text
.syntax unified
.fpu neon
.arch armv7a
.object_arch armv4
.arm
.altmacro
.p2align 2

/******************************************************************************/

.macro asm_function function_name
    .global \function_name
.func \function_name
\function_name:
.endm

/******************************************************************************/

/*
 * writeback_scratch_to_mem_neon(int numbytes, void *dst, void *src)
 *
 * Copy a chunk of data from a cached scratch buffer (so prefetch is not
 * really needed), to a memory buffer in forward direction. Generated from
 * pixman macro templates.
 */

asm_function writeback_scratch_to_mem_neon
	mov	ip, r1
	cmp	r0, #32
	blt	0f
	tst	ip, #15
	beq	1f
	tst	ip, #1
	beq	2f
	vld1.8	{d0[1]}, [r2]!
	add	ip, ip, #1
	sub	r0, r0, #1
2:	tst	ip, #2
	beq	3f
	vld1.8	{d0[2]}, [r2]!
	vld1.8	{d0[3]}, [r2]!
	add	ip, ip, #2
	sub	r0, r0, #2
3:	tst	ip, #4
	beq	4f
	vld1.8	{d0[4]}, [r2]!
	vld1.8	{d0[5]}, [r2]!
	vld1.8	{d0[6]}, [r2]!
	vld1.8	{d0[7]}, [r2]!
	add	ip, ip, #4
	sub	r0, r0, #4
4:	tst	ip, #8
	beq	5f
	vld1.8	{d1}, [r2]!
	add	ip, ip, #8
	sub	r0, r0, #8
5:	vld1.8	{d2-d3}, [r2]!
	add	ip, ip, #16
	sub	r0, r0, #16
	tst	r1, #1
	beq	6f
	vst1.8	{d0[1]}, [r1]!
6:	tst	r1, #2
	beq	7f
	vst1.8	{d0[2]}, [r1]!
	vst1.8	{d0[3]}, [r1]!
7:	tst	r1, #4
	beq	8f
	vst1.8	{d0[4]}, [r1]!
	vst1.8	{d0[5]}, [r1]!
	vst1.8	{d0[6]}, [r1]!
	vst1.8	{d0[7]}, [r1]!
8:	tst	r1, #8
	beq	9f
	vst1.8	{d1}, [r1, :64]!
9:	vst1.8	{d2-d3}, [r1, :128]!
1:	subs	r0, r0, #32
	blt	10f
	vld1.8	{d0-d3}, [r2]!
	subs	r0, r0, #32
	blt	11f
12:	vst1.8	{d0-d3}, [r1, :128]!
	vld1.8	{d0-d3}, [r2]!
	subs	r0, r0, #32
	bge	12b
11:	vst1.8	{d0-d3}, [r1, :128]!
10:	tst	r0, #31
	beq	13f
	tst	r0, #16
	beq	14f
	vld1.8	{d2-d3}, [r2]!
14:	tst	r0, #8
	beq	15f
	vld1.8	{d1}, [r2]!
15:	tst	r0, #4
	beq	16f
	vld1.8	{d0[4]}, [r2]!
	vld1.8	{d0[5]}, [r2]!
	vld1.8	{d0[6]}, [r2]!
	vld1.8	{d0[7]}, [r2]!
16:	tst	r0, #2
	beq	17f
	vld1.8	{d0[2]}, [r2]!
	vld1.8	{d0[3]}, [r2]!
17:	tst	r0, #1
	beq	18f
	vld1.8	{d0[1]}, [r2]!
18:	tst	r0, #16
	beq	19f
	vst1.8	{d2-d3}, [r1, :128]!
19:	tst	r0, #8
	beq	20f
	vst1.8	{d1}, [r1, :64]!
20:	tst	r0, #4
	beq	21f
	vst1.8	{d0[4]}, [r1]!
	vst1.8	{d0[5]}, [r1]!
	vst1.8	{d0[6]}, [r1]!
	vst1.8	{d0[7]}, [r1]!
21:	tst	r0, #2
	beq	22f
	vst1.8	{d0[2]}, [r1]!
	vst1.8	{d0[3]}, [r1]!
22:	tst	r0, #1
	beq	13f
	vst1.8	{d0[1]}, [r1]!
13:	bx	lr
0:	tst	r0, #31
	beq	23f
	tst	r0, #16
	beq	24f
	vld1.8	{d2-d3}, [r2]!
24:	tst	r0, #8
	beq	25f
	vld1.8	{d1}, [r2]!
25:	tst	r0, #4
	beq	26f
	vld1.8	{d0[4]}, [r2]!
	vld1.8	{d0[5]}, [r2]!
	vld1.8	{d0[6]}, [r2]!
	vld1.8	{d0[7]}, [r2]!
26:	tst	r0, #2
	beq	27f
	vld1.8	{d0[2]}, [r2]!
	vld1.8	{d0[3]}, [r2]!
27:	tst	r0, #1
	beq	28f
	vld1.8	{d0[1]}, [r2]!
28:	tst	r0, #16
	beq	29f
	vst1.8	{d2-d3}, [r1]!
29:	tst	r0, #8
	beq	30f
	vst1.8	{d1}, [r1]!
30:	tst	r0, #4
	beq	31f
	vst1.8	{d0[4]}, [r1]!
	vst1.8	{d0[5]}, [r1]!
	vst1.8	{d0[6]}, [r1]!
	vst1.8	{d0[7]}, [r1]!
31:	tst	r0, #2
	beq	32f
	vst1.8	{d0[2]}, [r1]!
	vst1.8	{d0[3]}, [r1]!
32:	tst	r0, #1
	beq	23f
	vst1.8	{d0[1]}, [r1]!
23:	bx	lr
.endfunc

/******************************************************************************/

/*
 * Helper macro for memcpy function, it can copy data from source (r1) to 
 * destination (r0) buffers fixing alignment in the process. Destination
 * buffer should be aligned already (4 bytes alignment is required.
 * Size of the block to copy is in r2 register
 */
.macro  UNALIGNED_MEMCPY shift
    sub     r1, #(\shift)
    ldr     ip, [r1], #4

    tst     r0, #4
    movne   r3, ip, lsr #(\shift * 8)
    ldrne   ip, [r1], #4
    subne   r2, r2, #4
    orrne   r3, r3, ip, asl #(32 - \shift * 8)
    strne   r3, [r0], #4

    tst     r0, #8
    movne   r3, ip, lsr #(\shift * 8)
    ldmiane r1!, {r4, ip}
    subne   r2, r2, #8
    orrne   r3, r3, r4, asl #(32 - \shift * 8)
    movne   r4, r4, lsr #(\shift * 8)
    orrne   r4, r4, ip, asl #(32 - \shift * 8)
    stmiane r0!, {r3-r4}
    cmp     r2, #32
    blt     3f
    pld     [r1, #48]
    stmfd   sp!, {r7, r8, r9, r10, r11}
    add     r3, r1, #128
    bic     r3, r3, #31
    sub     r9, r3, r1
1:
    pld     [r1, r9]
    subs    r2, r2, #32
    movge   r3, ip, lsr #(\shift * 8)
    ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip}
    orrge   r3, r3, r4, asl #(32 - \shift * 8)
    movge   r4, r4, lsr #(\shift * 8)
    orrge   r4, r4, r5, asl #(32 - \shift * 8)
    movge   r5, r5, lsr #(\shift * 8)
    orrge   r5, r5, r6, asl #(32 - \shift * 8)
    movge   r6, r6, lsr #(\shift * 8)
    orrge   r6, r6, r7, asl #(32 - \shift * 8)
    stmiage r0!, {r3-r6}
    movge   r7, r7, lsr #(\shift * 8)
    orrge   r7, r7, r8, asl #(32 - \shift * 8)
    movge   r8, r8, lsr #(\shift * 8)
    orrge   r8, r8, r10, asl #(32 - \shift * 8)
    movge   r10, r10, lsr #(\shift * 8)
    orrge   r10, r10, r11, asl #(32 - \shift * 8)
    movge   r11, r11, lsr #(\shift * 8)
    orrge   r11, r11, ip, asl #(32 - \shift * 8)
    stmiage r0!, {r7, r8, r10, r11}
    bgt     1b
2:
    ldmfd   sp!, {r7, r8, r9, r10, r11}
3:  /* copy remaining data */
    tst     r2, #16
    movne   r3, ip, lsr #(\shift * 8)
    ldmiane r1!, {r4-r6, ip}
    orrne   r3, r3, r4, asl #(32 - \shift * 8)
    movne   r4, r4, lsr #(\shift * 8)
    orrne   r4, r4, r5, asl #(32 - \shift * 8)
    movge   r5, r5, lsr #(\shift * 8)
    orrge   r5, r5, r6, asl #(32 - \shift * 8)
    movge   r6, r6, lsr #(\shift * 8)
    orrge   r6, r6, ip, asl #(32 - \shift * 8)
    stmiane r0!, {r3-r6}

    tst     r2, #8
    movne   r3, ip, lsr #(\shift * 8)
    ldmiane r1!, {r4, ip}
    orrne   r3, r3, r4, asl #(32 - \shift * 8)
    movne   r4, r4, lsr #(\shift * 8)
    orrne   r4, r4, ip, asl #(32 - \shift * 8)
    stmiane r0!, {r3-r4}

    tst     r2, #4
    movne   r3, ip, lsr #(\shift * 8)
    ldrne   ip, [r1], #4
    sub     r1, r1, #(4 - \shift)
    orrne   r3, r3, ip, asl #(32 - \shift * 8)
    strne   r3, [r0], #4

    tst     r2, #2
    ldrbne  r3, [r1], #1
    ldrbne  r4, [r1], #1
    ldr     r5, [sp], #4
    strbne  r3, [r0], #1
    strbne  r4, [r0], #1

    tst     r2, #1
    ldrbne  r3, [r1], #1
    ldr     r6, [sp], #4
    strbne  r3, [r0], #1

    ldmfd   sp!, {r0, r4}

    bx      lr
.endm

/*
 * Memcpy function with Raspberry Pi specific aligned prefetch, based on
 * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S
 */
asm_function memcpy_armv5te
    cmp     r2, #20
    blt     9f
    /* copy data until destination address is 4 bytes aligned */
    tst     r0, #1
    ldrbne  r3, [r1], #1
    stmfd   sp!, {r0, r4}
    subne   r2, r2, #1
    strbne  r3, [r0], #1
    tst     r0, #2
    ldrbne  r3, [r1], #1
    ldrbne  r4, [r1], #1
    stmfd   sp!, {r5, r6}
    subne   r2, r2, #2
    orrne   r3, r3, r4, asl #8
    strhne  r3, [r0], #2
    /* destination address is 4 bytes aligned */
    /* now we should handle 4 cases of source address alignment */
    tst     r1, #1
    bne     6f
    tst     r1, #2
    bne     7f

    /* both source and destination are 4 bytes aligned */
    stmfd   sp!, {r7, r8, r9, r10, r11}
    tst     r0, #4
    ldrne   r4, [r1], #4
    subne   r2, r2, #4
    strne   r4, [r0], #4
    tst     r0, #8
    ldmiane r1!, {r3-r4}
    add     r9, r1, #96
    subne   r2, r2, #8
    bic     r9, r9, #31
    stmiane r0!, {r3-r4}
    sub     r9, r9, r1
1:
    subs    r2, r2, #32
    ldmiage r1!, {r3-r6, r7, r8, r10, r11}
    pld     [r1, r9]
    stmiage r0!, {r3-r6}
    stmiage r0!, {r7, r8, r10, r11}
    bgt     1b
2:
    ldmfd   sp!, {r7, r8, r9, r10, r11}
    tst     r2, #16
    ldmiane r1!, {r3-r6}
    stmiane r0!, {r3-r6}
    tst     r2, #8
    ldmiane r1!, {r3-r4}
    stmiane r0!, {r3-r4}
    tst     r2, #4
    ldrne   r3, [r1], #4
    mov     ip, r0
    strne   r3, [ip], #4
    tst     r2, #2
    ldrhne  r3, [r1], #2
    ldmfd   sp!, {r5, r6}
    strhne  r3, [ip], #2
    tst     r2, #1
    ldrbne  r3, [r1], #1
    ldmfd   sp!, {r0, r4}
    strbne  r3, [ip], #1

    bx      lr

6:
    tst    r1, #2
    bne    8f
    UNALIGNED_MEMCPY 1
7:
    UNALIGNED_MEMCPY 2
8:
    UNALIGNED_MEMCPY 3
9:
    stmfd  sp!, {r0, r4}
1:  subs   r2, r2, #3
    ldrbge ip, [r0]
    ldrbge r3, [r1], #1
    ldrbge r4, [r1], #1
    ldrbge ip, [r1], #1
    strbge r3, [r0], #1
    strbge r4, [r0], #1
    strbge ip, [r0], #1
    bge    1b
    adds   r2, r2, #2
    ldrbge r3, [r1], #1
    mov    ip, r0
    ldr    r0, [sp], #4
    strbge r3, [ip], #1
    ldrbgt r3, [r1], #1
    ldr    r4, [sp], #4
    strbgt r3, [ip], #1
    bx     lr
.endfunc

/******************************************************************************/

/*
 * aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
 *
 * Both 'scratch' and 'fbmem' pointers must be 32 bytes aligned.
 * The value in 'numbytes' is also rounded up to a multiple of 32 bytes.

 * The only purpose of this code is to attempt minimizing penalty incured
 * by doing uncached reads from memory (for example framebuffer). We are
 * trying to do the largest possible perfectly aligned reads to fetch
 * data into a temporary scratch buffer in L1 cache.
 */

asm_function aligned_fetch_fbmem_to_scratch_neon
    SIZE        .req r0
    DST         .req r1
    SRC         .req r2

    subs        SIZE, #128
    blt         1f
0:
    /* aligned load from the source (framebuffer) */
    vld1.64     {q0, q1}, [SRC, :256]!
    vld1.64     {q2, q3}, [SRC, :256]!
    vld1.64     {q8, q9}, [SRC, :256]!
    vld1.64     {q10, q11}, [SRC, :256]!
    /* fetch destination (scratch buffer) into L1 cache */
    ldr         r3, [DST]
    ldr         ip, [DST, #64]
    /* aligned store to the scratch buffer */
    vst1.64     {q0, q1}, [DST, :256]!
    vst1.64     {q2, q3}, [DST, :256]!
    vst1.64     {q8, q9}, [DST, :256]!
    vst1.64     {q10, q11}, [DST, :256]!
    subs        SIZE, SIZE, #128
    bge         0b
1:
    tst         SIZE, #64
    beq         1f
    vld1.64     {q0, q1}, [SRC, :256]!
    vld1.64     {q2, q3}, [SRC, :256]!
    ldr         r3, [DST]
    vst1.64     {q0, q1}, [DST, :256]!
    vst1.64     {q2, q3}, [DST, :256]!
1:
    tst         SIZE, #32
    beq         1f
    vld1.64     {q0, q1}, [SRC, :256]!
    vst1.64     {q0, q1}, [DST, :256]!
1:
    tst         SIZE, #31
    beq         1f
    vld1.64     {q0, q1}, [SRC, :256]!
    vst1.64     {q0, q1}, [DST, :256]!
1:
    bx          lr

    .unreq      SIZE
    .unreq      DST
    .unreq      SRC
.endfunc

#endif