Commit ae976fe9 authored by Siarhei Siamashka's avatar Siarhei Siamashka
Browse files

CPU: add ARM memcpy assembly function

This is my old ARM9E/ARM11 memcpy code from
    https://garage.maemo.org/projects/mplayer/


with some tuning for Raspberry Pi (aligned prefetch added).

Will be used by VFP optimized overlapped blt function.
Signed-off-by: default avatarSiarhei Siamashka <siarhei.siamashka@gmail.com>
parent 98f1b119
/*
* Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
* Copyright © 2006-2008, 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
......@@ -201,6 +201,210 @@ asm_function writeback_scratch_to_mem_neon
/******************************************************************************/
/*
* Helper macro for memcpy function, it can copy data from source (r1) to
* destination (r0) buffers fixing alignment in the process. Destination
* buffer should be aligned already (4 bytes alignment is required.
* Size of the block to copy is in r2 register
*/
.macro UNALIGNED_MEMCPY shift
sub r1, #(\shift)
ldr ip, [r1], #4
tst r0, #4
movne r3, ip, lsr #(\shift * 8)
ldrne ip, [r1], #4
subne r2, r2, #4
orrne r3, r3, ip, asl #(32 - \shift * 8)
strne r3, [r0], #4
tst r0, #8
movne r3, ip, lsr #(\shift * 8)
ldmiane r1!, {r4, ip}
subne r2, r2, #8
orrne r3, r3, r4, asl #(32 - \shift * 8)
movne r4, r4, lsr #(\shift * 8)
orrne r4, r4, ip, asl #(32 - \shift * 8)
stmiane r0!, {r3-r4}
cmp r2, #32
blt 3f
pld [r1, #48]
stmfd sp!, {r7, r8, r9, r10, r11}
add r3, r1, #128
bic r3, r3, #31
sub r9, r3, r1
1:
pld [r1, r9]
subs r2, r2, #32
movge r3, ip, lsr #(\shift * 8)
ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip}
orrge r3, r3, r4, asl #(32 - \shift * 8)
movge r4, r4, lsr #(\shift * 8)
orrge r4, r4, r5, asl #(32 - \shift * 8)
movge r5, r5, lsr #(\shift * 8)
orrge r5, r5, r6, asl #(32 - \shift * 8)
movge r6, r6, lsr #(\shift * 8)
orrge r6, r6, r7, asl #(32 - \shift * 8)
stmiage r0!, {r3-r6}
movge r7, r7, lsr #(\shift * 8)
orrge r7, r7, r8, asl #(32 - \shift * 8)
movge r8, r8, lsr #(\shift * 8)
orrge r8, r8, r10, asl #(32 - \shift * 8)
movge r10, r10, lsr #(\shift * 8)
orrge r10, r10, r11, asl #(32 - \shift * 8)
movge r11, r11, lsr #(\shift * 8)
orrge r11, r11, ip, asl #(32 - \shift * 8)
stmiage r0!, {r7, r8, r10, r11}
bgt 1b
2:
ldmfd sp!, {r7, r8, r9, r10, r11}
3: /* copy remaining data */
tst r2, #16
movne r3, ip, lsr #(\shift * 8)
ldmiane r1!, {r4-r6, ip}
orrne r3, r3, r4, asl #(32 - \shift * 8)
movne r4, r4, lsr #(\shift * 8)
orrne r4, r4, r5, asl #(32 - \shift * 8)
movge r5, r5, lsr #(\shift * 8)
orrge r5, r5, r6, asl #(32 - \shift * 8)
movge r6, r6, lsr #(\shift * 8)
orrge r6, r6, ip, asl #(32 - \shift * 8)
stmiane r0!, {r3-r6}
tst r2, #8
movne r3, ip, lsr #(\shift * 8)
ldmiane r1!, {r4, ip}
orrne r3, r3, r4, asl #(32 - \shift * 8)
movne r4, r4, lsr #(\shift * 8)
orrne r4, r4, ip, asl #(32 - \shift * 8)
stmiane r0!, {r3-r4}
tst r2, #4
movne r3, ip, lsr #(\shift * 8)
ldrne ip, [r1], #4
sub r1, r1, #(4 - \shift)
orrne r3, r3, ip, asl #(32 - \shift * 8)
strne r3, [r0], #4
tst r2, #2
ldrbne r3, [r1], #1
ldrbne r4, [r1], #1
ldr r5, [sp], #4
strbne r3, [r0], #1
strbne r4, [r0], #1
tst r2, #1
ldrbne r3, [r1], #1
ldr r6, [sp], #4
strbne r3, [r0], #1
ldmfd sp!, {r0, r4}
bx lr
.endm
/*
* Memcpy function with Raspberry Pi specific aligned prefetch, based on
* https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S
*/
asm_function memcpy_armv5te
cmp r2, #20
blt 9f
/* copy data until destination address is 4 bytes aligned */
tst r0, #1
ldrbne r3, [r1], #1
stmfd sp!, {r0, r4}
subne r2, r2, #1
strbne r3, [r0], #1
tst r0, #2
ldrbne r3, [r1], #1
ldrbne r4, [r1], #1
stmfd sp!, {r5, r6}
subne r2, r2, #2
orrne r3, r3, r4, asl #8
strhne r3, [r0], #2
/* destination address is 4 bytes aligned */
/* now we should handle 4 cases of source address alignment */
tst r1, #1
bne 6f
tst r1, #2
bne 7f
/* both source and destination are 4 bytes aligned */
stmfd sp!, {r7, r8, r9, r10, r11}
tst r0, #4
ldrne r4, [r1], #4
subne r2, r2, #4
strne r4, [r0], #4
tst r0, #8
ldmiane r1!, {r3-r4}
add r9, r1, #96
subne r2, r2, #8
bic r9, r9, #31
stmiane r0!, {r3-r4}
sub r9, r9, r1
1:
subs r2, r2, #32
ldmiage r1!, {r3-r6, r7, r8, r10, r11}
pld [r1, r9]
stmiage r0!, {r3-r6}
stmiage r0!, {r7, r8, r10, r11}
bgt 1b
2:
ldmfd sp!, {r7, r8, r9, r10, r11}
tst r2, #16
ldmiane r1!, {r3-r6}
stmiane r0!, {r3-r6}
tst r2, #8
ldmiane r1!, {r3-r4}
stmiane r0!, {r3-r4}
tst r2, #4
ldrne r3, [r1], #4
mov ip, r0
strne r3, [ip], #4
tst r2, #2
ldrhne r3, [r1], #2
ldmfd sp!, {r5, r6}
strhne r3, [ip], #2
tst r2, #1
ldrbne r3, [r1], #1
ldmfd sp!, {r0, r4}
strbne r3, [ip], #1
bx lr
6:
tst r1, #2
bne 8f
UNALIGNED_MEMCPY 1
7:
UNALIGNED_MEMCPY 2
8:
UNALIGNED_MEMCPY 3
9:
stmfd sp!, {r0, r4}
1: subs r2, r2, #3
ldrbge ip, [r0]
ldrbge r3, [r1], #1
ldrbge r4, [r1], #1
ldrbge ip, [r1], #1
strbge r3, [r0], #1
strbge r4, [r0], #1
strbge ip, [r0], #1
bge 1b
adds r2, r2, #2
ldrbge r3, [r1], #1
mov ip, r0
ldr r0, [sp], #4
strbge r3, [ip], #1
ldrbgt r3, [r1], #1
ldr r4, [sp], #4
strbgt r3, [ip], #1
bx lr
.endfunc
/******************************************************************************/
/*
* aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
*
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment