CPU: add ARM memcpy assembly function

This is my old ARM9E/ARM11 memcpy code from https://garage.maemo.org/projects/mplayer/ with some tuning for Raspberry Pi (aligned prefetch added). Will be used by VFP optimized overlapped blt function. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>

CPU: add ARM memcpy assembly function
This is my old ARM9E/ARM11 memcpy code from https://garage.maemo.org/projects/mplayer/ with some tuning for Raspberry Pi (aligned prefetch added). Will be used by VFP optimized overlapped blt function. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
ae976fe9 · Siarhei Siamashka · 98f1b119 · ae976fe9
Commit ae976fe9 authored Jun 03, 2013 by Siarhei Siamashka
--- a/src/arm_asm.S
+++ b/src/arm_asm.S
 /*
- * Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ * Copyright © 2006-2008, 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -201,6 +201,210 @@ asm_function writeback_scratch_to_mem_neon

 /******************************************************************************/

+/*
+ * Helper macro for memcpy function, it can copy data from source (r1) to 
+ * destination (r0) buffers fixing alignment in the process. Destination
+ * buffer should be aligned already (4 bytes alignment is required.
+ * Size of the block to copy is in r2 register
+ */
+.macro  UNALIGNED_MEMCPY shift
+    sub     r1, #(\shift)
+    ldr     ip, [r1], #4
+
+    tst     r0, #4
+    movne   r3, ip, lsr #(\shift * 8)
+    ldrne   ip, [r1], #4
+    subne   r2, r2, #4
+    orrne   r3, r3, ip, asl #(32 - \shift * 8)
+    strne   r3, [r0], #4
+
+    tst     r0, #8
+    movne   r3, ip, lsr #(\shift * 8)
+    ldmiane r1!, {r4, ip}
+    subne   r2, r2, #8
+    orrne   r3, r3, r4, asl #(32 - \shift * 8)
+    movne   r4, r4, lsr #(\shift * 8)
+    orrne   r4, r4, ip, asl #(32 - \shift * 8)
+    stmiane r0!, {r3-r4}
+    cmp     r2, #32
+    blt     3f
+    pld     [r1, #48]
+    stmfd   sp!, {r7, r8, r9, r10, r11}
+    add     r3, r1, #128
+    bic     r3, r3, #31
+    sub     r9, r3, r1
+1:
+    pld     [r1, r9]
+    subs    r2, r2, #32
+    movge   r3, ip, lsr #(\shift * 8)
+    ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip}
+    orrge   r3, r3, r4, asl #(32 - \shift * 8)
+    movge   r4, r4, lsr #(\shift * 8)
+    orrge   r4, r4, r5, asl #(32 - \shift * 8)
+    movge   r5, r5, lsr #(\shift * 8)
+    orrge   r5, r5, r6, asl #(32 - \shift * 8)
+    movge   r6, r6, lsr #(\shift * 8)
+    orrge   r6, r6, r7, asl #(32 - \shift * 8)
+    stmiage r0!, {r3-r6}
+    movge   r7, r7, lsr #(\shift * 8)
+    orrge   r7, r7, r8, asl #(32 - \shift * 8)
+    movge   r8, r8, lsr #(\shift * 8)
+    orrge   r8, r8, r10, asl #(32 - \shift * 8)
+    movge   r10, r10, lsr #(\shift * 8)
+    orrge   r10, r10, r11, asl #(32 - \shift * 8)
+    movge   r11, r11, lsr #(\shift * 8)
+    orrge   r11, r11, ip, asl #(32 - \shift * 8)
+    stmiage r0!, {r7, r8, r10, r11}
+    bgt     1b
+2:
+    ldmfd   sp!, {r7, r8, r9, r10, r11}
+3:  /* copy remaining data */
+    tst     r2, #16
+    movne   r3, ip, lsr #(\shift * 8)
+    ldmiane r1!, {r4-r6, ip}
+    orrne   r3, r3, r4, asl #(32 - \shift * 8)
+    movne   r4, r4, lsr #(\shift * 8)
+    orrne   r4, r4, r5, asl #(32 - \shift * 8)
+    movge   r5, r5, lsr #(\shift * 8)
+    orrge   r5, r5, r6, asl #(32 - \shift * 8)
+    movge   r6, r6, lsr #(\shift * 8)
+    orrge   r6, r6, ip, asl #(32 - \shift * 8)
+    stmiane r0!, {r3-r6}
+
+    tst     r2, #8
+    movne   r3, ip, lsr #(\shift * 8)
+    ldmiane r1!, {r4, ip}
+    orrne   r3, r3, r4, asl #(32 - \shift * 8)
+    movne   r4, r4, lsr #(\shift * 8)
+    orrne   r4, r4, ip, asl #(32 - \shift * 8)
+    stmiane r0!, {r3-r4}
+
+    tst     r2, #4
+    movne   r3, ip, lsr #(\shift * 8)
+    ldrne   ip, [r1], #4
+    sub     r1, r1, #(4 - \shift)
+    orrne   r3, r3, ip, asl #(32 - \shift * 8)
+    strne   r3, [r0], #4
+
+    tst     r2, #2
+    ldrbne  r3, [r1], #1
+    ldrbne  r4, [r1], #1
+    ldr     r5, [sp], #4
+    strbne  r3, [r0], #1
+    strbne  r4, [r0], #1
+
+    tst     r2, #1
+    ldrbne  r3, [r1], #1
+    ldr     r6, [sp], #4
+    strbne  r3, [r0], #1
+
+    ldmfd   sp!, {r0, r4}
+
+    bx      lr
+.endm
+
+/*
+ * Memcpy function with Raspberry Pi specific aligned prefetch, based on
+ * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S
+ */
+asm_function memcpy_armv5te
+    cmp     r2, #20
+    blt     9f
+    /* copy data until destination address is 4 bytes aligned */
+    tst     r0, #1
+    ldrbne  r3, [r1], #1
+    stmfd   sp!, {r0, r4}
+    subne   r2, r2, #1
+    strbne  r3, [r0], #1
+    tst     r0, #2
+    ldrbne  r3, [r1], #1
+    ldrbne  r4, [r1], #1
+    stmfd   sp!, {r5, r6}
+    subne   r2, r2, #2
+    orrne   r3, r3, r4, asl #8
+    strhne  r3, [r0], #2
+    /* destination address is 4 bytes aligned */
+    /* now we should handle 4 cases of source address alignment */
+    tst     r1, #1
+    bne     6f
+    tst     r1, #2
+    bne     7f
+
+    /* both source and destination are 4 bytes aligned */
+    stmfd   sp!, {r7, r8, r9, r10, r11}
+    tst     r0, #4
+    ldrne   r4, [r1], #4
+    subne   r2, r2, #4
+    strne   r4, [r0], #4
+    tst     r0, #8
+    ldmiane r1!, {r3-r4}
+    add     r9, r1, #96
+    subne   r2, r2, #8
+    bic     r9, r9, #31
+    stmiane r0!, {r3-r4}
+    sub     r9, r9, r1
+1:
+    subs    r2, r2, #32
+    ldmiage r1!, {r3-r6, r7, r8, r10, r11}
+    pld     [r1, r9]
+    stmiage r0!, {r3-r6}
+    stmiage r0!, {r7, r8, r10, r11}
+    bgt     1b
+2:
+    ldmfd   sp!, {r7, r8, r9, r10, r11}
+    tst     r2, #16
+    ldmiane r1!, {r3-r6}
+    stmiane r0!, {r3-r6}
+    tst     r2, #8
+    ldmiane r1!, {r3-r4}
+    stmiane r0!, {r3-r4}
+    tst     r2, #4
+    ldrne   r3, [r1], #4
+    mov     ip, r0
+    strne   r3, [ip], #4
+    tst     r2, #2
+    ldrhne  r3, [r1], #2
+    ldmfd   sp!, {r5, r6}
+    strhne  r3, [ip], #2
+    tst     r2, #1
+    ldrbne  r3, [r1], #1
+    ldmfd   sp!, {r0, r4}
+    strbne  r3, [ip], #1
+
+    bx      lr
+
+6:
+    tst    r1, #2
+    bne    8f
+    UNALIGNED_MEMCPY 1
+7:
+    UNALIGNED_MEMCPY 2
+8:
+    UNALIGNED_MEMCPY 3
+9:
+    stmfd  sp!, {r0, r4}
+1:  subs   r2, r2, #3
+    ldrbge ip, [r0]
+    ldrbge r3, [r1], #1
+    ldrbge r4, [r1], #1
+    ldrbge ip, [r1], #1
+    strbge r3, [r0], #1
+    strbge r4, [r0], #1
+    strbge ip, [r0], #1
+    bge    1b
+    adds   r2, r2, #2
+    ldrbge r3, [r1], #1
+    mov    ip, r0
+    ldr    r0, [sp], #4
+    strbge r3, [ip], #1
+    ldrbgt r3, [r1], #1
+    ldr    r4, [sp], #4
+    strbgt r3, [ip], #1
+    bx     lr
+.endfunc
+
+/******************************************************************************/
+
 /*
 * aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
 *