Use ARM LDM instead of VFP for uncached reads on Marvell PJ4

Marvell PJ4 core used in CuBox very poorly handles VFP uncached reads from the framebuffer. Using WMMX or ARM LDM reads is much faster, with LDM instructions having a minor advantage. This improves framebuffer read performance from ~50MB/s to ~100MB/s. WMMX runtime detection and PJ4 core identification is also added as part of this fix. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>

Use ARM LDM instead of VFP for uncached reads on Marvell PJ4
Marvell PJ4 core used in CuBox very poorly handles VFP uncached reads from the framebuffer. Using WMMX or ARM LDM reads is much faster, with LDM instructions having a minor advantage. This improves framebuffer read performance from ~50MB/s to ~100MB/s. WMMX runtime detection and PJ4 core identification is also added as part of this fix. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
e9f978f3 · Siarhei Siamashka · 102957f9 · e9f978f3 · e9f978f3 · e9f978f3
Commit e9f978f3 authored Oct 17, 2013 by Siarhei Siamashka
--- a/src/arm_asm.S
+++ b/src/arm_asm.S
@@ -505,4 +505,48 @@ asm_function aligned_fetch_fbmem_to_scratch_vfp
    .unreq      SRC
 .endfunc
+asm_function aligned_fetch_fbmem_to_scratch_arm
+    SIZE        .req r0
+    DST         .req r1
+    SRC         .req r2
+    push        {r4-r11, lr}
+    subs        SIZE, #128
+    blt         1f
+0:
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+    subs        SIZE, SIZE, #128
+    bge         0b
+1:
+    tst         SIZE, #64
+    beq         1f
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+1:
+    tst         SIZE, #32
+    beq         1f
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+1:
+    tst         SIZE, #31
+    beq         1f
+    ldmia       SRC!, {r4-r11}
+    stmia       DST!, {r4-r11}
+1:
+    pop         {r4-r11, pc}
+    .unreq      SIZE
+    .unreq      DST
+    .unreq      SRC
+.endfunc
 #endif
--- a/src/cpu_backend.c
+++ b/src/cpu_backend.c
@@ -39,12 +39,7 @@ void memcpy_armv5te(void *dst, const void *src, int size);
 void writeback_scratch_to_mem_neon(int size, void *dst, const void *src);
 void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
 void aligned_fetch_fbmem_to_scratch_vfp(int size, void *dst, const void *src);
+void aligned_fetch_fbmem_to_scratch_arm(int size, void *dst, const void *src);
-static always_inline void
-aligned_fetch_fbmem_to_scratch_arm(int size, void *dst, const void *src)
-{
-    memcpy_armv5te(dst, src, size);
-}
 static always_inline void
 writeback_scratch_to_mem_arm(int size, void *dst, const void *src)
@@ -127,6 +122,14 @@ twopass_memmove_vfp(void *dst, const void *src, size_t size)
                    writeback_scratch_to_mem_arm);
 }
+static void
+twopass_memmove_arm(void *dst, const void *src, size_t size)
+{
+    twopass_memmove(dst, src, size,
+                    aligned_fetch_fbmem_to_scratch_arm,
+                    writeback_scratch_to_mem_arm);
+}
 static void
 twopass_blt_8bpp(int        width,
                 int        height,
@@ -244,6 +247,27 @@ overlapped_blt_vfp(void     *self,
                          twopass_memmove_vfp);
 }
+static int
+overlapped_blt_arm(void     *self,
+                   uint32_t *src_bits,
+                   uint32_t *dst_bits,
+                   int       src_stride,
+                   int       dst_stride,
+                   int       src_bpp,
+                   int       dst_bpp,
+                   int       src_x,
+                   int       src_y,
+                   int       dst_x,
+                   int       dst_y,
+                   int       width,
+                   int       height)
+{
+    return overlapped_blt(self, src_bits, dst_bits, src_stride, dst_stride,
+                          src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+                          width, height,
+                          twopass_memmove_arm);
+}
 #endif
 /* An empty, always failing implementation */
@@ -288,6 +312,11 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
        /* NEON works better on Cortex-A8 */
        ctx->blt2d.overlapped_blt = overlapped_blt_neon;
    }
+    if (ctx->cpuinfo->has_arm_wmmx)
+    {
+        /* ARM LDM/STM works better than VFP/WMMX on Marvell PJ4 */
+        ctx->blt2d.overlapped_blt = overlapped_blt_arm;
+    }
    else if (ctx->cpuinfo->has_arm_vfp && ctx->cpuinfo->has_arm_edsp) {
        /* VFP works better on Cortex-A9, Cortex-A15 and maybe everything else */
        ctx->blt2d.overlapped_blt = overlapped_blt_vfp;

--- a/src/cpuinfo.c
+++ b/src/cpuinfo.c
@@ -95,6 +95,7 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo)
            cpuinfo->has_arm_edsp = find_feature(val, "edsp");
            cpuinfo->has_arm_vfp  = find_feature(val, "vfp");
            cpuinfo->has_arm_neon = find_feature(val, "neon");
+            cpuinfo->has_arm_wmmx = find_feature(val, "iwmmxt");
        }
        else if ((val = cpuinfo_match_prefix(buffer, "CPU implementer"))) {
            if (sscanf(val, "%i", &cpuinfo->arm_implementer) != 1) {
@@ -175,6 +176,8 @@ cpuinfo_t *cpuinfo_init()
        cpuinfo->processor_name = strdup("ARM Cortex-A5");
    } else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xB76) {
        cpuinfo->processor_name = strdup("ARM1176");
+    } else if (cpuinfo->arm_implementer == 0x56 && cpuinfo->arm_part == 0x581) {
+        cpuinfo->processor_name = strdup("Marvell PJ4");
    } else {
        cpuinfo->processor_name = strdup("Unknown");
    }

--- a/src/cpuinfo.h
+++ b/src/cpuinfo.h
@@ -36,6 +36,7 @@ typedef struct {
    int has_arm_edsp;
    int has_arm_vfp;
    int has_arm_neon;
+    int has_arm_wmmx;
    /* The user-friendly CPU description string (usable for logs, etc.) */
    char *processor_name;
 } cpuinfo_t;