Framebuffer readback assembly code for AArch64

On a PINE64 board (ARM Cortex-A53), this provides ~180 MB/s speed for the framebuffer readback. For comparison, the normal memcpy operation in cached buffers runs at around ~1200 MB/s. Such read back speed is actually not very fast and is borderline usable. With a 1920x1080 32bpp screen resolution, this results in something like ~20 FPS scrolling. Benchmark vs. shadow framebuffer (1920x1080 32bpp): == Shadow framebuffer in xf86-video-fbdev == $ wget http://mirror.its.dal.ca/gutenberg/3/2/0/3/32032/32032.txt $ time DISPLAY=:0 xterm +j -maximized -e cat 32032.txt real 0m43.909s user 0m0.820s sys 0m0.300s $ DISPLAY=:0 x11perf -scroll500 -copywinwin500 -copypixwin500 -copywinpix500 15000 trep @ 1.8460 msec ( 542.0/sec): Scroll 500x500 pixels 12000 trep @ 2.2629 msec ( 442.0/sec): Copy 500x500 from window to window 12000 trep @ 2.2096 msec ( 453.0/sec): Copy 500x500 from pixmap to window 14000 trep @ 1.9740 msec ( 507.0/sec): Copy 500x500 from window to pixmap == Direct framebuffer readback in xf86-video-fbturbo == $ wget http://mirror.its.dal.ca/gutenberg/3/2/0/3/32032/32032.txt $ time DISPLAY=:0 xterm +j -maximized -e cat 32032.txt real 2m5.741s user 0m0.390s sys 0m0.190s $ DISPLAY=:0 x11perf -scroll500 -copywinwin500 -copypixwin500 -copywinpix500 4500 trep @ 5.9201 msec ( 169.0/sec): Scroll 500x500 pixels 6000 trep @ 5.9211 msec ( 169.0/sec): Copy 500x500 from window to window 18000 trep @ 1.5341 msec ( 652.0/sec): Copy 500x500 from pixmap to window 4000 trep @ 6.4657 msec ( 155.0/sec): Copy 500x500 from window to pixmap == The direct framebuffer access without the shadow framebuffer layer makes scrolling and moving windows slower. But copying from pixmaps to windows becomes faster. In the real world, copying from offscreen pixmaps to windows is much more important, because it is one of the performance bottlenecks for almost every X11 application. While reading back from the framebuffer is only used for a few very specialized tasks (scrolling/moving windows and making screenshots). On 32-bit ARM systems, the uncached framebuffer readback used to perform better. Even the Cortex-A53 running in 32-bit mode can do framebuffer readback at more than 300 MB/s: https://github.com/ssvb/tinymembench/wiki/PINE64-(Allwinner-A64) Scrolling/moving windows still can be accelerated by the kernel (via DMA, a dedicated 2D accelerator or some other method) and hooked into xf86-video-fbturbo. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>

Framebuffer readback assembly code for AArch64
On a PINE64 board (ARM Cortex-A53), this provides ~180 MB/s speed for the framebuffer readback. For comparison, the normal memcpy operation in cached buffers runs at around ~1200 MB/s. Such read back speed is actually not very fast and is borderline usable. With a 1920x1080 32bpp screen resolution, this results in something like ~20 FPS scrolling. Benchmark vs. shadow framebuffer (1920x1080 32bpp): == Shadow framebuffer in xf86-video-fbdev == $ wget http://mirror.its.dal.ca/gutenberg/3/2/0/3/32032/32032.txt $ time DISPLAY=:0 xterm +j -maximized -e cat 32032.txt real 0m43.909s user 0m0.820s sys 0m0.300s $ DISPLAY=:0 x11perf -scroll500 -copywinwin500 -copypixwin500 -copywinpix500 15000 trep @ 1.8460 msec ( 542.0/sec): Scroll 500x500 pixels 12000 trep @ 2.2629 msec ( 442.0/sec): Copy 500x500 from window to window 12000 trep @ 2.2096 msec ( 453.0/sec): Copy 500x500 from pixmap to window 14000 trep @ 1.9740 msec ( 507.0/sec): Copy 500x500 from window to pixmap == Direct framebuffer readback in xf86-video-fbturbo == $ wget http://mirror.its.dal.ca/gutenberg/3/2/0/3/32032/32032.txt $ time DISPLAY=:0 xterm +j -maximized -e cat 32032.txt real 2m5.741s user 0m0.390s sys 0m0.190s $ DISPLAY=:0 x11perf -scroll500 -copywinwin500 -copypixwin500 -copywinpix500 4500 trep @ 5.9201 msec ( 169.0/sec): Scroll 500x500 pixels 6000 trep @ 5.9211 msec ( 169.0/sec): Copy 500x500 from window to window 18000 trep @ 1.5341 msec ( 652.0/sec): Copy 500x500 from pixmap to window 4000 trep @ 6.4657 msec ( 155.0/sec): Copy 500x500 from window to pixmap == The direct framebuffer access without the shadow framebuffer layer makes scrolling and moving windows slower. But copying from pixmaps to windows becomes faster. In the real world, copying from offscreen pixmaps to windows is much more important, because it is one of the performance bottlenecks for almost every X11 application. While reading back from the framebuffer is only used for a few very specialized tasks (scrolling/moving windows and making screenshots). On 32-bit ARM systems, the uncached framebuffer readback used to perform better. Even the Cortex-A53 running in 32-bit mode can do framebuffer readback at more than 300 MB/s: https://github.com/ssvb/tinymembench/wiki/PINE64-(Allwinner-A64) Scrolling/moving windows still can be accelerated by the kernel (via DMA, a dedicated 2D accelerator or some other method) and hooked into xf86-video-fbturbo. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
a5e698f1 · Siarhei Siamashka · f9a6ed78 · a5e698f1 · a5e698f1 · a5e698f1
Commit a5e698f1 authored Apr 04, 2016 by Siarhei Siamashka
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -32,6 +32,7 @@ fbturbo_drv_la_SOURCES = \
         compat-api.h \
         uthash.h \
         arm_asm.S \
+         aarch64_asm.S \
         cpuinfo.c \
         cpuinfo.h \
         cpu_backend.c \

--- a/src/aarch64_asm.S
+++ b/src/aarch64_asm.S
+/*
+ * Copyright © 2016 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#ifdef __aarch64__
+
+    .cpu cortex-a53+fp+simd
+    .text
+    .align 2
+
+/******************************************************************************/
+
+.macro asm_function function_name
+    .global \function_name
+#ifdef __ELF__
+    .hidden \function_name
+    .type \function_name, %function
+#endif
+.func \function_name
+\function_name:
+.endm
+
+/******************************************************************************/
+
+asm_function aligned_fetch_fbmem_to_scratch_neon
+    SIZE        .req x0
+    DST         .req x1
+    SRC         .req x2
+
+    subs        SIZE, SIZE, #128
+    blt         1f
+0:
+    ldp         q0,  q1, [SRC, #(0 * 32)]
+    ldp         q2,  q3, [SRC, #(1 * 32)]
+    stp         q0,  q1, [DST, #(0 * 32)]
+    stp         q2,  q3, [DST, #(1 * 32)]
+    ldp         q0,  q1, [SRC, #(2 * 32)]
+    ldp         q2,  q3, [SRC, #(3 * 32)]
+    add         SRC, SRC, #128
+    stp         q0,  q1, [DST, #(2 * 32)]
+    stp         q2,  q3, [DST, #(3 * 32)]
+    add         DST, DST, #128
+    subs        SIZE, SIZE, #128
+    bge         0b
+1:
+    tst         SIZE, #64
+    beq         1f
+    ldp         q0,  q1, [SRC, #(0 * 32)]
+    ldp         q2,  q3, [SRC, #(1 * 32)]
+    add         SRC, SRC, #64
+    stp         q0,  q1, [DST, #(0 * 32)]
+    stp         q2,  q3, [DST, #(1 * 32)]
+    add         DST, DST, #64
+1:
+    tst         SIZE, #32
+    beq         1f
+    ldp         q0,  q1, [SRC, #(0 * 32)]
+    add         SRC, SRC, #32
+    stp         q0,  q1, [DST, #(0 * 32)]
+    add         DST, DST, #32
+1:
+    tst         SIZE, #31
+    beq         1f
+    ldp         q0, q1, [SRC]
+    stp         q0, q1, [DST]
+1:
+    ret
+
+    .unreq      SIZE
+    .unreq      DST
+    .unreq      SRC
+.endfunc
+
+#endif
--- a/src/cpu_backend.c
+++ b/src/cpu_backend.c
@@ -27,7 +27,7 @@
 #include "cpuinfo.h"
 #include "cpu_backend.h"

-#ifdef __arm__
+#if defined(__arm__) || defined(__aarch64__)

 #ifdef __GNUC__
 #define always_inline inline __attribute__((always_inline))
@@ -47,6 +47,16 @@ writeback_scratch_to_mem_arm(int size, void *dst, const void *src)
    memcpy_armv5te(dst, src, size);
 }

+#ifdef __aarch64__
+static always_inline void
+writeback_scratch_to_mem_memcpy(int size, void *dst, const void *src)
+{
+    memcpy(dst, src, size);
+}
+
+#define writeback_scratch_to_mem_neon writeback_scratch_to_mem_memcpy
+#endif
+
 #define SCRATCHSIZE 2048

 /*
@@ -114,6 +124,8 @@ twopass_memmove_neon(void *dst, const void *src, size_t size)
                    writeback_scratch_to_mem_neon);
 }

+#ifdef __arm__
+
 static void
 twopass_memmove_vfp(void *dst, const void *src, size_t size)
 {
@@ -130,6 +142,8 @@ twopass_memmove_arm(void *dst, const void *src, size_t size)
                    writeback_scratch_to_mem_arm);
 }

+#endif
+
 static void
 twopass_blt_8bpp(int        width,
                 int        height,
@@ -226,6 +240,8 @@ overlapped_blt_neon(void     *self,
                          twopass_memmove_neon);
 }

+#ifdef __arm__
+
 static int
 overlapped_blt_vfp(void     *self,
                   uint32_t *src_bits,
@@ -270,6 +286,8 @@ overlapped_blt_arm(void     *self,

 #endif

+#endif
+
 /* An empty, always failing implementation */
 static int
 overlapped_blt_noop(void     *self,
@@ -322,6 +340,10 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
    }
 #endif

+#ifdef __aarch64__
+    ctx->blt2d.overlapped_blt = overlapped_blt_neon;
+#endif
+
    return ctx;
 }


--- a/src/cpuinfo.c
+++ b/src/cpuinfo.c
@@ -93,8 +93,10 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo)
        }
        if ((val = cpuinfo_match_prefix(buffer, "Features"))) {
            cpuinfo->has_arm_edsp = find_feature(val, "edsp");
-            cpuinfo->has_arm_vfp  = find_feature(val, "vfp");
-            cpuinfo->has_arm_neon = find_feature(val, "neon");
+            cpuinfo->has_arm_vfp  = find_feature(val, "vfp") ||
+                                    find_feature(val, "fp");
+            cpuinfo->has_arm_neon = find_feature(val, "neon") ||
+                                    find_feature(val, "asimd");
            cpuinfo->has_arm_wmmx = find_feature(val, "iwmmxt");
        }
        else if ((val = cpuinfo_match_prefix(buffer, "CPU implementer"))) {
@@ -105,7 +107,9 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo)
            }
        }
        else if ((val = cpuinfo_match_prefix(buffer, "CPU architecture"))) {
-            if (sscanf(val, "%i", &cpuinfo->arm_architecture) != 1) {
+            if (strncmp(val, "AArch64", 7) == 0) {
+                cpuinfo->arm_architecture = 8;
+            } else if (sscanf(val, "%i", &cpuinfo->arm_architecture) != 1) {
                fclose(fd);
                free(buffer);
                return 0;
@@ -158,7 +162,9 @@ cpuinfo_t *cpuinfo_init()
        return cpuinfo;
    }

-    if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC0F) {
+    if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xD03) {
+        cpuinfo->processor_name = strdup("ARM Cortex-A53");
+    } else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC0F) {
        cpuinfo->processor_name = strdup("ARM Cortex-A15");
    } else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC09) {
        if (cpuinfo->has_arm_neon)

--- a/src/fbdev.c
+++ b/src/fbdev.c
@@ -884,7 +884,7 @@ FBDevScreenInit(SCREEN_INIT_ARGS_DECL)
 	 */
 	useBackingStore = xf86ReturnOptValBool(fPtr->Options, OPTION_USE_BS,
 	                                       !fPtr->shadowFB);
-#ifndef __arm__
+#if !(defined(__arm__) || defined(__aarch64__))
 	/*
 	 * right now we can only make "smart" decisions on ARM hardware,
 	 * everything else (for example x86) would take a performance hit