Commit a5e698f1 authored by Siarhei Siamashka's avatar Siarhei Siamashka
Browse files

Framebuffer readback assembly code for AArch64

On a PINE64 board (ARM Cortex-A53), this provides ~180 MB/s
speed for the framebuffer readback. For comparison, the normal
memcpy operation in cached buffers runs at around ~1200 MB/s.

Such read back speed is actually not very fast and is borderline
usable. With a 1920x1080 32bpp screen resolution, this results in
something like ~20 FPS scrolling.

Benchmark vs. shadow framebuffer (1920x1080 32bpp):

  == Shadow framebuffer in xf86-video-fbdev ==

     $ wget http://mirror.its.dal.ca/gutenberg/3/2/0/3/32032/32032.txt
     $ time DISPLAY=:0 xterm +j -maximized -e cat 32032.txt

     real 0m43.909s
     user 0m0.820s
     sys  0m0.300s

     $ DISPLAY=:0 x11perf -scroll500 -copywinwin500 -copypixwin500 -copywinpix500

     15000 trep @   1.8460 msec (   542.0/sec): Scroll 500x500 pixels
     12000 trep @   2.2629 msec (   442.0/sec): Copy 500x500 from window to window
     12000 trep @   2.2096 msec (   453.0/sec): Copy 500x500 from pixmap to window
     14000 trep @   1.9740 msec (   507.0/sec): Copy 500x500 from window to pixmap

  == Direct framebuffer readback in xf86-video-fbturbo ==

     $ wget http://mirror.its.dal.ca/gutenberg/3/2/0/3/32032/32032.txt
     $ time DISPLAY=:0 xterm +j -maximized -e cat 32032.txt

     real 2m5.741s
     user 0m0.390s
     sys  0m0.190s

     $ DISPLAY=:0 x11perf -scroll500 -copywinwin500 -copypixwin500 -copywinpix500

      4500 trep @   5.9201 msec (   169.0/sec): Scroll 500x500 pixels
      6000 trep @   5.9211 msec (   169.0/sec): Copy 500x500 from window to window
     18000 trep @   1.5341 msec (   652.0/sec): Copy 500x500 from pixmap to window
      4000 trep @   6.4657 msec (   155.0/sec): Copy 500x500 from window to pixmap

  ==

The direct framebuffer access without the shadow framebuffer layer
makes scrolling and moving windows slower. But copying from pixmaps
to windows becomes faster. In the real world, copying from offscreen
pixmaps to windows is much more important, because it is one of the
performance bottlenecks for almost every X11 application. While
reading back from the framebuffer is only used for a few very
specialized tasks (scrolling/moving windows and making screenshots).

On 32-bit ARM systems, the uncached framebuffer readback used to
perform better. Even the Cortex-A53 running in 32-bit mode can
do framebuffer readback at more than 300 MB/s:
    https://github.com/ssvb/tinymembench/wiki/PINE64-(Allwinner-A64)



Scrolling/moving windows still can be accelerated by the kernel
(via DMA, a dedicated 2D accelerator or some other method) and
hooked into xf86-video-fbturbo.
Signed-off-by: default avatarSiarhei Siamashka <siarhei.siamashka@gmail.com>
parent f9a6ed78
...@@ -32,6 +32,7 @@ fbturbo_drv_la_SOURCES = \ ...@@ -32,6 +32,7 @@ fbturbo_drv_la_SOURCES = \
compat-api.h \ compat-api.h \
uthash.h \ uthash.h \
arm_asm.S \ arm_asm.S \
aarch64_asm.S \
cpuinfo.c \ cpuinfo.c \
cpuinfo.h \ cpuinfo.h \
cpu_backend.c \ cpu_backend.c \
......
/*
* Copyright © 2016 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#ifdef __aarch64__
.cpu cortex-a53+fp+simd
.text
.align 2
/******************************************************************************/
.macro asm_function function_name
.global \function_name
#ifdef __ELF__
.hidden \function_name
.type \function_name, %function
#endif
.func \function_name
\function_name:
.endm
/******************************************************************************/
asm_function aligned_fetch_fbmem_to_scratch_neon
SIZE .req x0
DST .req x1
SRC .req x2
subs SIZE, SIZE, #128
blt 1f
0:
ldp q0, q1, [SRC, #(0 * 32)]
ldp q2, q3, [SRC, #(1 * 32)]
stp q0, q1, [DST, #(0 * 32)]
stp q2, q3, [DST, #(1 * 32)]
ldp q0, q1, [SRC, #(2 * 32)]
ldp q2, q3, [SRC, #(3 * 32)]
add SRC, SRC, #128
stp q0, q1, [DST, #(2 * 32)]
stp q2, q3, [DST, #(3 * 32)]
add DST, DST, #128
subs SIZE, SIZE, #128
bge 0b
1:
tst SIZE, #64
beq 1f
ldp q0, q1, [SRC, #(0 * 32)]
ldp q2, q3, [SRC, #(1 * 32)]
add SRC, SRC, #64
stp q0, q1, [DST, #(0 * 32)]
stp q2, q3, [DST, #(1 * 32)]
add DST, DST, #64
1:
tst SIZE, #32
beq 1f
ldp q0, q1, [SRC, #(0 * 32)]
add SRC, SRC, #32
stp q0, q1, [DST, #(0 * 32)]
add DST, DST, #32
1:
tst SIZE, #31
beq 1f
ldp q0, q1, [SRC]
stp q0, q1, [DST]
1:
ret
.unreq SIZE
.unreq DST
.unreq SRC
.endfunc
#endif
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#include "cpuinfo.h" #include "cpuinfo.h"
#include "cpu_backend.h" #include "cpu_backend.h"
#ifdef __arm__ #if defined(__arm__) || defined(__aarch64__)
#ifdef __GNUC__ #ifdef __GNUC__
#define always_inline inline __attribute__((always_inline)) #define always_inline inline __attribute__((always_inline))
...@@ -47,6 +47,16 @@ writeback_scratch_to_mem_arm(int size, void *dst, const void *src) ...@@ -47,6 +47,16 @@ writeback_scratch_to_mem_arm(int size, void *dst, const void *src)
memcpy_armv5te(dst, src, size); memcpy_armv5te(dst, src, size);
} }
#ifdef __aarch64__
static always_inline void
writeback_scratch_to_mem_memcpy(int size, void *dst, const void *src)
{
memcpy(dst, src, size);
}
#define writeback_scratch_to_mem_neon writeback_scratch_to_mem_memcpy
#endif
#define SCRATCHSIZE 2048 #define SCRATCHSIZE 2048
/* /*
...@@ -114,6 +124,8 @@ twopass_memmove_neon(void *dst, const void *src, size_t size) ...@@ -114,6 +124,8 @@ twopass_memmove_neon(void *dst, const void *src, size_t size)
writeback_scratch_to_mem_neon); writeback_scratch_to_mem_neon);
} }
#ifdef __arm__
static void static void
twopass_memmove_vfp(void *dst, const void *src, size_t size) twopass_memmove_vfp(void *dst, const void *src, size_t size)
{ {
...@@ -130,6 +142,8 @@ twopass_memmove_arm(void *dst, const void *src, size_t size) ...@@ -130,6 +142,8 @@ twopass_memmove_arm(void *dst, const void *src, size_t size)
writeback_scratch_to_mem_arm); writeback_scratch_to_mem_arm);
} }
#endif
static void static void
twopass_blt_8bpp(int width, twopass_blt_8bpp(int width,
int height, int height,
...@@ -226,6 +240,8 @@ overlapped_blt_neon(void *self, ...@@ -226,6 +240,8 @@ overlapped_blt_neon(void *self,
twopass_memmove_neon); twopass_memmove_neon);
} }
#ifdef __arm__
static int static int
overlapped_blt_vfp(void *self, overlapped_blt_vfp(void *self,
uint32_t *src_bits, uint32_t *src_bits,
...@@ -270,6 +286,8 @@ overlapped_blt_arm(void *self, ...@@ -270,6 +286,8 @@ overlapped_blt_arm(void *self,
#endif #endif
#endif
/* An empty, always failing implementation */ /* An empty, always failing implementation */
static int static int
overlapped_blt_noop(void *self, overlapped_blt_noop(void *self,
...@@ -322,6 +340,10 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer, ...@@ -322,6 +340,10 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
} }
#endif #endif
#ifdef __aarch64__
ctx->blt2d.overlapped_blt = overlapped_blt_neon;
#endif
return ctx; return ctx;
} }
......
...@@ -93,8 +93,10 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo) ...@@ -93,8 +93,10 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo)
} }
if ((val = cpuinfo_match_prefix(buffer, "Features"))) { if ((val = cpuinfo_match_prefix(buffer, "Features"))) {
cpuinfo->has_arm_edsp = find_feature(val, "edsp"); cpuinfo->has_arm_edsp = find_feature(val, "edsp");
cpuinfo->has_arm_vfp = find_feature(val, "vfp"); cpuinfo->has_arm_vfp = find_feature(val, "vfp") ||
cpuinfo->has_arm_neon = find_feature(val, "neon"); find_feature(val, "fp");
cpuinfo->has_arm_neon = find_feature(val, "neon") ||
find_feature(val, "asimd");
cpuinfo->has_arm_wmmx = find_feature(val, "iwmmxt"); cpuinfo->has_arm_wmmx = find_feature(val, "iwmmxt");
} }
else if ((val = cpuinfo_match_prefix(buffer, "CPU implementer"))) { else if ((val = cpuinfo_match_prefix(buffer, "CPU implementer"))) {
...@@ -105,7 +107,9 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo) ...@@ -105,7 +107,9 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo)
} }
} }
else if ((val = cpuinfo_match_prefix(buffer, "CPU architecture"))) { else if ((val = cpuinfo_match_prefix(buffer, "CPU architecture"))) {
if (sscanf(val, "%i", &cpuinfo->arm_architecture) != 1) { if (strncmp(val, "AArch64", 7) == 0) {
cpuinfo->arm_architecture = 8;
} else if (sscanf(val, "%i", &cpuinfo->arm_architecture) != 1) {
fclose(fd); fclose(fd);
free(buffer); free(buffer);
return 0; return 0;
...@@ -158,7 +162,9 @@ cpuinfo_t *cpuinfo_init() ...@@ -158,7 +162,9 @@ cpuinfo_t *cpuinfo_init()
return cpuinfo; return cpuinfo;
} }
if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC0F) { if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xD03) {
cpuinfo->processor_name = strdup("ARM Cortex-A53");
} else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC0F) {
cpuinfo->processor_name = strdup("ARM Cortex-A15"); cpuinfo->processor_name = strdup("ARM Cortex-A15");
} else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC09) { } else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xC09) {
if (cpuinfo->has_arm_neon) if (cpuinfo->has_arm_neon)
......
...@@ -884,7 +884,7 @@ FBDevScreenInit(SCREEN_INIT_ARGS_DECL) ...@@ -884,7 +884,7 @@ FBDevScreenInit(SCREEN_INIT_ARGS_DECL)
*/ */
useBackingStore = xf86ReturnOptValBool(fPtr->Options, OPTION_USE_BS, useBackingStore = xf86ReturnOptValBool(fPtr->Options, OPTION_USE_BS,
!fPtr->shadowFB); !fPtr->shadowFB);
#ifndef __arm__ #if !(defined(__arm__) || defined(__aarch64__))
/* /*
* right now we can only make "smart" decisions on ARM hardware, * right now we can only make "smart" decisions on ARM hardware,
* everything else (for example x86) would take a performance hit * everything else (for example x86) would take a performance hit
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment