Commit 24d05b1d authored by Siarhei Siamashka's avatar Siarhei Siamashka
Browse files

CPU: Added ARM NEON optimized CopyWindow/CopyArea implementation



Should be useful for better performance when moving windows
and scrolling on the devices without a dedicated 2D hardware
accelerator (Allwinner A13).
Signed-off-by: default avatarSiarhei Siamashka <siarhei.siamashka@gmail.com>
parent 000398d1
......@@ -40,6 +40,9 @@ m4_ifndef([XORG_MACROS_VERSION],
XORG_MACROS_VERSION(1.8)
XORG_DEFAULT_OPTIONS
# Needed to compile assembly sources
AM_PROG_AS
# Initialize libtool
AC_DISABLE_STATIC
AC_PROG_LIBTOOL
......
......@@ -31,8 +31,11 @@ sunxifb_drv_ladir = @moduledir@/drivers
sunxifb_drv_la_SOURCES = \
compat-api.h \
uthash.h \
arm_asm.S \
cpuinfo.c \
cpuinfo.h \
cpu_backend.c \
cpu_backend.h \
interfaces.h \
fbdev.c \
fbdev_priv.h \
......
/*
* Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#ifdef __arm__
.text
.syntax unified
.fpu neon
.arch armv7a
.object_arch armv4
.arm
.altmacro
.p2align 2
/******************************************************************************/
.macro asm_function function_name
.global \function_name
.func \function_name
\function_name:
.endm
/******************************************************************************/
/*
* writeback_scratch_to_mem_neon(int numbytes, void *dst, void *src)
*
* Copy a chunk of data from a cached scratch buffer (so prefetch is not
* really needed), to a memory buffer in forward direction. Generated from
* pixman macro templates.
*/
asm_function writeback_scratch_to_mem_neon
mov ip, r1
cmp r0, #32
blt 0f
tst ip, #15
beq 1f
tst ip, #1
beq 2f
vld1.8 {d0[1]}, [r2]!
add ip, ip, #1
sub r0, r0, #1
2: tst ip, #2
beq 3f
vld1.8 {d0[2]}, [r2]!
vld1.8 {d0[3]}, [r2]!
add ip, ip, #2
sub r0, r0, #2
3: tst ip, #4
beq 4f
vld1.8 {d0[4]}, [r2]!
vld1.8 {d0[5]}, [r2]!
vld1.8 {d0[6]}, [r2]!
vld1.8 {d0[7]}, [r2]!
add ip, ip, #4
sub r0, r0, #4
4: tst ip, #8
beq 5f
vld1.8 {d1}, [r2]!
add ip, ip, #8
sub r0, r0, #8
5: vld1.8 {d2-d3}, [r2]!
add ip, ip, #16
sub r0, r0, #16
tst r1, #1
beq 6f
vst1.8 {d0[1]}, [r1]!
6: tst r1, #2
beq 7f
vst1.8 {d0[2]}, [r1]!
vst1.8 {d0[3]}, [r1]!
7: tst r1, #4
beq 8f
vst1.8 {d0[4]}, [r1]!
vst1.8 {d0[5]}, [r1]!
vst1.8 {d0[6]}, [r1]!
vst1.8 {d0[7]}, [r1]!
8: tst r1, #8
beq 9f
vst1.8 {d1}, [r1, :64]!
9: vst1.8 {d2-d3}, [r1, :128]!
1: subs r0, r0, #32
blt 10f
vld1.8 {d0-d3}, [r2]!
subs r0, r0, #32
blt 11f
12: vst1.8 {d0-d3}, [r1, :128]!
vld1.8 {d0-d3}, [r2]!
subs r0, r0, #32
bge 12b
11: vst1.8 {d0-d3}, [r1, :128]!
10: tst r0, #31
beq 13f
tst r0, #16
beq 14f
vld1.8 {d2-d3}, [r2]!
14: tst r0, #8
beq 15f
vld1.8 {d1}, [r2]!
15: tst r0, #4
beq 16f
vld1.8 {d0[4]}, [r2]!
vld1.8 {d0[5]}, [r2]!
vld1.8 {d0[6]}, [r2]!
vld1.8 {d0[7]}, [r2]!
16: tst r0, #2
beq 17f
vld1.8 {d0[2]}, [r2]!
vld1.8 {d0[3]}, [r2]!
17: tst r0, #1
beq 18f
vld1.8 {d0[1]}, [r2]!
18: tst r0, #16
beq 19f
vst1.8 {d2-d3}, [r1, :128]!
19: tst r0, #8
beq 20f
vst1.8 {d1}, [r1, :64]!
20: tst r0, #4
beq 21f
vst1.8 {d0[4]}, [r1]!
vst1.8 {d0[5]}, [r1]!
vst1.8 {d0[6]}, [r1]!
vst1.8 {d0[7]}, [r1]!
21: tst r0, #2
beq 22f
vst1.8 {d0[2]}, [r1]!
vst1.8 {d0[3]}, [r1]!
22: tst r0, #1
beq 13f
vst1.8 {d0[1]}, [r1]!
13: bx lr
0: tst r0, #31
beq 23f
tst r0, #16
beq 24f
vld1.8 {d2-d3}, [r2]!
24: tst r0, #8
beq 25f
vld1.8 {d1}, [r2]!
25: tst r0, #4
beq 26f
vld1.8 {d0[4]}, [r2]!
vld1.8 {d0[5]}, [r2]!
vld1.8 {d0[6]}, [r2]!
vld1.8 {d0[7]}, [r2]!
26: tst r0, #2
beq 27f
vld1.8 {d0[2]}, [r2]!
vld1.8 {d0[3]}, [r2]!
27: tst r0, #1
beq 28f
vld1.8 {d0[1]}, [r2]!
28: tst r0, #16
beq 29f
vst1.8 {d2-d3}, [r1]!
29: tst r0, #8
beq 30f
vst1.8 {d1}, [r1]!
30: tst r0, #4
beq 31f
vst1.8 {d0[4]}, [r1]!
vst1.8 {d0[5]}, [r1]!
vst1.8 {d0[6]}, [r1]!
vst1.8 {d0[7]}, [r1]!
31: tst r0, #2
beq 32f
vst1.8 {d0[2]}, [r1]!
vst1.8 {d0[3]}, [r1]!
32: tst r0, #1
beq 23f
vst1.8 {d0[1]}, [r1]!
23: bx lr
.endfunc
/******************************************************************************/
/*
* aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
*
* Both 'scratch' and 'fbmem' pointers must be 32 bytes aligned.
* The value in 'numbytes' is also rounded up to a multiple of 32 bytes.
* The only purpose of this code is to attempt minimizing penalty incured
* by doing uncached reads from memory (for example framebuffer). We are
* trying to do the largest possible perfectly aligned reads to fetch
* data into a temporary scratch buffer in L1 cache.
*/
asm_function aligned_fetch_fbmem_to_scratch_neon
SIZE .req r0
DST .req r1
SRC .req r2
subs SIZE, #128
blt 1f
0:
/* aligned load from the source (framebuffer) */
vld1.64 {q0, q1}, [SRC, :256]!
vld1.64 {q2, q3}, [SRC, :256]!
vld1.64 {q8, q9}, [SRC, :256]!
vld1.64 {q10, q11}, [SRC, :256]!
/* fetch destination (scratch buffer) into L1 cache */
ldr r3, [DST]
ldr ip, [DST, #64]
/* aligned store to the scratch buffer */
vst1.64 {q0, q1}, [DST, :256]!
vst1.64 {q2, q3}, [DST, :256]!
vst1.64 {q8, q9}, [DST, :256]!
vst1.64 {q10, q11}, [DST, :256]!
subs SIZE, SIZE, #128
bge 0b
1:
tst SIZE, #64
beq 1f
vld1.64 {q0, q1}, [SRC, :256]!
vld1.64 {q2, q3}, [SRC, :256]!
ldr r3, [DST]
vst1.64 {q0, q1}, [DST, :256]!
vst1.64 {q2, q3}, [DST, :256]!
1:
tst SIZE, #32
beq 1f
vld1.64 {q0, q1}, [SRC, :256]!
vst1.64 {q0, q1}, [DST, :256]!
1:
tst SIZE, #31
beq 1f
vld1.64 {q0, q1}, [SRC, :256]!
vst1.64 {q0, q1}, [DST, :256]!
1:
bx lr
.unreq SIZE
.unreq DST
.unreq SRC
.endfunc
#endif
/*
* Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <string.h>
#include "cpuinfo.h"
#include "cpu_backend.h"
#ifdef __arm__
void writeback_scratch_to_mem_neon(int size, void *dst, const void *src);
void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
#define SCRATCHSIZE 2048
/*
* This is a function similar to memmove, which tries to minimize uncached read
* penalty for the source buffer (for example if the source is a framebuffer).
*
* Note: because this implementation fetches data as 32 byte aligned chunks
* valgrind is going to scream about read accesses outside the source buffer.
* (even if an aligned 32 byte chunk contains only a single byte belonging
* to the source buffer, the whole chunk is going to be read).
*/
static void
twopass_memmove_neon(void *dst_, const void *src_, size_t size)
{
uint8_t tmpbuf[SCRATCHSIZE + 32 + 31];
uint8_t *scratchbuf = (uint8_t *)((uintptr_t)(&tmpbuf[0] + 31) & ~31);
uint8_t *dst = (uint8_t *)dst_;
const uint8_t *src = (const uint8_t *)src_;
uintptr_t alignshift = (uintptr_t)src & 31;
uintptr_t extrasize = (alignshift == 0) ? 0 : 32;
if (src > dst) {
while (size >= SCRATCHSIZE) {
aligned_fetch_fbmem_to_scratch_neon(SCRATCHSIZE + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(SCRATCHSIZE, dst, scratchbuf + alignshift);
size -= SCRATCHSIZE;
dst += SCRATCHSIZE;
src += SCRATCHSIZE;
}
if (size > 0) {
aligned_fetch_fbmem_to_scratch_neon(size + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(size, dst, scratchbuf + alignshift);
}
}
else {
uintptr_t remainder = size % SCRATCHSIZE;
dst += size - remainder;
src += size - remainder;
size -= remainder;
if (remainder) {
aligned_fetch_fbmem_to_scratch_neon(remainder + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(remainder, dst, scratchbuf + alignshift);
}
while (size > 0) {
dst -= SCRATCHSIZE;
src -= SCRATCHSIZE;
size -= SCRATCHSIZE;
aligned_fetch_fbmem_to_scratch_neon(SCRATCHSIZE + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(SCRATCHSIZE, dst, scratchbuf + alignshift);
}
}
}
static void
twopass_blt_8bpp_neon(int width,
int height,
uint8_t *dst_bytes,
uintptr_t dst_stride,
uint8_t *src_bytes,
uintptr_t src_stride)
{
if (src_bytes < dst_bytes + width &&
src_bytes + src_stride * height > dst_bytes)
{
src_bytes += src_stride * height - src_stride;
dst_bytes += dst_stride * height - dst_stride;
dst_stride = -dst_stride;
src_stride = -src_stride;
if (src_bytes + width > dst_bytes)
{
while (--height >= 0)
{
twopass_memmove_neon(dst_bytes, src_bytes, width);
dst_bytes += dst_stride;
src_bytes += src_stride;
}
return;
}
}
while (--height >= 0)
{
twopass_memmove_neon(dst_bytes, src_bytes, width);
dst_bytes += dst_stride;
src_bytes += src_stride;
}
}
static int
overlapped_blt_neon(void *self,
uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
uint8_t *dst_bytes = (uint8_t *)dst_bits;
uint8_t *src_bytes = (uint8_t *)src_bits;
cpu_backend_t *ctx = (cpu_backend_t *)self;
int bpp = src_bpp >> 3;
int uncached_source = (src_bytes >= ctx->uncached_area_begin) &&
(src_bytes < ctx->uncached_area_end);
if (!uncached_source)
return 0;
if (src_bpp != dst_bpp || src_bpp & 7 || src_stride < 0 || dst_stride < 0)
return 0;
twopass_blt_8bpp_neon((uintptr_t) width * bpp,
height,
dst_bytes + (uintptr_t) dst_y * dst_stride * 4 +
(uintptr_t) dst_x * bpp,
(uintptr_t) dst_stride * 4,
src_bytes + (uintptr_t) src_y * src_stride * 4 +
(uintptr_t) src_x * bpp,
(uintptr_t) src_stride * 4);
return 1;
}
#endif
/* An empty, always failing implementation */
static int
overlapped_blt_noop(void *self,
uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
return 0;
}
cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
size_t uncached_buffer_size)
{
cpu_backend_t *ctx = calloc(sizeof(cpu_backend_t), 1);
if (!ctx)
return NULL;
ctx->uncached_area_begin = uncached_buffer;
ctx->uncached_area_end = uncached_buffer + uncached_buffer_size;
ctx->blt2d.self = ctx;
ctx->blt2d.overlapped_blt = overlapped_blt_noop;
ctx->cpuinfo = cpuinfo_init();
#ifdef __arm__
if (ctx->cpuinfo->has_arm_neon) {
ctx->blt2d.overlapped_blt = overlapped_blt_neon;
}
#endif
return ctx;
}
void cpu_backend_close(cpu_backend_t *ctx)
{
if (ctx->cpuinfo)
cpuinfo_close(ctx->cpuinfo);
free(ctx);
}
/*
* Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef CPU_BACKEND_H
#define CPU_BACKEND_H
#include <inttypes.h>
#include "cpuinfo.h"
#include "interfaces.h"
/*
* A set of CPU specific optimizations for different operations.
* Supports a single memory area, where reads are uncached and may
* need special treatment.
*/
typedef struct {
/* The information about CPU features */
cpuinfo_t *cpuinfo;
/* The range of addresses for uncached area */
uint8_t *uncached_area_begin;
uint8_t *uncached_area_end;
/* An accelerated implementation of blt2d_i interface */
blt2d_i blt2d;
} cpu_backend_t;
cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer, size_t uncached_buffer_size);
void cpu_backend_close(cpu_backend_t *cpu_backend);
#endif
......@@ -46,6 +46,8 @@
#include "shadow.h"
#include "dgaproc.h"
#include "cpu_backend.h"
#include "sunxi_disp.h"
#include "sunxi_disp_hwcursor.h"
#include "sunxi_x_g2d.h"
......@@ -421,6 +423,7 @@ FBDevPreInit(ScrnInfoPtr pScrn, int flags)
int default_depth, fbbpp;
const char *s;
int type;
cpuinfo_t *cpuinfo;
if (flags & PROBE_DETECT) return FALSE;
......@@ -504,9 +507,18 @@ FBDevPreInit(ScrnInfoPtr pScrn, int flags)
memcpy(fPtr->Options, FBDevOptions, sizeof(FBDevOptions));
xf86ProcessOptions(pScrn->scrnIndex, fPtr->pEnt->device->options, fPtr->Options);
/* use shadow framebuffer by default unless using HW acceleration */
/* check the processor type */
cpuinfo = cpuinfo_init();
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "processor: %s\n",
cpuinfo->processor_name);
/* don't use shadow by default if we have NEON or HW acceleration */
fPtr->shadowFB = !cpuinfo->has_arm_neon &&
!xf86GetOptValString(fPtr->Options, OPTION_ACCELMETHOD);
cpuinfo_close(cpuinfo);
/* but still honour the settings from xorg.conf */
fPtr->shadowFB = xf86ReturnOptValBool(fPtr->Options, OPTION_SHADOW_FB,
!xf86GetOptValString(fPtr->Options, OPTION_ACCELMETHOD));
fPtr->shadowFB);
debug = xf86ReturnOptValBool(fPtr->Options, OPTION_DEBUG, FALSE);
......@@ -687,6 +699,7 @@ FBDevScreenInit(SCREEN_INIT_ARGS_DECL)
int ret, flags;
int type;
char *accelmethod;
cpu_backend_t *cpu_backend;
TRACE_ENTER("FBDevScreenInit");
......@@ -853,6 +866,10 @@ FBDevScreenInit(SCREEN_INIT_ARGS_DECL)
xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
"Render extension initialisation failed\n");
/* initialize the 'CPU' backend */
cpu_backend = cpu_backend_init(fPtr->fbmem, pScrn->videoRam);
fPtr->cpu_backend_private = cpu_backend;
/* try to load G2D kernel module before initializing sunxi-disp */
if (!xf86LoadKernelModule("g2d_23"))
xf86DrvMsg(pScrn->scrnIndex, X_INFO,
......@@ -885,6 +902,12 @@ FBDevScreenInit(SCREEN_INIT_ARGS_DECL)
"no 2D acceleration selected via AccelMethod option\n");
}
if (!fPtr->SunxiG2D_private && cpu_backend->cpuinfo->has_arm_neon) {
if ((fPtr->SunxiG2D_private = SunxiG2D_Init(pScreen, &cpu_backend->blt2d))) {
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "enabled NEON optimizations\n");
}
}
if (fPtr->shadowFB && !FBDevShadowInit(pScreen)) {
xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
"shadow framebuffer initialization failed\n");
......@@ -1052,6 +1075,10 @@ FBDevCloseScreen(CLOSE_SCREEN_ARGS_DECL)
sunxi_disp_close(fPtr->sunxi_disp_private);
fPtr->sunxi_disp_private = NULL;
}
if (fPtr->cpu_backend_private) {
cpu_backend_close(fPtr->cpu_backend_private);
fPtr->cpu_backend_private = NULL;
}
if (fPtr->pDGAMode) {
free(fPtr->pDGAMode);
......
......@@ -48,6 +48,7 @@ typedef struct {
int nDGAMode;
OptionInfoPtr Options;
void *cpu_backend_private;
void *sunxi_disp_private;;
void *SunxiDispHardwareCursor_private;
void *SunxiMaliDRI2_private;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment