Commit b93dab5c authored by Siarhei Siamashka's avatar Siarhei Siamashka
Browse files

CPU: Added ARM VFP two-pass overlapped blit implementation



Using VFP, we can load up to 128 bytes with a single VLDM instruction.
But before this patch, only NEON implementation was available. Just
because it showed better results on Allwinner A10 compared to VFP.
And this DDX driver used to primarily target just sunxi hardware.

But looks like it makes sense to also target other devices (at least
ODROID-X, which has the same Mali400 GPU and can use the same DRI2
integration for EGL and GLESv2 support). And on the other ARM devices,
VFP aligned reads generally work better than NEON. The benchmark
results are listed below:

            1280x720, 32bpp, testing "x11perf -scroll500"

== Exynos 5250, Cortex-A15, Non-cacheable streaming enhancement disabled ==

NEON : 10000 trep @   3.7101 msec (   270.0/sec): Scroll 500x500 pixels
VFP  : 10000 trep @   2.6678 msec (   375.0/sec): Scroll 500x500 pixels

== Exynos 5250, Cortex-A15, Non-cacheable streaming enhancement enabled ==

NEON : 15000 trep @   2.2568 msec (   443.0/sec): Scroll 500x500 pixels
VFP  : 15000 trep @   2.3016 msec (   434.0/sec): Scroll 500x500 pixels

== Exynos 4412, Cortex-A9 ==

NEON : 10000 trep @   4.5125 msec (   222.0/sec): Scroll 500x500 pixels
VFP  : 10000 trep @   2.7015 msec (   370.0/sec): Scroll 500x500 pixels

== TI DM3730, Cortex-A8 ==

NEON : 15000 trep @   2.2303 msec (   448.0/sec): Scroll 500x500 pixels
VFP  : 10000 trep @   3.0670 msec (   326.0/sec): Scroll 500x500 pixels

== Allwinner A10, Cortex-A8 ==

NEON : 10000 trep @   2.5559 msec (   391.0/sec): Scroll 500x500 pixels
VFP  : 10000 trep @   3.0580 msec (   327.0/sec): Scroll 500x500 pixels

== Raspberry Pi, BCM2708, ARM1176 ==

VFP  :  3000 trep @   8.7699 msec (   114.0/sec): Scroll 500x500 pixels

The benchmark numbers in this particular test setup roughly represent
memory copy bandwidth measured in MB/s (when doing overlapped blits
inside of a writecombine mapped framebuffer).

-----------------------------------------------------------------------

Note: the use of VFP two-pass overlapped copy instead of ShadowFB is
      still not enabled by default when running on Raspberry Pi
      because the performance results are not so great.
Signed-off-by: default avatarSiarhei Siamashka <siarhei.siamashka@gmail.com>
parent ae976fe9
......@@ -466,4 +466,43 @@ asm_function aligned_fetch_fbmem_to_scratch_neon
.unreq SRC
.endfunc
asm_function aligned_fetch_fbmem_to_scratch_vfp
SIZE .req r0
DST .req r1
SRC .req r2
vpush {d8-d15}
subs SIZE, #128
blt 1f
0:
/* aligned load from the source (framebuffer) */
vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
/* aligned store to the scratch buffer */
vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
subs SIZE, SIZE, #128
bge 0b
1:
tst SIZE, #64
beq 1f
vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7}
vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7}
1:
tst SIZE, #32
beq 1f
vldm SRC!, {d0, d1, d2, d3}
vstm DST!, {d0, d1, d2, d3}
1:
tst SIZE, #31
beq 1f
vldm SRC!, {d0, d1, d2, d3}
vstm DST!, {d0, d1, d2, d3}
1:
vpop {d8-d15}
bx lr
.unreq SIZE
.unreq DST
.unreq SRC
.endfunc
#endif
......@@ -29,8 +29,28 @@
#ifdef __arm__
#ifdef __GNUC__
#define always_inline inline __attribute__((always_inline))
#else
#define always_inline inline
#endif
void memcpy_armv5te(void *dst, const void *src, int size);
void writeback_scratch_to_mem_neon(int size, void *dst, const void *src);
void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
void aligned_fetch_fbmem_to_scratch_vfp(int size, void *dst, const void *src);
static always_inline void
aligned_fetch_fbmem_to_scratch_arm(int size, void *dst, const void *src)
{
memcpy_armv5te(dst, src, size);
}
static always_inline void
writeback_scratch_to_mem_arm(int size, void *dst, const void *src)
{
memcpy_armv5te(dst, src, size);
}
#define SCRATCHSIZE 2048
......@@ -43,8 +63,10 @@ void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
* (even if an aligned 32 byte chunk contains only a single byte belonging
* to the source buffer, the whole chunk is going to be read).
*/
static void
twopass_memmove_neon(void *dst_, const void *src_, size_t size)
static always_inline void
twopass_memmove(void *dst_, const void *src_, size_t size,
void (*aligned_fetch_fbmem_to_scratch)(int, void *, const void *),
void (*writeback_scratch_to_mem)(int, void *, const void *))
{
uint8_t tmpbuf[SCRATCHSIZE + 32 + 31];
uint8_t *scratchbuf = (uint8_t *)((uintptr_t)(&tmpbuf[0] + 31) & ~31);
......@@ -55,17 +77,17 @@ twopass_memmove_neon(void *dst_, const void *src_, size_t size)
if (src > dst) {
while (size >= SCRATCHSIZE) {
aligned_fetch_fbmem_to_scratch_neon(SCRATCHSIZE + extrasize,
aligned_fetch_fbmem_to_scratch(SCRATCHSIZE + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(SCRATCHSIZE, dst, scratchbuf + alignshift);
writeback_scratch_to_mem(SCRATCHSIZE, dst, scratchbuf + alignshift);
size -= SCRATCHSIZE;
dst += SCRATCHSIZE;
src += SCRATCHSIZE;
}
if (size > 0) {
aligned_fetch_fbmem_to_scratch_neon(size + extrasize,
aligned_fetch_fbmem_to_scratch(size + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(size, dst, scratchbuf + alignshift);
writeback_scratch_to_mem(size, dst, scratchbuf + alignshift);
}
}
else {
......@@ -74,28 +96,45 @@ twopass_memmove_neon(void *dst_, const void *src_, size_t size)
src += size - remainder;
size -= remainder;
if (remainder) {
aligned_fetch_fbmem_to_scratch_neon(remainder + extrasize,
aligned_fetch_fbmem_to_scratch(remainder + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(remainder, dst, scratchbuf + alignshift);
writeback_scratch_to_mem(remainder, dst, scratchbuf + alignshift);
}
while (size > 0) {
dst -= SCRATCHSIZE;
src -= SCRATCHSIZE;
size -= SCRATCHSIZE;
aligned_fetch_fbmem_to_scratch_neon(SCRATCHSIZE + extrasize,
aligned_fetch_fbmem_to_scratch(SCRATCHSIZE + extrasize,
scratchbuf, src - alignshift);
writeback_scratch_to_mem_neon(SCRATCHSIZE, dst, scratchbuf + alignshift);
writeback_scratch_to_mem(SCRATCHSIZE, dst, scratchbuf + alignshift);
}
}
}
static void
twopass_blt_8bpp_neon(int width,
twopass_memmove_neon(void *dst, const void *src, size_t size)
{
twopass_memmove(dst, src, size,
aligned_fetch_fbmem_to_scratch_neon,
writeback_scratch_to_mem_neon);
}
static void
twopass_memmove_vfp(void *dst, const void *src, size_t size)
{
twopass_memmove(dst, src, size,
aligned_fetch_fbmem_to_scratch_vfp,
writeback_scratch_to_mem_arm);
}
static void
twopass_blt_8bpp(int width,
int height,
uint8_t *dst_bytes,
uintptr_t dst_stride,
uint8_t *src_bytes,
uintptr_t src_stride)
uintptr_t src_stride,
void (*twopass_memmove)(void *, const void *, size_t))
{
if (src_bytes < dst_bytes + width &&
src_bytes + src_stride * height > dst_bytes)
......@@ -108,7 +147,7 @@ twopass_blt_8bpp_neon(int width,
{
while (--height >= 0)
{
twopass_memmove_neon(dst_bytes, src_bytes, width);
twopass_memmove(dst_bytes, src_bytes, width);
dst_bytes += dst_stride;
src_bytes += src_stride;
}
......@@ -117,14 +156,14 @@ twopass_blt_8bpp_neon(int width,
}
while (--height >= 0)
{
twopass_memmove_neon(dst_bytes, src_bytes, width);
twopass_memmove(dst_bytes, src_bytes, width);
dst_bytes += dst_stride;
src_bytes += src_stride;
}
}
static int
overlapped_blt_neon(void *self,
static always_inline int
overlapped_blt(void *self,
uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
......@@ -136,7 +175,8 @@ overlapped_blt_neon(void *self,
int dst_x,
int dst_y,
int width,
int height)
int height,
void (*twopass_memmove)(void *, const void *, size_t))
{
uint8_t *dst_bytes = (uint8_t *)dst_bits;
uint8_t *src_bytes = (uint8_t *)src_bits;
......@@ -150,17 +190,60 @@ overlapped_blt_neon(void *self,
if (src_bpp != dst_bpp || src_bpp & 7 || src_stride < 0 || dst_stride < 0)
return 0;
twopass_blt_8bpp_neon((uintptr_t) width * bpp,
twopass_blt_8bpp((uintptr_t) width * bpp,
height,
dst_bytes + (uintptr_t) dst_y * dst_stride * 4 +
(uintptr_t) dst_x * bpp,
(uintptr_t) dst_stride * 4,
src_bytes + (uintptr_t) src_y * src_stride * 4 +
(uintptr_t) src_x * bpp,
(uintptr_t) src_stride * 4);
(uintptr_t) src_stride * 4,
twopass_memmove);
return 1;
}
static int
overlapped_blt_neon(void *self,
uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
return overlapped_blt(self, src_bits, dst_bits, src_stride, dst_stride,
src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
width, height,
twopass_memmove_neon);
}
static int
overlapped_blt_vfp(void *self,
uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
return overlapped_blt(self, src_bits, dst_bits, src_stride, dst_stride,
src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
width, height,
twopass_memmove_vfp);
}
#endif
/* An empty, always failing implementation */
......@@ -198,9 +281,17 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
ctx->cpuinfo = cpuinfo_init();
#ifdef __arm__
if (ctx->cpuinfo->has_arm_neon) {
if (ctx->cpuinfo->has_arm_neon &&
ctx->cpuinfo->arm_implementer == 0x41 &&
ctx->cpuinfo->arm_part == 0xC08)
{
/* NEON works better on Cortex-A8 */
ctx->blt2d.overlapped_blt = overlapped_blt_neon;
}
else if (ctx->cpuinfo->has_arm_vfp && ctx->cpuinfo->has_arm_edsp) {
/* VFP works better on Cortex-A9, Cortex-A15 and maybe everything else */
ctx->blt2d.overlapped_blt = overlapped_blt_vfp;
}
#endif
return ctx;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment