Commit e9f978f3 authored by Siarhei Siamashka's avatar Siarhei Siamashka
Browse files

Use ARM LDM instead of VFP for uncached reads on Marvell PJ4



Marvell PJ4 core used in CuBox very poorly handles VFP uncached
reads from the framebuffer. Using WMMX or ARM LDM reads is much
faster, with LDM instructions having a minor advantage. This
improves framebuffer read performance from ~50MB/s to ~100MB/s.

WMMX runtime detection and PJ4 core identification is also added
as part of this fix.
Signed-off-by: default avatarSiarhei Siamashka <siarhei.siamashka@gmail.com>
parent 102957f9
...@@ -505,4 +505,48 @@ asm_function aligned_fetch_fbmem_to_scratch_vfp ...@@ -505,4 +505,48 @@ asm_function aligned_fetch_fbmem_to_scratch_vfp
.unreq SRC .unreq SRC
.endfunc .endfunc
asm_function aligned_fetch_fbmem_to_scratch_arm
SIZE .req r0
DST .req r1
SRC .req r2
push {r4-r11, lr}
subs SIZE, #128
blt 1f
0:
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
subs SIZE, SIZE, #128
bge 0b
1:
tst SIZE, #64
beq 1f
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
1:
tst SIZE, #32
beq 1f
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
1:
tst SIZE, #31
beq 1f
ldmia SRC!, {r4-r11}
stmia DST!, {r4-r11}
1:
pop {r4-r11, pc}
.unreq SIZE
.unreq DST
.unreq SRC
.endfunc
#endif #endif
...@@ -39,12 +39,7 @@ void memcpy_armv5te(void *dst, const void *src, int size); ...@@ -39,12 +39,7 @@ void memcpy_armv5te(void *dst, const void *src, int size);
void writeback_scratch_to_mem_neon(int size, void *dst, const void *src); void writeback_scratch_to_mem_neon(int size, void *dst, const void *src);
void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src); void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
void aligned_fetch_fbmem_to_scratch_vfp(int size, void *dst, const void *src); void aligned_fetch_fbmem_to_scratch_vfp(int size, void *dst, const void *src);
void aligned_fetch_fbmem_to_scratch_arm(int size, void *dst, const void *src);
static always_inline void
aligned_fetch_fbmem_to_scratch_arm(int size, void *dst, const void *src)
{
memcpy_armv5te(dst, src, size);
}
static always_inline void static always_inline void
writeback_scratch_to_mem_arm(int size, void *dst, const void *src) writeback_scratch_to_mem_arm(int size, void *dst, const void *src)
...@@ -127,6 +122,14 @@ twopass_memmove_vfp(void *dst, const void *src, size_t size) ...@@ -127,6 +122,14 @@ twopass_memmove_vfp(void *dst, const void *src, size_t size)
writeback_scratch_to_mem_arm); writeback_scratch_to_mem_arm);
} }
static void
twopass_memmove_arm(void *dst, const void *src, size_t size)
{
twopass_memmove(dst, src, size,
aligned_fetch_fbmem_to_scratch_arm,
writeback_scratch_to_mem_arm);
}
static void static void
twopass_blt_8bpp(int width, twopass_blt_8bpp(int width,
int height, int height,
...@@ -244,6 +247,27 @@ overlapped_blt_vfp(void *self, ...@@ -244,6 +247,27 @@ overlapped_blt_vfp(void *self,
twopass_memmove_vfp); twopass_memmove_vfp);
} }
static int
overlapped_blt_arm(void *self,
uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
return overlapped_blt(self, src_bits, dst_bits, src_stride, dst_stride,
src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
width, height,
twopass_memmove_arm);
}
#endif #endif
/* An empty, always failing implementation */ /* An empty, always failing implementation */
...@@ -288,6 +312,11 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer, ...@@ -288,6 +312,11 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
/* NEON works better on Cortex-A8 */ /* NEON works better on Cortex-A8 */
ctx->blt2d.overlapped_blt = overlapped_blt_neon; ctx->blt2d.overlapped_blt = overlapped_blt_neon;
} }
if (ctx->cpuinfo->has_arm_wmmx)
{
/* ARM LDM/STM works better than VFP/WMMX on Marvell PJ4 */
ctx->blt2d.overlapped_blt = overlapped_blt_arm;
}
else if (ctx->cpuinfo->has_arm_vfp && ctx->cpuinfo->has_arm_edsp) { else if (ctx->cpuinfo->has_arm_vfp && ctx->cpuinfo->has_arm_edsp) {
/* VFP works better on Cortex-A9, Cortex-A15 and maybe everything else */ /* VFP works better on Cortex-A9, Cortex-A15 and maybe everything else */
ctx->blt2d.overlapped_blt = overlapped_blt_vfp; ctx->blt2d.overlapped_blt = overlapped_blt_vfp;
......
...@@ -95,6 +95,7 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo) ...@@ -95,6 +95,7 @@ static int parse_proc_cpuinfo(cpuinfo_t *cpuinfo)
cpuinfo->has_arm_edsp = find_feature(val, "edsp"); cpuinfo->has_arm_edsp = find_feature(val, "edsp");
cpuinfo->has_arm_vfp = find_feature(val, "vfp"); cpuinfo->has_arm_vfp = find_feature(val, "vfp");
cpuinfo->has_arm_neon = find_feature(val, "neon"); cpuinfo->has_arm_neon = find_feature(val, "neon");
cpuinfo->has_arm_wmmx = find_feature(val, "iwmmxt");
} }
else if ((val = cpuinfo_match_prefix(buffer, "CPU implementer"))) { else if ((val = cpuinfo_match_prefix(buffer, "CPU implementer"))) {
if (sscanf(val, "%i", &cpuinfo->arm_implementer) != 1) { if (sscanf(val, "%i", &cpuinfo->arm_implementer) != 1) {
...@@ -175,6 +176,8 @@ cpuinfo_t *cpuinfo_init() ...@@ -175,6 +176,8 @@ cpuinfo_t *cpuinfo_init()
cpuinfo->processor_name = strdup("ARM Cortex-A5"); cpuinfo->processor_name = strdup("ARM Cortex-A5");
} else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xB76) { } else if (cpuinfo->arm_implementer == 0x41 && cpuinfo->arm_part == 0xB76) {
cpuinfo->processor_name = strdup("ARM1176"); cpuinfo->processor_name = strdup("ARM1176");
} else if (cpuinfo->arm_implementer == 0x56 && cpuinfo->arm_part == 0x581) {
cpuinfo->processor_name = strdup("Marvell PJ4");
} else { } else {
cpuinfo->processor_name = strdup("Unknown"); cpuinfo->processor_name = strdup("Unknown");
} }
......
...@@ -36,6 +36,7 @@ typedef struct { ...@@ -36,6 +36,7 @@ typedef struct {
int has_arm_edsp; int has_arm_edsp;
int has_arm_vfp; int has_arm_vfp;
int has_arm_neon; int has_arm_neon;
int has_arm_wmmx;
/* The user-friendly CPU description string (usable for logs, etc.) */ /* The user-friendly CPU description string (usable for logs, etc.) */
char *processor_name; char *processor_name;
} cpuinfo_t; } cpuinfo_t;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment