CPU: Added ARM VFP two-pass overlapped blit implementation

Using VFP, we can load up to 128 bytes with a single VLDM instruction. But before this patch, only NEON implementation was available. Just because it showed better results on Allwinner A10 compared to VFP. And this DDX driver used to primarily target just sunxi hardware. But looks like it makes sense to also target other devices (at least ODROID-X, which has the same Mali400 GPU and can use the same DRI2 integration for EGL and GLESv2 support). And on the other ARM devices, VFP aligned reads generally work better than NEON. The benchmark results are listed below: 1280x720, 32bpp, testing "x11perf -scroll500" == Exynos 5250, Cortex-A15, Non-cacheable streaming enhancement disabled == NEON : 10000 trep @ 3.7101 msec ( 270.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 2.6678 msec ( 375.0/sec): Scroll 500x500 pixels == Exynos 5250, Cortex-A15, Non-cacheable streaming enhancement enabled == NEON : 15000 trep @ 2.2568 msec ( 443.0/sec): Scroll 500x500 pixels VFP : 15000 trep @ 2.3016 msec ( 434.0/sec): Scroll 500x500 pixels == Exynos 4412, Cortex-A9 == NEON : 10000 trep @ 4.5125 msec ( 222.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 2.7015 msec ( 370.0/sec): Scroll 500x500 pixels == TI DM3730, Cortex-A8 == NEON : 15000 trep @ 2.2303 msec ( 448.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 3.0670 msec ( 326.0/sec): Scroll 500x500 pixels == Allwinner A10, Cortex-A8 == NEON : 10000 trep @ 2.5559 msec ( 391.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 3.0580 msec ( 327.0/sec): Scroll 500x500 pixels == Raspberry Pi, BCM2708, ARM1176 == VFP : 3000 trep @ 8.7699 msec ( 114.0/sec): Scroll 500x500 pixels The benchmark numbers in this particular test setup roughly represent memory copy bandwidth measured in MB/s (when doing overlapped blits inside of a writecombine mapped framebuffer). ----------------------------------------------------------------------- Note: the use of VFP two-pass overlapped copy instead of ShadowFB is still not enabled by default when running on Raspberry Pi because the performance results are not so great. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>

CPU: Added ARM VFP two-pass overlapped blit implementation
Using VFP, we can load up to 128 bytes with a single VLDM instruction. But before this patch, only NEON implementation was available. Just because it showed better results on Allwinner A10 compared to VFP. And this DDX driver used to primarily target just sunxi hardware. But looks like it makes sense to also target other devices (at least ODROID-X, which has the same Mali400 GPU and can use the same DRI2 integration for EGL and GLESv2 support). And on the other ARM devices, VFP aligned reads generally work better than NEON. The benchmark results are listed below: 1280x720, 32bpp, testing "x11perf -scroll500" == Exynos 5250, Cortex-A15, Non-cacheable streaming enhancement disabled == NEON : 10000 trep @ 3.7101 msec ( 270.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 2.6678 msec ( 375.0/sec): Scroll 500x500 pixels == Exynos 5250, Cortex-A15, Non-cacheable streaming enhancement enabled == NEON : 15000 trep @ 2.2568 msec ( 443.0/sec): Scroll 500x500 pixels VFP : 15000 trep @ 2.3016 msec ( 434.0/sec): Scroll 500x500 pixels == Exynos 4412, Cortex-A9 == NEON : 10000 trep @ 4.5125 msec ( 222.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 2.7015 msec ( 370.0/sec): Scroll 500x500 pixels == TI DM3730, Cortex-A8 == NEON : 15000 trep @ 2.2303 msec ( 448.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 3.0670 msec ( 326.0/sec): Scroll 500x500 pixels == Allwinner A10, Cortex-A8 == NEON : 10000 trep @ 2.5559 msec ( 391.0/sec): Scroll 500x500 pixels VFP : 10000 trep @ 3.0580 msec ( 327.0/sec): Scroll 500x500 pixels == Raspberry Pi, BCM2708, ARM1176 == VFP : 3000 trep @ 8.7699 msec ( 114.0/sec): Scroll 500x500 pixels The benchmark numbers in this particular test setup roughly represent memory copy bandwidth measured in MB/s (when doing overlapped blits inside of a writecombine mapped framebuffer). ----------------------------------------------------------------------- Note: the use of VFP two-pass overlapped copy instead of ShadowFB is still not enabled by default when running on Raspberry Pi because the performance results are not so great. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
b93dab5c · Siarhei Siamashka · ae976fe9 · b93dab5c · b93dab5c
Commit b93dab5c authored Jun 05, 2013 by Siarhei Siamashka
--- a/src/arm_asm.S
+++ b/src/arm_asm.S
@@ -466,4 +466,43 @@ asm_function aligned_fetch_fbmem_to_scratch_neon
    .unreq      SRC
 .endfunc

+asm_function aligned_fetch_fbmem_to_scratch_vfp
+    SIZE        .req r0
+    DST         .req r1
+    SRC         .req r2
+
+    vpush       {d8-d15}
+    subs        SIZE, #128
+    blt         1f
+0:
+    /* aligned load from the source (framebuffer) */
+    vldm        SRC!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
+    /* aligned store to the scratch buffer */
+    vstm        DST!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
+    subs        SIZE, SIZE, #128
+    bge         0b
+1:
+    tst         SIZE, #64
+    beq         1f
+    vldm        SRC!, {d0, d1, d2, d3, d4, d5, d6, d7}
+    vstm        DST!, {d0, d1, d2, d3, d4, d5, d6, d7}
+1:
+    tst         SIZE, #32
+    beq         1f
+    vldm        SRC!, {d0, d1, d2, d3}
+    vstm        DST!, {d0, d1, d2, d3}
+1:
+    tst         SIZE, #31
+    beq         1f
+    vldm        SRC!, {d0, d1, d2, d3}
+    vstm        DST!, {d0, d1, d2, d3}
+1:
+    vpop        {d8-d15}
+    bx          lr
+
+    .unreq      SIZE
+    .unreq      DST
+    .unreq      SRC
+.endfunc
+
 #endif
--- a/src/cpu_backend.c
+++ b/src/cpu_backend.c
@@ -29,8 +29,28 @@

 #ifdef __arm__

+#ifdef __GNUC__
+#define always_inline inline __attribute__((always_inline))
+#else
+#define always_inline inline
+#endif
+
+void memcpy_armv5te(void *dst, const void *src, int size);
 void writeback_scratch_to_mem_neon(int size, void *dst, const void *src);
 void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
+void aligned_fetch_fbmem_to_scratch_vfp(int size, void *dst, const void *src);
+
+static always_inline void
+aligned_fetch_fbmem_to_scratch_arm(int size, void *dst, const void *src)
+{
+    memcpy_armv5te(dst, src, size);
+}
+
+static always_inline void
+writeback_scratch_to_mem_arm(int size, void *dst, const void *src)
+{
+    memcpy_armv5te(dst, src, size);
+}

 #define SCRATCHSIZE 2048

@@ -43,8 +63,10 @@ void aligned_fetch_fbmem_to_scratch_neon(int size, void *dst, const void *src);
 * (even if an aligned 32 byte chunk contains only a single byte belonging
 * to the source buffer, the whole chunk is going to be read).
 */
-static void
-twopass_memmove_neon(void *dst_, const void *src_, size_t size)
+static always_inline void
+twopass_memmove(void *dst_, const void *src_, size_t size,
+                void (*aligned_fetch_fbmem_to_scratch)(int, void *, const void *),
+                void (*writeback_scratch_to_mem)(int, void *, const void *))
 {
    uint8_t tmpbuf[SCRATCHSIZE + 32 + 31];
    uint8_t *scratchbuf = (uint8_t *)((uintptr_t)(&tmpbuf[0] + 31) & ~31);
@@ -55,17 +77,17 @@ twopass_memmove_neon(void *dst_, const void *src_, size_t size)

    if (src > dst) {
        while (size >= SCRATCHSIZE) {
-            aligned_fetch_fbmem_to_scratch_neon(SCRATCHSIZE + extrasize,
-                                                scratchbuf, src - alignshift);
-            writeback_scratch_to_mem_neon(SCRATCHSIZE, dst, scratchbuf + alignshift);
+            aligned_fetch_fbmem_to_scratch(SCRATCHSIZE + extrasize,
+                                           scratchbuf, src - alignshift);
+            writeback_scratch_to_mem(SCRATCHSIZE, dst, scratchbuf + alignshift);
            size -= SCRATCHSIZE;
            dst += SCRATCHSIZE;
            src += SCRATCHSIZE;
        }
        if (size > 0) {
-            aligned_fetch_fbmem_to_scratch_neon(size + extrasize,
-                                                scratchbuf, src - alignshift);
-            writeback_scratch_to_mem_neon(size, dst, scratchbuf + alignshift);
+            aligned_fetch_fbmem_to_scratch(size + extrasize,
+                                           scratchbuf, src - alignshift);
+            writeback_scratch_to_mem(size, dst, scratchbuf + alignshift);
        }
    }
    else {
@@ -74,28 +96,45 @@ twopass_memmove_neon(void *dst_, const void *src_, size_t size)
        src += size - remainder;
        size -= remainder;
        if (remainder) {
-            aligned_fetch_fbmem_to_scratch_neon(remainder + extrasize,
-                                                scratchbuf, src - alignshift);
-            writeback_scratch_to_mem_neon(remainder, dst, scratchbuf + alignshift);
+            aligned_fetch_fbmem_to_scratch(remainder + extrasize,
+                                           scratchbuf, src - alignshift);
+            writeback_scratch_to_mem(remainder, dst, scratchbuf + alignshift);
        }
        while (size > 0) {
            dst -= SCRATCHSIZE;
            src -= SCRATCHSIZE;
            size -= SCRATCHSIZE;
-            aligned_fetch_fbmem_to_scratch_neon(SCRATCHSIZE + extrasize,
-                                                scratchbuf, src - alignshift);
-            writeback_scratch_to_mem_neon(SCRATCHSIZE, dst, scratchbuf + alignshift);
+            aligned_fetch_fbmem_to_scratch(SCRATCHSIZE + extrasize,
+                                           scratchbuf, src - alignshift);
+            writeback_scratch_to_mem(SCRATCHSIZE, dst, scratchbuf + alignshift);
        }
    }
 }

 static void
-twopass_blt_8bpp_neon(int        width,
-                      int        height,
-                      uint8_t   *dst_bytes,
-                      uintptr_t  dst_stride,
-                      uint8_t   *src_bytes,
-                      uintptr_t  src_stride)
+twopass_memmove_neon(void *dst, const void *src, size_t size)
+{
+    twopass_memmove(dst, src, size,
+                    aligned_fetch_fbmem_to_scratch_neon,
+                    writeback_scratch_to_mem_neon);
+}
+
+static void
+twopass_memmove_vfp(void *dst, const void *src, size_t size)
+{
+    twopass_memmove(dst, src, size,
+                    aligned_fetch_fbmem_to_scratch_vfp,
+                    writeback_scratch_to_mem_arm);
+}
+
+static void
+twopass_blt_8bpp(int        width,
+                 int        height,
+                 uint8_t   *dst_bytes,
+                 uintptr_t  dst_stride,
+                 uint8_t   *src_bytes,
+                 uintptr_t  src_stride,
+                 void (*twopass_memmove)(void *, const void *, size_t))
 {
    if (src_bytes < dst_bytes + width &&
        src_bytes + src_stride * height > dst_bytes)
@@ -108,7 +147,7 @@ twopass_blt_8bpp_neon(int        width,
        {
            while (--height >= 0)
            {
-                twopass_memmove_neon(dst_bytes, src_bytes, width);
+                twopass_memmove(dst_bytes, src_bytes, width);
                dst_bytes += dst_stride;
                src_bytes += src_stride;
            }
@@ -117,12 +156,52 @@ twopass_blt_8bpp_neon(int        width,
    }
    while (--height >= 0)
    {
-        twopass_memmove_neon(dst_bytes, src_bytes, width);
+        twopass_memmove(dst_bytes, src_bytes, width);
        dst_bytes += dst_stride;
        src_bytes += src_stride;
    }
 }

+static always_inline int
+overlapped_blt(void     *self,
+               uint32_t *src_bits,
+               uint32_t *dst_bits,
+               int       src_stride,
+               int       dst_stride,
+               int       src_bpp,
+               int       dst_bpp,
+               int       src_x,
+               int       src_y,
+               int       dst_x,
+               int       dst_y,
+               int       width,
+               int       height,
+               void (*twopass_memmove)(void *, const void *, size_t))
+{
+    uint8_t *dst_bytes = (uint8_t *)dst_bits;
+    uint8_t *src_bytes = (uint8_t *)src_bits;
+    cpu_backend_t *ctx = (cpu_backend_t *)self;
+    int bpp = src_bpp >> 3;
+    int uncached_source = (src_bytes >= ctx->uncached_area_begin) &&
+                          (src_bytes < ctx->uncached_area_end);
+    if (!uncached_source)
+        return 0;
+
+    if (src_bpp != dst_bpp || src_bpp & 7 || src_stride < 0 || dst_stride < 0)
+        return 0;
+
+    twopass_blt_8bpp((uintptr_t) width * bpp,
+                     height,
+                     dst_bytes + (uintptr_t) dst_y * dst_stride * 4 +
+                                 (uintptr_t) dst_x * bpp,
+                     (uintptr_t) dst_stride * 4,
+                     src_bytes + (uintptr_t) src_y * src_stride * 4 +
+                                 (uintptr_t) src_x * bpp,
+                     (uintptr_t) src_stride * 4,
+                     twopass_memmove);
+    return 1;
+}
+
 static int
 overlapped_blt_neon(void     *self,
                    uint32_t *src_bits,
@@ -138,27 +217,31 @@ overlapped_blt_neon(void     *self,
                    int       width,
                    int       height)
 {
-    uint8_t *dst_bytes = (uint8_t *)dst_bits;
-    uint8_t *src_bytes = (uint8_t *)src_bits;
-    cpu_backend_t *ctx = (cpu_backend_t *)self;
-    int bpp = src_bpp >> 3;
-    int uncached_source = (src_bytes >= ctx->uncached_area_begin) &&
-                          (src_bytes < ctx->uncached_area_end);
-    if (!uncached_source)
-        return 0;
-
-    if (src_bpp != dst_bpp || src_bpp & 7 || src_stride < 0 || dst_stride < 0)
-        return 0;
+    return overlapped_blt(self, src_bits, dst_bits, src_stride, dst_stride,
+                          src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+                          width, height,
+                          twopass_memmove_neon);
+}

-    twopass_blt_8bpp_neon((uintptr_t) width * bpp,
-                          height,
-                          dst_bytes + (uintptr_t) dst_y * dst_stride * 4 +
-                                      (uintptr_t) dst_x * bpp,
-                          (uintptr_t) dst_stride * 4,
-                          src_bytes + (uintptr_t) src_y * src_stride * 4 +
-                                      (uintptr_t) src_x * bpp,
-                          (uintptr_t) src_stride * 4);
-    return 1;
+static int
+overlapped_blt_vfp(void     *self,
+                   uint32_t *src_bits,
+                   uint32_t *dst_bits,
+                   int       src_stride,
+                   int       dst_stride,
+                   int       src_bpp,
+                   int       dst_bpp,
+                   int       src_x,
+                   int       src_y,
+                   int       dst_x,
+                   int       dst_y,
+                   int       width,
+                   int       height)
+{
+    return overlapped_blt(self, src_bits, dst_bits, src_stride, dst_stride,
+                          src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+                          width, height,
+                          twopass_memmove_vfp);
 }

 #endif
@@ -198,9 +281,17 @@ cpu_backend_t *cpu_backend_init(uint8_t *uncached_buffer,
    ctx->cpuinfo = cpuinfo_init();

 #ifdef __arm__
-    if (ctx->cpuinfo->has_arm_neon) {
+    if (ctx->cpuinfo->has_arm_neon &&
+        ctx->cpuinfo->arm_implementer == 0x41 &&
+        ctx->cpuinfo->arm_part == 0xC08)
+    {
+        /* NEON works better on Cortex-A8 */
        ctx->blt2d.overlapped_blt = overlapped_blt_neon;
    }
+    else if (ctx->cpuinfo->has_arm_vfp && ctx->cpuinfo->has_arm_edsp) {
+        /* VFP works better on Cortex-A9, Cortex-A15 and maybe everything else */
+        ctx->blt2d.overlapped_blt = overlapped_blt_vfp;
+    }
 #endif

    return ctx;