Commit e3f2b1a9 authored by Alexei Fedorov's avatar Alexei Fedorov
Browse files

plat/arm: Introduce and use libc_asm.mk makefile

Trace analysis of FVP_Base_AEMv8A 0.0/6063 model
running in Aarch32 mode with the build options
listed below:
TRUSTED_BOARD_BOOT=1 GENERATE_COT=1
ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa
ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem
shows that when auth_signature() gets called
71.99% of CPU execution time is spent in memset() function
written in C using single byte write operations,
see lib\libc\memset.c.
This patch introduces new libc_asm.mk makefile which
replaces C memset() implementation with assembler
version giving the following results:
- for Aarch32 in auth_signature() call memset() CPU time
reduced to 20.56%.
The number of CPU instructions (Inst) executed during
TF-A boot stage before start of BL33 in RELEASE builds
for different versions is presented in the tables below,
where:
- C TF-A: existing TF-A C code;
- C musl: "lightweight code" C "implementation of the
  standard library for Linux-based systems"
https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c
- Asm Opt: assemler version from "Arm Optimized Routines"
  project
https://github.com/ARM-software/optimized-routines/blob/
master/string/arm/memset.S
- Asm Linux: assembler version from Linux kernel
https://github.com/torvalds/linux/blob/master/arch/arm/lib/memset.S


- Asm TF-A: assembler version from this patch

Aarch32:
+-----------+------+------+--------------+----------+
| Variant   | Set  | Size |    Inst 	 |  Ratio   |
+-----------+------+------+--------------+----------+
| C TF-A    | T32  | 16   | 2122110003   | 1.000000 |
| C musl    | T32  | 156  | 1643917668   | 0.774662 |
| Asm Opt   | T32  | 84   | 1604810003   | 0.756233 |
| Asm Linux | A32  | 168  | 1566255018   | 0.738065 |
| Asm TF-A  | A32  | 160  | 1525865101   | 0.719032 |
+-----------+------+------+--------------+----------+

AArch64:
+-----------+------+------------+----------+
| Variant   | Size |    Inst    |  Ratio   |
+-----------+------+------------+----------+
| C TF-A    | 28   | 2732497518 | 1.000000 |
| C musl    | 212  | 1802999999 | 0.659836 |
| Asm TF-A  | 140  | 1680260003 | 0.614917 |
+-----------+------+------------+----------+

This patch modifies 'plat\arm\common\arm_common.mk'
by overriding libc.mk makefile with libc_asm.mk and
does not effect other platforms.

Change-Id: Ie89dd0b74ba1079420733a0d76b7366ad0157c2e
Signed-off-by: default avatarAlexei Fedorov <Alexei.Fedorov@arm.com>
parent 29b76f2e
/*
* Copyright (c) 2020, Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <asm_macros.S>
.syntax unified
.global memset
/* -----------------------------------------------------------------------
* void *memset(void *dst, int val, size_t count)
*
* Copy the value of 'val' (converted to an unsigned char) into
* each of the first 'count' characters of the object pointed to by 'dst'.
*
* Returns the value of 'dst'.
* -----------------------------------------------------------------------
*/
func memset
mov r12, r0 /* keep r0 */
tst r0, #3
beq aligned /* 4-bytes aligned */
/* Unaligned 'dst' */
unaligned:
subs r2, r2, #1
strbhs r1, [r12], #1
bxls lr /* return if 0 */
tst r12, #3
bne unaligned /* continue while unaligned */
/* 4-bytes aligned */
aligned:bfi r1, r1, #8, #8 /* propagate 'val' */
bfi r1, r1, #16, #16
mov r3, r1
cmp r2, #16
blo less_16 /* < 16 */
push {r4, lr}
mov r4, r1
mov lr, r1
write_32:
subs r2, r2, #32
stmiahs r12!, {r1, r3, r4, lr}
stmiahs r12!, {r1, r3, r4, lr}
bhi write_32 /* write 32 bytes in a loop */
popeq {r4, pc} /* return if 0 */
lsls r2, r2, #28 /* C = r2[4]; N = r2[3]; Z = r2[3:0] */
stmiacs r12!, {r1, r3, r4, lr} /* write 16 bytes */
popeq {r4, pc} /* return if 16 */
stmiami r12!, {r1, r3} /* write 8 bytes */
lsls r2, r2, #2 /* C = r2[2]; N = r2[1]; Z = r2[1:0] */
strcs r1, [r12], #4 /* write 4 bytes */
popeq {r4, pc} /* return if 8 or 4 */
strhmi r1, [r12], #2 /* write 2 bytes */
lsls r2, r2, #1 /* N = Z = r2[0] */
strbmi r1, [r12] /* write 1 byte */
pop {r4, pc}
less_16:lsls r2, r2, #29 /* C = r2[3]; N = r2[2]; Z = r2[2:0] */
stmiacs r12!, {r1, r3} /* write 8 bytes */
bxeq lr /* return if 8 */
strmi r1, [r12], #4 /* write 4 bytes */
lsls r2, r2, #2 /* C = r2[1]; N = Z = r2[0] */
strhcs r1, [r12], #2 /* write 2 bytes */
strbmi r1, [r12] /* write 1 byte */
bx lr
endfunc memset
/*
* Copyright (c) 2020, Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <asm_macros.S>
.global memset
/* -----------------------------------------------------------------------
* void *memset(void *dst, int val, size_t count)
*
* Copy the value of 'val' (converted to an unsigned char) into
* each of the first 'count' characters of the object pointed to by 'dst'.
*
* Returns the value of 'dst'.
* -----------------------------------------------------------------------
*/
func memset
cbz x2, exit /* exit if 'count' = 0 */
mov x3, x0 /* keep x0 */
tst x0, #7
b.eq aligned /* 8-bytes aligned */
/* Unaligned 'dst' */
unaligned:
strb w1, [x3], #1
subs x2, x2, #1
b.eq exit /* exit if 0 */
tst x3, #7
b.ne unaligned /* continue while unaligned */
/* 8-bytes aligned */
aligned:cbz x1, x1_zero
bfi w1, w1, #8, #8 /* propagate 'val' */
bfi w1, w1, #16, #16
bfi x1, x1, #32, #32
x1_zero:ands x4, x2, #~0x3f
b.eq less_64
write_64:
.rept 4
stp x1, x1, [x3], #16 /* write 64 bytes in a loop */
.endr
subs x4, x4, #64
b.ne write_64
less_64:tbz w2, #5, less_32 /* < 32 bytes */
stp x1, x1, [x3], #16 /* write 32 bytes */
stp x1, x1, [x3], #16
less_32:tbz w2, #4, less_16 /* < 16 bytes */
stp x1, x1, [x3], #16 /* write 16 bytes */
less_16:tbz w2, #3, less_8 /* < 8 bytes */
str x1, [x3], #8 /* write 8 bytes */
less_8: tbz w2, #2, less_4 /* < 4 bytes */
str w1, [x3], #4 /* write 4 bytes */
less_4: tbz w2, #1, less_2 /* < 2 bytes */
strh w1, [x3], #2 /* write 2 bytes */
less_2: tbz w2, #0, exit
strb w1, [x3] /* write 1 byte */
exit: ret
endfunc memset
#
# Copyright (c) 2020, Arm Limited. All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#
LIBC_SRCS := $(addprefix lib/libc/, \
abort.c \
assert.c \
exit.c \
memchr.c \
memcmp.c \
memcpy.c \
memmove.c \
memrchr.c \
printf.c \
putchar.c \
puts.c \
snprintf.c \
strchr.c \
strcmp.c \
strlcpy.c \
strlen.c \
strncmp.c \
strnlen.c \
strrchr.c)
ifeq (${ARCH},aarch64)
LIBC_SRCS += $(addprefix lib/libc/aarch64/, \
memset.S \
setjmp.S)
else
LIBC_SRCS += $(addprefix lib/libc/aarch32/, \
memset.S)
endif
INCLUDES += -Iinclude/lib/libc \
-Iinclude/lib/libc/$(ARCH) \
......@@ -121,6 +121,12 @@ endif
ENABLE_PSCI_STAT := 1
ENABLE_PMF := 1
# Override the standard libc with optimised libc_asm
OVERRIDE_LIBC := 1
ifeq (${OVERRIDE_LIBC},1)
include lib/libc/libc_asm.mk
endif
# On ARM platforms, separate the code and read-only data sections to allow
# mapping the former as executable and the latter as execute-never.
SEPARATE_CODE_AND_RODATA := 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment