arm_asm.S 6.16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/*
 * Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

#ifdef __arm__

.text
.syntax unified
.fpu neon
.arch armv7a
.object_arch armv4
.arm
.altmacro
.p2align 2

/******************************************************************************/

.macro asm_function function_name
    .global \function_name
.func \function_name
\function_name:
.endm

/******************************************************************************/

/*
 * writeback_scratch_to_mem_neon(int numbytes, void *dst, void *src)
 *
 * Copy a chunk of data from a cached scratch buffer (so prefetch is not
 * really needed), to a memory buffer in forward direction. Generated from
 * pixman macro templates.
 */

asm_function writeback_scratch_to_mem_neon
	mov	ip, r1
	cmp	r0, #32
	blt	0f
	tst	ip, #15
	beq	1f
	tst	ip, #1
	beq	2f
	vld1.8	{d0[1]}, [r2]!
	add	ip, ip, #1
	sub	r0, r0, #1
2:	tst	ip, #2
	beq	3f
	vld1.8	{d0[2]}, [r2]!
	vld1.8	{d0[3]}, [r2]!
	add	ip, ip, #2
	sub	r0, r0, #2
3:	tst	ip, #4
	beq	4f
	vld1.8	{d0[4]}, [r2]!
	vld1.8	{d0[5]}, [r2]!
	vld1.8	{d0[6]}, [r2]!
	vld1.8	{d0[7]}, [r2]!
	add	ip, ip, #4
	sub	r0, r0, #4
4:	tst	ip, #8
	beq	5f
	vld1.8	{d1}, [r2]!
	add	ip, ip, #8
	sub	r0, r0, #8
5:	vld1.8	{d2-d3}, [r2]!
	add	ip, ip, #16
	sub	r0, r0, #16
	tst	r1, #1
	beq	6f
	vst1.8	{d0[1]}, [r1]!
6:	tst	r1, #2
	beq	7f
	vst1.8	{d0[2]}, [r1]!
	vst1.8	{d0[3]}, [r1]!
7:	tst	r1, #4
	beq	8f
	vst1.8	{d0[4]}, [r1]!
	vst1.8	{d0[5]}, [r1]!
	vst1.8	{d0[6]}, [r1]!
	vst1.8	{d0[7]}, [r1]!
8:	tst	r1, #8
	beq	9f
	vst1.8	{d1}, [r1, :64]!
9:	vst1.8	{d2-d3}, [r1, :128]!
1:	subs	r0, r0, #32
	blt	10f
	vld1.8	{d0-d3}, [r2]!
	subs	r0, r0, #32
	blt	11f
12:	vst1.8	{d0-d3}, [r1, :128]!
	vld1.8	{d0-d3}, [r2]!
	subs	r0, r0, #32
	bge	12b
11:	vst1.8	{d0-d3}, [r1, :128]!
10:	tst	r0, #31
	beq	13f
	tst	r0, #16
	beq	14f
	vld1.8	{d2-d3}, [r2]!
14:	tst	r0, #8
	beq	15f
	vld1.8	{d1}, [r2]!
15:	tst	r0, #4
	beq	16f
	vld1.8	{d0[4]}, [r2]!
	vld1.8	{d0[5]}, [r2]!
	vld1.8	{d0[6]}, [r2]!
	vld1.8	{d0[7]}, [r2]!
16:	tst	r0, #2
	beq	17f
	vld1.8	{d0[2]}, [r2]!
	vld1.8	{d0[3]}, [r2]!
17:	tst	r0, #1
	beq	18f
	vld1.8	{d0[1]}, [r2]!
18:	tst	r0, #16
	beq	19f
	vst1.8	{d2-d3}, [r1, :128]!
19:	tst	r0, #8
	beq	20f
	vst1.8	{d1}, [r1, :64]!
20:	tst	r0, #4
	beq	21f
	vst1.8	{d0[4]}, [r1]!
	vst1.8	{d0[5]}, [r1]!
	vst1.8	{d0[6]}, [r1]!
	vst1.8	{d0[7]}, [r1]!
21:	tst	r0, #2
	beq	22f
	vst1.8	{d0[2]}, [r1]!
	vst1.8	{d0[3]}, [r1]!
22:	tst	r0, #1
	beq	13f
	vst1.8	{d0[1]}, [r1]!
13:	bx	lr
0:	tst	r0, #31
	beq	23f
	tst	r0, #16
	beq	24f
	vld1.8	{d2-d3}, [r2]!
24:	tst	r0, #8
	beq	25f
	vld1.8	{d1}, [r2]!
25:	tst	r0, #4
	beq	26f
	vld1.8	{d0[4]}, [r2]!
	vld1.8	{d0[5]}, [r2]!
	vld1.8	{d0[6]}, [r2]!
	vld1.8	{d0[7]}, [r2]!
26:	tst	r0, #2
	beq	27f
	vld1.8	{d0[2]}, [r2]!
	vld1.8	{d0[3]}, [r2]!
27:	tst	r0, #1
	beq	28f
	vld1.8	{d0[1]}, [r2]!
28:	tst	r0, #16
	beq	29f
	vst1.8	{d2-d3}, [r1]!
29:	tst	r0, #8
	beq	30f
	vst1.8	{d1}, [r1]!
30:	tst	r0, #4
	beq	31f
	vst1.8	{d0[4]}, [r1]!
	vst1.8	{d0[5]}, [r1]!
	vst1.8	{d0[6]}, [r1]!
	vst1.8	{d0[7]}, [r1]!
31:	tst	r0, #2
	beq	32f
	vst1.8	{d0[2]}, [r1]!
	vst1.8	{d0[3]}, [r1]!
32:	tst	r0, #1
	beq	23f
	vst1.8	{d0[1]}, [r1]!
23:	bx	lr
.endfunc

/******************************************************************************/

/*
 * aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
 *
 * Both 'scratch' and 'fbmem' pointers must be 32 bytes aligned.
 * The value in 'numbytes' is also rounded up to a multiple of 32 bytes.

 * The only purpose of this code is to attempt minimizing penalty incured
 * by doing uncached reads from memory (for example framebuffer). We are
 * trying to do the largest possible perfectly aligned reads to fetch
 * data into a temporary scratch buffer in L1 cache.
 */

asm_function aligned_fetch_fbmem_to_scratch_neon
    SIZE        .req r0
    DST         .req r1
    SRC         .req r2

    subs        SIZE, #128
    blt         1f
0:
    /* aligned load from the source (framebuffer) */
    vld1.64     {q0, q1}, [SRC, :256]!
    vld1.64     {q2, q3}, [SRC, :256]!
    vld1.64     {q8, q9}, [SRC, :256]!
    vld1.64     {q10, q11}, [SRC, :256]!
    /* fetch destination (scratch buffer) into L1 cache */
    ldr         r3, [DST]
    ldr         ip, [DST, #64]
    /* aligned store to the scratch buffer */
    vst1.64     {q0, q1}, [DST, :256]!
    vst1.64     {q2, q3}, [DST, :256]!
    vst1.64     {q8, q9}, [DST, :256]!
    vst1.64     {q10, q11}, [DST, :256]!
    subs        SIZE, SIZE, #128
    bge         0b
1:
    tst         SIZE, #64
    beq         1f
    vld1.64     {q0, q1}, [SRC, :256]!
    vld1.64     {q2, q3}, [SRC, :256]!
    ldr         r3, [DST]
    vst1.64     {q0, q1}, [DST, :256]!
    vst1.64     {q2, q3}, [DST, :256]!
1:
    tst         SIZE, #32
    beq         1f
    vld1.64     {q0, q1}, [SRC, :256]!
    vst1.64     {q0, q1}, [DST, :256]!
1:
    tst         SIZE, #31
    beq         1f
    vld1.64     {q0, q1}, [SRC, :256]!
    vst1.64     {q0, q1}, [DST, :256]!
1:
    bx          lr

    .unreq      SIZE
    .unreq      DST
    .unreq      SRC
.endfunc

#endif