Boot Linux faster!

Check our new training course

Boot Linux faster!

Check our new training course
and Creative Commons CC-BY-SA
lecture and lab materials

Bootlin logo

Elixir Cross Referencer

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
/*
 * Copyright (c) 2017 Intel Corporation
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <kernel_structs.h>
#include <arch/x86/asm.h>
#include <arch/cpu.h>
#include <offsets_short.h>
#include <syscall.h>

/* Exports */
GTEXT(z_x86_syscall_entry_stub)
GTEXT(z_x86_userspace_enter)
GTEXT(z_arch_user_string_nlen)
GTEXT(z_arch_user_string_nlen_fault_start)
GTEXT(z_arch_user_string_nlen_fault_end)
GTEXT(z_arch_user_string_nlen_fixup)

/* Imports */
GDATA(_k_syscall_table)

#ifdef CONFIG_X86_KPTI
/* Switch from the shadow to the kernel page table, switch to the interrupted
 * thread's kernel stack, and copy all context from the trampoline stack.
 *
 * Assumes all registers are callee-saved since this gets called from other
 * ASM code. Assumes a particular stack layout which is correct for
 * _exception_enter and _interrupt_enter when invoked with a call instruction:
 *
 *  28 SS
 *  24 ES
 *  20 EFLAGS
 *  16 CS
 *  12 EIP
 *  8  isr_param or exc code
 *  4  isr or exc handler
 *  0  return address
 */
SECTION_FUNC(TEXT, z_x86_trampoline_to_kernel)
	/* Check interrupted code segment to see if we came from ring 3
	 * and hence on the trampoline stack
	 */
	testb $3, 16(%esp) /* Offset of CS */
	jz 1f

	/* Stash these regs as we need to use them */
	pushl	%esi
	pushl	%edi

	/* Switch to kernel page table */
	movl	$z_x86_kernel_pdpt, %esi
	movl	%esi, %cr3

	/* Save old trampoline stack pointer in %edi */
	movl	%esp, %edi

	/* %esp = _kernel->current->stack_info.start
	 *
	 * This is the lowest address of the user mode stack, and higest
	 * address of the kernel stack, they are adjacent.
	 * We want to transplant context here.
	 */
	movl	$_kernel, %esi
	movl	_kernel_offset_to_current(%esi), %esi
	movl	_thread_offset_to_stack_start(%esi), %esp

	/* Transplant stack context and restore ESI/EDI. Taking care to zero
	 * or put uninteresting values where we stashed ESI/EDI since the
	 * trampoline page is insecure and there might a context switch
	 * on the way out instead of returning to the original thread
	 * immediately.
	 */
	pushl	36(%edi)	/* SS */
	pushl	32(%edi)	/* ESP */
	pushl	28(%edi)	/* EFLAGS */
	pushl	24(%edi)	/* CS */
	pushl	20(%edi)	/* EIP */
	pushl	16(%edi)	/* error code or isr parameter */
	pushl	12(%edi)	/* exception/irq handler */
	pushl   8(%edi)		/* return address */
	movl	4(%edi), %esi	/* restore ESI */
	movl	$0, 4(%edi)	/* Zero old esi storage area */
	xchgl	%edi, (%edi)	/* Exchange old edi to restore it and put
				   old sp in the storage area */

	/* Trampoline stack should have nothing sensitive in it at this point */
1:
	ret

/* Copy interrupt return stack context to the trampoline stack, switch back
 * to the user page table, and only then 'iret'. We jump to this instead
 * of calling 'iret' if KPTI is turned on.
 *
 * Stack layout is expected to be as follows:
 *
 * 16 SS
 * 12 ESP
 * 8 EFLAGS
 * 4 CS
 * 0 EIP
 *
 * This function is conditionally macroed to KPTI_IRET/KPTI_IRET_USER
 */
SECTION_FUNC(TEXT, z_x86_trampoline_to_user)
	/* Check interrupted code segment to see if we came from ring 3
	 * and hence on the trampoline stack
	 */
	testb $3, 4(%esp) /* Offset of CS */
	jz 1f

	/* Otherwise, fall through ... */

SECTION_FUNC(TEXT, z_x86_trampoline_to_user_always)
	/* Stash EDI, need a free register */
	pushl	%edi

	/* Store old stack pointer and switch to trampoline stack */
	movl	%esp, %edi
	movl	$z_trampoline_stack_end, %esp

	/* Lock IRQs until we get out, we don't want anyone else using the
	 * trampoline stack
	 */
	cli

	/* Copy context */
	pushl	20(%edi)	/* SS */
	pushl	16(%edi)	/* ESP */
	pushl	12(%edi)	/* EFLAGS */
	pushl   8(%edi)		/* CS */
	pushl   4(%edi)		/* EIP */
	xchgl	%edi, (%edi)	/* Exchange old edi to restore it and put
				   trampoline stack address in its old storage
				   area */
	/* Switch to user page table */
	pushl	%eax
	movl	$z_x86_user_pdpt, %eax
	movl	%eax, %cr3
	popl	%eax
	movl	$0, -4(%esp)	/* Delete stashed EAX data */

	/* Trampoline stack should have nothing sensitive in it at this point */
1:
	iret
#endif /* CONFIG_X86_KPTI */

/* Landing site for syscall SW IRQ. Marshal arguments and call C function for
 * further processing. We're on the kernel stack for the invoking thread,
 * unless KPTI is enabled, in which case we're on the trampoline stack and
 * need to get off it before enabling interrupts.
 */
SECTION_FUNC(TEXT, z_x86_syscall_entry_stub)
#ifdef CONFIG_X86_KPTI
	/* Stash these regs as we need to use them */
	pushl	%esi
	pushl	%edi

	/* Switch to kernel page table */
	movl	$z_x86_kernel_pdpt, %esi
	movl	%esi, %cr3

	/* Save old trampoline stack pointer in %edi */
	movl	%esp, %edi

	/* %esp = _kernel->current->stack_info.start
	 *
	 * This is the lowest address of the user mode stack, and higest
	 * address of the kernel stack, they are adjacent.
	 * We want to transplant context here.
	 */
	movl	$_kernel, %esi
	movl	_kernel_offset_to_current(%esi), %esi
	movl	_thread_offset_to_stack_start(%esi), %esp

	/* Transplant context according to layout above. Variant of logic
	 * in x86_trampoline_to_kernel */
	pushl	24(%edi)	/* SS */
	pushl	20(%edi)	/* ESP */
	pushl	16(%edi)	/* EFLAGS */
	pushl	12(%edi)	/* CS */
	pushl	8(%edi)		/* EIP */
	movl	4(%edi), %esi	/* restore ESI */
	movl	$0, 4(%edi)	/* Zero old esi storage area */
	xchgl	%edi, (%edi)	/* Exchange old edi to restore it and put
				   old sp in the storage area */

	/* Trampoline stack should have nothing sensitive in it at this point */
#endif /* CONFIG_X86_KPTI */

	sti			/* re-enable interrupts */
	cld			/* clear direction flag, restored on 'iret' */

	/* call_id is in ESI. bounds-check it, must be less than
	 * K_SYSCALL_LIMIT
	 */
	cmp	$K_SYSCALL_LIMIT, %esi
	jae	_bad_syscall

_id_ok:
#ifdef CONFIG_BOUNDS_CHECK_BYPASS_MITIGATION
	/* Prevent speculation with bogus system call IDs */
	lfence
#endif
	/* Marshal arguments per calling convention to match what is expected
	 * for _k_syscall_handler_t functions
	 */
	push	%esp		/* ssf */
	push	%ebp		/* arg6 */
	push	%edi		/* arg5 */
	push	%ebx		/* arg4 */
#ifndef CONFIG_X86_IAMCU
	push	%ecx		/* arg3 */
	push	%edx		/* arg2	*/
	push	%eax		/* arg1 */
#endif

	/* from the call ID in ESI, load EBX with the actual function pointer
	 * to call by looking it up in the system call dispatch table
	 */
	xor	%edi, %edi
	mov	_k_syscall_table(%edi, %esi, 4), %ebx

	/* Run the handler, which is some entry in _k_syscall_table */
	INDIRECT_CALL(%ebx)

	/* EAX now contains return value. Pop or xor everything else to prevent
	 * information leak from kernel mode.
	 */
#ifndef CONFIG_X86_IAMCU
	pop	%edx		/* old arg1 value, discard it */
	pop	%edx
	pop	%ecx
#endif
	pop	%ebx
	pop	%edi
#ifndef CONFIG_X86_IAMCU
	/* Discard ssf and arg6 */
	add	$8, %esp
#else
	pop	%ecx		/* Clean ECX and get arg6 off the stack */
	pop	%edx		/* Clean EDX and get ssf off the stack */
#endif
	KPTI_IRET_USER

_bad_syscall:
	/* ESI had a bogus syscall value in it, replace with the bad syscall
	 * handler's ID, and put the bad ID as its first argument.  This
	 * clobbers ESI but the bad syscall handler never returns
	 * anyway, it's going to generate a kernel oops
	 */
	mov	%esi, %eax
	mov	$K_SYSCALL_BAD, %esi
	jmp	_id_ok


/*
 * size_t z_arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg)
 */
SECTION_FUNC(TEXT, z_arch_user_string_nlen)
	push	%ebp
	mov	%esp, %ebp

	/* error value, set to -1 initially. This location is -4(%ebp) */
	push	$-1

	/* Do the strlen operation, based on disassembly of minimal libc */
	xor	%eax, %eax		/* EAX = 0, length count */
	mov	0x8(%ebp), %edx		/* EDX base of string */

	/* This code might page fault */
strlen_loop:
z_arch_user_string_nlen_fault_start:
	cmpb	$0x0, (%edx, %eax, 1)	/* *(EDX + EAX) == 0? Could fault. */

z_arch_user_string_nlen_fault_end:
	je	strlen_done
	cmp	0xc(%ebp), %eax		/* Max length reached? */
	je	strlen_done
	inc	%eax			/* EAX++ and loop again */
	jmp	strlen_loop

strlen_done:
	/* Set error value to 0 since we succeeded */
	movl	$0, -4(%ebp)

z_arch_user_string_nlen_fixup:
	/* Write error value to err pointer parameter */
	movl	0x10(%ebp), %ecx
	pop	%edx
	movl	%edx, (%ecx)

	pop	%ebp
	ret


/* FUNC_NORETURN void z_x86_userspace_enter(k_thread_entry_t user_entry,
 *					   void *p1, void *p2, void *p3,
 *					   u32_t stack_end,
 *					   u32_t stack_start)
 *
 * A one-way trip to userspace.
 */
SECTION_FUNC(TEXT, z_x86_userspace_enter)
	pop	%esi	/* Discard return address on stack */

	/* Fetch parameters on the stack */
#ifndef CONFIG_X86_IAMCU
	pop	%eax	/* user_entry */
	pop	%edx	/* p1 */
	pop	%ecx	/* p2 */
#endif
	pop	%esi	/* p3 */
	pop	%ebx	/* stack_end (high address) */
	pop	%edi	/* stack_start (low address) */

	/* Move to the kernel stack for this thread, so we can erase the
	 * user stack. The kernel stack is the page immediately before
	 * the user stack.
	 *
	 * For security reasons, we must erase the entire user stack.
	 * We don't know what previous contexts it was used and do not
	 * want to leak any information.
	 */
	mov	%edi, %esp

	/* Stash some registers we are going to need to erase the user
	 * stack.
	 */
	push	%ecx
	push	%edi
	push	%eax

	/* Compute size of user stack in 4-byte chunks and put in ECX */
	mov	%ebx, %ecx
	sub	%edi, %ecx
	shr	$2, %ecx	/* Divide by 4 */

#ifdef CONFIG_INIT_STACKS
	mov	$0xAAAAAAAA, %eax
#else
	xor	%eax, %eax
#endif
	/* Copy 4 bytes of memory at a time, starting at ES:EDI, with whatever
	 * is in EAX. Repeat this ECX times.  Stack sizes are always at least
	 * 4-byte aligned.
	 */
	cld
	rep stosl

	/* Restore registers */
	pop	%eax
	pop	%edi
	pop	%ecx

	/* Now set stack pointer to the base of the user stack. Now that this
	 * is set we won't need EBX any more.
	 */
	mov	%ebx, %esp

	/* Set segment registers (except CS and SS which are done in
	 * a special way by 'iret' below)
	 */
	mov	$USER_DATA_SEG, %bx
	mov	%bx, %ds
	mov	%bx, %es

	/* Push arguments to z_thread_entry() */
	push	%esi	/* p3 */
#ifndef CONFIG_X86_IAMCU
	push	%ecx	/* p2 */
	push	%edx	/* p1 */
	push	%eax	/* user_entry */
#endif
	/* NULL return address */
	push	$0

	/* Save stack pointer at this position, this is where it will be
	 * when we land in z_thread_entry()
	 */
	mov	%esp, %edi

	/* Inter-privilege 'iret' pops all of these. Need to fake an interrupt
	 * return to enter user mode as far calls cannot change privilege
	 * level
	 */
	push	$USER_DATA_SEG	/* SS */
	push	%edi		/* ESP */
	pushfl			/* EFLAGS */
	push	$USER_CODE_SEG	/* CS */
	push	$z_thread_entry	/* EIP */

#ifdef CONFIG_EXECUTION_BENCHMARKING
	/* Save the eax and edx registers before reading the time stamp
	* once done pop the values.
	*/
	push %eax
	push %edx
	rdtsc
	mov %eax,__end_drop_to_usermode_time
	mov %edx,__end_drop_to_usermode_time+4
	pop %edx
	pop %eax
#endif

	/* We will land in z_thread_entry() in user mode after this */
	KPTI_IRET_USER