You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2237 lines
59 KiB
2237 lines
59 KiB
/* |
|
* Copyright (c) 2011-2014 Wind River Systems, Inc. |
|
* Copyright (c) 2017-2020 Intel Corporation |
|
* |
|
* SPDX-License-Identifier: Apache-2.0 |
|
*/ |
|
|
|
#include <zephyr/kernel.h> |
|
#include <zephyr/arch/x86/mmustructs.h> |
|
#include <zephyr/sys/mem_manage.h> |
|
#include <zephyr/sys/__assert.h> |
|
#include <zephyr/sys/check.h> |
|
#include <zephyr/logging/log.h> |
|
#include <errno.h> |
|
#include <ctype.h> |
|
#include <zephyr/spinlock.h> |
|
#include <kernel_arch_func.h> |
|
#include <x86_mmu.h> |
|
#include <zephyr/init.h> |
|
#include <kernel_internal.h> |
|
#include <mmu.h> |
|
#include <zephyr/drivers/interrupt_controller/loapic.h> |
|
#include <mmu.h> |
|
#include <zephyr/arch/x86/memmap.h> |
|
|
|
LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL); |
|
|
|
/* We will use some ignored bits in the PTE to backup permission settings |
|
* when the mapping was made. This is used to un-apply memory domain memory |
|
* partitions to page tables when the partitions are removed. |
|
*/ |
|
#define MMU_RW_ORIG MMU_IGNORED0 |
|
#define MMU_US_ORIG MMU_IGNORED1 |
|
#define MMU_XD_ORIG MMU_IGNORED2 |
|
|
|
/* Bits in the PTE that form the set of permission bits, when resetting */ |
|
#define MASK_PERM (MMU_RW | MMU_US | MMU_XD) |
|
|
|
/* When we want to set up a new mapping, discarding any previous state */ |
|
#define MASK_ALL (~((pentry_t)0U)) |
|
|
|
/* Bits to set at mapping time for particular permissions. We set the actual |
|
* page table bit effecting the policy and also the backup bit. |
|
*/ |
|
#define ENTRY_RW (MMU_RW | MMU_RW_ORIG) |
|
#define ENTRY_US (MMU_US | MMU_US_ORIG) |
|
#define ENTRY_XD (MMU_XD | MMU_XD_ORIG) |
|
|
|
/* Bit position which is always zero in a PTE. We'll use the PAT bit. |
|
* This helps disambiguate PTEs that do not have the Present bit set (MMU_P): |
|
* - If the entire entry is zero, it's an un-mapped virtual page |
|
* - If PTE_ZERO is set, we flipped this page due to KPTI |
|
* - Otherwise, this was a page-out |
|
*/ |
|
#define PTE_ZERO MMU_PAT |
|
|
|
/* Protects x86_domain_list and serializes instantiation of intermediate |
|
* paging structures. |
|
*/ |
|
__pinned_bss |
|
static struct k_spinlock x86_mmu_lock; |
|
|
|
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) |
|
/* List of all active and initialized memory domains. This is used to make |
|
* sure all memory mappings are the same across all page tables when invoking |
|
* range_map() |
|
*/ |
|
__pinned_bss |
|
static sys_slist_t x86_domain_list; |
|
#endif |
|
|
|
/* |
|
* Definitions for building an ontology of paging levels and capabilities |
|
* at each level |
|
*/ |
|
|
|
/* Data structure describing the characteristics of a particular paging |
|
* level |
|
*/ |
|
struct paging_level { |
|
/* What bits are used to store physical address */ |
|
pentry_t mask; |
|
|
|
/* Number of entries in this paging structure */ |
|
size_t entries; |
|
|
|
/* How many bits to right-shift a virtual address to obtain the |
|
* appropriate entry within this table. |
|
* |
|
* The memory scope of each entry in this table is 1 << shift. |
|
*/ |
|
unsigned int shift; |
|
#ifdef CONFIG_EXCEPTION_DEBUG |
|
/* Name of this level, for debug purposes */ |
|
const char *name; |
|
#endif |
|
}; |
|
|
|
/* Flags for all entries in intermediate paging levels. |
|
* Fortunately, the same bits are set for all intermediate levels for all |
|
* three paging modes. |
|
* |
|
* Obviously P is set. |
|
* |
|
* We want RW and US bit always set; actual access control will be |
|
* done at the leaf level. |
|
* |
|
* XD (if supported) always 0. Disabling execution done at leaf level. |
|
* |
|
* PCD/PWT always 0. Caching properties again done at leaf level. |
|
*/ |
|
#define INT_FLAGS (MMU_P | MMU_RW | MMU_US) |
|
|
|
/* Paging level ontology for the selected paging mode. |
|
* |
|
* See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A |
|
*/ |
|
__pinned_rodata |
|
static const struct paging_level paging_levels[] = { |
|
#ifdef CONFIG_X86_64 |
|
/* Page Map Level 4 */ |
|
{ |
|
.mask = 0x7FFFFFFFFFFFF000ULL, |
|
.entries = 512U, |
|
.shift = 39U, |
|
#ifdef CONFIG_EXCEPTION_DEBUG |
|
.name = "PML4" |
|
#endif |
|
}, |
|
#endif /* CONFIG_X86_64 */ |
|
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
|
/* Page Directory Pointer Table */ |
|
{ |
|
.mask = 0x7FFFFFFFFFFFF000ULL, |
|
#ifdef CONFIG_X86_64 |
|
.entries = 512U, |
|
#else |
|
/* PAE version */ |
|
.entries = 4U, |
|
#endif |
|
.shift = 30U, |
|
#ifdef CONFIG_EXCEPTION_DEBUG |
|
.name = "PDPT" |
|
#endif |
|
}, |
|
#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */ |
|
/* Page Directory */ |
|
{ |
|
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
|
.mask = 0x7FFFFFFFFFFFF000ULL, |
|
.entries = 512U, |
|
.shift = 21U, |
|
#else |
|
/* 32-bit */ |
|
.mask = 0xFFFFF000U, |
|
.entries = 1024U, |
|
.shift = 22U, |
|
#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */ |
|
#ifdef CONFIG_EXCEPTION_DEBUG |
|
.name = "PD" |
|
#endif |
|
}, |
|
/* Page Table */ |
|
{ |
|
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
|
.mask = 0x07FFFFFFFFFFF000ULL, |
|
.entries = 512U, |
|
.shift = 12U, |
|
#else |
|
/* 32-bit */ |
|
.mask = 0xFFFFF000U, |
|
.entries = 1024U, |
|
.shift = 12U, |
|
#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */ |
|
#ifdef CONFIG_EXCEPTION_DEBUG |
|
.name = "PT" |
|
#endif |
|
} |
|
}; |
|
|
|
#define NUM_LEVELS ARRAY_SIZE(paging_levels) |
|
#define PTE_LEVEL (NUM_LEVELS - 1) |
|
#define PDE_LEVEL (NUM_LEVELS - 2) |
|
|
|
/* |
|
* Macros for reserving space for page tables |
|
* |
|
* We need to reserve a block of memory equal in size to the page tables |
|
* generated by gen_mmu.py so that memory addresses do not shift between |
|
* build phases. These macros ultimately specify INITIAL_PAGETABLE_SIZE. |
|
*/ |
|
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
|
#ifdef CONFIG_X86_64 |
|
#define NUM_PML4_ENTRIES 512U |
|
#define NUM_PDPT_ENTRIES 512U |
|
#else |
|
#define NUM_PDPT_ENTRIES 4U |
|
#endif /* CONFIG_X86_64 */ |
|
#define NUM_PD_ENTRIES 512U |
|
#define NUM_PT_ENTRIES 512U |
|
#else |
|
#define NUM_PD_ENTRIES 1024U |
|
#define NUM_PT_ENTRIES 1024U |
|
#endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */ |
|
|
|
/* Memory range covered by an instance of various table types */ |
|
#define PT_AREA ((uintptr_t)(CONFIG_MMU_PAGE_SIZE * NUM_PT_ENTRIES)) |
|
#define PD_AREA (PT_AREA * NUM_PD_ENTRIES) |
|
#ifdef CONFIG_X86_64 |
|
#define PDPT_AREA (PD_AREA * NUM_PDPT_ENTRIES) |
|
#endif |
|
|
|
#define VM_ADDR CONFIG_KERNEL_VM_BASE |
|
#define VM_SIZE CONFIG_KERNEL_VM_SIZE |
|
|
|
/* Define a range [PT_START, PT_END) which is the memory range |
|
* covered by all the page tables needed for the address space |
|
*/ |
|
#define PT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PT_AREA)) |
|
#define PT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PT_AREA)) |
|
|
|
/* Number of page tables needed to cover address space. Depends on the specific |
|
* bounds, but roughly 1 page table per 2MB of RAM |
|
*/ |
|
#define NUM_PT ((PT_END - PT_START) / PT_AREA) |
|
|
|
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
|
/* Same semantics as above, but for the page directories needed to cover |
|
* system RAM. |
|
*/ |
|
#define PD_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PD_AREA)) |
|
#define PD_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PD_AREA)) |
|
/* Number of page directories needed to cover the address space. Depends on the |
|
* specific bounds, but roughly 1 page directory per 1GB of RAM |
|
*/ |
|
#define NUM_PD ((PD_END - PD_START) / PD_AREA) |
|
#else |
|
/* 32-bit page tables just have one toplevel page directory */ |
|
#define NUM_PD 1 |
|
#endif |
|
|
|
#ifdef CONFIG_X86_64 |
|
/* Same semantics as above, but for the page directory pointer tables needed |
|
* to cover the address space. On 32-bit there is just one 4-entry PDPT. |
|
*/ |
|
#define PDPT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PDPT_AREA)) |
|
#define PDPT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PDPT_AREA)) |
|
/* Number of PDPTs needed to cover the address space. 1 PDPT per 512GB of VM */ |
|
#define NUM_PDPT ((PDPT_END - PDPT_START) / PDPT_AREA) |
|
|
|
/* All pages needed for page tables, using computed values plus one more for |
|
* the top-level PML4 |
|
*/ |
|
#define NUM_TABLE_PAGES (NUM_PT + NUM_PD + NUM_PDPT + 1) |
|
#else /* !CONFIG_X86_64 */ |
|
/* Number of pages we need to reserve in the stack for per-thread page tables */ |
|
#define NUM_TABLE_PAGES (NUM_PT + NUM_PD) |
|
#endif /* CONFIG_X86_64 */ |
|
|
|
#define INITIAL_PTABLE_PAGES \ |
|
(NUM_TABLE_PAGES + CONFIG_X86_EXTRA_PAGE_TABLE_PAGES) |
|
|
|
#ifdef CONFIG_X86_PAE |
|
/* Toplevel PDPT wasn't included as it is not a page in size */ |
|
#define INITIAL_PTABLE_SIZE \ |
|
((INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) + 0x20) |
|
#else |
|
#define INITIAL_PTABLE_SIZE \ |
|
(INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) |
|
#endif |
|
|
|
/* "dummy" pagetables for the first-phase build. The real page tables |
|
* are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf, |
|
* and this dummy array is discarded. |
|
*/ |
|
Z_GENERIC_SECTION(.dummy_pagetables) |
|
static __used char dummy_pagetables[INITIAL_PTABLE_SIZE]; |
|
|
|
/* |
|
* Utility functions |
|
*/ |
|
|
|
/* For a table at a particular level, get the entry index that corresponds to |
|
* the provided virtual address |
|
*/ |
|
__pinned_func |
|
static inline int get_index(void *virt, int level) |
|
{ |
|
return (((uintptr_t)virt >> paging_levels[level].shift) % |
|
paging_levels[level].entries); |
|
} |
|
|
|
__pinned_func |
|
static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level) |
|
{ |
|
return &ptables[get_index(virt, level)]; |
|
} |
|
|
|
__pinned_func |
|
static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level) |
|
{ |
|
return ptables[get_index(virt, level)]; |
|
} |
|
|
|
/* Get the physical memory address associated with this table entry */ |
|
__pinned_func |
|
static inline uintptr_t get_entry_phys(pentry_t entry, int level) |
|
{ |
|
return entry & paging_levels[level].mask; |
|
} |
|
|
|
/* Return the virtual address of a linked table stored in the provided entry */ |
|
__pinned_func |
|
static inline pentry_t *next_table(pentry_t entry, int level) |
|
{ |
|
return z_mem_virt_addr(get_entry_phys(entry, level)); |
|
} |
|
|
|
/* Number of table entries at this level */ |
|
__pinned_func |
|
static inline size_t get_num_entries(int level) |
|
{ |
|
return paging_levels[level].entries; |
|
} |
|
|
|
/* 4K for everything except PAE PDPTs */ |
|
__pinned_func |
|
static inline size_t table_size(int level) |
|
{ |
|
return get_num_entries(level) * sizeof(pentry_t); |
|
} |
|
|
|
/* For a table at a particular level, size of the amount of virtual memory |
|
* that an entry within the table covers |
|
*/ |
|
__pinned_func |
|
static inline size_t get_entry_scope(int level) |
|
{ |
|
return (1UL << paging_levels[level].shift); |
|
} |
|
|
|
/* For a table at a particular level, size of the amount of virtual memory |
|
* that this entire table covers |
|
*/ |
|
__pinned_func |
|
static inline size_t get_table_scope(int level) |
|
{ |
|
return get_entry_scope(level) * get_num_entries(level); |
|
} |
|
|
|
/* Must have checked Present bit first! Non-present entries may have OS data |
|
* stored in any other bits |
|
*/ |
|
__pinned_func |
|
static inline bool is_leaf(int level, pentry_t entry) |
|
{ |
|
if (level == PTE_LEVEL) { |
|
/* Always true for PTE */ |
|
return true; |
|
} |
|
|
|
return ((entry & MMU_PS) != 0U); |
|
} |
|
|
|
/* This does NOT (by design) un-flip KPTI PTEs, it's just the raw PTE value */ |
|
__pinned_func |
|
static inline void pentry_get(int *paging_level, pentry_t *val, |
|
pentry_t *ptables, void *virt) |
|
{ |
|
pentry_t *table = ptables; |
|
|
|
for (int level = 0; level < NUM_LEVELS; level++) { |
|
pentry_t entry = get_entry(table, virt, level); |
|
|
|
if ((entry & MMU_P) == 0 || is_leaf(level, entry)) { |
|
*val = entry; |
|
if (paging_level != NULL) { |
|
*paging_level = level; |
|
} |
|
break; |
|
} else { |
|
table = next_table(entry, level); |
|
} |
|
} |
|
} |
|
|
|
__pinned_func |
|
static inline void tlb_flush_page(void *addr) |
|
{ |
|
/* Invalidate TLB entries corresponding to the page containing the |
|
* specified address |
|
*/ |
|
char *page = (char *)addr; |
|
|
|
__asm__ ("invlpg %0" :: "m" (*page)); |
|
} |
|
|
|
#ifdef CONFIG_X86_KPTI |
|
__pinned_func |
|
static inline bool is_flipped_pte(pentry_t pte) |
|
{ |
|
return (pte & MMU_P) == 0 && (pte & PTE_ZERO) != 0; |
|
} |
|
#endif |
|
|
|
#if defined(CONFIG_SMP) |
|
__pinned_func |
|
void z_x86_tlb_ipi(const void *arg) |
|
{ |
|
uintptr_t ptables_phys; |
|
|
|
ARG_UNUSED(arg); |
|
|
|
#ifdef CONFIG_X86_KPTI |
|
/* We're always on the kernel's set of page tables in this context |
|
* if KPTI is turned on |
|
*/ |
|
ptables_phys = z_x86_cr3_get(); |
|
__ASSERT(ptables_phys == z_mem_phys_addr(&z_x86_kernel_ptables), ""); |
|
#else |
|
/* We might have been moved to another memory domain, so always invoke |
|
* z_x86_thread_page_tables_get() instead of using current CR3 value. |
|
*/ |
|
ptables_phys = z_mem_phys_addr(z_x86_thread_page_tables_get(_current)); |
|
#endif |
|
/* |
|
* In the future, we can consider making this smarter, such as |
|
* propagating which page tables were modified (in case they are |
|
* not active on this CPU) or an address range to call |
|
* tlb_flush_page() on. |
|
*/ |
|
LOG_DBG("%s on CPU %d\n", __func__, arch_curr_cpu()->id); |
|
|
|
z_x86_cr3_set(ptables_phys); |
|
} |
|
|
|
/* NOTE: This is not synchronous and the actual flush takes place some short |
|
* time after this exits. |
|
*/ |
|
__pinned_func |
|
static inline void tlb_shootdown(void) |
|
{ |
|
z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_TLB_IPI_VECTOR); |
|
} |
|
#endif /* CONFIG_SMP */ |
|
|
|
__pinned_func |
|
static inline void assert_addr_aligned(uintptr_t addr) |
|
{ |
|
#if __ASSERT_ON |
|
__ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U, |
|
"unaligned address 0x%" PRIxPTR, addr); |
|
#endif |
|
} |
|
|
|
__pinned_func |
|
static inline bool is_addr_aligned(uintptr_t addr) |
|
{ |
|
if ((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) { |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
__pinned_func |
|
static inline void assert_virt_addr_aligned(void *addr) |
|
{ |
|
assert_addr_aligned((uintptr_t)addr); |
|
} |
|
|
|
__pinned_func |
|
static inline bool is_virt_addr_aligned(void *addr) |
|
{ |
|
return is_addr_aligned((uintptr_t)addr); |
|
} |
|
|
|
__pinned_func |
|
static inline void assert_size_aligned(size_t size) |
|
{ |
|
#if __ASSERT_ON |
|
__ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U, |
|
"unaligned size %zu", size); |
|
#endif |
|
} |
|
|
|
__pinned_func |
|
static inline bool is_size_aligned(size_t size) |
|
{ |
|
if ((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) { |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
__pinned_func |
|
static inline void assert_region_page_aligned(void *addr, size_t size) |
|
{ |
|
assert_virt_addr_aligned(addr); |
|
assert_size_aligned(size); |
|
} |
|
|
|
__pinned_func |
|
static inline bool is_region_page_aligned(void *addr, size_t size) |
|
{ |
|
if (!is_virt_addr_aligned(addr)) { |
|
return false; |
|
} |
|
|
|
return is_size_aligned(size); |
|
} |
|
|
|
/* |
|
* Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG. |
|
*/ |
|
#ifdef CONFIG_EXCEPTION_DEBUG |
|
|
|
/* Add colors to page table dumps to indicate mapping type */ |
|
#define COLOR_PAGE_TABLES 1 |
|
|
|
#if COLOR_PAGE_TABLES |
|
#define ANSI_DEFAULT "\x1B" "[0m" |
|
#define ANSI_RED "\x1B" "[1;31m" |
|
#define ANSI_GREEN "\x1B" "[1;32m" |
|
#define ANSI_YELLOW "\x1B" "[1;33m" |
|
#define ANSI_BLUE "\x1B" "[1;34m" |
|
#define ANSI_MAGENTA "\x1B" "[1;35m" |
|
#define ANSI_CYAN "\x1B" "[1;36m" |
|
#define ANSI_GREY "\x1B" "[1;90m" |
|
|
|
#define COLOR(x) printk(_CONCAT(ANSI_, x)) |
|
#else |
|
#define COLOR(x) do { } while (false) |
|
#endif |
|
|
|
__pinned_func |
|
static char get_entry_code(pentry_t value) |
|
{ |
|
char ret; |
|
|
|
if (value == 0U) { |
|
/* Unmapped entry */ |
|
ret = '.'; |
|
} else { |
|
if ((value & MMU_RW) != 0U) { |
|
/* Writable page */ |
|
if ((value & MMU_XD) != 0U) { |
|
/* RW */ |
|
ret = 'w'; |
|
} else { |
|
/* RWX */ |
|
ret = 'a'; |
|
} |
|
} else { |
|
if ((value & MMU_XD) != 0U) { |
|
/* R */ |
|
ret = 'r'; |
|
} else { |
|
/* RX */ |
|
ret = 'x'; |
|
} |
|
} |
|
|
|
if ((value & MMU_US) != 0U) { |
|
/* Uppercase indicates user mode access */ |
|
ret = toupper((unsigned char)ret); |
|
} |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
__pinned_func |
|
static void print_entries(pentry_t entries_array[], uint8_t *base, int level, |
|
size_t count) |
|
{ |
|
int column = 0; |
|
|
|
for (int i = 0; i < count; i++) { |
|
pentry_t entry = entries_array[i]; |
|
|
|
uintptr_t phys = get_entry_phys(entry, level); |
|
uintptr_t virt = |
|
(uintptr_t)base + (get_entry_scope(level) * i); |
|
|
|
if ((entry & MMU_P) != 0U) { |
|
if (is_leaf(level, entry)) { |
|
if (phys == virt) { |
|
/* Identity mappings */ |
|
COLOR(YELLOW); |
|
} else if (phys + Z_MEM_VM_OFFSET == virt) { |
|
/* Permanent RAM mappings */ |
|
COLOR(GREEN); |
|
} else { |
|
/* General mapped pages */ |
|
COLOR(CYAN); |
|
} |
|
} else { |
|
/* Intermediate entry */ |
|
COLOR(MAGENTA); |
|
} |
|
} else { |
|
if (is_leaf(level, entry)) { |
|
if (entry == 0U) { |
|
/* Unmapped */ |
|
COLOR(GREY); |
|
#ifdef CONFIG_X86_KPTI |
|
} else if (is_flipped_pte(entry)) { |
|
/* KPTI, un-flip it */ |
|
COLOR(BLUE); |
|
entry = ~entry; |
|
phys = get_entry_phys(entry, level); |
|
if (phys == virt) { |
|
/* Identity mapped */ |
|
COLOR(CYAN); |
|
} else { |
|
/* Non-identity mapped */ |
|
COLOR(BLUE); |
|
} |
|
#endif |
|
} else { |
|
/* Paged out */ |
|
COLOR(RED); |
|
} |
|
} else { |
|
/* Un-mapped intermediate entry */ |
|
COLOR(GREY); |
|
} |
|
} |
|
|
|
printk("%c", get_entry_code(entry)); |
|
|
|
column++; |
|
if (column == 64) { |
|
column = 0; |
|
printk("\n"); |
|
} |
|
} |
|
COLOR(DEFAULT); |
|
|
|
if (column != 0) { |
|
printk("\n"); |
|
} |
|
} |
|
|
|
__pinned_func |
|
static void dump_ptables(pentry_t *table, uint8_t *base, int level) |
|
{ |
|
const struct paging_level *info = &paging_levels[level]; |
|
|
|
#ifdef CONFIG_X86_64 |
|
/* Account for the virtual memory "hole" with sign-extension */ |
|
if (((uintptr_t)base & BITL(47)) != 0) { |
|
base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48)); |
|
} |
|
#endif |
|
|
|
printk("%s at %p (0x%" PRIxPTR "): ", info->name, table, |
|
z_mem_phys_addr(table)); |
|
if (level == 0) { |
|
printk("entire address space\n"); |
|
} else { |
|
printk("for %p - %p\n", base, |
|
base + get_table_scope(level) - 1); |
|
} |
|
|
|
print_entries(table, base, level, info->entries); |
|
|
|
/* Check if we're a page table */ |
|
if (level == PTE_LEVEL) { |
|
return; |
|
} |
|
|
|
/* Dump all linked child tables */ |
|
for (int j = 0; j < info->entries; j++) { |
|
pentry_t entry = table[j]; |
|
pentry_t *next; |
|
|
|
if ((entry & MMU_P) == 0U || |
|
(entry & MMU_PS) != 0U) { |
|
/* Not present or big page, skip */ |
|
continue; |
|
} |
|
|
|
next = next_table(entry, level); |
|
dump_ptables(next, base + (j * get_entry_scope(level)), |
|
level + 1); |
|
} |
|
} |
|
|
|
__pinned_func |
|
void z_x86_dump_page_tables(pentry_t *ptables) |
|
{ |
|
dump_ptables(ptables, NULL, 0); |
|
} |
|
|
|
/* Enable to dump out the kernel's page table right before main() starts, |
|
* sometimes useful for deep debugging. May overwhelm twister. |
|
*/ |
|
#define DUMP_PAGE_TABLES 0 |
|
|
|
#if DUMP_PAGE_TABLES |
|
__pinned_func |
|
static int dump_kernel_tables(const struct device *unused) |
|
{ |
|
z_x86_dump_page_tables(z_x86_kernel_ptables); |
|
|
|
return 0; |
|
} |
|
|
|
SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); |
|
#endif |
|
|
|
__pinned_func |
|
static void str_append(char **buf, size_t *size, const char *str) |
|
{ |
|
int ret = snprintk(*buf, *size, "%s", str); |
|
|
|
if (ret >= *size) { |
|
/* Truncated */ |
|
*size = 0U; |
|
} else { |
|
*size -= ret; |
|
*buf += ret; |
|
} |
|
|
|
} |
|
|
|
__pinned_func |
|
static void dump_entry(int level, void *virt, pentry_t entry) |
|
{ |
|
const struct paging_level *info = &paging_levels[level]; |
|
char buf[24] = { 0 }; |
|
char *pos = buf; |
|
size_t sz = sizeof(buf); |
|
uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level)); |
|
|
|
#define DUMP_BIT(bit) do { \ |
|
if ((entry & MMU_##bit) != 0U) { \ |
|
str_append(&pos, &sz, #bit " "); \ |
|
} \ |
|
} while (false) |
|
|
|
DUMP_BIT(RW); |
|
DUMP_BIT(US); |
|
DUMP_BIT(PWT); |
|
DUMP_BIT(PCD); |
|
DUMP_BIT(A); |
|
DUMP_BIT(D); |
|
DUMP_BIT(G); |
|
DUMP_BIT(XD); |
|
|
|
LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name, |
|
virtmap, entry & info->mask, buf); |
|
|
|
#undef DUMP_BIT |
|
} |
|
|
|
__pinned_func |
|
void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables, |
|
void *virt) |
|
{ |
|
pentry_get(paging_level, val, ptables, virt); |
|
} |
|
|
|
/* |
|
* Debug function for dumping out MMU table information to the LOG for a |
|
* specific virtual address, such as when we get an unexpected page fault. |
|
*/ |
|
__pinned_func |
|
void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt) |
|
{ |
|
pentry_t entry = 0; |
|
int level = 0; |
|
|
|
pentry_get(&level, &entry, ptables, virt); |
|
|
|
if ((entry & MMU_P) == 0) { |
|
LOG_ERR("%sE: not present", paging_levels[level].name); |
|
} else { |
|
dump_entry(level, virt, entry); |
|
} |
|
} |
|
#endif /* CONFIG_EXCEPTION_DEBUG */ |
|
|
|
/* Reset permissions on a PTE to original state when the mapping was made */ |
|
__pinned_func |
|
static inline pentry_t reset_pte(pentry_t old_val) |
|
{ |
|
pentry_t new_val; |
|
|
|
/* Clear any existing state in permission bits */ |
|
new_val = old_val & (~K_MEM_PARTITION_PERM_MASK); |
|
|
|
/* Now set permissions based on the stashed original values */ |
|
if ((old_val & MMU_RW_ORIG) != 0) { |
|
new_val |= MMU_RW; |
|
} |
|
if ((old_val & MMU_US_ORIG) != 0) { |
|
new_val |= MMU_US; |
|
} |
|
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
|
if ((old_val & MMU_XD_ORIG) != 0) { |
|
new_val |= MMU_XD; |
|
} |
|
#endif |
|
return new_val; |
|
} |
|
|
|
/* Wrapper functions for some gross stuff we have to do for Kernel |
|
* page table isolation. If these are User mode page tables, the user bit |
|
* isn't set, and this is not the shared page, all the bits in the PTE |
|
* are flipped. This serves three purposes: |
|
* - The page isn't present, implementing page table isolation |
|
* - Flipping the physical address bits cheaply mitigates L1TF |
|
* - State is preserved; to get original PTE, just complement again |
|
*/ |
|
__pinned_func |
|
static inline pentry_t pte_finalize_value(pentry_t val, bool user_table, |
|
int level) |
|
{ |
|
#ifdef CONFIG_X86_KPTI |
|
static const uintptr_t shared_phys_addr = |
|
Z_MEM_PHYS_ADDR(POINTER_TO_UINT(&z_shared_kernel_page_start)); |
|
|
|
if (user_table && (val & MMU_US) == 0 && (val & MMU_P) != 0 && |
|
get_entry_phys(val, level) != shared_phys_addr) { |
|
val = ~val; |
|
} |
|
#endif |
|
return val; |
|
} |
|
|
|
/* Atomic functions for modifying PTEs. These don't map nicely to Zephyr's |
|
* atomic API since the only types supported are 'int' and 'void *' and |
|
* the size of pentry_t depends on other factors like PAE. |
|
*/ |
|
#ifndef CONFIG_X86_PAE |
|
/* Non-PAE, pentry_t is same size as void ptr so use atomic_ptr_* APIs */ |
|
__pinned_func |
|
static inline pentry_t atomic_pte_get(const pentry_t *target) |
|
{ |
|
return (pentry_t)atomic_ptr_get((atomic_ptr_t *)target); |
|
} |
|
|
|
__pinned_func |
|
static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value, |
|
pentry_t new_value) |
|
{ |
|
return atomic_ptr_cas((atomic_ptr_t *)target, (void *)old_value, |
|
(void *)new_value); |
|
} |
|
#else |
|
/* Atomic builtins for 64-bit values on 32-bit x86 require floating point. |
|
* Don't do this, just lock local interrupts. Needless to say, this |
|
* isn't workable if someone ever adds SMP to the 32-bit x86 port. |
|
*/ |
|
BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP)); |
|
|
|
__pinned_func |
|
static inline pentry_t atomic_pte_get(const pentry_t *target) |
|
{ |
|
return *target; |
|
} |
|
|
|
__pinned_func |
|
static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value, |
|
pentry_t new_value) |
|
{ |
|
bool ret = false; |
|
int key = arch_irq_lock(); |
|
|
|
if (*target == old_value) { |
|
*target = new_value; |
|
ret = true; |
|
} |
|
arch_irq_unlock(key); |
|
|
|
return ret; |
|
} |
|
#endif /* CONFIG_X86_PAE */ |
|
|
|
/* Indicates that the target page tables will be used by user mode threads. |
|
* This only has implications for CONFIG_X86_KPTI where user thread facing |
|
* page tables need nearly all pages that don't have the US bit to also |
|
* not be Present. |
|
*/ |
|
#define OPTION_USER BIT(0) |
|
|
|
/* Indicates that the operation requires TLBs to be flushed as we are altering |
|
* existing mappings. Not needed for establishing new mappings |
|
*/ |
|
#define OPTION_FLUSH BIT(1) |
|
|
|
/* Indicates that each PTE's permission bits should be restored to their |
|
* original state when the memory was mapped. All other bits in the PTE are |
|
* preserved. |
|
*/ |
|
#define OPTION_RESET BIT(2) |
|
|
|
/* Indicates that the mapping will need to be cleared entirely. This is |
|
* mainly used for unmapping the memory region. |
|
*/ |
|
#define OPTION_CLEAR BIT(3) |
|
|
|
/** |
|
* Atomically update bits in a page table entry |
|
* |
|
* This is atomic with respect to modifications by other CPUs or preempted |
|
* contexts, which can be very important when making decisions based on |
|
* the PTE's prior "dirty" state. |
|
* |
|
* @param pte Pointer to page table entry to update |
|
* @param update_val Updated bits to set/clear in PTE. Ignored with |
|
* OPTION_RESET or OPTION_CLEAR. |
|
* @param update_mask Which bits to modify in the PTE. Ignored with |
|
* OPTION_RESET or OPTION_CLEAR. |
|
* @param options Control flags |
|
* @retval Old PTE value |
|
*/ |
|
__pinned_func |
|
static inline pentry_t pte_atomic_update(pentry_t *pte, pentry_t update_val, |
|
pentry_t update_mask, |
|
uint32_t options) |
|
{ |
|
bool user_table = (options & OPTION_USER) != 0U; |
|
bool reset = (options & OPTION_RESET) != 0U; |
|
bool clear = (options & OPTION_CLEAR) != 0U; |
|
pentry_t old_val, new_val; |
|
|
|
do { |
|
old_val = atomic_pte_get(pte); |
|
|
|
new_val = old_val; |
|
#ifdef CONFIG_X86_KPTI |
|
if (is_flipped_pte(new_val)) { |
|
/* Page was flipped for KPTI. Un-flip it */ |
|
new_val = ~new_val; |
|
} |
|
#endif /* CONFIG_X86_KPTI */ |
|
|
|
if (reset) { |
|
new_val = reset_pte(new_val); |
|
} else if (clear) { |
|
new_val = 0; |
|
} else { |
|
new_val = ((new_val & ~update_mask) | |
|
(update_val & update_mask)); |
|
} |
|
|
|
new_val = pte_finalize_value(new_val, user_table, PTE_LEVEL); |
|
} while (atomic_pte_cas(pte, old_val, new_val) == false); |
|
|
|
#ifdef CONFIG_X86_KPTI |
|
if (is_flipped_pte(old_val)) { |
|
/* Page was flipped for KPTI. Un-flip it */ |
|
old_val = ~old_val; |
|
} |
|
#endif /* CONFIG_X86_KPTI */ |
|
|
|
return old_val; |
|
} |
|
|
|
/** |
|
* Low level page table update function for a virtual page |
|
* |
|
* For the provided set of page tables, update the PTE associated with the |
|
* virtual address to a new value, using the mask to control what bits |
|
* need to be preserved. |
|
* |
|
* It is permitted to set up mappings without the Present bit set, in which |
|
* case all other bits may be used for OS accounting. |
|
* |
|
* This function is atomic with respect to the page table entries being |
|
* modified by another CPU, using atomic operations to update the requested |
|
* bits and return the previous PTE value. |
|
* |
|
* Common mask values: |
|
* MASK_ALL - Update all PTE bits. Existing state totally discarded. |
|
* MASK_PERM - Only update permission bits. All other bits and physical |
|
* mapping preserved. |
|
* |
|
* @param ptables Page tables to modify |
|
* @param virt Virtual page table entry to update |
|
* @param entry_val Value to update in the PTE (ignored if OPTION_RESET or |
|
* OPTION_CLEAR) |
|
* @param [out] old_val_ptr Filled in with previous PTE value. May be NULL. |
|
* @param mask What bits to update in the PTE (ignored if OPTION_RESET or |
|
* OPTION_CLEAR) |
|
* @param options Control options, described above |
|
* |
|
* @retval 0 if successful |
|
* @retval -EFAULT if large page encountered or missing page table level |
|
*/ |
|
__pinned_func |
|
static int page_map_set(pentry_t *ptables, void *virt, pentry_t entry_val, |
|
pentry_t *old_val_ptr, pentry_t mask, uint32_t options) |
|
{ |
|
pentry_t *table = ptables; |
|
bool flush = (options & OPTION_FLUSH) != 0U; |
|
int ret = 0; |
|
|
|
for (int level = 0; level < NUM_LEVELS; level++) { |
|
int index; |
|
pentry_t *entryp; |
|
|
|
index = get_index(virt, level); |
|
entryp = &table[index]; |
|
|
|
/* Check if we're a PTE */ |
|
if (level == PTE_LEVEL) { |
|
pentry_t old_val = pte_atomic_update(entryp, entry_val, |
|
mask, options); |
|
if (old_val_ptr != NULL) { |
|
*old_val_ptr = old_val; |
|
} |
|
break; |
|
} |
|
|
|
/* We bail out early here due to no support for |
|
* splitting existing bigpage mappings. |
|
* If the PS bit is not supported at some level (like |
|
* in a PML4 entry) it is always reserved and must be 0 |
|
*/ |
|
CHECKIF(!((*entryp & MMU_PS) == 0U)) { |
|
/* Cannot continue since we cannot split |
|
* bigpage mappings. |
|
*/ |
|
LOG_ERR("large page encountered"); |
|
ret = -EFAULT; |
|
goto out; |
|
} |
|
|
|
table = next_table(*entryp, level); |
|
|
|
CHECKIF(!(table != NULL)) { |
|
/* Cannot continue since table is NULL, |
|
* and it cannot be dereferenced in next loop |
|
* iteration. |
|
*/ |
|
LOG_ERR("missing page table level %d when trying to map %p", |
|
level + 1, virt); |
|
ret = -EFAULT; |
|
goto out; |
|
} |
|
} |
|
|
|
out: |
|
if (flush) { |
|
tlb_flush_page(virt); |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
/** |
|
* Map a physical region in a specific set of page tables. |
|
* |
|
* See documentation for page_map_set() for additional notes about masks and |
|
* supported options. |
|
* |
|
* It is vital to remember that all virtual-to-physical mappings must be |
|
* the same with respect to supervisor mode regardless of what thread is |
|
* scheduled (and therefore, if multiple sets of page tables exist, which one |
|
* is active). |
|
* |
|
* It is permitted to set up mappings without the Present bit set. |
|
* |
|
* @param ptables Page tables to modify |
|
* @param virt Base page-aligned virtual memory address to map the region. |
|
* @param phys Base page-aligned physical memory address for the region. |
|
* Ignored if OPTION_RESET or OPTION_CLEAR. Also affected by the mask |
|
* parameter. This address is not directly examined, it will simply be |
|
* programmed into the PTE. |
|
* @param size Size of the physical region to map |
|
* @param entry_flags Non-address bits to set in every PTE. Ignored if |
|
* OPTION_RESET. Also affected by the mask parameter. |
|
* @param mask What bits to update in each PTE. Un-set bits will never be |
|
* modified. Ignored if OPTION_RESET or OPTION_CLEAR. |
|
* @param options Control options, described above |
|
* |
|
* @retval 0 if successful |
|
* @retval -EINVAL if invalid parameters are supplied |
|
* @retval -EFAULT if errors encountered when updating page tables |
|
*/ |
|
__pinned_func |
|
static int range_map_ptables(pentry_t *ptables, void *virt, uintptr_t phys, |
|
size_t size, pentry_t entry_flags, pentry_t mask, |
|
uint32_t options) |
|
{ |
|
bool zero_entry = (options & (OPTION_RESET | OPTION_CLEAR)) != 0U; |
|
int ret = 0, ret2; |
|
|
|
CHECKIF(!is_addr_aligned(phys) || !is_size_aligned(size)) { |
|
ret = -EINVAL; |
|
goto out; |
|
} |
|
|
|
CHECKIF(!((entry_flags & paging_levels[0].mask) == 0U)) { |
|
LOG_ERR("entry_flags " PRI_ENTRY " overlaps address area", |
|
entry_flags); |
|
ret = -EINVAL; |
|
goto out; |
|
} |
|
|
|
/* This implementation is stack-efficient but not particularly fast. |
|
* We do a full page table walk for every page we are updating. |
|
* Recursive approaches are possible, but use much more stack space. |
|
*/ |
|
for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) { |
|
uint8_t *dest_virt = (uint8_t *)virt + offset; |
|
pentry_t entry_val; |
|
|
|
if (zero_entry) { |
|
entry_val = 0; |
|
} else { |
|
entry_val = (pentry_t)(phys + offset) | entry_flags; |
|
} |
|
|
|
ret2 = page_map_set(ptables, dest_virt, entry_val, NULL, mask, |
|
options); |
|
ARG_UNUSED(ret2); |
|
CHECKIF(ret2 != 0) { |
|
ret = ret2; |
|
} |
|
} |
|
|
|
out: |
|
return ret; |
|
} |
|
|
|
/** |
|
* Establish or update a memory mapping for all page tables |
|
* |
|
* The physical region noted from phys to phys + size will be mapped to |
|
* an equal sized virtual region starting at virt, with the provided flags. |
|
* The mask value denotes what bits in PTEs will actually be modified. |
|
* |
|
* See range_map_ptables() for additional details. |
|
* |
|
* @param virt Page-aligned starting virtual address |
|
* @param phys Page-aligned starting physical address. Ignored if the mask |
|
* parameter does not enable address bits or OPTION_RESET used. |
|
* This region is not directly examined, it will simply be |
|
* programmed into the page tables. |
|
* @param size Size of the physical region to map |
|
* @param entry_flags Desired state of non-address PTE bits covered by mask, |
|
* ignored if OPTION_RESET |
|
* @param mask What bits in the PTE to actually modify; unset bits will |
|
* be preserved. Ignored if OPTION_RESET. |
|
* @param options Control options. Do not set OPTION_USER here. OPTION_FLUSH |
|
* will trigger a TLB shootdown after all tables are updated. |
|
* |
|
* @retval 0 if successful |
|
* @retval -EINVAL if invalid parameters are supplied |
|
* @retval -EFAULT if errors encountered when updating page tables |
|
*/ |
|
__pinned_func |
|
static int range_map(void *virt, uintptr_t phys, size_t size, |
|
pentry_t entry_flags, pentry_t mask, uint32_t options) |
|
{ |
|
int ret = 0, ret2; |
|
|
|
LOG_DBG("%s: %p -> %p (%zu) flags " PRI_ENTRY " mask " |
|
PRI_ENTRY " opt 0x%x", __func__, (void *)phys, virt, size, |
|
entry_flags, mask, options); |
|
|
|
#ifdef CONFIG_X86_64 |
|
/* There's a gap in the "64-bit" address space, as 4-level paging |
|
* requires bits 48 to 63 to be copies of bit 47. Test this |
|
* by treating as a signed value and shifting. |
|
*/ |
|
__ASSERT(((((intptr_t)virt) << 16) >> 16) == (intptr_t)virt, |
|
"non-canonical virtual address mapping %p (size %zu)", |
|
virt, size); |
|
#endif /* CONFIG_X86_64 */ |
|
|
|
CHECKIF(!((options & OPTION_USER) == 0U)) { |
|
LOG_ERR("invalid option for mapping"); |
|
ret = -EINVAL; |
|
goto out; |
|
} |
|
|
|
/* All virtual-to-physical mappings are the same in all page tables. |
|
* What can differ is only access permissions, defined by the memory |
|
* domain associated with the page tables, and the threads that are |
|
* members of that domain. |
|
* |
|
* Any new mappings need to be applied to all page tables. |
|
*/ |
|
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) |
|
sys_snode_t *node; |
|
|
|
SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) { |
|
struct arch_mem_domain *domain = |
|
CONTAINER_OF(node, struct arch_mem_domain, node); |
|
|
|
ret2 = range_map_ptables(domain->ptables, virt, phys, size, |
|
entry_flags, mask, |
|
options | OPTION_USER); |
|
ARG_UNUSED(ret2); |
|
CHECKIF(ret2 != 0) { |
|
ret = ret2; |
|
} |
|
} |
|
#endif /* CONFIG_USERSPACE */ |
|
|
|
ret2 = range_map_ptables(z_x86_kernel_ptables, virt, phys, size, |
|
entry_flags, mask, options); |
|
ARG_UNUSED(ret2); |
|
CHECKIF(ret2 != 0) { |
|
ret = ret2; |
|
} |
|
|
|
out: |
|
#ifdef CONFIG_SMP |
|
if ((options & OPTION_FLUSH) != 0U) { |
|
tlb_shootdown(); |
|
} |
|
#endif /* CONFIG_SMP */ |
|
|
|
return ret; |
|
} |
|
|
|
__pinned_func |
|
static inline int range_map_unlocked(void *virt, uintptr_t phys, size_t size, |
|
pentry_t entry_flags, pentry_t mask, |
|
uint32_t options) |
|
{ |
|
k_spinlock_key_t key; |
|
int ret; |
|
|
|
key = k_spin_lock(&x86_mmu_lock); |
|
ret = range_map(virt, phys, size, entry_flags, mask, options); |
|
k_spin_unlock(&x86_mmu_lock, key); |
|
|
|
return ret; |
|
} |
|
|
|
__pinned_func |
|
static pentry_t flags_to_entry(uint32_t flags) |
|
{ |
|
pentry_t entry_flags = MMU_P; |
|
|
|
/* Translate flags argument into HW-recognized entry flags. |
|
* |
|
* Support for PAT is not implemented yet. Many systems may have |
|
* BIOS-populated MTRR values such that these cache settings are |
|
* redundant. |
|
*/ |
|
switch (flags & K_MEM_CACHE_MASK) { |
|
case K_MEM_CACHE_NONE: |
|
entry_flags |= MMU_PCD; |
|
break; |
|
case K_MEM_CACHE_WT: |
|
entry_flags |= MMU_PWT; |
|
break; |
|
case K_MEM_CACHE_WB: |
|
break; |
|
default: |
|
__ASSERT(false, "bad memory mapping flags 0x%x", flags); |
|
} |
|
|
|
if ((flags & K_MEM_PERM_RW) != 0U) { |
|
entry_flags |= ENTRY_RW; |
|
} |
|
|
|
if ((flags & K_MEM_PERM_USER) != 0U) { |
|
entry_flags |= ENTRY_US; |
|
} |
|
|
|
if ((flags & K_MEM_PERM_EXEC) == 0U) { |
|
entry_flags |= ENTRY_XD; |
|
} |
|
|
|
return entry_flags; |
|
} |
|
|
|
/* map new region virt..virt+size to phys with provided arch-neutral flags */ |
|
__pinned_func |
|
void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags) |
|
{ |
|
int ret; |
|
|
|
ret = range_map_unlocked(virt, phys, size, flags_to_entry(flags), |
|
MASK_ALL, 0); |
|
__ASSERT_NO_MSG(ret == 0); |
|
ARG_UNUSED(ret); |
|
} |
|
|
|
/* unmap region addr..addr+size, reset entries and flush TLB */ |
|
void arch_mem_unmap(void *addr, size_t size) |
|
{ |
|
int ret; |
|
|
|
ret = range_map_unlocked((void *)addr, 0, size, 0, 0, |
|
OPTION_FLUSH | OPTION_CLEAR); |
|
__ASSERT_NO_MSG(ret == 0); |
|
ARG_UNUSED(ret); |
|
} |
|
|
|
#ifdef Z_VM_KERNEL |
|
__boot_func |
|
static void identity_map_remove(uint32_t level) |
|
{ |
|
size_t size, scope = get_entry_scope(level); |
|
pentry_t *table; |
|
uint32_t cur_level; |
|
uint8_t *pos; |
|
pentry_t entry; |
|
pentry_t *entry_ptr; |
|
|
|
k_mem_region_align((uintptr_t *)&pos, &size, |
|
(uintptr_t)CONFIG_SRAM_BASE_ADDRESS, |
|
(size_t)CONFIG_SRAM_SIZE * 1024U, scope); |
|
|
|
while (size != 0U) { |
|
/* Need to get to the correct table */ |
|
table = z_x86_kernel_ptables; |
|
for (cur_level = 0; cur_level < level; cur_level++) { |
|
entry = get_entry(table, pos, cur_level); |
|
table = next_table(entry, level); |
|
} |
|
|
|
entry_ptr = get_entry_ptr(table, pos, level); |
|
|
|
/* set_pte */ |
|
*entry_ptr = 0; |
|
pos += scope; |
|
size -= scope; |
|
} |
|
} |
|
#endif |
|
|
|
/* Invoked to remove the identity mappings in the page tables, |
|
* they were only needed to transition the instruction pointer at early boot |
|
*/ |
|
__boot_func |
|
void z_x86_mmu_init(void) |
|
{ |
|
#ifdef Z_VM_KERNEL |
|
/* We booted with physical address space being identity mapped. |
|
* As we are now executing in virtual address space, |
|
* the identity map is no longer needed. So remove them. |
|
* |
|
* Without PAE, only need to remove the entries at the PD level. |
|
* With PAE, need to also remove the entry at PDP level. |
|
*/ |
|
identity_map_remove(PDE_LEVEL); |
|
|
|
#ifdef CONFIG_X86_PAE |
|
identity_map_remove(0); |
|
#endif |
|
#endif |
|
} |
|
|
|
#if CONFIG_X86_STACK_PROTECTION |
|
__pinned_func |
|
void z_x86_set_stack_guard(k_thread_stack_t *stack) |
|
{ |
|
int ret; |
|
|
|
/* Applied to all page tables as this affects supervisor mode. |
|
* XXX: This never gets reset when the thread exits, which can |
|
* cause problems if the memory is later used for something else. |
|
* See #29499 |
|
* |
|
* Guard page is always the first page of the stack object for both |
|
* kernel and thread stacks. |
|
*/ |
|
ret = range_map_unlocked(stack, 0, CONFIG_MMU_PAGE_SIZE, |
|
MMU_P | ENTRY_XD, MASK_PERM, OPTION_FLUSH); |
|
__ASSERT_NO_MSG(ret == 0); |
|
ARG_UNUSED(ret); |
|
} |
|
#endif /* CONFIG_X86_STACK_PROTECTION */ |
|
|
|
#ifdef CONFIG_USERSPACE |
|
__pinned_func |
|
static bool page_validate(pentry_t *ptables, uint8_t *addr, bool write) |
|
{ |
|
pentry_t *table = (pentry_t *)ptables; |
|
|
|
for (int level = 0; level < NUM_LEVELS; level++) { |
|
pentry_t entry = get_entry(table, addr, level); |
|
|
|
if (is_leaf(level, entry)) { |
|
#ifdef CONFIG_X86_KPTI |
|
if (is_flipped_pte(entry)) { |
|
/* We flipped this to prevent user access |
|
* since just clearing US isn't sufficient |
|
*/ |
|
return false; |
|
} |
|
#endif |
|
/* US and RW bits still carry meaning if non-present. |
|
* If the data page is paged out, access bits are |
|
* preserved. If un-mapped, the whole entry is 0. |
|
*/ |
|
if (((entry & MMU_US) == 0U) || |
|
(write && ((entry & MMU_RW) == 0U))) { |
|
return false; |
|
} |
|
} else { |
|
if ((entry & MMU_P) == 0U) { |
|
/* Missing intermediate table, address is |
|
* un-mapped |
|
*/ |
|
return false; |
|
} |
|
table = next_table(entry, level); |
|
} |
|
} |
|
|
|
return true; |
|
} |
|
|
|
__pinned_func |
|
static inline void bcb_fence(void) |
|
{ |
|
#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION |
|
__asm__ volatile ("lfence" : : : "memory"); |
|
#endif |
|
} |
|
|
|
__pinned_func |
|
int arch_buffer_validate(void *addr, size_t size, int write) |
|
{ |
|
pentry_t *ptables = z_x86_thread_page_tables_get(_current); |
|
uint8_t *virt; |
|
size_t aligned_size; |
|
int ret = 0; |
|
|
|
/* addr/size arbitrary, fix this up into an aligned region */ |
|
k_mem_region_align((uintptr_t *)&virt, &aligned_size, |
|
(uintptr_t)addr, size, CONFIG_MMU_PAGE_SIZE); |
|
|
|
for (size_t offset = 0; offset < aligned_size; |
|
offset += CONFIG_MMU_PAGE_SIZE) { |
|
if (!page_validate(ptables, virt + offset, write)) { |
|
ret = -1; |
|
break; |
|
} |
|
} |
|
|
|
bcb_fence(); |
|
|
|
return ret; |
|
} |
|
#ifdef CONFIG_X86_COMMON_PAGE_TABLE |
|
/* Very low memory configuration. A single set of page tables is used for |
|
* all threads. This relies on some assumptions: |
|
* |
|
* - No KPTI. If that were supported, we would need both a kernel and user |
|
* set of page tables. |
|
* - No SMP. If that were supported, we would need per-core page tables. |
|
* - Memory domains don't affect supervisor mode. |
|
* - All threads have the same virtual-to-physical mappings. |
|
* - Memory domain APIs can't be called by user mode. |
|
* |
|
* Because there is no SMP, only one set of page tables, and user threads can't |
|
* modify their own memory domains, we don't have to do much when |
|
* arch_mem_domain_* APIs are called. We do use a caching scheme to avoid |
|
* updating page tables if the last user thread scheduled was in the same |
|
* domain. |
|
* |
|
* We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting |
|
* up any arch-specific memory domain data (per domain page tables.) |
|
* |
|
* This is all nice and simple and saves a lot of memory. The cost is that |
|
* context switching is not trivial CR3 update. We have to reset all partitions |
|
* for the current domain configuration and then apply all the partitions for |
|
* the incoming thread's domain if they are not the same. We also need to |
|
* update permissions similarly on the thread stack region. |
|
*/ |
|
|
|
__pinned_func |
|
static inline int reset_region(uintptr_t start, size_t size) |
|
{ |
|
return range_map_unlocked((void *)start, 0, size, 0, 0, |
|
OPTION_FLUSH | OPTION_RESET); |
|
} |
|
|
|
__pinned_func |
|
static inline int apply_region(uintptr_t start, size_t size, pentry_t attr) |
|
{ |
|
return range_map_unlocked((void *)start, 0, size, attr, MASK_PERM, |
|
OPTION_FLUSH); |
|
} |
|
|
|
/* Cache of the current memory domain applied to the common page tables and |
|
* the stack buffer region that had User access granted. |
|
*/ |
|
static __pinned_bss struct k_mem_domain *current_domain; |
|
static __pinned_bss uintptr_t current_stack_start; |
|
static __pinned_bss size_t current_stack_size; |
|
|
|
__pinned_func |
|
void z_x86_swap_update_common_page_table(struct k_thread *incoming) |
|
{ |
|
k_spinlock_key_t key; |
|
|
|
if ((incoming->base.user_options & K_USER) == 0) { |
|
/* Incoming thread is not a user thread. Memory domains don't |
|
* affect supervisor threads and we don't need to enable User |
|
* bits for its stack buffer; do nothing. |
|
*/ |
|
return; |
|
} |
|
|
|
/* Step 1: Make sure the thread stack is set up correctly for the |
|
* for the incoming thread |
|
*/ |
|
if (incoming->stack_info.start != current_stack_start || |
|
incoming->stack_info.size != current_stack_size) { |
|
if (current_stack_size != 0U) { |
|
reset_region(current_stack_start, current_stack_size); |
|
} |
|
|
|
/* The incoming thread's stack region needs User permissions */ |
|
apply_region(incoming->stack_info.start, |
|
incoming->stack_info.size, |
|
K_MEM_PARTITION_P_RW_U_RW); |
|
|
|
/* Update cache */ |
|
current_stack_start = incoming->stack_info.start; |
|
current_stack_size = incoming->stack_info.size; |
|
} |
|
|
|
/* Step 2: The page tables always have some memory domain applied to |
|
* them. If the incoming thread's memory domain is different, |
|
* update the page tables |
|
*/ |
|
key = k_spin_lock(&z_mem_domain_lock); |
|
if (incoming->mem_domain_info.mem_domain == current_domain) { |
|
/* The incoming thread's domain is already applied */ |
|
goto out_unlock; |
|
} |
|
|
|
/* Reset the current memory domain regions... */ |
|
if (current_domain != NULL) { |
|
for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) { |
|
struct k_mem_partition *ptn = |
|
¤t_domain->partitions[i]; |
|
|
|
if (ptn->size == 0) { |
|
continue; |
|
} |
|
reset_region(ptn->start, ptn->size); |
|
} |
|
} |
|
|
|
/* ...and apply all the incoming domain's regions */ |
|
for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) { |
|
struct k_mem_partition *ptn = |
|
&incoming->mem_domain_info.mem_domain->partitions[i]; |
|
|
|
if (ptn->size == 0) { |
|
continue; |
|
} |
|
apply_region(ptn->start, ptn->size, ptn->attr); |
|
} |
|
current_domain = incoming->mem_domain_info.mem_domain; |
|
out_unlock: |
|
k_spin_unlock(&z_mem_domain_lock, key); |
|
} |
|
|
|
/* If a partition was added or removed in the cached domain, update the |
|
* page tables. |
|
*/ |
|
__pinned_func |
|
int arch_mem_domain_partition_remove(struct k_mem_domain *domain, |
|
uint32_t partition_id) |
|
{ |
|
struct k_mem_partition *ptn; |
|
|
|
if (domain != current_domain) { |
|
return 0; |
|
} |
|
|
|
ptn = &domain->partitions[partition_id]; |
|
|
|
return reset_region(ptn->start, ptn->size); |
|
} |
|
|
|
__pinned_func |
|
int arch_mem_domain_partition_add(struct k_mem_domain *domain, |
|
uint32_t partition_id) |
|
{ |
|
struct k_mem_partition *ptn; |
|
|
|
if (domain != current_domain) { |
|
return 0; |
|
} |
|
|
|
ptn = &domain->partitions[partition_id]; |
|
|
|
return apply_region(ptn->start, ptn->size, ptn->attr); |
|
} |
|
|
|
/* Rest of the APIs don't need to do anything */ |
|
__pinned_func |
|
int arch_mem_domain_thread_add(struct k_thread *thread) |
|
{ |
|
return 0; |
|
} |
|
|
|
__pinned_func |
|
int arch_mem_domain_thread_remove(struct k_thread *thread) |
|
{ |
|
return 0; |
|
} |
|
#else |
|
/* Memory domains each have a set of page tables assigned to them */ |
|
|
|
/* |
|
* Pool of free memory pages for copying page tables, as needed. |
|
*/ |
|
#define PTABLE_COPY_SIZE (INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) |
|
|
|
static uint8_t __pinned_noinit |
|
page_pool[PTABLE_COPY_SIZE * CONFIG_X86_MAX_ADDITIONAL_MEM_DOMAINS] |
|
__aligned(CONFIG_MMU_PAGE_SIZE); |
|
|
|
__pinned_data |
|
static uint8_t *page_pos = page_pool + sizeof(page_pool); |
|
|
|
/* Return a zeroed and suitably aligned memory page for page table data |
|
* from the global page pool |
|
*/ |
|
__pinned_func |
|
static void *page_pool_get(void) |
|
{ |
|
void *ret; |
|
|
|
if (page_pos == page_pool) { |
|
ret = NULL; |
|
} else { |
|
page_pos -= CONFIG_MMU_PAGE_SIZE; |
|
ret = page_pos; |
|
} |
|
|
|
if (ret != NULL) { |
|
memset(ret, 0, CONFIG_MMU_PAGE_SIZE); |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
/* Debugging function to show how many pages are free in the pool */ |
|
__pinned_func |
|
static inline unsigned int pages_free(void) |
|
{ |
|
return (page_pos - page_pool) / CONFIG_MMU_PAGE_SIZE; |
|
} |
|
|
|
/** |
|
* Duplicate an entire set of page tables |
|
* |
|
* Uses recursion, but depth at any given moment is limited by the number of |
|
* paging levels. |
|
* |
|
* x86_mmu_lock must be held. |
|
* |
|
* @param dst a zeroed out chunk of memory of sufficient size for the indicated |
|
* paging level. |
|
* @param src some paging structure from within the source page tables to copy |
|
* at the indicated paging level |
|
* @param level Current paging level |
|
* @retval 0 Success |
|
* @retval -ENOMEM Insufficient page pool memory |
|
*/ |
|
__pinned_func |
|
static int copy_page_table(pentry_t *dst, pentry_t *src, int level) |
|
{ |
|
if (level == PTE_LEVEL) { |
|
/* Base case: leaf page table */ |
|
for (int i = 0; i < get_num_entries(level); i++) { |
|
dst[i] = pte_finalize_value(reset_pte(src[i]), true, |
|
PTE_LEVEL); |
|
} |
|
} else { |
|
/* Recursive case: allocate sub-structures as needed and |
|
* make recursive calls on them |
|
*/ |
|
for (int i = 0; i < get_num_entries(level); i++) { |
|
pentry_t *child_dst; |
|
int ret; |
|
|
|
if ((src[i] & MMU_P) == 0) { |
|
/* Non-present, skip */ |
|
continue; |
|
} |
|
|
|
if ((level == PDE_LEVEL) && ((src[i] & MMU_PS) != 0)) { |
|
/* large page: no lower level table */ |
|
dst[i] = pte_finalize_value(src[i], true, |
|
PDE_LEVEL); |
|
continue; |
|
} |
|
|
|
__ASSERT((src[i] & MMU_PS) == 0, |
|
"large page encountered"); |
|
|
|
child_dst = page_pool_get(); |
|
if (child_dst == NULL) { |
|
return -ENOMEM; |
|
} |
|
|
|
/* Page table links are by physical address. RAM |
|
* for page tables is identity-mapped, but double- |
|
* cast needed for PAE case where sizeof(void *) and |
|
* sizeof(pentry_t) are not the same. |
|
*/ |
|
dst[i] = ((pentry_t)z_mem_phys_addr(child_dst) | |
|
INT_FLAGS); |
|
|
|
ret = copy_page_table(child_dst, |
|
next_table(src[i], level), |
|
level + 1); |
|
if (ret != 0) { |
|
return ret; |
|
} |
|
} |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
__pinned_func |
|
static int region_map_update(pentry_t *ptables, void *start, |
|
size_t size, pentry_t flags, bool reset) |
|
{ |
|
uint32_t options = OPTION_USER; |
|
int ret; |
|
k_spinlock_key_t key; |
|
|
|
if (reset) { |
|
options |= OPTION_RESET; |
|
} |
|
if (ptables == z_x86_page_tables_get()) { |
|
options |= OPTION_FLUSH; |
|
} |
|
|
|
key = k_spin_lock(&x86_mmu_lock); |
|
ret = range_map_ptables(ptables, start, 0, size, flags, MASK_PERM, |
|
options); |
|
k_spin_unlock(&x86_mmu_lock, key); |
|
|
|
#ifdef CONFIG_SMP |
|
tlb_shootdown(); |
|
#endif |
|
|
|
return ret; |
|
} |
|
|
|
__pinned_func |
|
static inline int reset_region(pentry_t *ptables, void *start, size_t size) |
|
{ |
|
LOG_DBG("%s(%p, %p, %zu)", __func__, ptables, start, size); |
|
return region_map_update(ptables, start, size, 0, true); |
|
} |
|
|
|
__pinned_func |
|
static inline int apply_region(pentry_t *ptables, void *start, |
|
size_t size, pentry_t attr) |
|
{ |
|
LOG_DBG("%s(%p, %p, %zu, " PRI_ENTRY ")", __func__, ptables, start, |
|
size, attr); |
|
return region_map_update(ptables, start, size, attr, false); |
|
} |
|
|
|
__pinned_func |
|
static void set_stack_perms(struct k_thread *thread, pentry_t *ptables) |
|
{ |
|
LOG_DBG("update stack for thread %p's ptables at %p: %p (size %zu)", |
|
thread, ptables, (void *)thread->stack_info.start, |
|
thread->stack_info.size); |
|
apply_region(ptables, (void *)thread->stack_info.start, |
|
thread->stack_info.size, |
|
MMU_P | MMU_XD | MMU_RW | MMU_US); |
|
} |
|
|
|
/* |
|
* Arch interface implementations for memory domains and userspace |
|
*/ |
|
|
|
__boot_func |
|
int arch_mem_domain_init(struct k_mem_domain *domain) |
|
{ |
|
int ret; |
|
k_spinlock_key_t key = k_spin_lock(&x86_mmu_lock); |
|
|
|
LOG_DBG("%s(%p)", __func__, domain); |
|
#if __ASSERT_ON |
|
sys_snode_t *node; |
|
|
|
/* Assert that we have not already initialized this domain */ |
|
SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) { |
|
struct arch_mem_domain *list_domain = |
|
CONTAINER_OF(node, struct arch_mem_domain, node); |
|
|
|
__ASSERT(list_domain != &domain->arch, |
|
"%s(%p) called multiple times", __func__, domain); |
|
} |
|
#endif /* __ASSERT_ON */ |
|
#ifndef CONFIG_X86_KPTI |
|
/* If we're not using KPTI then we can use the build time page tables |
|
* (which are mutable) as the set of page tables for the default |
|
* memory domain, saving us some memory. |
|
* |
|
* We skip adding this domain to x86_domain_list since we already |
|
* update z_x86_kernel_ptables directly in range_map(). |
|
*/ |
|
if (domain == &k_mem_domain_default) { |
|
domain->arch.ptables = z_x86_kernel_ptables; |
|
k_spin_unlock(&x86_mmu_lock, key); |
|
return 0; |
|
} |
|
#endif /* CONFIG_X86_KPTI */ |
|
#ifdef CONFIG_X86_PAE |
|
/* PDPT is stored within the memory domain itself since it is |
|
* much smaller than a full page |
|
*/ |
|
(void)memset(domain->arch.pdpt, 0, sizeof(domain->arch.pdpt)); |
|
domain->arch.ptables = domain->arch.pdpt; |
|
#else |
|
/* Allocate a page-sized top-level structure, either a PD or PML4 */ |
|
domain->arch.ptables = page_pool_get(); |
|
if (domain->arch.ptables == NULL) { |
|
k_spin_unlock(&x86_mmu_lock, key); |
|
return -ENOMEM; |
|
} |
|
#endif /* CONFIG_X86_PAE */ |
|
|
|
LOG_DBG("copy_page_table(%p, %p, 0)", domain->arch.ptables, |
|
z_x86_kernel_ptables); |
|
|
|
/* Make a copy of the boot page tables created by gen_mmu.py */ |
|
ret = copy_page_table(domain->arch.ptables, z_x86_kernel_ptables, 0); |
|
if (ret == 0) { |
|
sys_slist_append(&x86_domain_list, &domain->arch.node); |
|
} |
|
k_spin_unlock(&x86_mmu_lock, key); |
|
|
|
return ret; |
|
} |
|
|
|
int arch_mem_domain_partition_remove(struct k_mem_domain *domain, |
|
uint32_t partition_id) |
|
{ |
|
struct k_mem_partition *partition = &domain->partitions[partition_id]; |
|
|
|
/* Reset the partition's region back to defaults */ |
|
return reset_region(domain->arch.ptables, (void *)partition->start, |
|
partition->size); |
|
} |
|
|
|
/* Called on thread exit or when moving it to a different memory domain */ |
|
int arch_mem_domain_thread_remove(struct k_thread *thread) |
|
{ |
|
struct k_mem_domain *domain = thread->mem_domain_info.mem_domain; |
|
|
|
if ((thread->base.user_options & K_USER) == 0) { |
|
return 0; |
|
} |
|
|
|
if ((thread->base.thread_state & _THREAD_DEAD) == 0) { |
|
/* Thread is migrating to another memory domain and not |
|
* exiting for good; we weren't called from |
|
* z_thread_abort(). Resetting the stack region will |
|
* take place in the forthcoming thread_add() call. |
|
*/ |
|
return 0; |
|
} |
|
|
|
/* Restore permissions on the thread's stack area since it is no |
|
* longer a member of the domain. |
|
*/ |
|
return reset_region(domain->arch.ptables, |
|
(void *)thread->stack_info.start, |
|
thread->stack_info.size); |
|
} |
|
|
|
__pinned_func |
|
int arch_mem_domain_partition_add(struct k_mem_domain *domain, |
|
uint32_t partition_id) |
|
{ |
|
struct k_mem_partition *partition = &domain->partitions[partition_id]; |
|
|
|
/* Update the page tables with the partition info */ |
|
return apply_region(domain->arch.ptables, (void *)partition->start, |
|
partition->size, partition->attr | MMU_P); |
|
} |
|
|
|
/* Invoked from memory domain API calls, as well as during thread creation */ |
|
__pinned_func |
|
int arch_mem_domain_thread_add(struct k_thread *thread) |
|
{ |
|
int ret = 0; |
|
|
|
/* New memory domain we are being added to */ |
|
struct k_mem_domain *domain = thread->mem_domain_info.mem_domain; |
|
/* This is only set for threads that were migrating from some other |
|
* memory domain; new threads this is NULL. |
|
* |
|
* Note that NULL check on old_ptables must be done before any |
|
* address translation or else (NULL + offset) != NULL. |
|
*/ |
|
pentry_t *old_ptables = UINT_TO_POINTER(thread->arch.ptables); |
|
bool is_user = (thread->base.user_options & K_USER) != 0; |
|
bool is_migration = (old_ptables != NULL) && is_user; |
|
|
|
/* Allow US access to the thread's stack in its new domain if |
|
* we are migrating. If we are not migrating this is done in |
|
* z_x86_current_stack_perms() |
|
*/ |
|
if (is_migration) { |
|
old_ptables = z_mem_virt_addr(thread->arch.ptables); |
|
set_stack_perms(thread, domain->arch.ptables); |
|
} |
|
|
|
thread->arch.ptables = z_mem_phys_addr(domain->arch.ptables); |
|
LOG_DBG("set thread %p page tables to %p", thread, |
|
(void *)thread->arch.ptables); |
|
|
|
/* Check if we're doing a migration from a different memory domain |
|
* and have to remove permissions from its old domain. |
|
* |
|
* XXX: The checks we have to do here and in |
|
* arch_mem_domain_thread_remove() are clumsy, it may be worth looking |
|
* into adding a specific arch_mem_domain_thread_migrate() API. |
|
* See #29601 |
|
*/ |
|
if (is_migration) { |
|
ret = reset_region(old_ptables, |
|
(void *)thread->stack_info.start, |
|
thread->stack_info.size); |
|
} |
|
|
|
#if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) |
|
/* Need to switch to using these new page tables, in case we drop |
|
* to user mode before we are ever context switched out. |
|
* IPI takes care of this if the thread is currently running on some |
|
* other CPU. |
|
*/ |
|
if (thread == _current && thread->arch.ptables != z_x86_cr3_get()) { |
|
z_x86_cr3_set(thread->arch.ptables); |
|
} |
|
#endif /* CONFIG_X86_KPTI */ |
|
|
|
return ret; |
|
} |
|
#endif /* !CONFIG_X86_COMMON_PAGE_TABLE */ |
|
|
|
__pinned_func |
|
int arch_mem_domain_max_partitions_get(void) |
|
{ |
|
return CONFIG_MAX_DOMAIN_PARTITIONS; |
|
} |
|
|
|
/* Invoked from z_x86_userspace_enter */ |
|
__pinned_func |
|
void z_x86_current_stack_perms(void) |
|
{ |
|
/* Clear any previous context in the stack buffer to prevent |
|
* unintentional data leakage. |
|
*/ |
|
(void)memset((void *)_current->stack_info.start, 0xAA, |
|
_current->stack_info.size - _current->stack_info.delta); |
|
|
|
/* Only now is it safe to grant access to the stack buffer since any |
|
* previous context has been erased. |
|
*/ |
|
#ifdef CONFIG_X86_COMMON_PAGE_TABLE |
|
/* Re run swap page table update logic since we're entering User mode. |
|
* This will grant stack and memory domain access if it wasn't set |
|
* already (in which case this returns very quickly). |
|
*/ |
|
z_x86_swap_update_common_page_table(_current); |
|
#else |
|
/* Memory domain access is already programmed into the page tables. |
|
* Need to enable access to this new user thread's stack buffer in |
|
* its domain-specific page tables. |
|
*/ |
|
set_stack_perms(_current, z_x86_thread_page_tables_get(_current)); |
|
#endif |
|
} |
|
#endif /* CONFIG_USERSPACE */ |
|
|
|
#ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES |
|
__boot_func |
|
static void mark_addr_page_reserved(uintptr_t addr, size_t len) |
|
{ |
|
uintptr_t pos = ROUND_DOWN(addr, CONFIG_MMU_PAGE_SIZE); |
|
uintptr_t end = ROUND_UP(addr + len, CONFIG_MMU_PAGE_SIZE); |
|
|
|
for (; pos < end; pos += CONFIG_MMU_PAGE_SIZE) { |
|
if (!z_is_page_frame(pos)) { |
|
continue; |
|
} |
|
|
|
struct z_page_frame *pf = z_phys_to_page_frame(pos); |
|
|
|
pf->flags |= Z_PAGE_FRAME_RESERVED; |
|
} |
|
} |
|
|
|
__boot_func |
|
void arch_reserved_pages_update(void) |
|
{ |
|
#ifdef CONFIG_X86_PC_COMPATIBLE |
|
/* |
|
* Best is to do some E820 or similar enumeration to specifically |
|
* identify all page frames which are reserved by the hardware or |
|
* firmware. Or use x86_memmap[] with Multiboot if available. |
|
* |
|
* But still, reserve everything in the first megabyte of physical |
|
* memory on PC-compatible platforms. |
|
*/ |
|
mark_addr_page_reserved(0, MB(1)); |
|
#endif /* CONFIG_X86_PC_COMPATIBLE */ |
|
|
|
#ifdef CONFIG_X86_MEMMAP |
|
for (int i = 0; i < CONFIG_X86_MEMMAP_ENTRIES; i++) { |
|
struct x86_memmap_entry *entry = &x86_memmap[i]; |
|
|
|
switch (entry->type) { |
|
case X86_MEMMAP_ENTRY_UNUSED: |
|
__fallthrough; |
|
case X86_MEMMAP_ENTRY_RAM: |
|
continue; |
|
|
|
case X86_MEMMAP_ENTRY_ACPI: |
|
__fallthrough; |
|
case X86_MEMMAP_ENTRY_NVS: |
|
__fallthrough; |
|
case X86_MEMMAP_ENTRY_DEFECTIVE: |
|
__fallthrough; |
|
default: |
|
/* If any of three above cases satisfied, exit switch |
|
* and mark page reserved |
|
*/ |
|
break; |
|
} |
|
|
|
mark_addr_page_reserved(entry->base, entry->length); |
|
} |
|
#endif /* CONFIG_X86_MEMMAP */ |
|
} |
|
#endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */ |
|
|
|
int arch_page_phys_get(void *virt, uintptr_t *phys) |
|
{ |
|
pentry_t pte = 0; |
|
int level, ret; |
|
|
|
__ASSERT(POINTER_TO_UINT(virt) % CONFIG_MMU_PAGE_SIZE == 0U, |
|
"unaligned address %p to %s", virt, __func__); |
|
|
|
pentry_get(&level, &pte, z_x86_page_tables_get(), virt); |
|
|
|
if ((pte & MMU_P) != 0) { |
|
if (phys != NULL) { |
|
*phys = (uintptr_t)get_entry_phys(pte, PTE_LEVEL); |
|
} |
|
ret = 0; |
|
} else { |
|
/* Not mapped */ |
|
ret = -EFAULT; |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
#ifdef CONFIG_DEMAND_PAGING |
|
#define PTE_MASK (paging_levels[PTE_LEVEL].mask) |
|
|
|
__pinned_func |
|
void arch_mem_page_out(void *addr, uintptr_t location) |
|
{ |
|
int ret; |
|
pentry_t mask = PTE_MASK | MMU_P | MMU_A; |
|
|
|
/* Accessed bit set to guarantee the entry is not completely 0 in |
|
* case of location value 0. A totally 0 PTE is un-mapped. |
|
*/ |
|
ret = range_map(addr, location, CONFIG_MMU_PAGE_SIZE, MMU_A, mask, |
|
OPTION_FLUSH); |
|
__ASSERT_NO_MSG(ret == 0); |
|
ARG_UNUSED(ret); |
|
} |
|
|
|
__pinned_func |
|
void arch_mem_page_in(void *addr, uintptr_t phys) |
|
{ |
|
int ret; |
|
pentry_t mask = PTE_MASK | MMU_P | MMU_D | MMU_A; |
|
|
|
ret = range_map(addr, phys, CONFIG_MMU_PAGE_SIZE, MMU_P, mask, |
|
OPTION_FLUSH); |
|
__ASSERT_NO_MSG(ret == 0); |
|
ARG_UNUSED(ret); |
|
} |
|
|
|
__pinned_func |
|
void arch_mem_scratch(uintptr_t phys) |
|
{ |
|
page_map_set(z_x86_page_tables_get(), Z_SCRATCH_PAGE, |
|
phys | MMU_P | MMU_RW | MMU_XD, NULL, MASK_ALL, |
|
OPTION_FLUSH); |
|
} |
|
|
|
__pinned_func |
|
uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed) |
|
{ |
|
pentry_t all_pte, mask; |
|
uint32_t options; |
|
|
|
/* What to change, if anything, in the page_map_set() calls */ |
|
if (clear_accessed) { |
|
mask = MMU_A; |
|
options = OPTION_FLUSH; |
|
} else { |
|
/* In this configuration page_map_set() just queries the |
|
* page table and makes no changes |
|
*/ |
|
mask = 0; |
|
options = 0U; |
|
} |
|
|
|
page_map_set(z_x86_kernel_ptables, addr, 0, &all_pte, mask, options); |
|
|
|
/* Un-mapped PTEs are completely zeroed. No need to report anything |
|
* else in this case. |
|
*/ |
|
if (all_pte == 0) { |
|
return ARCH_DATA_PAGE_NOT_MAPPED; |
|
} |
|
|
|
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) |
|
/* Don't bother looking at other page tables if non-present as we |
|
* are not required to report accurate accessed/dirty in this case |
|
* and all mappings are otherwise the same. |
|
*/ |
|
if ((all_pte & MMU_P) != 0) { |
|
sys_snode_t *node; |
|
|
|
/* IRQs are locked, safe to do this */ |
|
SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) { |
|
pentry_t cur_pte; |
|
struct arch_mem_domain *domain = |
|
CONTAINER_OF(node, struct arch_mem_domain, |
|
node); |
|
|
|
page_map_set(domain->ptables, addr, 0, &cur_pte, |
|
mask, options | OPTION_USER); |
|
|
|
/* Logical OR of relevant PTE in all page tables. |
|
* addr/location and present state should be identical |
|
* among them. |
|
*/ |
|
all_pte |= cur_pte; |
|
} |
|
} |
|
#endif /* USERSPACE && ~X86_COMMON_PAGE_TABLE */ |
|
|
|
/* NOTE: We are truncating the PTE on PAE systems, whose pentry_t |
|
* are larger than a uintptr_t. |
|
* |
|
* We currently aren't required to report back XD state (bit 63), and |
|
* Zephyr just doesn't support large physical memory on 32-bit |
|
* systems, PAE was only implemented for XD support. |
|
*/ |
|
if (phys != NULL) { |
|
*phys = (uintptr_t)get_entry_phys(all_pte, PTE_LEVEL); |
|
} |
|
|
|
/* We don't filter out any other bits in the PTE and the kernel |
|
* ignores them. For the case of ARCH_DATA_PAGE_NOT_MAPPED, |
|
* we use a bit which is never set in a real PTE (the PAT bit) in the |
|
* current system. |
|
* |
|
* The other ARCH_DATA_PAGE_* macros are defined to their corresponding |
|
* bits in the PTE. |
|
*/ |
|
return (uintptr_t)all_pte; |
|
} |
|
|
|
__pinned_func |
|
enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location) |
|
{ |
|
pentry_t pte; |
|
int level; |
|
|
|
/* TODO: since we only have to query the current set of page tables, |
|
* could optimize this with recursive page table mapping |
|
*/ |
|
pentry_get(&level, &pte, z_x86_page_tables_get(), addr); |
|
|
|
if (pte == 0) { |
|
/* Not mapped */ |
|
return ARCH_PAGE_LOCATION_BAD; |
|
} |
|
|
|
__ASSERT(level == PTE_LEVEL, "bigpage found at %p", addr); |
|
*location = (uintptr_t)get_entry_phys(pte, PTE_LEVEL); |
|
|
|
if ((pte & MMU_P) != 0) { |
|
return ARCH_PAGE_LOCATION_PAGED_IN; |
|
} else { |
|
return ARCH_PAGE_LOCATION_PAGED_OUT; |
|
} |
|
} |
|
|
|
#ifdef CONFIG_X86_KPTI |
|
__pinned_func |
|
bool z_x86_kpti_is_access_ok(void *addr, pentry_t *ptables) |
|
{ |
|
pentry_t pte; |
|
int level; |
|
|
|
pentry_get(&level, &pte, ptables, addr); |
|
|
|
/* Might as well also check if it's un-mapped, normally we don't |
|
* fetch the PTE from the page tables until we are inside |
|
* z_page_fault() and call arch_page_fault_status_get() |
|
*/ |
|
if (level != PTE_LEVEL || pte == 0 || is_flipped_pte(pte)) { |
|
return false; |
|
} |
|
|
|
return true; |
|
} |
|
#endif /* CONFIG_X86_KPTI */ |
|
#endif /* CONFIG_DEMAND_PAGING */
|
|
|