Commit e76e82d772522b05ed93228478d2a4460754b6a4

Authored by Heiko Carstens
Committed by Martin Schwidefsky
1 parent 51eee033dc

s390/mm: add page table dumper

This is more or less the same as the x86 page table dumper which was
merged four years ago: 926e5392 "x86: add code to dump the (kernel)
page tables for visual inspection by kernel developers".

We add a file at /sys/kernel/debug/kernel_page_tables for debugging
purposes so it's quite easy to see the kernel page table layout and
possible odd mappings:

---[ Identity Mapping ]---
0x0000000000000000-0x0000000000100000        1M PTE RW
---[ Kernel Image Start ]---
0x0000000000100000-0x0000000000800000        7M PMD RO
0x0000000000800000-0x00000000008a9000      676K PTE RO
0x00000000008a9000-0x0000000000900000      348K PTE RW
0x0000000000900000-0x0000000001500000       12M PMD RW
---[ Kernel Image End ]---
0x0000000001500000-0x0000000280000000    10219M PMD RW
0x0000000280000000-0x000003d280000000     3904G PUD I
---[ vmemmap Area ]---
0x000003d280000000-0x000003d288c00000      140M PTE RW
0x000003d288c00000-0x000003d300000000     1908M PMD I
0x000003d300000000-0x000003e000000000       52G PUD I
---[ vmalloc Area ]---
0x000003e000000000-0x000003e000009000       36K PTE RW
0x000003e000009000-0x000003e0000ee000      916K PTE I
0x000003e0000ee000-0x000003e000146000      352K PTE RW
0x000003e000146000-0x000003e000200000      744K PTE I
0x000003e000200000-0x000003e080000000     2046M PMD I
0x000003e080000000-0x0000040000000000      126G PUD I

This usually makes only sense for kernel developers. The output
with CONFIG_DEBUG_PAGEALLOC is not very helpful, because of the
huge number of mapped out pages, however I decided for the time
being to not add a !DEBUG_PAGEALLOC dependency.
Maybe it's helpful for somebody even with that option.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

Showing 3 changed files with 232 additions and 0 deletions Side-by-side Diff

arch/s390/Kconfig.debug
... ... @@ -31,6 +31,18 @@
31 31  
32 32 If unsure, or if you run an older (pre 4.4) gcc, say N.
33 33  
  34 +config S390_PTDUMP
  35 + bool "Export kernel pagetable layout to userspace via debugfs"
  36 + depends on DEBUG_KERNEL
  37 + select DEBUG_FS
  38 + ---help---
  39 + Say Y here if you want to show the kernel pagetable layout in a
  40 + debugfs file. This information is only useful for kernel developers
  41 + who are working in architecture specific areas of the kernel.
  42 + It is probably not a good idea to enable this feature in a production
  43 + kernel.
  44 + If in doubt, say "N"
  45 +
34 46 config DEBUG_SET_MODULE_RONX
35 47 def_bool y
36 48 depends on MODULES
arch/s390/mm/Makefile
... ... @@ -7,4 +7,5 @@
7 7 obj-$(CONFIG_CMM) += cmm.o
8 8 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9 9 obj-$(CONFIG_DEBUG_SET_MODULE_RONX) += pageattr.o
  10 +obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
arch/s390/mm/dump_pagetables.c
  1 +#include <linux/seq_file.h>
  2 +#include <linux/debugfs.h>
  3 +#include <linux/module.h>
  4 +#include <linux/mm.h>
  5 +#include <asm/sections.h>
  6 +#include <asm/pgtable.h>
  7 +
  8 +static unsigned long max_addr;
  9 +
  10 +struct addr_marker {
  11 + unsigned long start_address;
  12 + const char *name;
  13 +};
  14 +
  15 +enum address_markers_idx {
  16 + IDENTITY_NR = 0,
  17 + KERNEL_START_NR,
  18 + KERNEL_END_NR,
  19 + VMEMMAP_NR,
  20 + VMALLOC_NR,
  21 +};
  22 +
  23 +static struct addr_marker address_markers[] = {
  24 + [IDENTITY_NR] = {0, "Identity Mapping"},
  25 + [KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"},
  26 + [KERNEL_END_NR] = {(unsigned long)&_end, "Kernel Image End"},
  27 + [VMEMMAP_NR] = {0, "vmemmap Area"},
  28 + [VMALLOC_NR] = {0, "vmalloc Area"},
  29 + { -1, NULL }
  30 +};
  31 +
  32 +struct pg_state {
  33 + int level;
  34 + unsigned int current_prot;
  35 + unsigned long start_address;
  36 + unsigned long current_address;
  37 + const struct addr_marker *marker;
  38 +};
  39 +
  40 +static void print_prot(struct seq_file *m, unsigned int pr, int level)
  41 +{
  42 + static const char * const level_name[] =
  43 + { "ASCE", "PGD", "PUD", "PMD", "PTE" };
  44 +
  45 + seq_printf(m, "%s ", level_name[level]);
  46 + if (pr & _PAGE_INVALID)
  47 + seq_printf(m, "I\n");
  48 + else
  49 + seq_printf(m, "%s\n", pr & _PAGE_RO ? "RO" : "RW");
  50 +}
  51 +
  52 +static void note_page(struct seq_file *m, struct pg_state *st,
  53 + unsigned int new_prot, int level)
  54 +{
  55 + static const char units[] = "KMGTPE";
  56 + int width = sizeof(unsigned long) * 2;
  57 + const char *unit = units;
  58 + unsigned int prot, cur;
  59 + unsigned long delta;
  60 +
  61 + /*
  62 + * If we have a "break" in the series, we need to flush the state
  63 + * that we have now. "break" is either changing perms, levels or
  64 + * address space marker.
  65 + */
  66 + prot = new_prot;
  67 + cur = st->current_prot;
  68 +
  69 + if (!st->level) {
  70 + /* First entry */
  71 + st->current_prot = new_prot;
  72 + st->level = level;
  73 + st->marker = address_markers;
  74 + seq_printf(m, "---[ %s ]---\n", st->marker->name);
  75 + } else if (prot != cur || level != st->level ||
  76 + st->current_address >= st->marker[1].start_address) {
  77 + /* Print the actual finished series */
  78 + seq_printf(m, "0x%0*lx-0x%0*lx",
  79 + width, st->start_address,
  80 + width, st->current_address);
  81 + delta = (st->current_address - st->start_address) >> 10;
  82 + while (!(delta & 0x3ff) && unit[1]) {
  83 + delta >>= 10;
  84 + unit++;
  85 + }
  86 + seq_printf(m, "%9lu%c ", delta, *unit);
  87 + print_prot(m, st->current_prot, st->level);
  88 + if (st->current_address >= st->marker[1].start_address) {
  89 + st->marker++;
  90 + seq_printf(m, "---[ %s ]---\n", st->marker->name);
  91 + }
  92 + st->start_address = st->current_address;
  93 + st->current_prot = new_prot;
  94 + st->level = level;
  95 + }
  96 +}
  97 +
  98 +/*
  99 + * The actual page table walker functions. In order to keep the implementation
  100 + * of print_prot() short, we only check and pass _PAGE_INVALID and _PAGE_RO
  101 + * flags to note_page() if a region, segment or page table entry is invalid or
  102 + * read-only.
  103 + * After all it's just a hint that the current level being walked contains an
  104 + * invalid or read-only entry.
  105 + */
  106 +static void walk_pte_level(struct seq_file *m, struct pg_state *st,
  107 + pmd_t *pmd, unsigned long addr)
  108 +{
  109 + unsigned int prot;
  110 + pte_t *pte;
  111 + int i;
  112 +
  113 + for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
  114 + st->current_address = addr;
  115 + pte = pte_offset_kernel(pmd, addr);
  116 + prot = pte_val(*pte) & (_PAGE_RO | _PAGE_INVALID);
  117 + note_page(m, st, prot, 4);
  118 + addr += PAGE_SIZE;
  119 + }
  120 +}
  121 +
  122 +static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
  123 + pud_t *pud, unsigned long addr)
  124 +{
  125 + unsigned int prot;
  126 + pmd_t *pmd;
  127 + int i;
  128 +
  129 + for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) {
  130 + st->current_address = addr;
  131 + pmd = pmd_offset(pud, addr);
  132 + if (!pmd_none(*pmd)) {
  133 + if (pmd_large(*pmd)) {
  134 + prot = pmd_val(*pmd) & _SEGMENT_ENTRY_RO;
  135 + note_page(m, st, prot, 3);
  136 + } else
  137 + walk_pte_level(m, st, pmd, addr);
  138 + } else
  139 + note_page(m, st, _PAGE_INVALID, 3);
  140 + addr += PMD_SIZE;
  141 + }
  142 +}
  143 +
  144 +static void walk_pud_level(struct seq_file *m, struct pg_state *st,
  145 + pgd_t *pgd, unsigned long addr)
  146 +{
  147 + pud_t *pud;
  148 + int i;
  149 +
  150 + for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) {
  151 + st->current_address = addr;
  152 + pud = pud_offset(pgd, addr);
  153 + if (!pud_none(*pud))
  154 + walk_pmd_level(m, st, pud, addr);
  155 + else
  156 + note_page(m, st, _PAGE_INVALID, 2);
  157 + addr += PUD_SIZE;
  158 + }
  159 +}
  160 +
  161 +static void walk_pgd_level(struct seq_file *m)
  162 +{
  163 + unsigned long addr = 0;
  164 + struct pg_state st;
  165 + pgd_t *pgd;
  166 + int i;
  167 +
  168 + memset(&st, 0, sizeof(st));
  169 + for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
  170 + st.current_address = addr;
  171 + pgd = pgd_offset_k(addr);
  172 + if (!pgd_none(*pgd))
  173 + walk_pud_level(m, &st, pgd, addr);
  174 + else
  175 + note_page(m, &st, _PAGE_INVALID, 1);
  176 + addr += PGDIR_SIZE;
  177 + }
  178 + /* Flush out the last page */
  179 + st.current_address = max_addr;
  180 + note_page(m, &st, 0, 0);
  181 +}
  182 +
  183 +static int ptdump_show(struct seq_file *m, void *v)
  184 +{
  185 + walk_pgd_level(m);
  186 + return 0;
  187 +}
  188 +
  189 +static int ptdump_open(struct inode *inode, struct file *filp)
  190 +{
  191 + return single_open(filp, ptdump_show, NULL);
  192 +}
  193 +
  194 +static const struct file_operations ptdump_fops = {
  195 + .open = ptdump_open,
  196 + .read = seq_read,
  197 + .llseek = seq_lseek,
  198 + .release = single_release,
  199 +};
  200 +
  201 +static int pt_dump_init(void)
  202 +{
  203 + /*
  204 + * Figure out the maximum virtual address being accessible with the
  205 + * kernel ASCE. We need this to keep the page table walker functions
  206 + * from accessing non-existent entries.
  207 + */
  208 +#ifdef CONFIG_64BIT
  209 + max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
  210 + max_addr = 1UL << (max_addr * 11 + 31);
  211 +#else
  212 + max_addr = 1UL << 31;
  213 +#endif
  214 + address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
  215 + address_markers[VMALLOC_NR].start_address = VMALLOC_START;
  216 + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
  217 + return 0;
  218 +}
  219 +device_initcall(pt_dump_init);