Commit e76e82d772522b05ed93228478d2a4460754b6a4
Committed by
Martin Schwidefsky
1 parent
51eee033dc
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
s390/mm: add page table dumper
This is more or less the same as the x86 page table dumper which was merged four years ago: 926e5392 "x86: add code to dump the (kernel) page tables for visual inspection by kernel developers". We add a file at /sys/kernel/debug/kernel_page_tables for debugging purposes so it's quite easy to see the kernel page table layout and possible odd mappings: ---[ Identity Mapping ]--- 0x0000000000000000-0x0000000000100000 1M PTE RW ---[ Kernel Image Start ]--- 0x0000000000100000-0x0000000000800000 7M PMD RO 0x0000000000800000-0x00000000008a9000 676K PTE RO 0x00000000008a9000-0x0000000000900000 348K PTE RW 0x0000000000900000-0x0000000001500000 12M PMD RW ---[ Kernel Image End ]--- 0x0000000001500000-0x0000000280000000 10219M PMD RW 0x0000000280000000-0x000003d280000000 3904G PUD I ---[ vmemmap Area ]--- 0x000003d280000000-0x000003d288c00000 140M PTE RW 0x000003d288c00000-0x000003d300000000 1908M PMD I 0x000003d300000000-0x000003e000000000 52G PUD I ---[ vmalloc Area ]--- 0x000003e000000000-0x000003e000009000 36K PTE RW 0x000003e000009000-0x000003e0000ee000 916K PTE I 0x000003e0000ee000-0x000003e000146000 352K PTE RW 0x000003e000146000-0x000003e000200000 744K PTE I 0x000003e000200000-0x000003e080000000 2046M PMD I 0x000003e080000000-0x0000040000000000 126G PUD I This usually makes only sense for kernel developers. The output with CONFIG_DEBUG_PAGEALLOC is not very helpful, because of the huge number of mapped out pages, however I decided for the time being to not add a !DEBUG_PAGEALLOC dependency. Maybe it's helpful for somebody even with that option. Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Showing 3 changed files with 232 additions and 0 deletions Side-by-side Diff
arch/s390/Kconfig.debug
... | ... | @@ -31,6 +31,18 @@ |
31 | 31 | |
32 | 32 | If unsure, or if you run an older (pre 4.4) gcc, say N. |
33 | 33 | |
34 | +config S390_PTDUMP | |
35 | + bool "Export kernel pagetable layout to userspace via debugfs" | |
36 | + depends on DEBUG_KERNEL | |
37 | + select DEBUG_FS | |
38 | + ---help--- | |
39 | + Say Y here if you want to show the kernel pagetable layout in a | |
40 | + debugfs file. This information is only useful for kernel developers | |
41 | + who are working in architecture specific areas of the kernel. | |
42 | + It is probably not a good idea to enable this feature in a production | |
43 | + kernel. | |
44 | + If in doubt, say "N" | |
45 | + | |
34 | 46 | config DEBUG_SET_MODULE_RONX |
35 | 47 | def_bool y |
36 | 48 | depends on MODULES |
arch/s390/mm/Makefile
arch/s390/mm/dump_pagetables.c
1 | +#include <linux/seq_file.h> | |
2 | +#include <linux/debugfs.h> | |
3 | +#include <linux/module.h> | |
4 | +#include <linux/mm.h> | |
5 | +#include <asm/sections.h> | |
6 | +#include <asm/pgtable.h> | |
7 | + | |
8 | +static unsigned long max_addr; | |
9 | + | |
10 | +struct addr_marker { | |
11 | + unsigned long start_address; | |
12 | + const char *name; | |
13 | +}; | |
14 | + | |
15 | +enum address_markers_idx { | |
16 | + IDENTITY_NR = 0, | |
17 | + KERNEL_START_NR, | |
18 | + KERNEL_END_NR, | |
19 | + VMEMMAP_NR, | |
20 | + VMALLOC_NR, | |
21 | +}; | |
22 | + | |
23 | +static struct addr_marker address_markers[] = { | |
24 | + [IDENTITY_NR] = {0, "Identity Mapping"}, | |
25 | + [KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"}, | |
26 | + [KERNEL_END_NR] = {(unsigned long)&_end, "Kernel Image End"}, | |
27 | + [VMEMMAP_NR] = {0, "vmemmap Area"}, | |
28 | + [VMALLOC_NR] = {0, "vmalloc Area"}, | |
29 | + { -1, NULL } | |
30 | +}; | |
31 | + | |
32 | +struct pg_state { | |
33 | + int level; | |
34 | + unsigned int current_prot; | |
35 | + unsigned long start_address; | |
36 | + unsigned long current_address; | |
37 | + const struct addr_marker *marker; | |
38 | +}; | |
39 | + | |
40 | +static void print_prot(struct seq_file *m, unsigned int pr, int level) | |
41 | +{ | |
42 | + static const char * const level_name[] = | |
43 | + { "ASCE", "PGD", "PUD", "PMD", "PTE" }; | |
44 | + | |
45 | + seq_printf(m, "%s ", level_name[level]); | |
46 | + if (pr & _PAGE_INVALID) | |
47 | + seq_printf(m, "I\n"); | |
48 | + else | |
49 | + seq_printf(m, "%s\n", pr & _PAGE_RO ? "RO" : "RW"); | |
50 | +} | |
51 | + | |
52 | +static void note_page(struct seq_file *m, struct pg_state *st, | |
53 | + unsigned int new_prot, int level) | |
54 | +{ | |
55 | + static const char units[] = "KMGTPE"; | |
56 | + int width = sizeof(unsigned long) * 2; | |
57 | + const char *unit = units; | |
58 | + unsigned int prot, cur; | |
59 | + unsigned long delta; | |
60 | + | |
61 | + /* | |
62 | + * If we have a "break" in the series, we need to flush the state | |
63 | + * that we have now. "break" is either changing perms, levels or | |
64 | + * address space marker. | |
65 | + */ | |
66 | + prot = new_prot; | |
67 | + cur = st->current_prot; | |
68 | + | |
69 | + if (!st->level) { | |
70 | + /* First entry */ | |
71 | + st->current_prot = new_prot; | |
72 | + st->level = level; | |
73 | + st->marker = address_markers; | |
74 | + seq_printf(m, "---[ %s ]---\n", st->marker->name); | |
75 | + } else if (prot != cur || level != st->level || | |
76 | + st->current_address >= st->marker[1].start_address) { | |
77 | + /* Print the actual finished series */ | |
78 | + seq_printf(m, "0x%0*lx-0x%0*lx", | |
79 | + width, st->start_address, | |
80 | + width, st->current_address); | |
81 | + delta = (st->current_address - st->start_address) >> 10; | |
82 | + while (!(delta & 0x3ff) && unit[1]) { | |
83 | + delta >>= 10; | |
84 | + unit++; | |
85 | + } | |
86 | + seq_printf(m, "%9lu%c ", delta, *unit); | |
87 | + print_prot(m, st->current_prot, st->level); | |
88 | + if (st->current_address >= st->marker[1].start_address) { | |
89 | + st->marker++; | |
90 | + seq_printf(m, "---[ %s ]---\n", st->marker->name); | |
91 | + } | |
92 | + st->start_address = st->current_address; | |
93 | + st->current_prot = new_prot; | |
94 | + st->level = level; | |
95 | + } | |
96 | +} | |
97 | + | |
98 | +/* | |
99 | + * The actual page table walker functions. In order to keep the implementation | |
100 | + * of print_prot() short, we only check and pass _PAGE_INVALID and _PAGE_RO | |
101 | + * flags to note_page() if a region, segment or page table entry is invalid or | |
102 | + * read-only. | |
103 | + * After all it's just a hint that the current level being walked contains an | |
104 | + * invalid or read-only entry. | |
105 | + */ | |
106 | +static void walk_pte_level(struct seq_file *m, struct pg_state *st, | |
107 | + pmd_t *pmd, unsigned long addr) | |
108 | +{ | |
109 | + unsigned int prot; | |
110 | + pte_t *pte; | |
111 | + int i; | |
112 | + | |
113 | + for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { | |
114 | + st->current_address = addr; | |
115 | + pte = pte_offset_kernel(pmd, addr); | |
116 | + prot = pte_val(*pte) & (_PAGE_RO | _PAGE_INVALID); | |
117 | + note_page(m, st, prot, 4); | |
118 | + addr += PAGE_SIZE; | |
119 | + } | |
120 | +} | |
121 | + | |
122 | +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, | |
123 | + pud_t *pud, unsigned long addr) | |
124 | +{ | |
125 | + unsigned int prot; | |
126 | + pmd_t *pmd; | |
127 | + int i; | |
128 | + | |
129 | + for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) { | |
130 | + st->current_address = addr; | |
131 | + pmd = pmd_offset(pud, addr); | |
132 | + if (!pmd_none(*pmd)) { | |
133 | + if (pmd_large(*pmd)) { | |
134 | + prot = pmd_val(*pmd) & _SEGMENT_ENTRY_RO; | |
135 | + note_page(m, st, prot, 3); | |
136 | + } else | |
137 | + walk_pte_level(m, st, pmd, addr); | |
138 | + } else | |
139 | + note_page(m, st, _PAGE_INVALID, 3); | |
140 | + addr += PMD_SIZE; | |
141 | + } | |
142 | +} | |
143 | + | |
144 | +static void walk_pud_level(struct seq_file *m, struct pg_state *st, | |
145 | + pgd_t *pgd, unsigned long addr) | |
146 | +{ | |
147 | + pud_t *pud; | |
148 | + int i; | |
149 | + | |
150 | + for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) { | |
151 | + st->current_address = addr; | |
152 | + pud = pud_offset(pgd, addr); | |
153 | + if (!pud_none(*pud)) | |
154 | + walk_pmd_level(m, st, pud, addr); | |
155 | + else | |
156 | + note_page(m, st, _PAGE_INVALID, 2); | |
157 | + addr += PUD_SIZE; | |
158 | + } | |
159 | +} | |
160 | + | |
161 | +static void walk_pgd_level(struct seq_file *m) | |
162 | +{ | |
163 | + unsigned long addr = 0; | |
164 | + struct pg_state st; | |
165 | + pgd_t *pgd; | |
166 | + int i; | |
167 | + | |
168 | + memset(&st, 0, sizeof(st)); | |
169 | + for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { | |
170 | + st.current_address = addr; | |
171 | + pgd = pgd_offset_k(addr); | |
172 | + if (!pgd_none(*pgd)) | |
173 | + walk_pud_level(m, &st, pgd, addr); | |
174 | + else | |
175 | + note_page(m, &st, _PAGE_INVALID, 1); | |
176 | + addr += PGDIR_SIZE; | |
177 | + } | |
178 | + /* Flush out the last page */ | |
179 | + st.current_address = max_addr; | |
180 | + note_page(m, &st, 0, 0); | |
181 | +} | |
182 | + | |
183 | +static int ptdump_show(struct seq_file *m, void *v) | |
184 | +{ | |
185 | + walk_pgd_level(m); | |
186 | + return 0; | |
187 | +} | |
188 | + | |
189 | +static int ptdump_open(struct inode *inode, struct file *filp) | |
190 | +{ | |
191 | + return single_open(filp, ptdump_show, NULL); | |
192 | +} | |
193 | + | |
194 | +static const struct file_operations ptdump_fops = { | |
195 | + .open = ptdump_open, | |
196 | + .read = seq_read, | |
197 | + .llseek = seq_lseek, | |
198 | + .release = single_release, | |
199 | +}; | |
200 | + | |
201 | +static int pt_dump_init(void) | |
202 | +{ | |
203 | + /* | |
204 | + * Figure out the maximum virtual address being accessible with the | |
205 | + * kernel ASCE. We need this to keep the page table walker functions | |
206 | + * from accessing non-existent entries. | |
207 | + */ | |
208 | +#ifdef CONFIG_64BIT | |
209 | + max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; | |
210 | + max_addr = 1UL << (max_addr * 11 + 31); | |
211 | +#else | |
212 | + max_addr = 1UL << 31; | |
213 | +#endif | |
214 | + address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; | |
215 | + address_markers[VMALLOC_NR].start_address = VMALLOC_START; | |
216 | + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); | |
217 | + return 0; | |
218 | +} | |
219 | +device_initcall(pt_dump_init); |