Commit 402b08622d9ac6e32e25289573272e0f21bb58a7
Committed by
Avi Kivity
1 parent
37817f2982
Exists in
master
and in
4 other branches
s390: KVM preparation: provide hook to enable pgstes in user pagetable
The SIE instruction on s390 uses the 2nd half of the page table page to virtualize the storage keys of a guest. This patch offers the s390_enable_sie function, which reorganizes the page tables of a single-threaded process to reserve space in the page table: s390_enable_sie makes sure that the process is single threaded and then uses dup_mm to create a new mm with reorganized page tables. The old mm is freed and the process has now a page status extended field after every page table. Code that wants to exploit pgstes should SELECT CONFIG_PGSTE. This patch has a small common code hit, namely making dup_mm non-static. Edit (Carsten): I've modified Martin's patch, following Jeremy Fitzhardinge's review feedback. Now we do have the prototype for dup_mm in include/linux/sched.h. Following Martin's suggestion, s390_enable_sie() does now call task_lock() to prevent race against ptrace modification of mm_users. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Carsten Otte <cotte@de.ibm.com> Acked-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Avi Kivity <avi@qumranet.com>
Showing 8 changed files with 82 additions and 5 deletions Side-by-side Diff
arch/s390/Kconfig
arch/s390/kernel/setup.c
... | ... | @@ -316,7 +316,11 @@ |
316 | 316 | early_param("ipldelay", early_parse_ipldelay); |
317 | 317 | |
318 | 318 | #ifdef CONFIG_S390_SWITCH_AMODE |
319 | +#ifdef CONFIG_PGSTE | |
320 | +unsigned int switch_amode = 1; | |
321 | +#else | |
319 | 322 | unsigned int switch_amode = 0; |
323 | +#endif | |
320 | 324 | EXPORT_SYMBOL_GPL(switch_amode); |
321 | 325 | |
322 | 326 | static void set_amode_and_uaccess(unsigned long user_amode, |
arch/s390/mm/pgtable.c
... | ... | @@ -30,11 +30,27 @@ |
30 | 30 | #define TABLES_PER_PAGE 4 |
31 | 31 | #define FRAG_MASK 15UL |
32 | 32 | #define SECOND_HALVES 10UL |
33 | + | |
34 | +void clear_table_pgstes(unsigned long *table) | |
35 | +{ | |
36 | + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); | |
37 | + memset(table + 256, 0, PAGE_SIZE/4); | |
38 | + clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); | |
39 | + memset(table + 768, 0, PAGE_SIZE/4); | |
40 | +} | |
41 | + | |
33 | 42 | #else |
34 | 43 | #define ALLOC_ORDER 2 |
35 | 44 | #define TABLES_PER_PAGE 2 |
36 | 45 | #define FRAG_MASK 3UL |
37 | 46 | #define SECOND_HALVES 2UL |
47 | + | |
48 | +void clear_table_pgstes(unsigned long *table) | |
49 | +{ | |
50 | + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); | |
51 | + memset(table + 256, 0, PAGE_SIZE/2); | |
52 | +} | |
53 | + | |
38 | 54 | #endif |
39 | 55 | |
40 | 56 | unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) |
... | ... | @@ -153,7 +169,7 @@ |
153 | 169 | unsigned long *table; |
154 | 170 | unsigned long bits; |
155 | 171 | |
156 | - bits = mm->context.noexec ? 3UL : 1UL; | |
172 | + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; | |
157 | 173 | spin_lock(&mm->page_table_lock); |
158 | 174 | page = NULL; |
159 | 175 | if (!list_empty(&mm->context.pgtable_list)) { |
... | ... | @@ -170,7 +186,10 @@ |
170 | 186 | pgtable_page_ctor(page); |
171 | 187 | page->flags &= ~FRAG_MASK; |
172 | 188 | table = (unsigned long *) page_to_phys(page); |
173 | - clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | |
189 | + if (mm->context.pgstes) | |
190 | + clear_table_pgstes(table); | |
191 | + else | |
192 | + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | |
174 | 193 | spin_lock(&mm->page_table_lock); |
175 | 194 | list_add(&page->lru, &mm->context.pgtable_list); |
176 | 195 | } |
... | ... | @@ -191,7 +210,7 @@ |
191 | 210 | struct page *page; |
192 | 211 | unsigned long bits; |
193 | 212 | |
194 | - bits = mm->context.noexec ? 3UL : 1UL; | |
213 | + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; | |
195 | 214 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); |
196 | 215 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
197 | 216 | spin_lock(&mm->page_table_lock); |
... | ... | @@ -228,4 +247,44 @@ |
228 | 247 | mm->context.noexec = 0; |
229 | 248 | update_mm(mm, tsk); |
230 | 249 | } |
250 | + | |
251 | +/* | |
252 | + * switch on pgstes for its userspace process (for kvm) | |
253 | + */ | |
254 | +int s390_enable_sie(void) | |
255 | +{ | |
256 | + struct task_struct *tsk = current; | |
257 | + struct mm_struct *mm; | |
258 | + int rc; | |
259 | + | |
260 | + task_lock(tsk); | |
261 | + | |
262 | + rc = 0; | |
263 | + if (tsk->mm->context.pgstes) | |
264 | + goto unlock; | |
265 | + | |
266 | + rc = -EINVAL; | |
267 | + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | |
268 | + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) | |
269 | + goto unlock; | |
270 | + | |
271 | + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ | |
272 | + mm = dup_mm(tsk); | |
273 | + tsk->mm->context.pgstes = 0; | |
274 | + | |
275 | + rc = -ENOMEM; | |
276 | + if (!mm) | |
277 | + goto unlock; | |
278 | + mmput(tsk->mm); | |
279 | + tsk->mm = tsk->active_mm = mm; | |
280 | + preempt_disable(); | |
281 | + update_mm(mm, tsk); | |
282 | + cpu_set(smp_processor_id(), mm->cpu_vm_mask); | |
283 | + preempt_enable(); | |
284 | + rc = 0; | |
285 | +unlock: | |
286 | + task_unlock(tsk); | |
287 | + return rc; | |
288 | +} | |
289 | +EXPORT_SYMBOL_GPL(s390_enable_sie); |
include/asm-s390/mmu.h
include/asm-s390/mmu_context.h
... | ... | @@ -20,7 +20,13 @@ |
20 | 20 | #ifdef CONFIG_64BIT |
21 | 21 | mm->context.asce_bits |= _ASCE_TYPE_REGION3; |
22 | 22 | #endif |
23 | - mm->context.noexec = s390_noexec; | |
23 | + if (current->mm->context.pgstes) { | |
24 | + mm->context.noexec = 0; | |
25 | + mm->context.pgstes = 1; | |
26 | + } else { | |
27 | + mm->context.noexec = s390_noexec; | |
28 | + mm->context.pgstes = 0; | |
29 | + } | |
24 | 30 | mm->context.asce_limit = STACK_TOP_MAX; |
25 | 31 | crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); |
26 | 32 | return 0; |
include/asm-s390/pgtable.h
include/linux/sched.h
... | ... | @@ -1798,6 +1798,8 @@ |
1798 | 1798 | extern struct mm_struct *get_task_mm(struct task_struct *task); |
1799 | 1799 | /* Remove the current tasks stale references to the old mm_struct */ |
1800 | 1800 | extern void mm_release(struct task_struct *, struct mm_struct *); |
1801 | +/* Allocate a new mm structure and copy contents from tsk->mm */ | |
1802 | +extern struct mm_struct *dup_mm(struct task_struct *tsk); | |
1801 | 1803 | |
1802 | 1804 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); |
1803 | 1805 | extern void flush_thread(void); |
kernel/fork.c
... | ... | @@ -521,7 +521,7 @@ |
521 | 521 | * Allocate a new mm structure and copy contents from the |
522 | 522 | * mm structure of the passed in task structure. |
523 | 523 | */ |
524 | -static struct mm_struct *dup_mm(struct task_struct *tsk) | |
524 | +struct mm_struct *dup_mm(struct task_struct *tsk) | |
525 | 525 | { |
526 | 526 | struct mm_struct *mm, *oldmm = current->mm; |
527 | 527 | int err; |