Commit 402b08622d9ac6e32e25289573272e0f21bb58a7

Authored by Carsten Otte
Committed by Avi Kivity
1 parent 37817f2982

s390: KVM preparation: provide hook to enable pgstes in user pagetable

The SIE instruction on s390 uses the 2nd half of the page table page to
virtualize the storage keys of a guest. This patch offers the s390_enable_sie
function, which reorganizes the page tables of a single-threaded process to
reserve space in the page table:
s390_enable_sie makes sure that the process is single threaded and then uses
dup_mm to create a new mm with reorganized page tables. The old mm is freed
and the process has now a page status extended field after every page table.

Code that wants to exploit pgstes should SELECT CONFIG_PGSTE.

This patch has a small common code hit, namely making dup_mm non-static.

Edit (Carsten): I've modified Martin's patch, following Jeremy Fitzhardinge's
review feedback. Now we do have the prototype for dup_mm in
include/linux/sched.h. Following Martin's suggestion, s390_enable_sie() does now
call task_lock() to prevent race against ptrace modification of mm_users.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>

Showing 8 changed files with 82 additions and 5 deletions Side-by-side Diff

... ... @@ -62,6 +62,10 @@
62 62 default y
63 63 depends on SMP && PREEMPT
64 64  
  65 +config PGSTE
  66 + bool
  67 + default y if KVM
  68 +
65 69 mainmenu "Linux Kernel Configuration"
66 70  
67 71 config S390
arch/s390/kernel/setup.c
... ... @@ -316,7 +316,11 @@
316 316 early_param("ipldelay", early_parse_ipldelay);
317 317  
318 318 #ifdef CONFIG_S390_SWITCH_AMODE
  319 +#ifdef CONFIG_PGSTE
  320 +unsigned int switch_amode = 1;
  321 +#else
319 322 unsigned int switch_amode = 0;
  323 +#endif
320 324 EXPORT_SYMBOL_GPL(switch_amode);
321 325  
322 326 static void set_amode_and_uaccess(unsigned long user_amode,
arch/s390/mm/pgtable.c
... ... @@ -30,11 +30,27 @@
30 30 #define TABLES_PER_PAGE 4
31 31 #define FRAG_MASK 15UL
32 32 #define SECOND_HALVES 10UL
  33 +
  34 +void clear_table_pgstes(unsigned long *table)
  35 +{
  36 + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
  37 + memset(table + 256, 0, PAGE_SIZE/4);
  38 + clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
  39 + memset(table + 768, 0, PAGE_SIZE/4);
  40 +}
  41 +
33 42 #else
34 43 #define ALLOC_ORDER 2
35 44 #define TABLES_PER_PAGE 2
36 45 #define FRAG_MASK 3UL
37 46 #define SECOND_HALVES 2UL
  47 +
  48 +void clear_table_pgstes(unsigned long *table)
  49 +{
  50 + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
  51 + memset(table + 256, 0, PAGE_SIZE/2);
  52 +}
  53 +
38 54 #endif
39 55  
40 56 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
... ... @@ -153,7 +169,7 @@
153 169 unsigned long *table;
154 170 unsigned long bits;
155 171  
156   - bits = mm->context.noexec ? 3UL : 1UL;
  172 + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
157 173 spin_lock(&mm->page_table_lock);
158 174 page = NULL;
159 175 if (!list_empty(&mm->context.pgtable_list)) {
... ... @@ -170,7 +186,10 @@
170 186 pgtable_page_ctor(page);
171 187 page->flags &= ~FRAG_MASK;
172 188 table = (unsigned long *) page_to_phys(page);
173   - clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
  189 + if (mm->context.pgstes)
  190 + clear_table_pgstes(table);
  191 + else
  192 + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
174 193 spin_lock(&mm->page_table_lock);
175 194 list_add(&page->lru, &mm->context.pgtable_list);
176 195 }
... ... @@ -191,7 +210,7 @@
191 210 struct page *page;
192 211 unsigned long bits;
193 212  
194   - bits = mm->context.noexec ? 3UL : 1UL;
  213 + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
195 214 bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
196 215 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
197 216 spin_lock(&mm->page_table_lock);
... ... @@ -228,4 +247,44 @@
228 247 mm->context.noexec = 0;
229 248 update_mm(mm, tsk);
230 249 }
  250 +
  251 +/*
  252 + * switch on pgstes for its userspace process (for kvm)
  253 + */
  254 +int s390_enable_sie(void)
  255 +{
  256 + struct task_struct *tsk = current;
  257 + struct mm_struct *mm;
  258 + int rc;
  259 +
  260 + task_lock(tsk);
  261 +
  262 + rc = 0;
  263 + if (tsk->mm->context.pgstes)
  264 + goto unlock;
  265 +
  266 + rc = -EINVAL;
  267 + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
  268 + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list)
  269 + goto unlock;
  270 +
  271 + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */
  272 + mm = dup_mm(tsk);
  273 + tsk->mm->context.pgstes = 0;
  274 +
  275 + rc = -ENOMEM;
  276 + if (!mm)
  277 + goto unlock;
  278 + mmput(tsk->mm);
  279 + tsk->mm = tsk->active_mm = mm;
  280 + preempt_disable();
  281 + update_mm(mm, tsk);
  282 + cpu_set(smp_processor_id(), mm->cpu_vm_mask);
  283 + preempt_enable();
  284 + rc = 0;
  285 +unlock:
  286 + task_unlock(tsk);
  287 + return rc;
  288 +}
  289 +EXPORT_SYMBOL_GPL(s390_enable_sie);
include/asm-s390/mmu.h
... ... @@ -7,6 +7,7 @@
7 7 unsigned long asce_bits;
8 8 unsigned long asce_limit;
9 9 int noexec;
  10 + int pgstes;
10 11 } mm_context_t;
11 12  
12 13 #endif
include/asm-s390/mmu_context.h
... ... @@ -20,7 +20,13 @@
20 20 #ifdef CONFIG_64BIT
21 21 mm->context.asce_bits |= _ASCE_TYPE_REGION3;
22 22 #endif
23   - mm->context.noexec = s390_noexec;
  23 + if (current->mm->context.pgstes) {
  24 + mm->context.noexec = 0;
  25 + mm->context.pgstes = 1;
  26 + } else {
  27 + mm->context.noexec = s390_noexec;
  28 + mm->context.pgstes = 0;
  29 + }
24 30 mm->context.asce_limit = STACK_TOP_MAX;
25 31 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
26 32 return 0;
include/asm-s390/pgtable.h
... ... @@ -966,6 +966,7 @@
966 966  
967 967 extern int add_shared_memory(unsigned long start, unsigned long size);
968 968 extern int remove_shared_memory(unsigned long start, unsigned long size);
  969 +extern int s390_enable_sie(void);
969 970  
970 971 /*
971 972 * No page table caches to initialise
include/linux/sched.h
... ... @@ -1798,6 +1798,8 @@
1798 1798 extern struct mm_struct *get_task_mm(struct task_struct *task);
1799 1799 /* Remove the current tasks stale references to the old mm_struct */
1800 1800 extern void mm_release(struct task_struct *, struct mm_struct *);
  1801 +/* Allocate a new mm structure and copy contents from tsk->mm */
  1802 +extern struct mm_struct *dup_mm(struct task_struct *tsk);
1801 1803  
1802 1804 extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
1803 1805 extern void flush_thread(void);
... ... @@ -521,7 +521,7 @@
521 521 * Allocate a new mm structure and copy contents from the
522 522 * mm structure of the passed in task structure.
523 523 */
524   -static struct mm_struct *dup_mm(struct task_struct *tsk)
  524 +struct mm_struct *dup_mm(struct task_struct *tsk)
525 525 {
526 526 struct mm_struct *mm, *oldmm = current->mm;
527 527 int err;