Commit 8bccd85ffbaf8ff1448d1235fa6594e207695531
Committed by
Linus Torvalds
1 parent
bb7e7e032d
Exists in
master
and in
40 other branches
[PATCH] Implement sys_* do_* layering in the memory policy layer.
- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions take variable sized bitmaps from user space as arguments. do_xxx functions take fixed sized nodemask_t as arguments and may be used from inside the kernel. Doing so simplifies the initialization code. There is no fs = kernel_ds assumption anymore. - Split up get_nodes into get_nodes (which gets the node list) and contextualize_policy which restricts the nodes to those accessible to the task and updates cpusets. - Add comments explaining limitations of bind policy Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 1 changed file with 162 additions and 114 deletions Side-by-side Diff
mm/mempolicy.c
... | ... | @@ -2,6 +2,7 @@ |
2 | 2 | * Simple NUMA memory policy for the Linux kernel. |
3 | 3 | * |
4 | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
5 | + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | |
5 | 6 | * Subject to the GNU Public License, version 2. |
6 | 7 | * |
7 | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
8 | 9 | |
9 | 10 | |
... | ... | @@ -17,13 +18,19 @@ |
17 | 18 | * offset into the backing object or offset into the mapping |
18 | 19 | * for anonymous memory. For process policy an process counter |
19 | 20 | * is used. |
21 | + * | |
20 | 22 | * bind Only allocate memory on a specific set of nodes, |
21 | 23 | * no fallback. |
24 | + * FIXME: memory is allocated starting with the first node | |
25 | + * to the last. It would be better if bind would truly restrict | |
26 | + * the allocation to memory nodes instead | |
27 | + * | |
22 | 28 | * preferred Try a specific node first before normal fallback. |
23 | 29 | * As a special case node -1 here means do the allocation |
24 | 30 | * on the local CPU. This is normally identical to default, |
25 | 31 | * but useful to set in a VMA when you have a non default |
26 | 32 | * process policy. |
33 | + * | |
27 | 34 | * default Allocate on the local node first, or when on a VMA |
28 | 35 | * use the process policy. This is what Linux always did |
29 | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
... | ... | @@ -113,56 +120,6 @@ |
113 | 120 | } |
114 | 121 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
115 | 122 | } |
116 | - | |
117 | -/* Copy a node mask from user space. */ | |
118 | -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | |
119 | - unsigned long maxnode, int mode) | |
120 | -{ | |
121 | - unsigned long k; | |
122 | - unsigned long nlongs; | |
123 | - unsigned long endmask; | |
124 | - | |
125 | - --maxnode; | |
126 | - nodes_clear(*nodes); | |
127 | - if (maxnode == 0 || !nmask) | |
128 | - return 0; | |
129 | - | |
130 | - nlongs = BITS_TO_LONGS(maxnode); | |
131 | - if ((maxnode % BITS_PER_LONG) == 0) | |
132 | - endmask = ~0UL; | |
133 | - else | |
134 | - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | |
135 | - | |
136 | - /* When the user specified more nodes than supported just check | |
137 | - if the non supported part is all zero. */ | |
138 | - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | |
139 | - if (nlongs > PAGE_SIZE/sizeof(long)) | |
140 | - return -EINVAL; | |
141 | - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | |
142 | - unsigned long t; | |
143 | - if (get_user(t, nmask + k)) | |
144 | - return -EFAULT; | |
145 | - if (k == nlongs - 1) { | |
146 | - if (t & endmask) | |
147 | - return -EINVAL; | |
148 | - } else if (t) | |
149 | - return -EINVAL; | |
150 | - } | |
151 | - nlongs = BITS_TO_LONGS(MAX_NUMNODES); | |
152 | - endmask = ~0UL; | |
153 | - } | |
154 | - | |
155 | - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | |
156 | - return -EFAULT; | |
157 | - nodes_addr(*nodes)[nlongs-1] &= endmask; | |
158 | - /* Update current mems_allowed */ | |
159 | - cpuset_update_current_mems_allowed(); | |
160 | - /* Ignore nodes not set in current->mems_allowed */ | |
161 | - /* AK: shouldn't this error out instead? */ | |
162 | - cpuset_restrict_to_mems_allowed(nodes_addr(*nodes)); | |
163 | - return mpol_check_policy(mode, nodes); | |
164 | -} | |
165 | - | |
166 | 123 | /* Generate a custom zonelist for the BIND policy. */ |
167 | 124 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
168 | 125 | { |
169 | 126 | |
170 | 127 | |
... | ... | @@ -380,17 +337,25 @@ |
380 | 337 | return err; |
381 | 338 | } |
382 | 339 | |
383 | -/* Change policy for a memory range */ | |
384 | -asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |
385 | - unsigned long mode, | |
386 | - unsigned long __user *nmask, unsigned long maxnode, | |
387 | - unsigned flags) | |
340 | +static int contextualize_policy(int mode, nodemask_t *nodes) | |
388 | 341 | { |
342 | + if (!nodes) | |
343 | + return 0; | |
344 | + | |
345 | + /* Update current mems_allowed */ | |
346 | + cpuset_update_current_mems_allowed(); | |
347 | + /* Ignore nodes not set in current->mems_allowed */ | |
348 | + cpuset_restrict_to_mems_allowed(nodes->bits); | |
349 | + return mpol_check_policy(mode, nodes); | |
350 | +} | |
351 | + | |
352 | +long do_mbind(unsigned long start, unsigned long len, | |
353 | + unsigned long mode, nodemask_t *nmask, unsigned long flags) | |
354 | +{ | |
389 | 355 | struct vm_area_struct *vma; |
390 | 356 | struct mm_struct *mm = current->mm; |
391 | 357 | struct mempolicy *new; |
392 | 358 | unsigned long end; |
393 | - nodemask_t nodes; | |
394 | 359 | int err; |
395 | 360 | |
396 | 361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) |
... | ... | @@ -405,12 +370,9 @@ |
405 | 370 | return -EINVAL; |
406 | 371 | if (end == start) |
407 | 372 | return 0; |
408 | - | |
409 | - err = get_nodes(&nodes, nmask, maxnode, mode); | |
410 | - if (err) | |
411 | - return err; | |
412 | - | |
413 | - new = mpol_new(mode, &nodes); | |
373 | + if (contextualize_policy(mode, nmask)) | |
374 | + return -EINVAL; | |
375 | + new = mpol_new(mode, nmask); | |
414 | 376 | if (IS_ERR(new)) |
415 | 377 | return PTR_ERR(new); |
416 | 378 | |
... | ... | @@ -418,7 +380,7 @@ |
418 | 380 | mode,nodes_addr(nodes)[0]); |
419 | 381 | |
420 | 382 | down_write(&mm->mmap_sem); |
421 | - vma = check_range(mm, start, end, &nodes, flags); | |
383 | + vma = check_range(mm, start, end, nmask, flags); | |
422 | 384 | err = PTR_ERR(vma); |
423 | 385 | if (!IS_ERR(vma)) |
424 | 386 | err = mbind_range(vma, start, end, new); |
425 | 387 | |
426 | 388 | |
427 | 389 | |
428 | 390 | |
... | ... | @@ -428,19 +390,13 @@ |
428 | 390 | } |
429 | 391 | |
430 | 392 | /* Set the process memory policy */ |
431 | -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |
432 | - unsigned long maxnode) | |
393 | +long do_set_mempolicy(int mode, nodemask_t *nodes) | |
433 | 394 | { |
434 | - int err; | |
435 | 395 | struct mempolicy *new; |
436 | - nodemask_t nodes; | |
437 | 396 | |
438 | - if (mode < 0 || mode > MPOL_MAX) | |
397 | + if (contextualize_policy(mode, nodes)) | |
439 | 398 | return -EINVAL; |
440 | - err = get_nodes(&nodes, nmask, maxnode, mode); | |
441 | - if (err) | |
442 | - return err; | |
443 | - new = mpol_new(mode, &nodes); | |
399 | + new = mpol_new(mode, nodes); | |
444 | 400 | if (IS_ERR(new)) |
445 | 401 | return PTR_ERR(new); |
446 | 402 | mpol_free(current->mempolicy); |
... | ... | @@ -459,7 +415,8 @@ |
459 | 415 | switch (p->policy) { |
460 | 416 | case MPOL_BIND: |
461 | 417 | for (i = 0; p->v.zonelist->zones[i]; i++) |
462 | - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); | |
418 | + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, | |
419 | + *nodes); | |
463 | 420 | break; |
464 | 421 | case MPOL_DEFAULT: |
465 | 422 | break; |
466 | 423 | |
467 | 424 | |
468 | 425 | |
... | ... | @@ -491,38 +448,17 @@ |
491 | 448 | return err; |
492 | 449 | } |
493 | 450 | |
494 | -/* Copy a kernel node mask to user space */ | |
495 | -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | |
496 | - nodemask_t *nodes) | |
497 | -{ | |
498 | - unsigned long copy = ALIGN(maxnode-1, 64) / 8; | |
499 | - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | |
500 | - | |
501 | - if (copy > nbytes) { | |
502 | - if (copy > PAGE_SIZE) | |
503 | - return -EINVAL; | |
504 | - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | |
505 | - return -EFAULT; | |
506 | - copy = nbytes; | |
507 | - } | |
508 | - return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | |
509 | -} | |
510 | - | |
511 | 451 | /* Retrieve NUMA policy */ |
512 | -asmlinkage long sys_get_mempolicy(int __user *policy, | |
513 | - unsigned long __user *nmask, | |
514 | - unsigned long maxnode, | |
515 | - unsigned long addr, unsigned long flags) | |
452 | +long do_get_mempolicy(int *policy, nodemask_t *nmask, | |
453 | + unsigned long addr, unsigned long flags) | |
516 | 454 | { |
517 | - int err, pval; | |
455 | + int err; | |
518 | 456 | struct mm_struct *mm = current->mm; |
519 | 457 | struct vm_area_struct *vma = NULL; |
520 | 458 | struct mempolicy *pol = current->mempolicy; |
521 | 459 | |
522 | 460 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
523 | 461 | return -EINVAL; |
524 | - if (nmask != NULL && maxnode < MAX_NUMNODES) | |
525 | - return -EINVAL; | |
526 | 462 | if (flags & MPOL_F_ADDR) { |
527 | 463 | down_read(&mm->mmap_sem); |
528 | 464 | vma = find_vma_intersection(mm, addr, addr+1); |
529 | 465 | |
530 | 466 | |
531 | 467 | |
532 | 468 | |
... | ... | @@ -545,31 +481,25 @@ |
545 | 481 | err = lookup_node(mm, addr); |
546 | 482 | if (err < 0) |
547 | 483 | goto out; |
548 | - pval = err; | |
484 | + *policy = err; | |
549 | 485 | } else if (pol == current->mempolicy && |
550 | 486 | pol->policy == MPOL_INTERLEAVE) { |
551 | - pval = current->il_next; | |
487 | + *policy = current->il_next; | |
552 | 488 | } else { |
553 | 489 | err = -EINVAL; |
554 | 490 | goto out; |
555 | 491 | } |
556 | 492 | } else |
557 | - pval = pol->policy; | |
493 | + *policy = pol->policy; | |
558 | 494 | |
559 | 495 | if (vma) { |
560 | 496 | up_read(¤t->mm->mmap_sem); |
561 | 497 | vma = NULL; |
562 | 498 | } |
563 | 499 | |
564 | - if (policy && put_user(pval, policy)) | |
565 | - return -EFAULT; | |
566 | - | |
567 | 500 | err = 0; |
568 | - if (nmask) { | |
569 | - nodemask_t nodes; | |
570 | - get_zonemask(pol, &nodes); | |
571 | - err = copy_nodes_to_user(nmask, maxnode, &nodes); | |
572 | - } | |
501 | + if (nmask) | |
502 | + get_zonemask(pol, nmask); | |
573 | 503 | |
574 | 504 | out: |
575 | 505 | if (vma) |
... | ... | @@ -577,6 +507,126 @@ |
577 | 507 | return err; |
578 | 508 | } |
579 | 509 | |
510 | +/* | |
511 | + * User space interface with variable sized bitmaps for nodelists. | |
512 | + */ | |
513 | + | |
514 | +/* Copy a node mask from user space. */ | |
515 | +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | |
516 | + unsigned long maxnode) | |
517 | +{ | |
518 | + unsigned long k; | |
519 | + unsigned long nlongs; | |
520 | + unsigned long endmask; | |
521 | + | |
522 | + --maxnode; | |
523 | + nodes_clear(*nodes); | |
524 | + if (maxnode == 0 || !nmask) | |
525 | + return 0; | |
526 | + | |
527 | + nlongs = BITS_TO_LONGS(maxnode); | |
528 | + if ((maxnode % BITS_PER_LONG) == 0) | |
529 | + endmask = ~0UL; | |
530 | + else | |
531 | + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | |
532 | + | |
533 | + /* When the user specified more nodes than supported just check | |
534 | + if the non supported part is all zero. */ | |
535 | + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | |
536 | + if (nlongs > PAGE_SIZE/sizeof(long)) | |
537 | + return -EINVAL; | |
538 | + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | |
539 | + unsigned long t; | |
540 | + if (get_user(t, nmask + k)) | |
541 | + return -EFAULT; | |
542 | + if (k == nlongs - 1) { | |
543 | + if (t & endmask) | |
544 | + return -EINVAL; | |
545 | + } else if (t) | |
546 | + return -EINVAL; | |
547 | + } | |
548 | + nlongs = BITS_TO_LONGS(MAX_NUMNODES); | |
549 | + endmask = ~0UL; | |
550 | + } | |
551 | + | |
552 | + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | |
553 | + return -EFAULT; | |
554 | + nodes_addr(*nodes)[nlongs-1] &= endmask; | |
555 | + return 0; | |
556 | +} | |
557 | + | |
558 | +/* Copy a kernel node mask to user space */ | |
559 | +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | |
560 | + nodemask_t *nodes) | |
561 | +{ | |
562 | + unsigned long copy = ALIGN(maxnode-1, 64) / 8; | |
563 | + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | |
564 | + | |
565 | + if (copy > nbytes) { | |
566 | + if (copy > PAGE_SIZE) | |
567 | + return -EINVAL; | |
568 | + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | |
569 | + return -EFAULT; | |
570 | + copy = nbytes; | |
571 | + } | |
572 | + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | |
573 | +} | |
574 | + | |
575 | +asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |
576 | + unsigned long mode, | |
577 | + unsigned long __user *nmask, unsigned long maxnode, | |
578 | + unsigned flags) | |
579 | +{ | |
580 | + nodemask_t nodes; | |
581 | + int err; | |
582 | + | |
583 | + err = get_nodes(&nodes, nmask, maxnode); | |
584 | + if (err) | |
585 | + return err; | |
586 | + return do_mbind(start, len, mode, &nodes, flags); | |
587 | +} | |
588 | + | |
589 | +/* Set the process memory policy */ | |
590 | +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |
591 | + unsigned long maxnode) | |
592 | +{ | |
593 | + int err; | |
594 | + nodemask_t nodes; | |
595 | + | |
596 | + if (mode < 0 || mode > MPOL_MAX) | |
597 | + return -EINVAL; | |
598 | + err = get_nodes(&nodes, nmask, maxnode); | |
599 | + if (err) | |
600 | + return err; | |
601 | + return do_set_mempolicy(mode, &nodes); | |
602 | +} | |
603 | + | |
604 | +/* Retrieve NUMA policy */ | |
605 | +asmlinkage long sys_get_mempolicy(int __user *policy, | |
606 | + unsigned long __user *nmask, | |
607 | + unsigned long maxnode, | |
608 | + unsigned long addr, unsigned long flags) | |
609 | +{ | |
610 | + int err, pval; | |
611 | + nodemask_t nodes; | |
612 | + | |
613 | + if (nmask != NULL && maxnode < MAX_NUMNODES) | |
614 | + return -EINVAL; | |
615 | + | |
616 | + err = do_get_mempolicy(&pval, &nodes, addr, flags); | |
617 | + | |
618 | + if (err) | |
619 | + return err; | |
620 | + | |
621 | + if (policy && put_user(pval, policy)) | |
622 | + return -EFAULT; | |
623 | + | |
624 | + if (nmask) | |
625 | + err = copy_nodes_to_user(nmask, maxnode, &nodes); | |
626 | + | |
627 | + return err; | |
628 | +} | |
629 | + | |
580 | 630 | #ifdef CONFIG_COMPAT |
581 | 631 | |
582 | 632 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, |
... | ... | @@ -664,7 +714,7 @@ |
664 | 714 | |
665 | 715 | if (vma) { |
666 | 716 | if (vma->vm_ops && vma->vm_ops->get_policy) |
667 | - pol = vma->vm_ops->get_policy(vma, addr); | |
717 | + pol = vma->vm_ops->get_policy(vma, addr); | |
668 | 718 | else if (vma->vm_policy && |
669 | 719 | vma->vm_policy->policy != MPOL_DEFAULT) |
670 | 720 | pol = vma->vm_policy; |
671 | 721 | |
672 | 722 | |
... | ... | @@ -1147,15 +1197,13 @@ |
1147 | 1197 | /* Set interleaving policy for system init. This way not all |
1148 | 1198 | the data structures allocated at system boot end up in node zero. */ |
1149 | 1199 | |
1150 | - if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | |
1151 | - MAX_NUMNODES) < 0) | |
1200 | + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) | |
1152 | 1201 | printk("numa_policy_init: interleaving failed\n"); |
1153 | 1202 | } |
1154 | 1203 | |
1155 | -/* Reset policy of current process to default. | |
1156 | - * Assumes fs == KERNEL_DS */ | |
1204 | +/* Reset policy of current process to default */ | |
1157 | 1205 | void numa_default_policy(void) |
1158 | 1206 | { |
1159 | - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | |
1207 | + do_set_mempolicy(MPOL_DEFAULT, NULL); | |
1160 | 1208 | } |