Commit 8bccd85ffbaf8ff1448d1235fa6594e207695531

Authored by Christoph Lameter
Committed by Linus Torvalds
1 parent bb7e7e032d

[PATCH] Implement sys_* do_* layering in the memory policy layer.

- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions
  take variable sized bitmaps from user space as arguments. do_xxx functions
  take fixed sized nodemask_t as arguments and may be used from inside the
  kernel. Doing so simplifies the initialization code. There is no
  fs = kernel_ds assumption anymore.

- Split up get_nodes into get_nodes (which gets the node list) and
  contextualize_policy which restricts the nodes to those accessible
  to the task and updates cpusets.

- Add comments explaining limitations of bind policy

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 1 changed file with 162 additions and 114 deletions Side-by-side Diff

... ... @@ -2,6 +2,7 @@
2 2 * Simple NUMA memory policy for the Linux kernel.
3 3 *
4 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
  5 + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
5 6 * Subject to the GNU Public License, version 2.
6 7 *
7 8 * NUMA policy allows the user to give hints in which node(s) memory should
8 9  
9 10  
... ... @@ -17,13 +18,19 @@
17 18 * offset into the backing object or offset into the mapping
18 19 * for anonymous memory. For process policy an process counter
19 20 * is used.
  21 + *
20 22 * bind Only allocate memory on a specific set of nodes,
21 23 * no fallback.
  24 + * FIXME: memory is allocated starting with the first node
  25 + * to the last. It would be better if bind would truly restrict
  26 + * the allocation to memory nodes instead
  27 + *
22 28 * preferred Try a specific node first before normal fallback.
23 29 * As a special case node -1 here means do the allocation
24 30 * on the local CPU. This is normally identical to default,
25 31 * but useful to set in a VMA when you have a non default
26 32 * process policy.
  33 + *
27 34 * default Allocate on the local node first, or when on a VMA
28 35 * use the process policy. This is what Linux always did
29 36 * in a NUMA aware kernel and still does by, ahem, default.
... ... @@ -113,56 +120,6 @@
113 120 }
114 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
115 122 }
116   -
117   -/* Copy a node mask from user space. */
118   -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119   - unsigned long maxnode, int mode)
120   -{
121   - unsigned long k;
122   - unsigned long nlongs;
123   - unsigned long endmask;
124   -
125   - --maxnode;
126   - nodes_clear(*nodes);
127   - if (maxnode == 0 || !nmask)
128   - return 0;
129   -
130   - nlongs = BITS_TO_LONGS(maxnode);
131   - if ((maxnode % BITS_PER_LONG) == 0)
132   - endmask = ~0UL;
133   - else
134   - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135   -
136   - /* When the user specified more nodes than supported just check
137   - if the non supported part is all zero. */
138   - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139   - if (nlongs > PAGE_SIZE/sizeof(long))
140   - return -EINVAL;
141   - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142   - unsigned long t;
143   - if (get_user(t, nmask + k))
144   - return -EFAULT;
145   - if (k == nlongs - 1) {
146   - if (t & endmask)
147   - return -EINVAL;
148   - } else if (t)
149   - return -EINVAL;
150   - }
151   - nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152   - endmask = ~0UL;
153   - }
154   -
155   - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
156   - return -EFAULT;
157   - nodes_addr(*nodes)[nlongs-1] &= endmask;
158   - /* Update current mems_allowed */
159   - cpuset_update_current_mems_allowed();
160   - /* Ignore nodes not set in current->mems_allowed */
161   - /* AK: shouldn't this error out instead? */
162   - cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163   - return mpol_check_policy(mode, nodes);
164   -}
165   -
166 123 /* Generate a custom zonelist for the BIND policy. */
167 124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
168 125 {
169 126  
170 127  
... ... @@ -380,17 +337,25 @@
380 337 return err;
381 338 }
382 339  
383   -/* Change policy for a memory range */
384   -asmlinkage long sys_mbind(unsigned long start, unsigned long len,
385   - unsigned long mode,
386   - unsigned long __user *nmask, unsigned long maxnode,
387   - unsigned flags)
  340 +static int contextualize_policy(int mode, nodemask_t *nodes)
388 341 {
  342 + if (!nodes)
  343 + return 0;
  344 +
  345 + /* Update current mems_allowed */
  346 + cpuset_update_current_mems_allowed();
  347 + /* Ignore nodes not set in current->mems_allowed */
  348 + cpuset_restrict_to_mems_allowed(nodes->bits);
  349 + return mpol_check_policy(mode, nodes);
  350 +}
  351 +
  352 +long do_mbind(unsigned long start, unsigned long len,
  353 + unsigned long mode, nodemask_t *nmask, unsigned long flags)
  354 +{
389 355 struct vm_area_struct *vma;
390 356 struct mm_struct *mm = current->mm;
391 357 struct mempolicy *new;
392 358 unsigned long end;
393   - nodemask_t nodes;
394 359 int err;
395 360  
396 361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
... ... @@ -405,12 +370,9 @@
405 370 return -EINVAL;
406 371 if (end == start)
407 372 return 0;
408   -
409   - err = get_nodes(&nodes, nmask, maxnode, mode);
410   - if (err)
411   - return err;
412   -
413   - new = mpol_new(mode, &nodes);
  373 + if (contextualize_policy(mode, nmask))
  374 + return -EINVAL;
  375 + new = mpol_new(mode, nmask);
414 376 if (IS_ERR(new))
415 377 return PTR_ERR(new);
416 378  
... ... @@ -418,7 +380,7 @@
418 380 mode,nodes_addr(nodes)[0]);
419 381  
420 382 down_write(&mm->mmap_sem);
421   - vma = check_range(mm, start, end, &nodes, flags);
  383 + vma = check_range(mm, start, end, nmask, flags);
422 384 err = PTR_ERR(vma);
423 385 if (!IS_ERR(vma))
424 386 err = mbind_range(vma, start, end, new);
425 387  
426 388  
427 389  
428 390  
... ... @@ -428,19 +390,13 @@
428 390 }
429 391  
430 392 /* Set the process memory policy */
431   -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
432   - unsigned long maxnode)
  393 +long do_set_mempolicy(int mode, nodemask_t *nodes)
433 394 {
434   - int err;
435 395 struct mempolicy *new;
436   - nodemask_t nodes;
437 396  
438   - if (mode < 0 || mode > MPOL_MAX)
  397 + if (contextualize_policy(mode, nodes))
439 398 return -EINVAL;
440   - err = get_nodes(&nodes, nmask, maxnode, mode);
441   - if (err)
442   - return err;
443   - new = mpol_new(mode, &nodes);
  399 + new = mpol_new(mode, nodes);
444 400 if (IS_ERR(new))
445 401 return PTR_ERR(new);
446 402 mpol_free(current->mempolicy);
... ... @@ -459,7 +415,8 @@
459 415 switch (p->policy) {
460 416 case MPOL_BIND:
461 417 for (i = 0; p->v.zonelist->zones[i]; i++)
462   - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
  418 + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
  419 + *nodes);
463 420 break;
464 421 case MPOL_DEFAULT:
465 422 break;
466 423  
467 424  
468 425  
... ... @@ -491,38 +448,17 @@
491 448 return err;
492 449 }
493 450  
494   -/* Copy a kernel node mask to user space */
495   -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
496   - nodemask_t *nodes)
497   -{
498   - unsigned long copy = ALIGN(maxnode-1, 64) / 8;
499   - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
500   -
501   - if (copy > nbytes) {
502   - if (copy > PAGE_SIZE)
503   - return -EINVAL;
504   - if (clear_user((char __user *)mask + nbytes, copy - nbytes))
505   - return -EFAULT;
506   - copy = nbytes;
507   - }
508   - return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
509   -}
510   -
511 451 /* Retrieve NUMA policy */
512   -asmlinkage long sys_get_mempolicy(int __user *policy,
513   - unsigned long __user *nmask,
514   - unsigned long maxnode,
515   - unsigned long addr, unsigned long flags)
  452 +long do_get_mempolicy(int *policy, nodemask_t *nmask,
  453 + unsigned long addr, unsigned long flags)
516 454 {
517   - int err, pval;
  455 + int err;
518 456 struct mm_struct *mm = current->mm;
519 457 struct vm_area_struct *vma = NULL;
520 458 struct mempolicy *pol = current->mempolicy;
521 459  
522 460 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
523 461 return -EINVAL;
524   - if (nmask != NULL && maxnode < MAX_NUMNODES)
525   - return -EINVAL;
526 462 if (flags & MPOL_F_ADDR) {
527 463 down_read(&mm->mmap_sem);
528 464 vma = find_vma_intersection(mm, addr, addr+1);
529 465  
530 466  
531 467  
532 468  
... ... @@ -545,31 +481,25 @@
545 481 err = lookup_node(mm, addr);
546 482 if (err < 0)
547 483 goto out;
548   - pval = err;
  484 + *policy = err;
549 485 } else if (pol == current->mempolicy &&
550 486 pol->policy == MPOL_INTERLEAVE) {
551   - pval = current->il_next;
  487 + *policy = current->il_next;
552 488 } else {
553 489 err = -EINVAL;
554 490 goto out;
555 491 }
556 492 } else
557   - pval = pol->policy;
  493 + *policy = pol->policy;
558 494  
559 495 if (vma) {
560 496 up_read(&current->mm->mmap_sem);
561 497 vma = NULL;
562 498 }
563 499  
564   - if (policy && put_user(pval, policy))
565   - return -EFAULT;
566   -
567 500 err = 0;
568   - if (nmask) {
569   - nodemask_t nodes;
570   - get_zonemask(pol, &nodes);
571   - err = copy_nodes_to_user(nmask, maxnode, &nodes);
572   - }
  501 + if (nmask)
  502 + get_zonemask(pol, nmask);
573 503  
574 504 out:
575 505 if (vma)
... ... @@ -577,6 +507,126 @@
577 507 return err;
578 508 }
579 509  
  510 +/*
  511 + * User space interface with variable sized bitmaps for nodelists.
  512 + */
  513 +
  514 +/* Copy a node mask from user space. */
  515 +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
  516 + unsigned long maxnode)
  517 +{
  518 + unsigned long k;
  519 + unsigned long nlongs;
  520 + unsigned long endmask;
  521 +
  522 + --maxnode;
  523 + nodes_clear(*nodes);
  524 + if (maxnode == 0 || !nmask)
  525 + return 0;
  526 +
  527 + nlongs = BITS_TO_LONGS(maxnode);
  528 + if ((maxnode % BITS_PER_LONG) == 0)
  529 + endmask = ~0UL;
  530 + else
  531 + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
  532 +
  533 + /* When the user specified more nodes than supported just check
  534 + if the non supported part is all zero. */
  535 + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
  536 + if (nlongs > PAGE_SIZE/sizeof(long))
  537 + return -EINVAL;
  538 + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
  539 + unsigned long t;
  540 + if (get_user(t, nmask + k))
  541 + return -EFAULT;
  542 + if (k == nlongs - 1) {
  543 + if (t & endmask)
  544 + return -EINVAL;
  545 + } else if (t)
  546 + return -EINVAL;
  547 + }
  548 + nlongs = BITS_TO_LONGS(MAX_NUMNODES);
  549 + endmask = ~0UL;
  550 + }
  551 +
  552 + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
  553 + return -EFAULT;
  554 + nodes_addr(*nodes)[nlongs-1] &= endmask;
  555 + return 0;
  556 +}
  557 +
  558 +/* Copy a kernel node mask to user space */
  559 +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
  560 + nodemask_t *nodes)
  561 +{
  562 + unsigned long copy = ALIGN(maxnode-1, 64) / 8;
  563 + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
  564 +
  565 + if (copy > nbytes) {
  566 + if (copy > PAGE_SIZE)
  567 + return -EINVAL;
  568 + if (clear_user((char __user *)mask + nbytes, copy - nbytes))
  569 + return -EFAULT;
  570 + copy = nbytes;
  571 + }
  572 + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
  573 +}
  574 +
  575 +asmlinkage long sys_mbind(unsigned long start, unsigned long len,
  576 + unsigned long mode,
  577 + unsigned long __user *nmask, unsigned long maxnode,
  578 + unsigned flags)
  579 +{
  580 + nodemask_t nodes;
  581 + int err;
  582 +
  583 + err = get_nodes(&nodes, nmask, maxnode);
  584 + if (err)
  585 + return err;
  586 + return do_mbind(start, len, mode, &nodes, flags);
  587 +}
  588 +
  589 +/* Set the process memory policy */
  590 +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
  591 + unsigned long maxnode)
  592 +{
  593 + int err;
  594 + nodemask_t nodes;
  595 +
  596 + if (mode < 0 || mode > MPOL_MAX)
  597 + return -EINVAL;
  598 + err = get_nodes(&nodes, nmask, maxnode);
  599 + if (err)
  600 + return err;
  601 + return do_set_mempolicy(mode, &nodes);
  602 +}
  603 +
  604 +/* Retrieve NUMA policy */
  605 +asmlinkage long sys_get_mempolicy(int __user *policy,
  606 + unsigned long __user *nmask,
  607 + unsigned long maxnode,
  608 + unsigned long addr, unsigned long flags)
  609 +{
  610 + int err, pval;
  611 + nodemask_t nodes;
  612 +
  613 + if (nmask != NULL && maxnode < MAX_NUMNODES)
  614 + return -EINVAL;
  615 +
  616 + err = do_get_mempolicy(&pval, &nodes, addr, flags);
  617 +
  618 + if (err)
  619 + return err;
  620 +
  621 + if (policy && put_user(pval, policy))
  622 + return -EFAULT;
  623 +
  624 + if (nmask)
  625 + err = copy_nodes_to_user(nmask, maxnode, &nodes);
  626 +
  627 + return err;
  628 +}
  629 +
580 630 #ifdef CONFIG_COMPAT
581 631  
582 632 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
... ... @@ -664,7 +714,7 @@
664 714  
665 715 if (vma) {
666 716 if (vma->vm_ops && vma->vm_ops->get_policy)
667   - pol = vma->vm_ops->get_policy(vma, addr);
  717 + pol = vma->vm_ops->get_policy(vma, addr);
668 718 else if (vma->vm_policy &&
669 719 vma->vm_policy->policy != MPOL_DEFAULT)
670 720 pol = vma->vm_policy;
671 721  
672 722  
... ... @@ -1147,15 +1197,13 @@
1147 1197 /* Set interleaving policy for system init. This way not all
1148 1198 the data structures allocated at system boot end up in node zero. */
1149 1199  
1150   - if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1151   - MAX_NUMNODES) < 0)
  1200 + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1152 1201 printk("numa_policy_init: interleaving failed\n");
1153 1202 }
1154 1203  
1155   -/* Reset policy of current process to default.
1156   - * Assumes fs == KERNEL_DS */
  1204 +/* Reset policy of current process to default */
1157 1205 void numa_default_policy(void)
1158 1206 {
1159   - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
  1207 + do_set_mempolicy(MPOL_DEFAULT, NULL);
1160 1208 }