Commit 54fcb2303ef40bc9476fc698ad292c569e5da4fb

Authored by Eric W. Biederman
Committed by Greg Kroah-Hartman
1 parent bb4fbf094b

mnt: Make propagate_umount less slow for overlapping mount propagation trees

commit 296990deb389c7da21c78030376ba244dc1badf5 upstream.

Andrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

Make the walk of the mount propagation trees nearly linear by
remembering which mounts have already been visited, allowing
subsequent walks to detect when walking a mount propgation tree or a
subtree of a mount propgation tree would be duplicate work and to skip
them entirely.

Walk the list of mounts whose propgatation trees need to be traversed
from the mount highest in the mount tree to mounts lower in the mount
tree so that odds are higher that the code will walk the largest trees
first, allowing later tree walks to be skipped entirely.

Add cleanup_umount_visitation to remover the code's memory of which
mounts have been visited.

Add the functions last_slave and skip_propagation_subtree to allow
skipping appropriate parts of the mount propagation tree without
needing to change the logic of the rest of the code.

A script to generate overlapping mount propagation trees:

$ cat runs.h
set -e
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

iteration=10
if [ -n "$1" ] ; then
	iteration=$1
fi

for i in $(seq $iteration); do
	mount --bind /mnt/1/1 /mnt/1/1
done

mount --rbind /mnt/1 /mnt/2

TIMEFORMAT='%Rs'
nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
echo -n "umount -l /mnt/1 -> $nr        "
time umount -l /mnt/1

nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
time umount -l /mnt/2

$ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

     mhash |  8192   |  8192  | 1048576 | 1048576
    mounts | before  | after  |  before | after
    ------------------------------------------------
      1025 |  0.040s | 0.016s |  0.038s | 0.019s
      2049 |  0.094s | 0.017s |  0.080s | 0.018s
      4097 |  0.243s | 0.019s |  0.206s | 0.023s
      8193 |  1.202s | 0.028s |  1.562s | 0.032s
     16385 |  9.635s | 0.036s |  9.952s | 0.041s
     32769 | 60.928s | 0.063s | 44.321s | 0.064s
     65537 |         | 0.097s |         | 0.097s
    131073 |         | 0.233s |         | 0.176s
    262145 |         | 0.653s |         | 0.344s
    524289 |         | 2.305s |         | 0.735s
   1048577 |         | 7.107s |         | 2.603s

Andrei Vagin reports fixing the performance problem is part of the
work to fix CVE-2016-6213.

Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Reported-by: Andrei Vagin <avagin@openvz.org>
Reviewed-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 62 additions and 1 deletions Side-by-side Diff

... ... @@ -24,6 +24,11 @@
24 24 return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
25 25 }
26 26  
  27 +static inline struct mount *last_slave(struct mount *p)
  28 +{
  29 + return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
  30 +}
  31 +
27 32 static inline struct mount *next_slave(struct mount *p)
28 33 {
29 34 return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
... ... @@ -164,6 +169,19 @@
164 169 }
165 170 }
166 171  
  172 +static struct mount *skip_propagation_subtree(struct mount *m,
  173 + struct mount *origin)
  174 +{
  175 + /*
  176 + * Advance m such that propagation_next will not return
  177 + * the slaves of m.
  178 + */
  179 + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
  180 + m = last_slave(m);
  181 +
  182 + return m;
  183 +}
  184 +
167 185 static struct mount *next_group(struct mount *m, struct mount *origin)
168 186 {
169 187 while (1) {
... ... @@ -507,6 +525,15 @@
507 525 }
508 526 }
509 527  
  528 +static void cleanup_umount_visitations(struct list_head *visited)
  529 +{
  530 + while (!list_empty(visited)) {
  531 + struct mount *mnt =
  532 + list_first_entry(visited, struct mount, mnt_umounting);
  533 + list_del_init(&mnt->mnt_umounting);
  534 + }
  535 +}
  536 +
510 537 /*
511 538 * collect all mounts that receive propagation from the mount in @list,
512 539 * and return these additional mounts in the same list.
513 540  
514 541  
... ... @@ -519,11 +546,23 @@
519 546 struct mount *mnt;
520 547 LIST_HEAD(to_restore);
521 548 LIST_HEAD(to_umount);
  549 + LIST_HEAD(visited);
522 550  
523   - list_for_each_entry(mnt, list, mnt_list) {
  551 + /* Find candidates for unmounting */
  552 + list_for_each_entry_reverse(mnt, list, mnt_list) {
524 553 struct mount *parent = mnt->mnt_parent;
525 554 struct mount *m;
526 555  
  556 + /*
  557 + * If this mount has already been visited it is known that it's
  558 + * entire peer group and all of their slaves in the propagation
  559 + * tree for the mountpoint has already been visited and there is
  560 + * no need to visit them again.
  561 + */
  562 + if (!list_empty(&mnt->mnt_umounting))
  563 + continue;
  564 +
  565 + list_add_tail(&mnt->mnt_umounting, &visited);
527 566 for (m = propagation_next(parent, parent); m;
528 567 m = propagation_next(m, parent)) {
529 568 struct mount *child = __lookup_mnt(&m->mnt,
... ... @@ -531,6 +570,27 @@
531 570 if (!child)
532 571 continue;
533 572  
  573 + if (!list_empty(&child->mnt_umounting)) {
  574 + /*
  575 + * If the child has already been visited it is
  576 + * know that it's entire peer group and all of
  577 + * their slaves in the propgation tree for the
  578 + * mountpoint has already been visited and there
  579 + * is no need to visit this subtree again.
  580 + */
  581 + m = skip_propagation_subtree(m, parent);
  582 + continue;
  583 + } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
  584 + /*
  585 + * We have come accross an partially unmounted
  586 + * mount in list that has not been visited yet.
  587 + * Remember it has been visited and continue
  588 + * about our merry way.
  589 + */
  590 + list_add_tail(&child->mnt_umounting, &visited);
  591 + continue;
  592 + }
  593 +
534 594 /* Check the child and parents while progress is made */
535 595 while (__propagate_umount(child,
536 596 &to_umount, &to_restore)) {
... ... @@ -544,6 +604,7 @@
544 604  
545 605 umount_list(&to_umount, &to_restore);
546 606 restore_mounts(&to_restore);
  607 + cleanup_umount_visitations(&visited);
547 608 list_splice_tail(&to_umount, list);
548 609  
549 610 return 0;