Commit bd8815a6d802fc16a7a106e170593aa05dc17e72

Authored by Tejun Heo
1 parent 95109b627b

cgroup: make css_for_each_descendant() and friends include the origin css in the iteration

Previously, all css descendant iterators didn't include the origin
(root of subtree) css in the iteration.  The reasons were maintaining
consistency with css_for_each_child() and that at the time of
introduction more use cases needed skipping the origin anyway;
however, given that css_is_descendant() considers self to be a
descendant, omitting the origin css has become more confusing and
looking at the accumulated use cases rather clearly indicates that
including origin would result in simpler code overall.

While this is a change which can easily lead to subtle bugs, cgroup
API including the iterators has recently gone through major
restructuring and no out-of-tree changes will be applicable without
adjustments making this a relatively acceptable opportunity for this
type of change.

The conversions are mostly straight-forward.  If the iteration block
had explicit origin handling before or after, it's moved inside the
iteration.  If not, if (pos == origin) continue; is added.  Some
conversions add extra reference get/put around origin handling by
consolidating origin handling and the rest.  While the extra ref
operations aren't strictly necessary, this shouldn't cause any
noticeable difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>

Showing 9 changed files with 69 additions and 74 deletions Side-by-side Diff

... ... @@ -615,12 +615,10 @@
615 615 struct blkcg_policy *pol = blkcg_policy[pd->plid];
616 616 struct blkcg_gq *pos_blkg;
617 617 struct cgroup_subsys_state *pos_css;
618   - u64 sum;
  618 + u64 sum = 0;
619 619  
620 620 lockdep_assert_held(pd->blkg->q->queue_lock);
621 621  
622   - sum = blkg_stat_read((void *)pd + off);
623   -
624 622 rcu_read_lock();
625 623 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
626 624 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
627 625  
... ... @@ -650,12 +648,10 @@
650 648 struct blkcg_policy *pol = blkcg_policy[pd->plid];
651 649 struct blkcg_gq *pos_blkg;
652 650 struct cgroup_subsys_state *pos_css;
653   - struct blkg_rwstat sum;
  651 + struct blkg_rwstat sum = { };
654 652 int i;
655 653  
656 654 lockdep_assert_held(pd->blkg->q->queue_lock);
657   -
658   - sum = blkg_rwstat_read((void *)pd + off);
659 655  
660 656 rcu_read_lock();
661 657 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
... ... @@ -291,6 +291,7 @@
291 291 * read locked. If called under either blkcg or queue lock, the iteration
292 292 * is guaranteed to include all and only online blkgs. The caller may
293 293 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
  294 + * @p_blkg is included in the iteration and the first node to be visited.
294 295 */
295 296 #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
296 297 css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
... ... @@ -304,7 +305,8 @@
304 305 * @p_blkg: target blkg to walk descendants of
305 306 *
306 307 * Similar to blkg_for_each_descendant_pre() but performs post-order
307   - * traversal instead. Synchronization rules are the same.
  308 + * traversal instead. Synchronization rules are the same. @p_blkg is
  309 + * included in the iteration and the last node to be visited.
308 310 */
309 311 #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
310 312 css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
block/blk-throttle.c
... ... @@ -1379,7 +1379,6 @@
1379 1379 * restrictions in the whole hierarchy and allows them to bypass
1380 1380 * blk-throttle.
1381 1381 */
1382   - tg_update_has_rules(tg);
1383 1382 blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
1384 1383 tg_update_has_rules(blkg_to_tg(blkg));
1385 1384  
... ... @@ -1638,8 +1637,6 @@
1638 1637 */
1639 1638 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
1640 1639 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1641   -
1642   - tg_drain_bios(&td_root_tg(td)->service_queue);
1643 1640  
1644 1641 /* finally, transfer bios from top-level tg's into the td */
1645 1642 tg_drain_bios(&td->service_queue);
include/linux/cgroup.h
... ... @@ -798,7 +798,8 @@
798 798 * @pos: the css * to use as the loop cursor
799 799 * @root: css whose descendants to walk
800 800 *
801   - * Walk @root's descendants. Must be called under rcu_read_lock(). A
  801 + * Walk @root's descendants. @root is included in the iteration and the
  802 + * first node to be visited. Must be called under rcu_read_lock(). A
802 803 * descendant css which hasn't finished ->css_online() or already has
803 804 * finished ->css_offline() may show up during traversal and it's each
804 805 * subsystem's responsibility to verify that each @pos is alive.
805 806  
... ... @@ -820,13 +821,12 @@
820 821 *
821 822 * my_update_state(@css)
822 823 * {
823   - * Lock @css;
824   - * Update @css's state;
825   - * Unlock @css;
826   - *
827 824 * css_for_each_descendant_pre(@pos, @css) {
828 825 * Lock @pos;
829   - * Verify @pos is alive and inherit state from @pos's parent;
  826 + * if (@pos == @css)
  827 + * Update @css's state;
  828 + * else
  829 + * Verify @pos is alive and inherit state from its parent;
830 830 * Unlock @pos;
831 831 * }
832 832 * }
... ... @@ -864,8 +864,9 @@
864 864 * @css: css whose descendants to walk
865 865 *
866 866 * Similar to css_for_each_descendant_pre() but performs post-order
867   - * traversal instead. Note that the walk visibility guarantee described in
868   - * pre-order walk doesn't apply the same to post-order walks.
  867 + * traversal instead. @root is included in the iteration and the last
  868 + * node to be visited. Note that the walk visibility guarantee described
  869 + * in pre-order walk doesn't apply the same to post-order walks.
869 870 */
870 871 #define css_for_each_descendant_post(pos, css) \
871 872 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
... ... @@ -2868,17 +2868,6 @@
2868 2868  
2869 2869 mutex_unlock(&cgroup_mutex);
2870 2870  
2871   - /* @root always needs to be updated */
2872   - inode = root->dentry->d_inode;
2873   - mutex_lock(&inode->i_mutex);
2874   - mutex_lock(&cgroup_mutex);
2875   - ret = cgroup_addrm_files(root, cfts, is_add);
2876   - mutex_unlock(&cgroup_mutex);
2877   - mutex_unlock(&inode->i_mutex);
2878   -
2879   - if (ret)
2880   - goto out_deact;
2881   -
2882 2871 /* add/rm files for all cgroups created before */
2883 2872 rcu_read_lock();
2884 2873 css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) {
... ... @@ -2907,7 +2896,6 @@
2907 2896 }
2908 2897 rcu_read_unlock();
2909 2898 dput(prev);
2910   -out_deact:
2911 2899 deactivate_super(sb);
2912 2900 return ret;
2913 2901 }
... ... @@ -3099,7 +3087,8 @@
3099 3087 * @root: css whose descendants to walk
3100 3088 *
3101 3089 * To be used by css_for_each_descendant_pre(). Find the next descendant
3102   - * to visit for pre-order traversal of @root's descendants.
  3090 + * to visit for pre-order traversal of @root's descendants. @root is
  3091 + * included in the iteration and the first node to be visited.
3103 3092 *
3104 3093 * While this function requires RCU read locking, it doesn't require the
3105 3094 * whole traversal to be contained in a single RCU critical section. This
3106 3095  
... ... @@ -3114,9 +3103,9 @@
3114 3103  
3115 3104 WARN_ON_ONCE(!rcu_read_lock_held());
3116 3105  
3117   - /* if first iteration, pretend we just visited @root */
  3106 + /* if first iteration, visit @root */
3118 3107 if (!pos)
3119   - pos = root;
  3108 + return root;
3120 3109  
3121 3110 /* visit the first child if exists */
3122 3111 next = css_next_child(NULL, pos);
... ... @@ -3186,7 +3175,8 @@
3186 3175 * @root: css whose descendants to walk
3187 3176 *
3188 3177 * To be used by css_for_each_descendant_post(). Find the next descendant
3189   - * to visit for post-order traversal of @root's descendants.
  3178 + * to visit for post-order traversal of @root's descendants. @root is
  3179 + * included in the iteration and the last node to be visited.
3190 3180 *
3191 3181 * While this function requires RCU read locking, it doesn't require the
3192 3182 * whole traversal to be contained in a single RCU critical section. This
3193 3183  
... ... @@ -3207,14 +3197,17 @@
3207 3197 return next != root ? next : NULL;
3208 3198 }
3209 3199  
  3200 + /* if we visited @root, we're done */
  3201 + if (pos == root)
  3202 + return NULL;
  3203 +
3210 3204 /* if there's an unvisited sibling, visit its leftmost descendant */
3211 3205 next = css_next_child(pos, css_parent(pos));
3212 3206 if (next)
3213 3207 return css_leftmost_descendant(next);
3214 3208  
3215 3209 /* no sibling left, visit parent */
3216   - next = css_parent(pos);
3217   - return next != root ? next : NULL;
  3210 + return css_parent(pos);
3218 3211 }
3219 3212 EXPORT_SYMBOL_GPL(css_next_descendant_post);
3220 3213  
kernel/cgroup_freezer.c
... ... @@ -311,7 +311,6 @@
311 311 /* update states bottom-up */
312 312 css_for_each_descendant_post(pos, css)
313 313 update_if_frozen(pos);
314   - update_if_frozen(css);
315 314  
316 315 rcu_read_unlock();
317 316  
... ... @@ -391,11 +390,6 @@
391 390 {
392 391 struct cgroup_subsys_state *pos;
393 392  
394   - /* update @freezer */
395   - spin_lock_irq(&freezer->lock);
396   - freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
397   - spin_unlock_irq(&freezer->lock);
398   -
399 393 /*
400 394 * Update all its descendants in pre-order traversal. Each
401 395 * descendant will try to inherit its parent's FREEZING state as
402 396  
... ... @@ -406,14 +400,23 @@
406 400 struct freezer *pos_f = css_freezer(pos);
407 401 struct freezer *parent = parent_freezer(pos_f);
408 402  
409   - /*
410   - * Our update to @parent->state is already visible which is
411   - * all we need. No need to lock @parent. For more info on
412   - * synchronization, see freezer_post_create().
413   - */
414 403 spin_lock_irq(&pos_f->lock);
415   - freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
416   - CGROUP_FREEZING_PARENT);
  404 +
  405 + if (pos_f == freezer) {
  406 + freezer_apply_state(pos_f, freeze,
  407 + CGROUP_FREEZING_SELF);
  408 + } else {
  409 + /*
  410 + * Our update to @parent->state is already visible
  411 + * which is all we need. No need to lock @parent.
  412 + * For more info on synchronization, see
  413 + * freezer_post_create().
  414 + */
  415 + freezer_apply_state(pos_f,
  416 + parent->state & CGROUP_FREEZING,
  417 + CGROUP_FREEZING_PARENT);
  418 + }
  419 +
417 420 spin_unlock_irq(&pos_f->lock);
418 421 }
419 422 rcu_read_unlock();
... ... @@ -222,7 +222,8 @@
222 222 *
223 223 * Walk @des_cs through the online descendants of @root_cs. Must be used
224 224 * with RCU read locked. The caller may modify @pos_css by calling
225   - * css_rightmost_descendant() to skip subtree.
  225 + * css_rightmost_descendant() to skip subtree. @root_cs is included in the
  226 + * iteration and the first node to be visited.
226 227 */
227 228 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
228 229 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
... ... @@ -506,6 +507,9 @@
506 507  
507 508 rcu_read_lock();
508 509 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
  510 + if (cp == root_cs)
  511 + continue;
  512 +
509 513 /* skip the whole subtree if @cp doesn't have any CPU */
510 514 if (cpumask_empty(cp->cpus_allowed)) {
511 515 pos_css = css_rightmost_descendant(pos_css);
... ... @@ -613,6 +617,8 @@
613 617  
614 618 rcu_read_lock();
615 619 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
  620 + if (cp == &top_cpuset)
  621 + continue;
616 622 /*
617 623 * Continue traversing beyond @cp iff @cp has some CPUs and
618 624 * isn't load balancing. The former is obvious. The
619 625  
... ... @@ -875,15 +881,17 @@
875 881 struct cpuset *cp;
876 882 struct cgroup_subsys_state *pos_css;
877 883  
878   - if (update_root)
879   - update_tasks_cpumask(root_cs, heap);
880   -
881 884 rcu_read_lock();
882 885 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
883   - /* skip the whole subtree if @cp have some CPU */
884   - if (!cpumask_empty(cp->cpus_allowed)) {
885   - pos_css = css_rightmost_descendant(pos_css);
886   - continue;
  886 + if (cp == root_cs) {
  887 + if (!update_root)
  888 + continue;
  889 + } else {
  890 + /* skip the whole subtree if @cp have some CPU */
  891 + if (!cpumask_empty(cp->cpus_allowed)) {
  892 + pos_css = css_rightmost_descendant(pos_css);
  893 + continue;
  894 + }
887 895 }
888 896 if (!css_tryget(&cp->css))
889 897 continue;
890 898  
... ... @@ -1130,15 +1138,17 @@
1130 1138 struct cpuset *cp;
1131 1139 struct cgroup_subsys_state *pos_css;
1132 1140  
1133   - if (update_root)
1134   - update_tasks_nodemask(root_cs, heap);
1135   -
1136 1141 rcu_read_lock();
1137 1142 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1138   - /* skip the whole subtree if @cp have some CPU */
1139   - if (!nodes_empty(cp->mems_allowed)) {
1140   - pos_css = css_rightmost_descendant(pos_css);
1141   - continue;
  1143 + if (cp == root_cs) {
  1144 + if (!update_root)
  1145 + continue;
  1146 + } else {
  1147 + /* skip the whole subtree if @cp have some CPU */
  1148 + if (!nodes_empty(cp->mems_allowed)) {
  1149 + pos_css = css_rightmost_descendant(pos_css);
  1150 + continue;
  1151 + }
1142 1152 }
1143 1153 if (!css_tryget(&cp->css))
1144 1154 continue;
... ... @@ -2237,7 +2247,7 @@
2237 2247  
2238 2248 rcu_read_lock();
2239 2249 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2240   - if (!css_tryget(&cs->css))
  2250 + if (cs == &top_cpuset || !css_tryget(&cs->css))
2241 2251 continue;
2242 2252 rcu_read_unlock();
2243 2253  
... ... @@ -1079,14 +1079,7 @@
1079 1079 {
1080 1080 struct cgroup_subsys_state *prev_css, *next_css;
1081 1081  
1082   - /*
1083   - * Root is not visited by cgroup iterators so it needs an
1084   - * explicit visit.
1085   - */
1086   - if (!last_visited)
1087   - return root;
1088   -
1089   - prev_css = (last_visited == root) ? NULL : &last_visited->css;
  1082 + prev_css = last_visited ? &last_visited->css : NULL;
1090 1083 skip_node:
1091 1084 next_css = css_next_descendant_pre(prev_css, &root->css);
1092 1085  
security/device_cgroup.c
... ... @@ -456,7 +456,7 @@
456 456 * methods), and online ones are safe to access outside RCU
457 457 * read lock without bumping refcnt.
458 458 */
459   - if (!is_devcg_online(devcg))
  459 + if (pos == &devcg_root->css || !is_devcg_online(devcg))
460 460 continue;
461 461  
462 462 rcu_read_unlock();