Commit ac53db596cc08ecb8040cfb6f71ae40c6f2041c4
Committed by
Ingo Molnar
1 parent
2c13c919d9
Exists in
master
and in
20 other branches
sched: Use a buddy to implement yield_task_fair()
Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 5 changed files with 90 additions and 71 deletions Side-by-side Diff
include/linux/sched.h
kernel/sched.c
... | ... | @@ -324,7 +324,7 @@ |
324 | 324 | * 'curr' points to currently running entity on this cfs_rq. |
325 | 325 | * It is set to NULL otherwise (i.e when none are currently running). |
326 | 326 | */ |
327 | - struct sched_entity *curr, *next, *last; | |
327 | + struct sched_entity *curr, *next, *last, *skip; | |
328 | 328 | |
329 | 329 | unsigned int nr_spread_over; |
330 | 330 |
kernel/sched_debug.c
... | ... | @@ -179,7 +179,7 @@ |
179 | 179 | |
180 | 180 | raw_spin_lock_irqsave(&rq->lock, flags); |
181 | 181 | if (cfs_rq->rb_leftmost) |
182 | - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | |
182 | + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; | |
183 | 183 | last = __pick_last_entity(cfs_rq); |
184 | 184 | if (last) |
185 | 185 | max_vruntime = last->vruntime; |
kernel/sched_fair.c
... | ... | @@ -69,14 +69,6 @@ |
69 | 69 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
70 | 70 | |
71 | 71 | /* |
72 | - * sys_sched_yield() compat mode | |
73 | - * | |
74 | - * This option switches the agressive yield implementation of the | |
75 | - * old scheduler back on. | |
76 | - */ | |
77 | -unsigned int __read_mostly sysctl_sched_compat_yield; | |
78 | - | |
79 | -/* | |
80 | 72 | * SCHED_OTHER wake-up granularity. |
81 | 73 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
82 | 74 | * |
... | ... | @@ -419,7 +411,7 @@ |
419 | 411 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
420 | 412 | } |
421 | 413 | |
422 | -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |
414 | +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) | |
423 | 415 | { |
424 | 416 | struct rb_node *left = cfs_rq->rb_leftmost; |
425 | 417 | |
... | ... | @@ -429,6 +421,17 @@ |
429 | 421 | return rb_entry(left, struct sched_entity, run_node); |
430 | 422 | } |
431 | 423 | |
424 | +static struct sched_entity *__pick_next_entity(struct sched_entity *se) | |
425 | +{ | |
426 | + struct rb_node *next = rb_next(&se->run_node); | |
427 | + | |
428 | + if (!next) | |
429 | + return NULL; | |
430 | + | |
431 | + return rb_entry(next, struct sched_entity, run_node); | |
432 | +} | |
433 | + | |
434 | +#ifdef CONFIG_SCHED_DEBUG | |
432 | 435 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
433 | 436 | { |
434 | 437 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
... | ... | @@ -443,7 +446,6 @@ |
443 | 446 | * Scheduling class statistics methods: |
444 | 447 | */ |
445 | 448 | |
446 | -#ifdef CONFIG_SCHED_DEBUG | |
447 | 449 | int sched_proc_update_handler(struct ctl_table *table, int write, |
448 | 450 | void __user *buffer, size_t *lenp, |
449 | 451 | loff_t *ppos) |
... | ... | @@ -1017,6 +1019,17 @@ |
1017 | 1019 | } |
1018 | 1020 | } |
1019 | 1021 | |
1022 | +static void __clear_buddies_skip(struct sched_entity *se) | |
1023 | +{ | |
1024 | + for_each_sched_entity(se) { | |
1025 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); | |
1026 | + if (cfs_rq->skip == se) | |
1027 | + cfs_rq->skip = NULL; | |
1028 | + else | |
1029 | + break; | |
1030 | + } | |
1031 | +} | |
1032 | + | |
1020 | 1033 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
1021 | 1034 | { |
1022 | 1035 | if (cfs_rq->last == se) |
... | ... | @@ -1024,6 +1037,9 @@ |
1024 | 1037 | |
1025 | 1038 | if (cfs_rq->next == se) |
1026 | 1039 | __clear_buddies_next(se); |
1040 | + | |
1041 | + if (cfs_rq->skip == se) | |
1042 | + __clear_buddies_skip(se); | |
1027 | 1043 | } |
1028 | 1044 | |
1029 | 1045 | static void |
... | ... | @@ -1099,7 +1115,7 @@ |
1099 | 1115 | return; |
1100 | 1116 | |
1101 | 1117 | if (cfs_rq->nr_running > 1) { |
1102 | - struct sched_entity *se = __pick_next_entity(cfs_rq); | |
1118 | + struct sched_entity *se = __pick_first_entity(cfs_rq); | |
1103 | 1119 | s64 delta = curr->vruntime - se->vruntime; |
1104 | 1120 | |
1105 | 1121 | if (delta < 0) |
1106 | 1122 | |
1107 | 1123 | |
... | ... | @@ -1143,13 +1159,27 @@ |
1143 | 1159 | static int |
1144 | 1160 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
1145 | 1161 | |
1162 | +/* | |
1163 | + * Pick the next process, keeping these things in mind, in this order: | |
1164 | + * 1) keep things fair between processes/task groups | |
1165 | + * 2) pick the "next" process, since someone really wants that to run | |
1166 | + * 3) pick the "last" process, for cache locality | |
1167 | + * 4) do not run the "skip" process, if something else is available | |
1168 | + */ | |
1146 | 1169 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
1147 | 1170 | { |
1148 | - struct sched_entity *se = __pick_next_entity(cfs_rq); | |
1171 | + struct sched_entity *se = __pick_first_entity(cfs_rq); | |
1149 | 1172 | struct sched_entity *left = se; |
1150 | 1173 | |
1151 | - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | |
1152 | - se = cfs_rq->next; | |
1174 | + /* | |
1175 | + * Avoid running the skip buddy, if running something else can | |
1176 | + * be done without getting too unfair. | |
1177 | + */ | |
1178 | + if (cfs_rq->skip == se) { | |
1179 | + struct sched_entity *second = __pick_next_entity(se); | |
1180 | + if (second && wakeup_preempt_entity(second, left) < 1) | |
1181 | + se = second; | |
1182 | + } | |
1153 | 1183 | |
1154 | 1184 | /* |
1155 | 1185 | * Prefer last buddy, try to return the CPU to a preempted task. |
... | ... | @@ -1157,6 +1187,12 @@ |
1157 | 1187 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
1158 | 1188 | se = cfs_rq->last; |
1159 | 1189 | |
1190 | + /* | |
1191 | + * Someone really wants this to run. If it's not unfair, run it. | |
1192 | + */ | |
1193 | + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | |
1194 | + se = cfs_rq->next; | |
1195 | + | |
1160 | 1196 | clear_buddies(cfs_rq, se); |
1161 | 1197 | |
1162 | 1198 | return se; |
... | ... | @@ -1333,52 +1369,6 @@ |
1333 | 1369 | hrtick_update(rq); |
1334 | 1370 | } |
1335 | 1371 | |
1336 | -/* | |
1337 | - * sched_yield() support is very simple - we dequeue and enqueue. | |
1338 | - * | |
1339 | - * If compat_yield is turned on then we requeue to the end of the tree. | |
1340 | - */ | |
1341 | -static void yield_task_fair(struct rq *rq) | |
1342 | -{ | |
1343 | - struct task_struct *curr = rq->curr; | |
1344 | - struct cfs_rq *cfs_rq = task_cfs_rq(curr); | |
1345 | - struct sched_entity *rightmost, *se = &curr->se; | |
1346 | - | |
1347 | - /* | |
1348 | - * Are we the only task in the tree? | |
1349 | - */ | |
1350 | - if (unlikely(rq->nr_running == 1)) | |
1351 | - return; | |
1352 | - | |
1353 | - clear_buddies(cfs_rq, se); | |
1354 | - | |
1355 | - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | |
1356 | - update_rq_clock(rq); | |
1357 | - /* | |
1358 | - * Update run-time statistics of the 'current'. | |
1359 | - */ | |
1360 | - update_curr(cfs_rq); | |
1361 | - | |
1362 | - return; | |
1363 | - } | |
1364 | - /* | |
1365 | - * Find the rightmost entry in the rbtree: | |
1366 | - */ | |
1367 | - rightmost = __pick_last_entity(cfs_rq); | |
1368 | - /* | |
1369 | - * Already in the rightmost position? | |
1370 | - */ | |
1371 | - if (unlikely(!rightmost || entity_before(rightmost, se))) | |
1372 | - return; | |
1373 | - | |
1374 | - /* | |
1375 | - * Minimally necessary key value to be last in the tree: | |
1376 | - * Upon rescheduling, sched_class::put_prev_task() will place | |
1377 | - * 'current' within the tree based on its new key value. | |
1378 | - */ | |
1379 | - se->vruntime = rightmost->vruntime + 1; | |
1380 | -} | |
1381 | - | |
1382 | 1372 | #ifdef CONFIG_SMP |
1383 | 1373 | |
1384 | 1374 | static void task_waking_fair(struct rq *rq, struct task_struct *p) |
... | ... | @@ -1849,6 +1839,14 @@ |
1849 | 1839 | } |
1850 | 1840 | } |
1851 | 1841 | |
1842 | +static void set_skip_buddy(struct sched_entity *se) | |
1843 | +{ | |
1844 | + if (likely(task_of(se)->policy != SCHED_IDLE)) { | |
1845 | + for_each_sched_entity(se) | |
1846 | + cfs_rq_of(se)->skip = se; | |
1847 | + } | |
1848 | +} | |
1849 | + | |
1852 | 1850 | /* |
1853 | 1851 | * Preempt the current task with a newly woken task if needed: |
1854 | 1852 | */ |
... | ... | @@ -1945,6 +1943,36 @@ |
1945 | 1943 | cfs_rq = cfs_rq_of(se); |
1946 | 1944 | put_prev_entity(cfs_rq, se); |
1947 | 1945 | } |
1946 | +} | |
1947 | + | |
1948 | +/* | |
1949 | + * sched_yield() is very simple | |
1950 | + * | |
1951 | + * The magic of dealing with the ->skip buddy is in pick_next_entity. | |
1952 | + */ | |
1953 | +static void yield_task_fair(struct rq *rq) | |
1954 | +{ | |
1955 | + struct task_struct *curr = rq->curr; | |
1956 | + struct cfs_rq *cfs_rq = task_cfs_rq(curr); | |
1957 | + struct sched_entity *se = &curr->se; | |
1958 | + | |
1959 | + /* | |
1960 | + * Are we the only task in the tree? | |
1961 | + */ | |
1962 | + if (unlikely(rq->nr_running == 1)) | |
1963 | + return; | |
1964 | + | |
1965 | + clear_buddies(cfs_rq, se); | |
1966 | + | |
1967 | + if (curr->policy != SCHED_BATCH) { | |
1968 | + update_rq_clock(rq); | |
1969 | + /* | |
1970 | + * Update run-time statistics of the 'current'. | |
1971 | + */ | |
1972 | + update_curr(cfs_rq); | |
1973 | + } | |
1974 | + | |
1975 | + set_skip_buddy(se); | |
1948 | 1976 | } |
1949 | 1977 | |
1950 | 1978 | #ifdef CONFIG_SMP |
kernel/sysctl.c
... | ... | @@ -360,13 +360,6 @@ |
360 | 360 | .mode = 0644, |
361 | 361 | .proc_handler = sched_rt_handler, |
362 | 362 | }, |
363 | - { | |
364 | - .procname = "sched_compat_yield", | |
365 | - .data = &sysctl_sched_compat_yield, | |
366 | - .maxlen = sizeof(unsigned int), | |
367 | - .mode = 0644, | |
368 | - .proc_handler = proc_dointvec, | |
369 | - }, | |
370 | 363 | #ifdef CONFIG_SCHED_AUTOGROUP |
371 | 364 | { |
372 | 365 | .procname = "sched_autogroup_enabled", |