Commit ac53db596cc08ecb8040cfb6f71ae40c6f2041c4

Authored by Rik van Riel
Committed by Ingo Molnar
1 parent 2c13c919d9

sched: Use a buddy to implement yield_task_fair()

Use the buddy mechanism to implement yield_task_fair.  This
allows us to skip onto the next highest priority se at every
level in the CFS tree, unless doing so would introduce gross
unfairness in CPU time distribution.

We order the buddy selection in pick_next_entity to check
yield first, then last, then next.  We need next to be able
to override yield, because it is possible for the "next" and
"yield" task to be different processen in the same sub-tree
of the CFS tree.  When they are, we need to go into that
sub-tree regardless of the "yield" hint, and pick the correct
entity once we get to the right level.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 5 changed files with 90 additions and 71 deletions Side-by-side Diff

include/linux/sched.h
... ... @@ -1942,8 +1942,6 @@
1942 1942 void __user *buffer, size_t *lenp,
1943 1943 loff_t *ppos);
1944 1944  
1945   -extern unsigned int sysctl_sched_compat_yield;
1946   -
1947 1945 #ifdef CONFIG_SCHED_AUTOGROUP
1948 1946 extern unsigned int sysctl_sched_autogroup_enabled;
1949 1947  
... ... @@ -324,7 +324,7 @@
324 324 * 'curr' points to currently running entity on this cfs_rq.
325 325 * It is set to NULL otherwise (i.e when none are currently running).
326 326 */
327   - struct sched_entity *curr, *next, *last;
  327 + struct sched_entity *curr, *next, *last, *skip;
328 328  
329 329 unsigned int nr_spread_over;
330 330  
kernel/sched_debug.c
... ... @@ -179,7 +179,7 @@
179 179  
180 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 181 if (cfs_rq->rb_leftmost)
182   - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
  182 + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 183 last = __pick_last_entity(cfs_rq);
184 184 if (last)
185 185 max_vruntime = last->vruntime;
... ... @@ -69,14 +69,6 @@
69 69 unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70  
71 71 /*
72   - * sys_sched_yield() compat mode
73   - *
74   - * This option switches the agressive yield implementation of the
75   - * old scheduler back on.
76   - */
77   -unsigned int __read_mostly sysctl_sched_compat_yield;
78   -
79   -/*
80 72 * SCHED_OTHER wake-up granularity.
81 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 74 *
... ... @@ -419,7 +411,7 @@
419 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420 412 }
421 413  
422   -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
  414 +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423 415 {
424 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417  
... ... @@ -429,6 +421,17 @@
429 421 return rb_entry(left, struct sched_entity, run_node);
430 422 }
431 423  
  424 +static struct sched_entity *__pick_next_entity(struct sched_entity *se)
  425 +{
  426 + struct rb_node *next = rb_next(&se->run_node);
  427 +
  428 + if (!next)
  429 + return NULL;
  430 +
  431 + return rb_entry(next, struct sched_entity, run_node);
  432 +}
  433 +
  434 +#ifdef CONFIG_SCHED_DEBUG
432 435 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433 436 {
434 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
... ... @@ -443,7 +446,6 @@
443 446 * Scheduling class statistics methods:
444 447 */
445 448  
446   -#ifdef CONFIG_SCHED_DEBUG
447 449 int sched_proc_update_handler(struct ctl_table *table, int write,
448 450 void __user *buffer, size_t *lenp,
449 451 loff_t *ppos)
... ... @@ -1017,6 +1019,17 @@
1017 1019 }
1018 1020 }
1019 1021  
  1022 +static void __clear_buddies_skip(struct sched_entity *se)
  1023 +{
  1024 + for_each_sched_entity(se) {
  1025 + struct cfs_rq *cfs_rq = cfs_rq_of(se);
  1026 + if (cfs_rq->skip == se)
  1027 + cfs_rq->skip = NULL;
  1028 + else
  1029 + break;
  1030 + }
  1031 +}
  1032 +
1020 1033 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1021 1034 {
1022 1035 if (cfs_rq->last == se)
... ... @@ -1024,6 +1037,9 @@
1024 1037  
1025 1038 if (cfs_rq->next == se)
1026 1039 __clear_buddies_next(se);
  1040 +
  1041 + if (cfs_rq->skip == se)
  1042 + __clear_buddies_skip(se);
1027 1043 }
1028 1044  
1029 1045 static void
... ... @@ -1099,7 +1115,7 @@
1099 1115 return;
1100 1116  
1101 1117 if (cfs_rq->nr_running > 1) {
1102   - struct sched_entity *se = __pick_next_entity(cfs_rq);
  1118 + struct sched_entity *se = __pick_first_entity(cfs_rq);
1103 1119 s64 delta = curr->vruntime - se->vruntime;
1104 1120  
1105 1121 if (delta < 0)
1106 1122  
1107 1123  
... ... @@ -1143,13 +1159,27 @@
1143 1159 static int
1144 1160 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1145 1161  
  1162 +/*
  1163 + * Pick the next process, keeping these things in mind, in this order:
  1164 + * 1) keep things fair between processes/task groups
  1165 + * 2) pick the "next" process, since someone really wants that to run
  1166 + * 3) pick the "last" process, for cache locality
  1167 + * 4) do not run the "skip" process, if something else is available
  1168 + */
1146 1169 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1147 1170 {
1148   - struct sched_entity *se = __pick_next_entity(cfs_rq);
  1171 + struct sched_entity *se = __pick_first_entity(cfs_rq);
1149 1172 struct sched_entity *left = se;
1150 1173  
1151   - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1152   - se = cfs_rq->next;
  1174 + /*
  1175 + * Avoid running the skip buddy, if running something else can
  1176 + * be done without getting too unfair.
  1177 + */
  1178 + if (cfs_rq->skip == se) {
  1179 + struct sched_entity *second = __pick_next_entity(se);
  1180 + if (second && wakeup_preempt_entity(second, left) < 1)
  1181 + se = second;
  1182 + }
1153 1183  
1154 1184 /*
1155 1185 * Prefer last buddy, try to return the CPU to a preempted task.
... ... @@ -1157,6 +1187,12 @@
1157 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1158 1188 se = cfs_rq->last;
1159 1189  
  1190 + /*
  1191 + * Someone really wants this to run. If it's not unfair, run it.
  1192 + */
  1193 + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
  1194 + se = cfs_rq->next;
  1195 +
1160 1196 clear_buddies(cfs_rq, se);
1161 1197  
1162 1198 return se;
... ... @@ -1333,52 +1369,6 @@
1333 1369 hrtick_update(rq);
1334 1370 }
1335 1371  
1336   -/*
1337   - * sched_yield() support is very simple - we dequeue and enqueue.
1338   - *
1339   - * If compat_yield is turned on then we requeue to the end of the tree.
1340   - */
1341   -static void yield_task_fair(struct rq *rq)
1342   -{
1343   - struct task_struct *curr = rq->curr;
1344   - struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1345   - struct sched_entity *rightmost, *se = &curr->se;
1346   -
1347   - /*
1348   - * Are we the only task in the tree?
1349   - */
1350   - if (unlikely(rq->nr_running == 1))
1351   - return;
1352   -
1353   - clear_buddies(cfs_rq, se);
1354   -
1355   - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1356   - update_rq_clock(rq);
1357   - /*
1358   - * Update run-time statistics of the 'current'.
1359   - */
1360   - update_curr(cfs_rq);
1361   -
1362   - return;
1363   - }
1364   - /*
1365   - * Find the rightmost entry in the rbtree:
1366   - */
1367   - rightmost = __pick_last_entity(cfs_rq);
1368   - /*
1369   - * Already in the rightmost position?
1370   - */
1371   - if (unlikely(!rightmost || entity_before(rightmost, se)))
1372   - return;
1373   -
1374   - /*
1375   - * Minimally necessary key value to be last in the tree:
1376   - * Upon rescheduling, sched_class::put_prev_task() will place
1377   - * 'current' within the tree based on its new key value.
1378   - */
1379   - se->vruntime = rightmost->vruntime + 1;
1380   -}
1381   -
1382 1372 #ifdef CONFIG_SMP
1383 1373  
1384 1374 static void task_waking_fair(struct rq *rq, struct task_struct *p)
... ... @@ -1849,6 +1839,14 @@
1849 1839 }
1850 1840 }
1851 1841  
  1842 +static void set_skip_buddy(struct sched_entity *se)
  1843 +{
  1844 + if (likely(task_of(se)->policy != SCHED_IDLE)) {
  1845 + for_each_sched_entity(se)
  1846 + cfs_rq_of(se)->skip = se;
  1847 + }
  1848 +}
  1849 +
1852 1850 /*
1853 1851 * Preempt the current task with a newly woken task if needed:
1854 1852 */
... ... @@ -1945,6 +1943,36 @@
1945 1943 cfs_rq = cfs_rq_of(se);
1946 1944 put_prev_entity(cfs_rq, se);
1947 1945 }
  1946 +}
  1947 +
  1948 +/*
  1949 + * sched_yield() is very simple
  1950 + *
  1951 + * The magic of dealing with the ->skip buddy is in pick_next_entity.
  1952 + */
  1953 +static void yield_task_fair(struct rq *rq)
  1954 +{
  1955 + struct task_struct *curr = rq->curr;
  1956 + struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  1957 + struct sched_entity *se = &curr->se;
  1958 +
  1959 + /*
  1960 + * Are we the only task in the tree?
  1961 + */
  1962 + if (unlikely(rq->nr_running == 1))
  1963 + return;
  1964 +
  1965 + clear_buddies(cfs_rq, se);
  1966 +
  1967 + if (curr->policy != SCHED_BATCH) {
  1968 + update_rq_clock(rq);
  1969 + /*
  1970 + * Update run-time statistics of the 'current'.
  1971 + */
  1972 + update_curr(cfs_rq);
  1973 + }
  1974 +
  1975 + set_skip_buddy(se);
1948 1976 }
1949 1977  
1950 1978 #ifdef CONFIG_SMP
... ... @@ -360,13 +360,6 @@
360 360 .mode = 0644,
361 361 .proc_handler = sched_rt_handler,
362 362 },
363   - {
364   - .procname = "sched_compat_yield",
365   - .data = &sysctl_sched_compat_yield,
366   - .maxlen = sizeof(unsigned int),
367   - .mode = 0644,
368   - .proc_handler = proc_dointvec,
369   - },
370 363 #ifdef CONFIG_SCHED_AUTOGROUP
371 364 {
372 365 .procname = "sched_autogroup_enabled",