Commit 1d3504fcf5606579d60b649d19f44b3871c1ddae

Authored by Hidetoshi Seto
Committed by Ingo Molnar
1 parent 4d5f35533f

sched, cpuset: customize sched domains, core

[rebased for sched-devel/latest]

 - Add a new cpuset file, having levels:
     sched_relax_domain_level

 - Modify partition_sched_domains() and build_sched_domains()
   to take attributes parameter passed from cpuset.

 - Fill newidle_idx for node domains which currently unused but
   might be required if sched_relax_domain_level become higher.

 - We can change the default level by boot option 'relax_domain_level='.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 7 changed files with 161 additions and 11 deletions Side-by-side Diff

include/asm-ia64/topology.h
... ... @@ -93,7 +93,7 @@
93 93 .cache_nice_tries = 2, \
94 94 .busy_idx = 3, \
95 95 .idle_idx = 2, \
96   - .newidle_idx = 0, /* unused */ \
  96 + .newidle_idx = 2, \
97 97 .wake_idx = 1, \
98 98 .forkexec_idx = 1, \
99 99 .flags = SD_LOAD_BALANCE \
include/asm-sh/topology.h
... ... @@ -16,7 +16,7 @@
16 16 .cache_nice_tries = 2, \
17 17 .busy_idx = 3, \
18 18 .idle_idx = 2, \
19   - .newidle_idx = 0, \
  19 + .newidle_idx = 2, \
20 20 .wake_idx = 1, \
21 21 .forkexec_idx = 1, \
22 22 .flags = SD_LOAD_BALANCE \
include/asm-x86/topology.h
... ... @@ -147,7 +147,7 @@
147 147  
148 148 # define SD_CACHE_NICE_TRIES 2
149 149 # define SD_IDLE_IDX 2
150   -# define SD_NEWIDLE_IDX 0
  150 +# define SD_NEWIDLE_IDX 2
151 151 # define SD_FORKEXEC_IDX 1
152 152  
153 153 #endif
include/linux/sched.h
... ... @@ -704,6 +704,7 @@
704 704 #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
705 705 #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
706 706 #define SD_SERIALIZE 1024 /* Only a single load balancing instance */
  707 +#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
707 708  
708 709 #define BALANCE_FOR_MC_POWER \
709 710 (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
... ... @@ -733,6 +734,24 @@
733 734 u32 reciprocal_cpu_power;
734 735 };
735 736  
  737 +enum sched_domain_level {
  738 + SD_LV_NONE = 0,
  739 + SD_LV_SIBLING,
  740 + SD_LV_MC,
  741 + SD_LV_CPU,
  742 + SD_LV_NODE,
  743 + SD_LV_ALLNODES,
  744 + SD_LV_MAX
  745 +};
  746 +
  747 +struct sched_domain_attr {
  748 + int relax_domain_level;
  749 +};
  750 +
  751 +#define SD_ATTR_INIT (struct sched_domain_attr) { \
  752 + .relax_domain_level = -1, \
  753 +}
  754 +
736 755 struct sched_domain {
737 756 /* These fields must be setup */
738 757 struct sched_domain *parent; /* top domain must be null terminated */
... ... @@ -750,6 +769,7 @@
750 769 unsigned int wake_idx;
751 770 unsigned int forkexec_idx;
752 771 int flags; /* See SD_* */
  772 + enum sched_domain_level level;
753 773  
754 774 /* Runtime fields. */
755 775 unsigned long last_balance; /* init to jiffies. units in jiffies */
... ... @@ -789,7 +809,8 @@
789 809 #endif
790 810 };
791 811  
792   -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
  812 +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
  813 + struct sched_domain_attr *dattr_new);
793 814 extern int arch_reinit_sched_domains(void);
794 815  
795 816 #endif /* CONFIG_SMP */
... ... @@ -98,6 +98,9 @@
98 98 /* partition number for rebuild_sched_domains() */
99 99 int pn;
100 100  
  101 + /* for custom sched domain */
  102 + int relax_domain_level;
  103 +
101 104 /* used for walking a cpuset heirarchy */
102 105 struct list_head stack_list;
103 106 };
... ... @@ -478,6 +481,16 @@
478 481 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
479 482 }
480 483  
  484 +static void
  485 +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  486 +{
  487 + if (!dattr)
  488 + return;
  489 + if (dattr->relax_domain_level < c->relax_domain_level)
  490 + dattr->relax_domain_level = c->relax_domain_level;
  491 + return;
  492 +}
  493 +
481 494 /*
482 495 * rebuild_sched_domains()
483 496 *
484 497  
... ... @@ -553,12 +566,14 @@
553 566 int csn; /* how many cpuset ptrs in csa so far */
554 567 int i, j, k; /* indices for partition finding loops */
555 568 cpumask_t *doms; /* resulting partition; i.e. sched domains */
  569 + struct sched_domain_attr *dattr; /* attributes for custom domains */
556 570 int ndoms; /* number of sched domains in result */
557 571 int nslot; /* next empty doms[] cpumask_t slot */
558 572  
559 573 q = NULL;
560 574 csa = NULL;
561 575 doms = NULL;
  576 + dattr = NULL;
562 577  
563 578 /* Special case for the 99% of systems with one, full, sched domain */
564 579 if (is_sched_load_balance(&top_cpuset)) {
... ... @@ -566,6 +581,11 @@
566 581 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
567 582 if (!doms)
568 583 goto rebuild;
  584 + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  585 + if (dattr) {
  586 + *dattr = SD_ATTR_INIT;
  587 + update_domain_attr(dattr, &top_cpuset);
  588 + }
569 589 *doms = top_cpuset.cpus_allowed;
570 590 goto rebuild;
571 591 }
... ... @@ -622,6 +642,7 @@
622 642 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
623 643 if (!doms)
624 644 goto rebuild;
  645 + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
625 646  
626 647 for (nslot = 0, i = 0; i < csn; i++) {
627 648 struct cpuset *a = csa[i];
628 649  
... ... @@ -644,12 +665,15 @@
644 665 }
645 666  
646 667 cpus_clear(*dp);
  668 + if (dattr)
  669 + *(dattr + nslot) = SD_ATTR_INIT;
647 670 for (j = i; j < csn; j++) {
648 671 struct cpuset *b = csa[j];
649 672  
650 673 if (apn == b->pn) {
651 674 cpus_or(*dp, *dp, b->cpus_allowed);
652 675 b->pn = -1;
  676 + update_domain_attr(dattr, b);
653 677 }
654 678 }
655 679 nslot++;
... ... @@ -660,7 +684,7 @@
660 684 rebuild:
661 685 /* Have scheduler rebuild sched domains */
662 686 get_online_cpus();
663   - partition_sched_domains(ndoms, doms);
  687 + partition_sched_domains(ndoms, doms, dattr);
664 688 put_online_cpus();
665 689  
666 690 done:
... ... @@ -668,6 +692,7 @@
668 692 kfifo_free(q);
669 693 kfree(csa);
670 694 /* Don't kfree(doms) -- partition_sched_domains() does that. */
  695 + /* Don't kfree(dattr) -- partition_sched_domains() does that. */
671 696 }
672 697  
673 698 static inline int started_after_time(struct task_struct *t1,
... ... @@ -1011,6 +1036,21 @@
1011 1036 return 0;
1012 1037 }
1013 1038  
  1039 +static int update_relax_domain_level(struct cpuset *cs, char *buf)
  1040 +{
  1041 + int val = simple_strtol(buf, NULL, 10);
  1042 +
  1043 + if (val < 0)
  1044 + val = -1;
  1045 +
  1046 + if (val != cs->relax_domain_level) {
  1047 + cs->relax_domain_level = val;
  1048 + rebuild_sched_domains();
  1049 + }
  1050 +
  1051 + return 0;
  1052 +}
  1053 +
1014 1054 /*
1015 1055 * update_flag - read a 0 or a 1 in a file and update associated flag
1016 1056 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
... ... @@ -1202,6 +1242,7 @@
1202 1242 FILE_CPU_EXCLUSIVE,
1203 1243 FILE_MEM_EXCLUSIVE,
1204 1244 FILE_SCHED_LOAD_BALANCE,
  1245 + FILE_SCHED_RELAX_DOMAIN_LEVEL,
1205 1246 FILE_MEMORY_PRESSURE_ENABLED,
1206 1247 FILE_MEMORY_PRESSURE,
1207 1248 FILE_SPREAD_PAGE,
... ... @@ -1256,6 +1297,9 @@
1256 1297 case FILE_SCHED_LOAD_BALANCE:
1257 1298 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1258 1299 break;
  1300 + case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  1301 + retval = update_relax_domain_level(cs, buffer);
  1302 + break;
1259 1303 case FILE_MEMORY_MIGRATE:
1260 1304 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1261 1305 break;
... ... @@ -1354,6 +1398,9 @@
1354 1398 case FILE_SCHED_LOAD_BALANCE:
1355 1399 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1356 1400 break;
  1401 + case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  1402 + s += sprintf(s, "%d", cs->relax_domain_level);
  1403 + break;
1357 1404 case FILE_MEMORY_MIGRATE:
1358 1405 *s++ = is_memory_migrate(cs) ? '1' : '0';
1359 1406 break;
... ... @@ -1424,6 +1471,13 @@
1424 1471 .private = FILE_SCHED_LOAD_BALANCE,
1425 1472 };
1426 1473  
  1474 +static struct cftype cft_sched_relax_domain_level = {
  1475 + .name = "sched_relax_domain_level",
  1476 + .read = cpuset_common_file_read,
  1477 + .write = cpuset_common_file_write,
  1478 + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  1479 +};
  1480 +
1427 1481 static struct cftype cft_memory_migrate = {
1428 1482 .name = "memory_migrate",
1429 1483 .read = cpuset_common_file_read,
... ... @@ -1475,6 +1529,9 @@
1475 1529 return err;
1476 1530 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1477 1531 return err;
  1532 + if ((err = cgroup_add_file(cont, ss,
  1533 + &cft_sched_relax_domain_level)) < 0)
  1534 + return err;
1478 1535 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1479 1536 return err;
1480 1537 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
... ... @@ -1559,6 +1616,7 @@
1559 1616 nodes_clear(cs->mems_allowed);
1560 1617 cs->mems_generation = cpuset_mems_generation++;
1561 1618 fmeter_init(&cs->fmeter);
  1619 + cs->relax_domain_level = -1;
1562 1620  
1563 1621 cs->parent = parent;
1564 1622 number_of_cpusets++;
... ... @@ -1631,6 +1689,7 @@
1631 1689 fmeter_init(&top_cpuset.fmeter);
1632 1690 top_cpuset.mems_generation = cpuset_mems_generation++;
1633 1691 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
  1692 + top_cpuset.relax_domain_level = -1;
1634 1693  
1635 1694 err = register_filesystem(&cpuset_fs_type);
1636 1695 if (err < 0)
... ... @@ -6771,6 +6771,7 @@
6771 6771 { \
6772 6772 memset(sd, 0, sizeof(*sd)); \
6773 6773 *sd = SD_##type##_INIT; \
  6774 + sd->level = SD_LV_##type; \
6774 6775 }
6775 6776  
6776 6777 SD_INIT_FUNC(CPU)
6777 6778  
... ... @@ -6819,11 +6820,42 @@
6819 6820 #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
6820 6821 ((unsigned long)(a) + offsetof(struct allmasks, v))
6821 6822  
  6823 +static int default_relax_domain_level = -1;
  6824 +
  6825 +static int __init setup_relax_domain_level(char *str)
  6826 +{
  6827 + default_relax_domain_level = simple_strtoul(str, NULL, 0);
  6828 + return 1;
  6829 +}
  6830 +__setup("relax_domain_level=", setup_relax_domain_level);
  6831 +
  6832 +static void set_domain_attribute(struct sched_domain *sd,
  6833 + struct sched_domain_attr *attr)
  6834 +{
  6835 + int request;
  6836 +
  6837 + if (!attr || attr->relax_domain_level < 0) {
  6838 + if (default_relax_domain_level < 0)
  6839 + return;
  6840 + else
  6841 + request = default_relax_domain_level;
  6842 + } else
  6843 + request = attr->relax_domain_level;
  6844 + if (request < sd->level) {
  6845 + /* turn off idle balance on this domain */
  6846 + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
  6847 + } else {
  6848 + /* turn on idle balance on this domain */
  6849 + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
  6850 + }
  6851 +}
  6852 +
6822 6853 /*
6823 6854 * Build sched domains for a given set of cpus and attach the sched domains
6824 6855 * to the individual cpus
6825 6856 */
6826   -static int build_sched_domains(const cpumask_t *cpu_map)
  6857 +static int __build_sched_domains(const cpumask_t *cpu_map,
  6858 + struct sched_domain_attr *attr)
6827 6859 {
6828 6860 int i;
6829 6861 struct root_domain *rd;
... ... @@ -6887,6 +6919,7 @@
6887 6919 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
6888 6920 sd = &per_cpu(allnodes_domains, i);
6889 6921 SD_INIT(sd, ALLNODES);
  6922 + set_domain_attribute(sd, attr);
6890 6923 sd->span = *cpu_map;
6891 6924 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6892 6925 p = sd;
... ... @@ -6896,6 +6929,7 @@
6896 6929  
6897 6930 sd = &per_cpu(node_domains, i);
6898 6931 SD_INIT(sd, NODE);
  6932 + set_domain_attribute(sd, attr);
6899 6933 sched_domain_node_span(cpu_to_node(i), &sd->span);
6900 6934 sd->parent = p;
6901 6935 if (p)
... ... @@ -6906,6 +6940,7 @@
6906 6940 p = sd;
6907 6941 sd = &per_cpu(phys_domains, i);
6908 6942 SD_INIT(sd, CPU);
  6943 + set_domain_attribute(sd, attr);
6909 6944 sd->span = *nodemask;
6910 6945 sd->parent = p;
6911 6946 if (p)
... ... @@ -6916,6 +6951,7 @@
6916 6951 p = sd;
6917 6952 sd = &per_cpu(core_domains, i);
6918 6953 SD_INIT(sd, MC);
  6954 + set_domain_attribute(sd, attr);
6919 6955 sd->span = cpu_coregroup_map(i);
6920 6956 cpus_and(sd->span, sd->span, *cpu_map);
6921 6957 sd->parent = p;
... ... @@ -6927,6 +6963,7 @@
6927 6963 p = sd;
6928 6964 sd = &per_cpu(cpu_domains, i);
6929 6965 SD_INIT(sd, SIBLING);
  6966 + set_domain_attribute(sd, attr);
6930 6967 sd->span = per_cpu(cpu_sibling_map, i);
6931 6968 cpus_and(sd->span, sd->span, *cpu_map);
6932 6969 sd->parent = p;
6933 6970  
... ... @@ -7124,8 +7161,15 @@
7124 7161 #endif
7125 7162 }
7126 7163  
  7164 +static int build_sched_domains(const cpumask_t *cpu_map)
  7165 +{
  7166 + return __build_sched_domains(cpu_map, NULL);
  7167 +}
  7168 +
7127 7169 static cpumask_t *doms_cur; /* current sched domains */
7128 7170 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
  7171 +static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
  7172 + in 'doms_cur' */
7129 7173  
7130 7174 /*
7131 7175 * Special case: If a kmalloc of a doms_cur partition (array of
... ... @@ -7153,6 +7197,7 @@
7153 7197 if (!doms_cur)
7154 7198 doms_cur = &fallback_doms;
7155 7199 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
  7200 + dattr_cur = NULL;
7156 7201 err = build_sched_domains(doms_cur);
7157 7202 register_sched_domain_sysctl();
7158 7203  
... ... @@ -7182,6 +7227,22 @@
7182 7227 arch_destroy_sched_domains(cpu_map, &tmpmask);
7183 7228 }
7184 7229  
  7230 +/* handle null as "default" */
  7231 +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  7232 + struct sched_domain_attr *new, int idx_new)
  7233 +{
  7234 + struct sched_domain_attr tmp;
  7235 +
  7236 + /* fast path */
  7237 + if (!new && !cur)
  7238 + return 1;
  7239 +
  7240 + tmp = SD_ATTR_INIT;
  7241 + return !memcmp(cur ? (cur + idx_cur) : &tmp,
  7242 + new ? (new + idx_new) : &tmp,
  7243 + sizeof(struct sched_domain_attr));
  7244 +}
  7245 +
7185 7246 /*
7186 7247 * Partition sched domains as specified by the 'ndoms_new'
7187 7248 * cpumasks in the array doms_new[] of cpumasks. This compares
... ... @@ -7203,7 +7264,8 @@
7203 7264 *
7204 7265 * Call with hotplug lock held
7205 7266 */
7206   -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
  7267 +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
  7268 + struct sched_domain_attr *dattr_new)
7207 7269 {
7208 7270 int i, j;
7209 7271  
7210 7272  
... ... @@ -7216,12 +7278,14 @@
7216 7278 ndoms_new = 1;
7217 7279 doms_new = &fallback_doms;
7218 7280 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
  7281 + dattr_new = NULL;
7219 7282 }
7220 7283  
7221 7284 /* Destroy deleted domains */
7222 7285 for (i = 0; i < ndoms_cur; i++) {
7223 7286 for (j = 0; j < ndoms_new; j++) {
7224   - if (cpus_equal(doms_cur[i], doms_new[j]))
  7287 + if (cpus_equal(doms_cur[i], doms_new[j])
  7288 + && dattrs_equal(dattr_cur, i, dattr_new, j))
7225 7289 goto match1;
7226 7290 }
7227 7291 /* no match - a current sched domain not in new doms_new[] */
7228 7292  
... ... @@ -7233,11 +7297,13 @@
7233 7297 /* Build new domains */
7234 7298 for (i = 0; i < ndoms_new; i++) {
7235 7299 for (j = 0; j < ndoms_cur; j++) {
7236   - if (cpus_equal(doms_new[i], doms_cur[j]))
  7300 + if (cpus_equal(doms_new[i], doms_cur[j])
  7301 + && dattrs_equal(dattr_new, i, dattr_cur, j))
7237 7302 goto match2;
7238 7303 }
7239 7304 /* no match - add a new doms_new */
7240   - build_sched_domains(doms_new + i);
  7305 + __build_sched_domains(doms_new + i,
  7306 + dattr_new ? dattr_new + i : NULL);
7241 7307 match2:
7242 7308 ;
7243 7309 }
7244 7310  
... ... @@ -7245,7 +7311,9 @@
7245 7311 /* Remember the new sched domains */
7246 7312 if (doms_cur != &fallback_doms)
7247 7313 kfree(doms_cur);
  7314 + kfree(dattr_cur); /* kfree(NULL) is safe */
7248 7315 doms_cur = doms_new;
  7316 + dattr_cur = dattr_new;
7249 7317 ndoms_cur = ndoms_new;
7250 7318  
7251 7319 register_sched_domain_sysctl();
... ... @@ -940,7 +940,9 @@
940 940 return cpu;
941 941  
942 942 for_each_domain(cpu, sd) {
943   - if (sd->flags & SD_WAKE_IDLE) {
  943 + if ((sd->flags & SD_WAKE_IDLE)
  944 + || ((sd->flags & SD_WAKE_IDLE_FAR)
  945 + && !task_hot(p, task_rq(p)->clock, sd))) {
944 946 cpus_and(tmp, sd->span, p->cpus_allowed);
945 947 for_each_cpu_mask(i, tmp) {
946 948 if (idle_cpu(i)) {