Commit d55262c4d164759a8debe772da6c9b16059dec47

Authored by Tejun Heo
1 parent 4c16bd327c

workqueue: update sysfs interface to reflect NUMA awareness and a kernel param t…

…o disable NUMA affinity

Unbound workqueues are now NUMA aware.  Let's add some control knobs
and update sysfs interface accordingly.

* Add kernel param workqueue.numa_disable which disables NUMA affinity
  globally.

* Replace sysfs file "pool_id" with "pool_ids" which contain
  node:pool_id pairs.  This change is userland-visible but "pool_id"
  hasn't seen a release yet, so this is okay.

* Add a new sysf files "numa" which can toggle NUMA affinity on
  individual workqueues.  This is implemented as attrs->no_numa whichn
  is special in that it isn't part of a pool's attributes.  It only
  affects how apply_workqueue_attrs() picks which pools to use.

After "pool_ids" change, first_pwq() doesn't have any user left.
Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>

Showing 3 changed files with 73 additions and 23 deletions Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -3222,6 +3222,15 @@
3222 3222 or other driver-specific files in the
3223 3223 Documentation/watchdog/ directory.
3224 3224  
  3225 + workqueue.disable_numa
  3226 + By default, all work items queued to unbound
  3227 + workqueues are affine to the NUMA nodes they're
  3228 + issued on, which results in better behavior in
  3229 + general. If NUMA affinity needs to be disabled for
  3230 + whatever reason, this option can be used. Note
  3231 + that this also can be controlled per-workqueue for
  3232 + workqueues visible under /sys/bus/workqueue/.
  3233 +
3225 3234 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
3226 3235 default x2apic cluster mode on platforms
3227 3236 supporting x2apic.
include/linux/workqueue.h
... ... @@ -119,10 +119,15 @@
119 119 /*
120 120 * A struct for workqueue attributes. This can be used to change
121 121 * attributes of an unbound workqueue.
  122 + *
  123 + * Unlike other fields, ->no_numa isn't a property of a worker_pool. It
  124 + * only modifies how apply_workqueue_attrs() select pools and thus doesn't
  125 + * participate in pool hash calculations or equality comparisons.
122 126 */
123 127 struct workqueue_attrs {
124 128 int nice; /* nice level */
125 129 cpumask_var_t cpumask; /* allowed CPUs */
  130 + bool no_numa; /* disable NUMA affinity */
126 131 };
127 132  
128 133 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
... ... @@ -268,6 +268,9 @@
268 268 static cpumask_var_t *wq_numa_possible_cpumask;
269 269 /* possible CPUs of each node */
270 270  
  271 +static bool wq_disable_numa;
  272 +module_param_named(disable_numa, wq_disable_numa, bool, 0444);
  273 +
271 274 static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
272 275  
273 276 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
... ... @@ -517,21 +520,6 @@
517 520 }
518 521  
519 522 /**
520   - * first_pwq - return the first pool_workqueue of the specified workqueue
521   - * @wq: the target workqueue
522   - *
523   - * This must be called either with wq->mutex held or sched RCU read locked.
524   - * If the pwq needs to be used beyond the locking in effect, the caller is
525   - * responsible for guaranteeing that the pwq stays online.
526   - */
527   -static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
528   -{
529   - assert_rcu_or_wq_mutex(wq);
530   - return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
531   - pwqs_node);
532   -}
533   -
534   -/**
535 523 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
536 524 * @wq: the target workqueue
537 525 * @node: the node ID
538 526  
539 527  
... ... @@ -3114,16 +3102,21 @@
3114 3102 __ATTR_NULL,
3115 3103 };
3116 3104  
3117   -static ssize_t wq_pool_id_show(struct device *dev,
3118   - struct device_attribute *attr, char *buf)
  3105 +static ssize_t wq_pool_ids_show(struct device *dev,
  3106 + struct device_attribute *attr, char *buf)
3119 3107 {
3120 3108 struct workqueue_struct *wq = dev_to_wq(dev);
3121   - struct worker_pool *pool;
3122   - int written;
  3109 + const char *delim = "";
  3110 + int node, written = 0;
3123 3111  
3124 3112 rcu_read_lock_sched();
3125   - pool = first_pwq(wq)->pool;
3126   - written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
  3113 + for_each_node(node) {
  3114 + written += scnprintf(buf + written, PAGE_SIZE - written,
  3115 + "%s%d:%d", delim, node,
  3116 + unbound_pwq_by_node(wq, node)->pool->id);
  3117 + delim = " ";
  3118 + }
  3119 + written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3127 3120 rcu_read_unlock_sched();
3128 3121  
3129 3122 return written;
3130 3123  
3131 3124  
... ... @@ -3212,10 +3205,46 @@
3212 3205 return ret ?: count;
3213 3206 }
3214 3207  
  3208 +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
  3209 + char *buf)
  3210 +{
  3211 + struct workqueue_struct *wq = dev_to_wq(dev);
  3212 + int written;
  3213 +
  3214 + mutex_lock(&wq->mutex);
  3215 + written = scnprintf(buf, PAGE_SIZE, "%d\n",
  3216 + !wq->unbound_attrs->no_numa);
  3217 + mutex_unlock(&wq->mutex);
  3218 +
  3219 + return written;
  3220 +}
  3221 +
  3222 +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
  3223 + const char *buf, size_t count)
  3224 +{
  3225 + struct workqueue_struct *wq = dev_to_wq(dev);
  3226 + struct workqueue_attrs *attrs;
  3227 + int v, ret;
  3228 +
  3229 + attrs = wq_sysfs_prep_attrs(wq);
  3230 + if (!attrs)
  3231 + return -ENOMEM;
  3232 +
  3233 + ret = -EINVAL;
  3234 + if (sscanf(buf, "%d", &v) == 1) {
  3235 + attrs->no_numa = !v;
  3236 + ret = apply_workqueue_attrs(wq, attrs);
  3237 + }
  3238 +
  3239 + free_workqueue_attrs(attrs);
  3240 + return ret ?: count;
  3241 +}
  3242 +
3215 3243 static struct device_attribute wq_sysfs_unbound_attrs[] = {
3216   - __ATTR(pool_id, 0444, wq_pool_id_show, NULL),
  3244 + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3217 3245 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3218 3246 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
  3247 + __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3219 3248 __ATTR_NULL,
3220 3249 };
3221 3250  
... ... @@ -3750,7 +3779,7 @@
3750 3779 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3751 3780 int cpu_going_down, cpumask_t *cpumask)
3752 3781 {
3753   - if (!wq_numa_enabled)
  3782 + if (!wq_numa_enabled || attrs->no_numa)
3754 3783 goto use_dfl;
3755 3784  
3756 3785 /* does @node have any online CPUs @attrs wants? */
... ... @@ -3951,6 +3980,8 @@
3951 3980 cpumask = target_attrs->cpumask;
3952 3981  
3953 3982 mutex_lock(&wq->mutex);
  3983 + if (wq->unbound_attrs->no_numa)
  3984 + goto out_unlock;
3954 3985  
3955 3986 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3956 3987 pwq = unbound_pwq_by_node(wq, node);
... ... @@ -4762,6 +4793,11 @@
4762 4793  
4763 4794 if (num_possible_nodes() <= 1)
4764 4795 return;
  4796 +
  4797 + if (wq_disable_numa) {
  4798 + pr_info("workqueue: NUMA affinity support disabled\n");
  4799 + return;
  4800 + }
4765 4801  
4766 4802 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
4767 4803 BUG_ON(!wq_update_unbound_numa_attrs_buf);