workqueue: update sysfs interface to reflect NUMA awareness and a kernel param t…

…o disable NUMA affinity Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>

workqueue: update sysfs interface to reflect NUMA awareness and a kernel param t…
…o disable NUMA affinity Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Tejun Heo
1 parent 4c16bd327c
Showing 3 changed files with 73 additions and 23 deletions Side-by-side Diff
Documentation/kernel-parameters.txt
include/linux/workqueue.h
kernel/workqueue.c
@@ -3222,6 +3222,15 @@
 			or other driver-specific files in the
 			Documentation/watchdog/ directory.
  
+	workqueue.disable_numa
+			By default, all work items queued to unbound
+			workqueues are affine to the NUMA nodes they're
+			issued on, which results in better behavior in
+			general.  If NUMA affinity needs to be disabled for
+			whatever reason, this option can be used.  Note
+			that this also can be controlled per-workqueue for
+			workqueues visible under /sys/bus/workqueue/.
+
 	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
 			default x2apic cluster mode on platforms
 			supporting x2apic.
@@ -119,10 +119,15 @@
 /*
  * A struct for workqueue attributes.  This can be used to change
  * attributes of an unbound workqueue.
+ *
+ * Unlike other fields, ->no_numa isn't a property of a worker_pool.  It
+ * only modifies how apply_workqueue_attrs() select pools and thus doesn't
+ * participate in pool hash calculations or equality comparisons.
  */
 struct workqueue_attrs {
 	int			nice;		/* nice level */
 	cpumask_var_t		cpumask;	/* allowed CPUs */
+	bool			no_numa;	/* disable NUMA affinity */
 };
  
 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
@@ -268,6 +268,9 @@
 static cpumask_var_t *wq_numa_possible_cpumask;
 					/* possible CPUs of each node */
  
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
  
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -517,21 +520,6 @@
 }
  
 /**
- * first_pwq - return the first pool_workqueue of the specified workqueue
- * @wq: the target workqueue
- *
- * This must be called either with wq->mutex held or sched RCU read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- */
-static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
-{
-	assert_rcu_or_wq_mutex(wq);
-	return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
-				      pwqs_node);
-}
-
-/**
  * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
  * @wq: the target workqueue
  * @node: the node ID
  
  
@@ -3114,16 +3102,21 @@
 	__ATTR_NULL,
 };
  
-static ssize_t wq_pool_id_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
+static ssize_t wq_pool_ids_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct worker_pool *pool;
-	int written;
+	const char *delim = "";
+	int node, written = 0;
  
 	rcu_read_lock_sched();
-	pool = first_pwq(wq)->pool;
-	written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
+	for_each_node(node) {
+		written += scnprintf(buf + written, PAGE_SIZE - written,
+				     "%s%d:%d", delim, node,
+				     unbound_pwq_by_node(wq, node)->pool->id);
+		delim = " ";
+	}
+	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
 	rcu_read_unlock_sched();
  
 	return written;
  
  
@@ -3212,10 +3205,46 @@
 	return ret ?: count;
 }
  
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n",
+			    !wq->unbound_attrs->no_numa);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int v, ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	if (sscanf(buf, "%d", &v) == 1) {
+		attrs->no_numa = !v;
+		ret = apply_workqueue_attrs(wq, attrs);
+	}
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
 static struct device_attribute wq_sysfs_unbound_attrs[] = {
-	__ATTR(pool_id, 0444, wq_pool_id_show, NULL),
+	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
 	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
 	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
 	__ATTR_NULL,
 };
  
@@ -3750,7 +3779,7 @@
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
 				 int cpu_going_down, cpumask_t *cpumask)
 {
-	if (!wq_numa_enabled)
+	if (!wq_numa_enabled || attrs->no_numa)
 		goto use_dfl;
  
 	/* does @node have any online CPUs @attrs wants? */
@@ -3951,6 +3980,8 @@
 	cpumask = target_attrs->cpumask;
  
 	mutex_lock(&wq->mutex);
+	if (wq->unbound_attrs->no_numa)
+		goto out_unlock;
  
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	pwq = unbound_pwq_by_node(wq, node);
@@ -4762,6 +4793,11 @@
  
 	if (num_possible_nodes() <= 1)
 		return;
+
+	if (wq_disable_numa) {
+		pr_info("workqueue: NUMA affinity support disabled\n");
+		return;
+	}
  
 	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
 	BUG_ON(!wq_update_unbound_numa_attrs_buf);
...	...	@@ -3222,6 +3222,15 @@
3222	3222	or other driver-specific files in the
3223	3223	Documentation/watchdog/ directory.
3224	3224
	3225	+ workqueue.disable_numa
	3226	+ By default, all work items queued to unbound
	3227	+ workqueues are affine to the NUMA nodes they're
	3228	+ issued on, which results in better behavior in
	3229	+ general. If NUMA affinity needs to be disabled for
	3230	+ whatever reason, this option can be used. Note
	3231	+ that this also can be controlled per-workqueue for
	3232	+ workqueues visible under /sys/bus/workqueue/.
	3233	+
3225	3234	x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
3226	3235	default x2apic cluster mode on platforms
3227	3236	supporting x2apic.
...	...	@@ -119,10 +119,15 @@
119	119	/*
120	120	* A struct for workqueue attributes. This can be used to change
121	121	* attributes of an unbound workqueue.
	122	+ *
	123	+ * Unlike other fields, ->no_numa isn't a property of a worker_pool. It
	124	+ * only modifies how apply_workqueue_attrs() select pools and thus doesn't
	125	+ * participate in pool hash calculations or equality comparisons.
122	126	*/
123	127	struct workqueue_attrs {
124	128	int nice; /* nice level */
125	129	cpumask_var_t cpumask; /* allowed CPUs */
	130	+ bool no_numa; /* disable NUMA affinity */
126	131	};
127	132
128	133	static inline struct delayed_work to_delayed_work(struct work_struct work)
...	...	@@ -268,6 +268,9 @@
268	268	static cpumask_var_t *wq_numa_possible_cpumask;
269	269	/* possible CPUs of each node */
270	270
	271	+static bool wq_disable_numa;
	272	+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
	273	+
271	274	static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
272	275
273	276	/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
...	...	@@ -517,21 +520,6 @@
517	520	}
518	521
519	522	/**
520		- * first_pwq - return the first pool_workqueue of the specified workqueue
521		- * @wq: the target workqueue
522		- *
523		- * This must be called either with wq->mutex held or sched RCU read locked.
524		- * If the pwq needs to be used beyond the locking in effect, the caller is
525		- * responsible for guaranteeing that the pwq stays online.
526		- */
527		-static struct pool_workqueue first_pwq(struct workqueue_struct wq)
528		-{
529		- assert_rcu_or_wq_mutex(wq);
530		- return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
531		- pwqs_node);
532		-}
533		-
534		-/**
535	523	* unbound_pwq_by_node - return the unbound pool_workqueue for the given node
536	524	* @wq: the target workqueue
537	525	* @node: the node ID
538	526
539	527
...	...	@@ -3114,16 +3102,21 @@
3114	3102	__ATTR_NULL,
3115	3103	};
3116	3104
3117		-static ssize_t wq_pool_id_show(struct device *dev,
3118		- struct device_attribute attr, char buf)
	3105	+static ssize_t wq_pool_ids_show(struct device *dev,
	3106	+ struct device_attribute attr, char buf)
3119	3107	{
3120	3108	struct workqueue_struct *wq = dev_to_wq(dev);
3121		- struct worker_pool *pool;
3122		- int written;
	3109	+ const char *delim = "";
	3110	+ int node, written = 0;
3123	3111
3124	3112	rcu_read_lock_sched();
3125		- pool = first_pwq(wq)->pool;
3126		- written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
	3113	+ for_each_node(node) {
	3114	+ written += scnprintf(buf + written, PAGE_SIZE - written,
	3115	+ "%s%d:%d", delim, node,
	3116	+ unbound_pwq_by_node(wq, node)->pool->id);
	3117	+ delim = " ";
	3118	+ }
	3119	+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3127	3120	rcu_read_unlock_sched();
3128	3121
3129	3122	return written;
3130	3123
3131	3124
...	...	@@ -3212,10 +3205,46 @@
3212	3205	return ret ?: count;
3213	3206	}
3214	3207
	3208	+static ssize_t wq_numa_show(struct device dev, struct device_attribute attr,
	3209	+ char *buf)
	3210	+{
	3211	+ struct workqueue_struct *wq = dev_to_wq(dev);
	3212	+ int written;
	3213	+
	3214	+ mutex_lock(&wq->mutex);
	3215	+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
	3216	+ !wq->unbound_attrs->no_numa);
	3217	+ mutex_unlock(&wq->mutex);
	3218	+
	3219	+ return written;
	3220	+}
	3221	+
	3222	+static ssize_t wq_numa_store(struct device dev, struct device_attribute attr,
	3223	+ const char *buf, size_t count)
	3224	+{
	3225	+ struct workqueue_struct *wq = dev_to_wq(dev);
	3226	+ struct workqueue_attrs *attrs;
	3227	+ int v, ret;
	3228	+
	3229	+ attrs = wq_sysfs_prep_attrs(wq);
	3230	+ if (!attrs)
	3231	+ return -ENOMEM;
	3232	+
	3233	+ ret = -EINVAL;
	3234	+ if (sscanf(buf, "%d", &v) == 1) {
	3235	+ attrs->no_numa = !v;
	3236	+ ret = apply_workqueue_attrs(wq, attrs);
	3237	+ }
	3238	+
	3239	+ free_workqueue_attrs(attrs);
	3240	+ return ret ?: count;
	3241	+}
	3242	+
3215	3243	static struct device_attribute wq_sysfs_unbound_attrs[] = {
3216		- __ATTR(pool_id, 0444, wq_pool_id_show, NULL),
	3244	+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3217	3245	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3218	3246	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
	3247	+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3219	3248	__ATTR_NULL,
3220	3249	};
3221	3250
...	...	@@ -3750,7 +3779,7 @@
3750	3779	static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3751	3780	int cpu_going_down, cpumask_t *cpumask)
3752	3781	{
3753		- if (!wq_numa_enabled)
	3782	+ if (!wq_numa_enabled \|\| attrs->no_numa)
3754	3783	goto use_dfl;
3755	3784
3756	3785	/* does @node have any online CPUs @attrs wants? */
...	...	@@ -3951,6 +3980,8 @@
3951	3980	cpumask = target_attrs->cpumask;
3952	3981
3953	3982	mutex_lock(&wq->mutex);
	3983	+ if (wq->unbound_attrs->no_numa)
	3984	+ goto out_unlock;
3954	3985
3955	3986	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3956	3987	pwq = unbound_pwq_by_node(wq, node);
...	...	@@ -4762,6 +4793,11 @@
4762	4793
4763	4794	if (num_possible_nodes() <= 1)
4764	4795	return;
	4796	+
	4797	+ if (wq_disable_numa) {
	4798	+ pr_info("workqueue: NUMA affinity support disabled\n");
	4799	+ return;
	4800	+ }
4765	4801
4766	4802	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
4767	4803	BUG_ON(!wq_update_unbound_numa_attrs_buf);