dm kcopyd: preallocate sub jobs to avoid deadlock

There's a possible theoretical deadlock in dm-kcopyd because multiple allocations from the same mempool are required to finish a request. Avoid this by preallocating sub jobs. There is a mempool of 512 entries. Each request requires up to 9 entries from the mempool. If we have at least 57 concurrent requests running, the mempool may overflow and mempool allocations may start blocking until another entry is freed to the mempool. Because the same thread is used to free entries to the mempool and allocate entries from the mempool, this may result in a deadlock. This patch changes it so that one mempool entry contains all 9 "struct kcopyd_job" required to fulfill the whole request. The allocation is done only once in dm_kcopyd_copy and no further mempool allocations are done during request processing. If dm_kcopyd_copy is not run in the completion thread, this implementation is deadlock-free. MIN_JOBS needs reducing accordingly and we've chosen to reduce it further to 8. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>

dm kcopyd: preallocate sub jobs to avoid deadlock
There's a possible theoretical deadlock in dm-kcopyd because multiple allocations from the same mempool are required to finish a request. Avoid this by preallocating sub jobs. There is a mempool of 512 entries. Each request requires up to 9 entries from the mempool. If we have at least 57 concurrent requests running, the mempool may overflow and mempool allocations may start blocking until another entry is freed to the mempool. Because the same thread is used to free entries to the mempool and allocate entries from the mempool, this may result in a deadlock. This patch changes it so that one mempool entry contains all 9 "struct kcopyd_job" required to fulfill the whole request. The allocation is done only once in dm_kcopyd_copy and no further mempool allocations are done during request processing. If dm_kcopyd_copy is not run in the completion thread, this implementation is deadlock-free. MIN_JOBS needs reducing accordingly and we've chosen to reduce it further to 8. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Mikulas Patocka · Alasdair G Kergon
1 parent a705a34a56
Showing 1 changed file with 29 additions and 20 deletions Side-by-side Diff
drivers/md/dm-kcopyd.c
@@ -27,6 +27,10 @@
  
 #include "dm.h"
  
+#define SUB_JOB_SIZE	128
+#define SPLIT_COUNT	8
+#define MIN_JOBS	8
+
 /*-----------------------------------------------------------------
  * Each kcopyd client has its own little pool of preallocated
  * pages for kcopyd io.
  
  
@@ -216,16 +220,17 @@
 	struct mutex lock;
 	atomic_t sub_jobs;
 	sector_t progress;
+
+	struct kcopyd_job *master_job;
 };
  
-/* FIXME: this should scale with the number of pages */
-#define MIN_JOBS 512
-
 static struct kmem_cache *_job_cache;
  
 int __init dm_kcopyd_init(void)
 {
-	_job_cache = KMEM_CACHE(kcopyd_job, 0);
+	_job_cache = kmem_cache_create("kcopyd_job",
+				sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
+				__alignof__(struct kcopyd_job), 0, NULL);
 	if (!_job_cache)
 		return -ENOMEM;
  
@@ -299,7 +304,12 @@
  
 	if (job->pages)
 		kcopyd_put_pages(kc, job->pages);
-	mempool_free(job, kc->job_pool);
+	/*
+	 * If this is the master job, the sub jobs have already
+	 * completed so we can free everything.
+	 */
+	if (job->master_job == job)
+		mempool_free(job, kc->job_pool);
 	fn(read_err, write_err, context);
  
 	if (atomic_dec_and_test(&kc->nr_jobs))
  
@@ -460,14 +470,14 @@
 	wake(kc);
 }
  
-#define SUB_JOB_SIZE 128
 static void segment_complete(int read_err, unsigned long write_err,
 			     void *context)
 {
 	/* FIXME: tidy this function */
 	sector_t progress = 0;
 	sector_t count = 0;
-	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
+	struct kcopyd_job *job = sub_job->master_job;
 	struct dm_kcopyd_client *kc = job->kc;
  
 	mutex_lock(&job->lock);
@@ -498,8 +508,6 @@
  
 	if (count) {
 		int i;
-		struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
-							   GFP_NOIO);
  
 		*sub_job = *job;
 		sub_job->source.sector += progress;
@@ -511,7 +519,7 @@
 		}
  
 		sub_job->fn = segment_complete;
-		sub_job->context = job;
+		sub_job->context = sub_job;
 		dispatch_job(sub_job);
  
 	} else if (atomic_dec_and_test(&job->sub_jobs)) {
  
  
  
@@ -531,19 +539,19 @@
 }
  
 /*
- * Create some little jobs that will do the move between
- * them.
+ * Create some sub jobs to share the work between them.
  */
-#define SPLIT_COUNT 8
-static void split_job(struct kcopyd_job *job)
+static void split_job(struct kcopyd_job *master_job)
 {
 	int i;
  
-	atomic_inc(&job->kc->nr_jobs);
+	atomic_inc(&master_job->kc->nr_jobs);
  
-	atomic_set(&job->sub_jobs, SPLIT_COUNT);
-	for (i = 0; i < SPLIT_COUNT; i++)
-		segment_complete(0, 0u, job);
+	atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
+	for (i = 0; i < SPLIT_COUNT; i++) {
+		master_job[i + 1].master_job = master_job;
+		segment_complete(0, 0u, &master_job[i + 1]);
+	}
 }
  
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
@@ -553,7 +561,8 @@
 	struct kcopyd_job *job;
  
 	/*
-	 * Allocate a new job.
+	 * Allocate an array of jobs consisting of one master job
+	 * followed by SPLIT_COUNT sub jobs.
 	 */
 	job = mempool_alloc(kc->job_pool, GFP_NOIO);
  
  
@@ -577,10 +586,10 @@
  
 	job->fn = fn;
 	job->context = context;
+	job->master_job = job;
  
 	if (job->source.count <= SUB_JOB_SIZE)
 		dispatch_job(job);
-
 	else {
 		mutex_init(&job->lock);
 		job->progress = 0;
...	...	@@ -27,6 +27,10 @@
27	27
28	28	#include "dm.h"
29	29
	30	+#define SUB_JOB_SIZE 128
	31	+#define SPLIT_COUNT 8
	32	+#define MIN_JOBS 8
	33	+
30	34	/*-----------------------------------------------------------------
31	35	* Each kcopyd client has its own little pool of preallocated
32	36	* pages for kcopyd io.
33	37
34	38
...	...	@@ -216,16 +220,17 @@
216	220	struct mutex lock;
217	221	atomic_t sub_jobs;
218	222	sector_t progress;
	223	+
	224	+ struct kcopyd_job *master_job;
219	225	};
220	226
221		-/* FIXME: this should scale with the number of pages */
222		-#define MIN_JOBS 512
223		-
224	227	static struct kmem_cache *_job_cache;
225	228
226	229	int __init dm_kcopyd_init(void)
227	230	{
228		- _job_cache = KMEM_CACHE(kcopyd_job, 0);
	231	+ _job_cache = kmem_cache_create("kcopyd_job",
	232	+ sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
	233	+ __alignof__(struct kcopyd_job), 0, NULL);
229	234	if (!_job_cache)
230	235	return -ENOMEM;
231	236
...	...	@@ -299,7 +304,12 @@
299	304
300	305	if (job->pages)
301	306	kcopyd_put_pages(kc, job->pages);
302		- mempool_free(job, kc->job_pool);
	307	+ /*
	308	+ * If this is the master job, the sub jobs have already
	309	+ * completed so we can free everything.
	310	+ */
	311	+ if (job->master_job == job)
	312	+ mempool_free(job, kc->job_pool);
303	313	fn(read_err, write_err, context);
304	314
305	315	if (atomic_dec_and_test(&kc->nr_jobs))
306	316
...	...	@@ -460,14 +470,14 @@
460	470	wake(kc);
461	471	}
462	472
463		-#define SUB_JOB_SIZE 128
464	473	static void segment_complete(int read_err, unsigned long write_err,
465	474	void *context)
466	475	{
467	476	/* FIXME: tidy this function */
468	477	sector_t progress = 0;
469	478	sector_t count = 0;
470		- struct kcopyd_job job = (struct kcopyd_job ) context;
	479	+ struct kcopyd_job sub_job = (struct kcopyd_job ) context;
	480	+ struct kcopyd_job *job = sub_job->master_job;
471	481	struct dm_kcopyd_client *kc = job->kc;
472	482
473	483	mutex_lock(&job->lock);
...	...	@@ -498,8 +508,6 @@
498	508
499	509	if (count) {
500	510	int i;
501		- struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
502		- GFP_NOIO);
503	511
504	512	sub_job = job;
505	513	sub_job->source.sector += progress;
...	...	@@ -511,7 +519,7 @@
511	519	}
512	520
513	521	sub_job->fn = segment_complete;
514		- sub_job->context = job;
	522	+ sub_job->context = sub_job;
515	523	dispatch_job(sub_job);
516	524
517	525	} else if (atomic_dec_and_test(&job->sub_jobs)) {
518	526
519	527
520	528
...	...	@@ -531,19 +539,19 @@
531	539	}
532	540
533	541	/*
534		- * Create some little jobs that will do the move between
535		- * them.
	542	+ * Create some sub jobs to share the work between them.
536	543	*/
537		-#define SPLIT_COUNT 8
538		-static void split_job(struct kcopyd_job *job)
	544	+static void split_job(struct kcopyd_job *master_job)
539	545	{
540	546	int i;
541	547
542		- atomic_inc(&job->kc->nr_jobs);
	548	+ atomic_inc(&master_job->kc->nr_jobs);
543	549
544		- atomic_set(&job->sub_jobs, SPLIT_COUNT);
545		- for (i = 0; i < SPLIT_COUNT; i++)
546		- segment_complete(0, 0u, job);
	550	+ atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
	551	+ for (i = 0; i < SPLIT_COUNT; i++) {
	552	+ master_job[i + 1].master_job = master_job;
	553	+ segment_complete(0, 0u, &master_job[i + 1]);
	554	+ }
547	555	}
548	556
549	557	int dm_kcopyd_copy(struct dm_kcopyd_client kc, struct dm_io_region from,
...	...	@@ -553,7 +561,8 @@
553	561	struct kcopyd_job *job;
554	562
555	563	/*
556		- * Allocate a new job.
	564	+ * Allocate an array of jobs consisting of one master job
	565	+ * followed by SPLIT_COUNT sub jobs.
557	566	*/
558	567	job = mempool_alloc(kc->job_pool, GFP_NOIO);
559	568
560	569
...	...	@@ -577,10 +586,10 @@
577	586
578	587	job->fn = fn;
579	588	job->context = context;
	589	+ job->master_job = job;
580	590
581	591	if (job->source.count <= SUB_JOB_SIZE)
582	592	dispatch_job(job);
583		-
584	593	else {
585	594	mutex_init(&job->lock);
586	595	job->progress = 0;