percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free Percpu allocation always uses GFP_KERNEL and whole alloc/free paths were protected by single mutex. All percpu allocations have been from GFP_KERNEL-safe context and the original allocator had this assumption too. However, by protecting both alloc and free paths with the same mutex, the new allocator creates free -> alloc -> GFP_KERNEL dependency which the original allocator didn't have. This can lead to deadlock if free is called from FS or IO paths. Also, in general, allocators are expected to allow free to be called from atomic context. This patch implements finer grained locking to break the deadlock and allow atomic free. For details, please read the "Synchronization rules" comment. While at it, also add CONTEXT: to function comments to describe which context they expect to be called from and what they do to it. This problem was reported by Thomas Gleixner and Peter Zijlstra. http://thread.gmane.org/gmane.linux.kernel/802384 Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: Peter Zijlstra <peterz@infradead.org>

percpu: finer grained locking to break deadlock and allow atomic free
Impact: fix deadlock and allow atomic free Percpu allocation always uses GFP_KERNEL and whole alloc/free paths were protected by single mutex. All percpu allocations have been from GFP_KERNEL-safe context and the original allocator had this assumption too. However, by protecting both alloc and free paths with the same mutex, the new allocator creates free -> alloc -> GFP_KERNEL dependency which the original allocator didn't have. This can lead to deadlock if free is called from FS or IO paths. Also, in general, allocators are expected to allow free to be called from atomic context. This patch implements finer grained locking to break the deadlock and allow atomic free. For details, please read the "Synchronization rules" comment. While at it, also add CONTEXT: to function comments to describe which context they expect to be called from and what they do to it. This problem was reported by Thomas Gleixner and Peter Zijlstra. http://thread.gmane.org/gmane.linux.kernel/802384 Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: Peter Zijlstra <peterz@infradead.org>
Tejun Heo
1 parent a56dbddf06
Showing 1 changed file with 124 additions and 37 deletions Side-by-side Diff
mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
  
  
  
  
@@ -101,20 +102,28 @@
 static int pcpu_reserved_chunk_limit;
  
 /*
- * One mutex to rule them all.
+ * Synchronization rules.
  *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
  *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
  */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
  
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
@@ -176,6 +185,9 @@
  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
  * memory is always zeroed.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -215,6 +227,9 @@
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@
  * searchs for the chunk with the highest start address which isn't
  * beyond @addr.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * The address of the found chunk.
  */
@@ -300,6 +318,9 @@
  * @new: chunk to insert
  *
  * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@
  * A single allocation can split an area into three areas, so this
  * function makes sure that @chunk->map has at least two extra slots.
  *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
  * RETURNS:
  * 0 if noop, 1 if successfully extended, -errno on failure.
  */
  
  
  
@@ -332,14 +357,26 @@
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
  
+	spin_unlock_irq(&pcpu_lock);
+
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
  
 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-	if (!new)
+	if (!new) {
+		spin_lock_irq(&pcpu_lock);
 		return -ENOMEM;
+	}
  
+	/*
+	 * Acquire pcpu_lock and switch to new area map.  Only free
+	 * could have happened inbetween, so map_used couldn't have
+	 * grown.
+	 */
+	spin_lock_irq(&pcpu_lock);
+	BUG_ON(new_alloc < chunk->map_used + 2);
+
 	size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, size);
  
@@ -371,6 +408,9 @@
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
@@ -406,6 +446,9 @@
  *
  * @chunk->map must have at least two free slots.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
@@ -495,6 +538,9 @@
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
@@ -658,6 +707,9 @@
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
  
  
@@ -748,15 +800,16 @@
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
  
  
  
  
  
@@ -766,27 +819,37 @@
 		return NULL;
 	}
  
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
  
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint ||
 		    pcpu_extend_area_map(chunk) < 0)
-			goto out_unlock;
+			goto fail_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
-		goto out_unlock;
+		goto fail_unlock;
 	}
  
+restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
-			if (pcpu_extend_area_map(chunk) < 0)
-				goto out_unlock;
+
+			switch (pcpu_extend_area_map(chunk)) {
+			case 0:
+				break;
+			case 1:
+				goto restart;	/* pcpu_lock dropped, restart */
+			default:
+				goto fail_unlock;
+			}
+
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
  
  
  
  
  
  
  
@@ -794,27 +857,36 @@
 	}
  
 	/* hmmm... no space left, create a new chunk */
+	spin_unlock_irq(&pcpu_lock);
+
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
-		goto out_unlock;
+		goto fail_unlock_mutex;
+
+	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
+	goto restart;
  
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
-
 area_found:
+	spin_unlock_irq(&pcpu_lock);
+
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
+		spin_lock_irq(&pcpu_lock);
 		pcpu_free_area(chunk, off);
-		goto out_unlock;
+		goto fail_unlock;
 	}
  
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
+	mutex_unlock(&pcpu_alloc_mutex);
+
+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+	spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+	mutex_unlock(&pcpu_alloc_mutex);
+	return NULL;
 }
  
 /**
@@ -825,6 +897,9 @@
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -843,6 +918,9 @@
  * percpu area if arch has set it up; otherwise, allocation is served
  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -856,6 +934,9 @@
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
  
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
  
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@
 		list_move(&chunk->list, &todo);
 	}
  
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irq(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
  
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
  
  
@@ -888,18 +971,22 @@
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
  */
 void free_percpu(void *ptr)
 {
 	void *addr = __pcpu_ptr_to_addr(ptr);
 	struct pcpu_chunk *chunk;
+	unsigned long flags;
 	int off;
  
 	if (!ptr)
 		return;
  
-	mutex_lock(&pcpu_mutex);
+	spin_lock_irqsave(&pcpu_lock, flags);
  
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@
 			}
 	}
  
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
...	...	@@ -62,6 +62,7 @@
62	62	#include <linux/pfn.h>
63	63	#include <linux/rbtree.h>
64	64	#include <linux/slab.h>
	65	+#include <linux/spinlock.h>
65	66	#include <linux/vmalloc.h>
66	67	#include <linux/workqueue.h>
67	68
68	69
69	70
70	71
...	...	@@ -101,20 +102,28 @@
101	102	static int pcpu_reserved_chunk_limit;
102	103
103	104	/*
104		- * One mutex to rule them all.
	105	+ * Synchronization rules.
105	106	*
106		- * The following mutex is grabbed in the outermost public alloc/free
107		- * interface functions and released only when the operation is
108		- * complete. As such, every function in this file other than the
109		- * outermost functions are called under pcpu_mutex.
	107	+ * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
	108	+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
	109	+ * The latter is a spinlock and protects the index data structures -
	110	+ * chunk slots, rbtree, chunks and area maps in chunks.
110	111	*
111		- * It can easily be switched to use spinlock such that only the area
112		- * allocation and page population commit are protected with it doing
113		- * actual [de]allocation without holding any lock. However, given
114		- * what this allocator does, I think it's better to let them run
115		- * sequentially.
	112	+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
	113	+ * pcpu_lock is grabbed and released as necessary. All actual memory
	114	+ * allocations are done using GFP_KERNEL with pcpu_lock released.
	115	+ *
	116	+ * Free path accesses and alters only the index data structures, so it
	117	+ * can be safely called from atomic context. When memory needs to be
	118	+ * returned to the system, free path schedules reclaim_work which
	119	+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
	120	+ * reclaimed, release both locks and frees the chunks. Note that it's
	121	+ * necessary to grab both locks to remove a chunk from circulation as
	122	+ * allocation path might be referencing the chunk with only
	123	+ * pcpu_alloc_mutex locked.
116	124	*/
117		-static DEFINE_MUTEX(pcpu_mutex);
	125	+static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
	126	+static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
118	127
119	128	static struct list_head pcpu_slot __read_mostly; / chunk list slots */
120	129	static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
...	...	@@ -176,6 +185,9 @@
176	185	* kzalloc() is used; otherwise, vmalloc() is used. The returned
177	186	* memory is always zeroed.
178	187	*
	188	+ * CONTEXT:
	189	+ * Does GFP_KERNEL allocation.
	190	+ *
179	191	* RETURNS:
180	192	* Pointer to the allocated area on success, NULL on failure.
181	193	*/
...	...	@@ -215,6 +227,9 @@
215	227	* New slot according to the changed state is determined and @chunk is
216	228	* moved to the slot. Note that the reserved chunk is never put on
217	229	* chunk slots.
	230	+ *
	231	+ * CONTEXT:
	232	+ * pcpu_lock.
218	233	*/
219	234	static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
220	235	{
...	...	@@ -260,6 +275,9 @@
260	275	* searchs for the chunk with the highest start address which isn't
261	276	* beyond @addr.
262	277	*
	278	+ * CONTEXT:
	279	+ * pcpu_lock.
	280	+ *
263	281	* RETURNS:
264	282	* The address of the found chunk.
265	283	*/
...	...	@@ -300,6 +318,9 @@
300	318	* @new: chunk to insert
301	319	*
302	320	* Insert @new into address rb tree.
	321	+ *
	322	+ * CONTEXT:
	323	+ * pcpu_lock.
303	324	*/
304	325	static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
305	326	{
...	...	@@ -319,6 +340,10 @@
319	340	* A single allocation can split an area into three areas, so this
320	341	* function makes sure that @chunk->map has at least two extra slots.
321	342	*
	343	+ * CONTEXT:
	344	+ * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
	345	+ * if area map is extended.
	346	+ *
322	347	* RETURNS:
323	348	* 0 if noop, 1 if successfully extended, -errno on failure.
324	349	*/
325	350
326	351
327	352
...	...	@@ -332,14 +357,26 @@
332	357	if (chunk->map_alloc >= chunk->map_used + 2)
333	358	return 0;
334	359
	360	+ spin_unlock_irq(&pcpu_lock);
	361	+
335	362	new_alloc = PCPU_DFL_MAP_ALLOC;
336	363	while (new_alloc < chunk->map_used + 2)
337	364	new_alloc *= 2;
338	365
339	366	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
340		- if (!new)
	367	+ if (!new) {
	368	+ spin_lock_irq(&pcpu_lock);
341	369	return -ENOMEM;
	370	+ }
342	371
	372	+ /*
	373	+ * Acquire pcpu_lock and switch to new area map. Only free
	374	+ * could have happened inbetween, so map_used couldn't have
	375	+ * grown.
	376	+ */
	377	+ spin_lock_irq(&pcpu_lock);
	378	+ BUG_ON(new_alloc < chunk->map_used + 2);
	379	+
343	380	size = chunk->map_alloc * sizeof(chunk->map[0]);
344	381	memcpy(new, chunk->map, size);
345	382
...	...	@@ -371,6 +408,9 @@
371	408	* is inserted after the target block.
372	409	*
373	410	* @chunk->map must have enough free slots to accomodate the split.
	411	+ *
	412	+ * CONTEXT:
	413	+ * pcpu_lock.
374	414	*/
375	415	static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
376	416	int head, int tail)
...	...	@@ -406,6 +446,9 @@
406	446	*
407	447	* @chunk->map must have at least two free slots.
408	448	*
	449	+ * CONTEXT:
	450	+ * pcpu_lock.
	451	+ *
409	452	* RETURNS:
410	453	* Allocated offset in @chunk on success, -1 if no matching area is
411	454	* found.
...	...	@@ -495,6 +538,9 @@
495	538	* Free area starting from @freeme to @chunk. Note that this function
496	539	* only modifies the allocation map. It doesn't depopulate or unmap
497	540	* the area.
	541	+ *
	542	+ * CONTEXT:
	543	+ * pcpu_lock.
498	544	*/
499	545	static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
500	546	{
...	...	@@ -580,6 +626,9 @@
580	626	* For each cpu, depopulate and unmap pages [@page_start,@page_end)
581	627	* from @chunk. If @flush is true, vcache is flushed before unmapping
582	628	* and tlb after.
	629	+ *
	630	+ * CONTEXT:
	631	+ * pcpu_alloc_mutex.
583	632	*/
584	633	static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
585	634	bool flush)
...	...	@@ -658,6 +707,9 @@
658	707	*
659	708	* For each cpu, populate and map pages [@page_start,@page_end) into
660	709	* @chunk. The area is cleared on return.
	710	+ *
	711	+ * CONTEXT:
	712	+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
661	713	*/
662	714	static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
663	715	{
664	716
665	717
...	...	@@ -748,15 +800,16 @@
748	800	* @align: alignment of area (max PAGE_SIZE)
749	801	* @reserved: allocate from the reserved chunk if available
750	802	*
751		- * Allocate percpu area of @size bytes aligned at @align. Might
752		- * sleep. Might trigger writeouts.
	803	+ * Allocate percpu area of @size bytes aligned at @align.
753	804	*
	805	+ * CONTEXT:
	806	+ * Does GFP_KERNEL allocation.
	807	+ *
754	808	* RETURNS:
755	809	* Percpu pointer to the allocated area on success, NULL on failure.
756	810	*/
757	811	static void *pcpu_alloc(size_t size, size_t align, bool reserved)
758	812	{
759		- void *ptr = NULL;
760	813	struct pcpu_chunk *chunk;
761	814	int slot, off;
762	815
763	816
764	817
765	818
766	819
...	...	@@ -766,27 +819,37 @@
766	819	return NULL;
767	820	}
768	821
769		- mutex_lock(&pcpu_mutex);
	822	+ mutex_lock(&pcpu_alloc_mutex);
	823	+ spin_lock_irq(&pcpu_lock);
770	824
771	825	/* serve reserved allocations from the reserved chunk if available */
772	826	if (reserved && pcpu_reserved_chunk) {
773	827	chunk = pcpu_reserved_chunk;
774	828	if (size > chunk->contig_hint \|\|
775	829	pcpu_extend_area_map(chunk) < 0)
776		- goto out_unlock;
	830	+ goto fail_unlock;
777	831	off = pcpu_alloc_area(chunk, size, align);
778	832	if (off >= 0)
779	833	goto area_found;
780		- goto out_unlock;
	834	+ goto fail_unlock;
781	835	}
782	836
	837	+restart:
783	838	/* search through normal chunks */
784	839	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
785	840	list_for_each_entry(chunk, &pcpu_slot[slot], list) {
786	841	if (size > chunk->contig_hint)
787	842	continue;
788		- if (pcpu_extend_area_map(chunk) < 0)
789		- goto out_unlock;
	843	+
	844	+ switch (pcpu_extend_area_map(chunk)) {
	845	+ case 0:
	846	+ break;
	847	+ case 1:
	848	+ goto restart; /* pcpu_lock dropped, restart */
	849	+ default:
	850	+ goto fail_unlock;
	851	+ }
	852	+
790	853	off = pcpu_alloc_area(chunk, size, align);
791	854	if (off >= 0)
792	855	goto area_found;
793	856
794	857
795	858
796	859
797	860
798	861
799	862
...	...	@@ -794,27 +857,36 @@
794	857	}
795	858
796	859	/* hmmm... no space left, create a new chunk */
	860	+ spin_unlock_irq(&pcpu_lock);
	861	+
797	862	chunk = alloc_pcpu_chunk();
798	863	if (!chunk)
799		- goto out_unlock;
	864	+ goto fail_unlock_mutex;
	865	+
	866	+ spin_lock_irq(&pcpu_lock);
800	867	pcpu_chunk_relocate(chunk, -1);
801	868	pcpu_chunk_addr_insert(chunk);
	869	+ goto restart;
802	870
803		- off = pcpu_alloc_area(chunk, size, align);
804		- if (off < 0)
805		- goto out_unlock;
806		-
807	871	area_found:
	872	+ spin_unlock_irq(&pcpu_lock);
	873	+
808	874	/* populate, map and clear the area */
809	875	if (pcpu_populate_chunk(chunk, off, size)) {
	876	+ spin_lock_irq(&pcpu_lock);
810	877	pcpu_free_area(chunk, off);
811		- goto out_unlock;
	878	+ goto fail_unlock;
812	879	}
813	880
814		- ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
815		-out_unlock:
816		- mutex_unlock(&pcpu_mutex);
817		- return ptr;
	881	+ mutex_unlock(&pcpu_alloc_mutex);
	882	+
	883	+ return __addr_to_pcpu_ptr(chunk->vm->addr + off);
	884	+
	885	+fail_unlock:
	886	+ spin_unlock_irq(&pcpu_lock);
	887	+fail_unlock_mutex:
	888	+ mutex_unlock(&pcpu_alloc_mutex);
	889	+ return NULL;
818	890	}
819	891
820	892	/**
...	...	@@ -825,6 +897,9 @@
825	897	* Allocate percpu area of @size bytes aligned at @align. Might
826	898	* sleep. Might trigger writeouts.
827	899	*
	900	+ * CONTEXT:
	901	+ * Does GFP_KERNEL allocation.
	902	+ *
828	903	* RETURNS:
829	904	* Percpu pointer to the allocated area on success, NULL on failure.
830	905	*/
...	...	@@ -843,6 +918,9 @@
843	918	* percpu area if arch has set it up; otherwise, allocation is served
844	919	* from the same dynamic area. Might sleep. Might trigger writeouts.
845	920	*
	921	+ * CONTEXT:
	922	+ * Does GFP_KERNEL allocation.
	923	+ *
846	924	* RETURNS:
847	925	* Percpu pointer to the allocated area on success, NULL on failure.
848	926	*/
...	...	@@ -856,6 +934,9 @@
856	934	* @work: unused
857	935	*
858	936	* Reclaim all fully free chunks except for the first one.
	937	+ *
	938	+ * CONTEXT:
	939	+ * workqueue context.
859	940	*/
860	941	static void pcpu_reclaim(struct work_struct *work)
861	942	{
...	...	@@ -863,7 +944,8 @@
863	944	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
864	945	struct pcpu_chunk chunk, next;
865	946
866		- mutex_lock(&pcpu_mutex);
	947	+ mutex_lock(&pcpu_alloc_mutex);
	948	+ spin_lock_irq(&pcpu_lock);
867	949
868	950	list_for_each_entry_safe(chunk, next, head, list) {
869	951	WARN_ON(chunk->immutable);
...	...	@@ -876,7 +958,8 @@
876	958	list_move(&chunk->list, &todo);
877	959	}
878	960
879		- mutex_unlock(&pcpu_mutex);
	961	+ spin_unlock_irq(&pcpu_lock);
	962	+ mutex_unlock(&pcpu_alloc_mutex);
880	963
881	964	list_for_each_entry_safe(chunk, next, &todo, list) {
882	965	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
883	966
884	967
...	...	@@ -888,18 +971,22 @@
888	971	* free_percpu - free percpu area
889	972	* @ptr: pointer to area to free
890	973	*
891		- * Free percpu area @ptr. Might sleep.
	974	+ * Free percpu area @ptr.
	975	+ *
	976	+ * CONTEXT:
	977	+ * Can be called from atomic context.
892	978	*/
893	979	void free_percpu(void *ptr)
894	980	{
895	981	void *addr = __pcpu_ptr_to_addr(ptr);
896	982	struct pcpu_chunk *chunk;
	983	+ unsigned long flags;
897	984	int off;
898	985
899	986	if (!ptr)
900	987	return;
901	988
902		- mutex_lock(&pcpu_mutex);
	989	+ spin_lock_irqsave(&pcpu_lock, flags);
903	990
904	991	chunk = pcpu_chunk_addr_search(addr);
905	992	off = addr - chunk->vm->addr;
...	...	@@ -917,7 +1004,7 @@
917	1004	}
918	1005	}
919	1006
920		- mutex_unlock(&pcpu_mutex);
	1007	+ spin_unlock_irqrestore(&pcpu_lock, flags);
921	1008	}
922	1009	EXPORT_SYMBOL_GPL(free_percpu);
923	1010