Commit ccea34b5d0fbab081496d1860f31acee99fa8a6d

Authored by Tejun Heo
1 parent a56dbddf06

percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Peter Zijlstra <peterz@infradead.org>

Showing 1 changed file with 124 additions and 37 deletions Side-by-side Diff

... ... @@ -62,6 +62,7 @@
62 62 #include <linux/pfn.h>
63 63 #include <linux/rbtree.h>
64 64 #include <linux/slab.h>
  65 +#include <linux/spinlock.h>
65 66 #include <linux/vmalloc.h>
66 67 #include <linux/workqueue.h>
67 68  
68 69  
69 70  
70 71  
... ... @@ -101,20 +102,28 @@
101 102 static int pcpu_reserved_chunk_limit;
102 103  
103 104 /*
104   - * One mutex to rule them all.
  105 + * Synchronization rules.
105 106 *
106   - * The following mutex is grabbed in the outermost public alloc/free
107   - * interface functions and released only when the operation is
108   - * complete. As such, every function in this file other than the
109   - * outermost functions are called under pcpu_mutex.
  107 + * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
  108 + * protects allocation/reclaim paths, chunks and chunk->page arrays.
  109 + * The latter is a spinlock and protects the index data structures -
  110 + * chunk slots, rbtree, chunks and area maps in chunks.
110 111 *
111   - * It can easily be switched to use spinlock such that only the area
112   - * allocation and page population commit are protected with it doing
113   - * actual [de]allocation without holding any lock. However, given
114   - * what this allocator does, I think it's better to let them run
115   - * sequentially.
  112 + * During allocation, pcpu_alloc_mutex is kept locked all the time and
  113 + * pcpu_lock is grabbed and released as necessary. All actual memory
  114 + * allocations are done using GFP_KERNEL with pcpu_lock released.
  115 + *
  116 + * Free path accesses and alters only the index data structures, so it
  117 + * can be safely called from atomic context. When memory needs to be
  118 + * returned to the system, free path schedules reclaim_work which
  119 + * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
  120 + * reclaimed, release both locks and frees the chunks. Note that it's
  121 + * necessary to grab both locks to remove a chunk from circulation as
  122 + * allocation path might be referencing the chunk with only
  123 + * pcpu_alloc_mutex locked.
116 124 */
117   -static DEFINE_MUTEX(pcpu_mutex);
  125 +static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
  126 +static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
118 127  
119 128 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
120 129 static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
... ... @@ -176,6 +185,9 @@
176 185 * kzalloc() is used; otherwise, vmalloc() is used. The returned
177 186 * memory is always zeroed.
178 187 *
  188 + * CONTEXT:
  189 + * Does GFP_KERNEL allocation.
  190 + *
179 191 * RETURNS:
180 192 * Pointer to the allocated area on success, NULL on failure.
181 193 */
... ... @@ -215,6 +227,9 @@
215 227 * New slot according to the changed state is determined and @chunk is
216 228 * moved to the slot. Note that the reserved chunk is never put on
217 229 * chunk slots.
  230 + *
  231 + * CONTEXT:
  232 + * pcpu_lock.
218 233 */
219 234 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
220 235 {
... ... @@ -260,6 +275,9 @@
260 275 * searchs for the chunk with the highest start address which isn't
261 276 * beyond @addr.
262 277 *
  278 + * CONTEXT:
  279 + * pcpu_lock.
  280 + *
263 281 * RETURNS:
264 282 * The address of the found chunk.
265 283 */
... ... @@ -300,6 +318,9 @@
300 318 * @new: chunk to insert
301 319 *
302 320 * Insert @new into address rb tree.
  321 + *
  322 + * CONTEXT:
  323 + * pcpu_lock.
303 324 */
304 325 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
305 326 {
... ... @@ -319,6 +340,10 @@
319 340 * A single allocation can split an area into three areas, so this
320 341 * function makes sure that @chunk->map has at least two extra slots.
321 342 *
  343 + * CONTEXT:
  344 + * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
  345 + * if area map is extended.
  346 + *
322 347 * RETURNS:
323 348 * 0 if noop, 1 if successfully extended, -errno on failure.
324 349 */
325 350  
326 351  
327 352  
... ... @@ -332,14 +357,26 @@
332 357 if (chunk->map_alloc >= chunk->map_used + 2)
333 358 return 0;
334 359  
  360 + spin_unlock_irq(&pcpu_lock);
  361 +
335 362 new_alloc = PCPU_DFL_MAP_ALLOC;
336 363 while (new_alloc < chunk->map_used + 2)
337 364 new_alloc *= 2;
338 365  
339 366 new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
340   - if (!new)
  367 + if (!new) {
  368 + spin_lock_irq(&pcpu_lock);
341 369 return -ENOMEM;
  370 + }
342 371  
  372 + /*
  373 + * Acquire pcpu_lock and switch to new area map. Only free
  374 + * could have happened inbetween, so map_used couldn't have
  375 + * grown.
  376 + */
  377 + spin_lock_irq(&pcpu_lock);
  378 + BUG_ON(new_alloc < chunk->map_used + 2);
  379 +
343 380 size = chunk->map_alloc * sizeof(chunk->map[0]);
344 381 memcpy(new, chunk->map, size);
345 382  
... ... @@ -371,6 +408,9 @@
371 408 * is inserted after the target block.
372 409 *
373 410 * @chunk->map must have enough free slots to accomodate the split.
  411 + *
  412 + * CONTEXT:
  413 + * pcpu_lock.
374 414 */
375 415 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
376 416 int head, int tail)
... ... @@ -406,6 +446,9 @@
406 446 *
407 447 * @chunk->map must have at least two free slots.
408 448 *
  449 + * CONTEXT:
  450 + * pcpu_lock.
  451 + *
409 452 * RETURNS:
410 453 * Allocated offset in @chunk on success, -1 if no matching area is
411 454 * found.
... ... @@ -495,6 +538,9 @@
495 538 * Free area starting from @freeme to @chunk. Note that this function
496 539 * only modifies the allocation map. It doesn't depopulate or unmap
497 540 * the area.
  541 + *
  542 + * CONTEXT:
  543 + * pcpu_lock.
498 544 */
499 545 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
500 546 {
... ... @@ -580,6 +626,9 @@
580 626 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
581 627 * from @chunk. If @flush is true, vcache is flushed before unmapping
582 628 * and tlb after.
  629 + *
  630 + * CONTEXT:
  631 + * pcpu_alloc_mutex.
583 632 */
584 633 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
585 634 bool flush)
... ... @@ -658,6 +707,9 @@
658 707 *
659 708 * For each cpu, populate and map pages [@page_start,@page_end) into
660 709 * @chunk. The area is cleared on return.
  710 + *
  711 + * CONTEXT:
  712 + * pcpu_alloc_mutex, does GFP_KERNEL allocation.
661 713 */
662 714 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
663 715 {
664 716  
665 717  
... ... @@ -748,15 +800,16 @@
748 800 * @align: alignment of area (max PAGE_SIZE)
749 801 * @reserved: allocate from the reserved chunk if available
750 802 *
751   - * Allocate percpu area of @size bytes aligned at @align. Might
752   - * sleep. Might trigger writeouts.
  803 + * Allocate percpu area of @size bytes aligned at @align.
753 804 *
  805 + * CONTEXT:
  806 + * Does GFP_KERNEL allocation.
  807 + *
754 808 * RETURNS:
755 809 * Percpu pointer to the allocated area on success, NULL on failure.
756 810 */
757 811 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
758 812 {
759   - void *ptr = NULL;
760 813 struct pcpu_chunk *chunk;
761 814 int slot, off;
762 815  
763 816  
764 817  
765 818  
766 819  
... ... @@ -766,27 +819,37 @@
766 819 return NULL;
767 820 }
768 821  
769   - mutex_lock(&pcpu_mutex);
  822 + mutex_lock(&pcpu_alloc_mutex);
  823 + spin_lock_irq(&pcpu_lock);
770 824  
771 825 /* serve reserved allocations from the reserved chunk if available */
772 826 if (reserved && pcpu_reserved_chunk) {
773 827 chunk = pcpu_reserved_chunk;
774 828 if (size > chunk->contig_hint ||
775 829 pcpu_extend_area_map(chunk) < 0)
776   - goto out_unlock;
  830 + goto fail_unlock;
777 831 off = pcpu_alloc_area(chunk, size, align);
778 832 if (off >= 0)
779 833 goto area_found;
780   - goto out_unlock;
  834 + goto fail_unlock;
781 835 }
782 836  
  837 +restart:
783 838 /* search through normal chunks */
784 839 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
785 840 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
786 841 if (size > chunk->contig_hint)
787 842 continue;
788   - if (pcpu_extend_area_map(chunk) < 0)
789   - goto out_unlock;
  843 +
  844 + switch (pcpu_extend_area_map(chunk)) {
  845 + case 0:
  846 + break;
  847 + case 1:
  848 + goto restart; /* pcpu_lock dropped, restart */
  849 + default:
  850 + goto fail_unlock;
  851 + }
  852 +
790 853 off = pcpu_alloc_area(chunk, size, align);
791 854 if (off >= 0)
792 855 goto area_found;
793 856  
794 857  
795 858  
796 859  
797 860  
798 861  
799 862  
... ... @@ -794,27 +857,36 @@
794 857 }
795 858  
796 859 /* hmmm... no space left, create a new chunk */
  860 + spin_unlock_irq(&pcpu_lock);
  861 +
797 862 chunk = alloc_pcpu_chunk();
798 863 if (!chunk)
799   - goto out_unlock;
  864 + goto fail_unlock_mutex;
  865 +
  866 + spin_lock_irq(&pcpu_lock);
800 867 pcpu_chunk_relocate(chunk, -1);
801 868 pcpu_chunk_addr_insert(chunk);
  869 + goto restart;
802 870  
803   - off = pcpu_alloc_area(chunk, size, align);
804   - if (off < 0)
805   - goto out_unlock;
806   -
807 871 area_found:
  872 + spin_unlock_irq(&pcpu_lock);
  873 +
808 874 /* populate, map and clear the area */
809 875 if (pcpu_populate_chunk(chunk, off, size)) {
  876 + spin_lock_irq(&pcpu_lock);
810 877 pcpu_free_area(chunk, off);
811   - goto out_unlock;
  878 + goto fail_unlock;
812 879 }
813 880  
814   - ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
815   -out_unlock:
816   - mutex_unlock(&pcpu_mutex);
817   - return ptr;
  881 + mutex_unlock(&pcpu_alloc_mutex);
  882 +
  883 + return __addr_to_pcpu_ptr(chunk->vm->addr + off);
  884 +
  885 +fail_unlock:
  886 + spin_unlock_irq(&pcpu_lock);
  887 +fail_unlock_mutex:
  888 + mutex_unlock(&pcpu_alloc_mutex);
  889 + return NULL;
818 890 }
819 891  
820 892 /**
... ... @@ -825,6 +897,9 @@
825 897 * Allocate percpu area of @size bytes aligned at @align. Might
826 898 * sleep. Might trigger writeouts.
827 899 *
  900 + * CONTEXT:
  901 + * Does GFP_KERNEL allocation.
  902 + *
828 903 * RETURNS:
829 904 * Percpu pointer to the allocated area on success, NULL on failure.
830 905 */
... ... @@ -843,6 +918,9 @@
843 918 * percpu area if arch has set it up; otherwise, allocation is served
844 919 * from the same dynamic area. Might sleep. Might trigger writeouts.
845 920 *
  921 + * CONTEXT:
  922 + * Does GFP_KERNEL allocation.
  923 + *
846 924 * RETURNS:
847 925 * Percpu pointer to the allocated area on success, NULL on failure.
848 926 */
... ... @@ -856,6 +934,9 @@
856 934 * @work: unused
857 935 *
858 936 * Reclaim all fully free chunks except for the first one.
  937 + *
  938 + * CONTEXT:
  939 + * workqueue context.
859 940 */
860 941 static void pcpu_reclaim(struct work_struct *work)
861 942 {
... ... @@ -863,7 +944,8 @@
863 944 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
864 945 struct pcpu_chunk *chunk, *next;
865 946  
866   - mutex_lock(&pcpu_mutex);
  947 + mutex_lock(&pcpu_alloc_mutex);
  948 + spin_lock_irq(&pcpu_lock);
867 949  
868 950 list_for_each_entry_safe(chunk, next, head, list) {
869 951 WARN_ON(chunk->immutable);
... ... @@ -876,7 +958,8 @@
876 958 list_move(&chunk->list, &todo);
877 959 }
878 960  
879   - mutex_unlock(&pcpu_mutex);
  961 + spin_unlock_irq(&pcpu_lock);
  962 + mutex_unlock(&pcpu_alloc_mutex);
880 963  
881 964 list_for_each_entry_safe(chunk, next, &todo, list) {
882 965 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
883 966  
884 967  
... ... @@ -888,18 +971,22 @@
888 971 * free_percpu - free percpu area
889 972 * @ptr: pointer to area to free
890 973 *
891   - * Free percpu area @ptr. Might sleep.
  974 + * Free percpu area @ptr.
  975 + *
  976 + * CONTEXT:
  977 + * Can be called from atomic context.
892 978 */
893 979 void free_percpu(void *ptr)
894 980 {
895 981 void *addr = __pcpu_ptr_to_addr(ptr);
896 982 struct pcpu_chunk *chunk;
  983 + unsigned long flags;
897 984 int off;
898 985  
899 986 if (!ptr)
900 987 return;
901 988  
902   - mutex_lock(&pcpu_mutex);
  989 + spin_lock_irqsave(&pcpu_lock, flags);
903 990  
904 991 chunk = pcpu_chunk_addr_search(addr);
905 992 off = addr - chunk->vm->addr;
... ... @@ -917,7 +1004,7 @@
917 1004 }
918 1005 }
919 1006  
920   - mutex_unlock(&pcpu_mutex);
  1007 + spin_unlock_irqrestore(&pcpu_lock, flags);
921 1008 }
922 1009 EXPORT_SYMBOL_GPL(free_percpu);
923 1010