Commit ce3141a277ff6cc37e51008b8888dc2cb7456ef1

Authored by Tejun Heo
1 parent c8a51be4ca

percpu: drop pcpu_chunk->page[]

percpu core doesn't need to tack all the allocated pages.  It needs to
know whether certain pages are populated and a way to reverse map
address to page when freeing.  This patch drops pcpu_chunk->page[] and
use populated bitmap and vmalloc_to_page() lookup instead.  Using
vmalloc_to_page() exclusively is also possible but complicates first
chunk handling, inflates cache footprint and prevents non-standard
memory allocation for percpu memory.

pcpu_chunk->page[] was used to track each page's allocation and
allowed asymmetric population which happens during failure path;
however, with single bitmap for all units, this is no longer possible.
Bite the bullet and rewrite (de)populate functions so that things are
done in clearly separated steps such that asymmetric population
doesn't happen.  This makes the (de)population process much more
modular and will also ease implementing non-standard memory usage in
the future (e.g. large pages).

This makes @get_page_fn parameter to pcpu_setup_first_chunk()
unnecessary.  The parameter is dropped and all first chunk helpers are
updated accordingly.  Please note that despite the volume most changes
to first chunk helpers are symbol renames for variables which don't
need to be referenced outside of the helper anymore.

This change reduces memory usage and cache footprint of pcpu_chunk.
Now only #unit_pages bits are necessary per chunk.

[ Impact: reduced memory usage and cache footprint for bookkeeping ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>

Showing 3 changed files with 400 additions and 249 deletions Side-by-side Diff

arch/sparc/kernel/smp_64.c
... ... @@ -1415,19 +1415,6 @@
1415 1415 #endif
1416 1416 }
1417 1417  
1418   -static size_t pcpur_size __initdata;
1419   -static void **pcpur_ptrs __initdata;
1420   -
1421   -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
1422   -{
1423   - size_t off = (size_t)pageno << PAGE_SHIFT;
1424   -
1425   - if (off >= pcpur_size)
1426   - return NULL;
1427   -
1428   - return virt_to_page(pcpur_ptrs[cpu] + off);
1429   -}
1430   -
1431 1418 #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
1432 1419  
1433 1420 static void __init pcpu_map_range(unsigned long start, unsigned long end,
1434 1421  
1435 1422  
1436 1423  
1437 1424  
1438 1425  
1439 1426  
... ... @@ -1491,25 +1478,26 @@
1491 1478 size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
1492 1479 static struct vm_struct vm;
1493 1480 unsigned long delta, cpu;
1494   - size_t pcpu_unit_size;
  1481 + size_t size_sum, pcpu_unit_size;
1495 1482 size_t ptrs_size;
  1483 + void **ptrs;
1496 1484  
1497   - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
1498   - PERCPU_DYNAMIC_RESERVE);
1499   - dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
  1485 + size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
  1486 + PERCPU_DYNAMIC_RESERVE);
  1487 + dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
1500 1488  
1501 1489  
1502   - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
1503   - pcpur_ptrs = alloc_bootmem(ptrs_size);
  1490 + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
  1491 + ptrs = alloc_bootmem(ptrs_size);
1504 1492  
1505 1493 for_each_possible_cpu(cpu) {
1506   - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
1507   - PCPU_CHUNK_SIZE);
  1494 + ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
  1495 + PCPU_CHUNK_SIZE);
1508 1496  
1509   - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
1510   - PCPU_CHUNK_SIZE - pcpur_size);
  1497 + free_bootmem(__pa(ptrs[cpu] + size_sum),
  1498 + PCPU_CHUNK_SIZE - size_sum);
1511 1499  
1512   - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
  1500 + memcpy(ptrs[cpu], __per_cpu_load, static_size);
1513 1501 }
1514 1502  
1515 1503 /* allocate address and map */
1516 1504  
1517 1505  
... ... @@ -1523,14 +1511,14 @@
1523 1511  
1524 1512 start += cpu * PCPU_CHUNK_SIZE;
1525 1513 end = start + PCPU_CHUNK_SIZE;
1526   - pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
  1514 + pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
1527 1515 }
1528 1516  
1529   - pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
  1517 + pcpu_unit_size = pcpu_setup_first_chunk(static_size,
1530 1518 PERCPU_MODULE_RESERVE, dyn_size,
1531 1519 PCPU_CHUNK_SIZE, vm.addr);
1532 1520  
1533   - free_bootmem(__pa(pcpur_ptrs), ptrs_size);
  1521 + free_bootmem(__pa(ptrs), ptrs_size);
1534 1522  
1535 1523 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1536 1524 for_each_possible_cpu(cpu) {
include/linux/percpu.h
... ... @@ -58,13 +58,12 @@
58 58  
59 59 extern void *pcpu_base_addr;
60 60  
61   -typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
62 61 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
63 62 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
64 63 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
65 64 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
66 65  
67   -extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  66 +extern size_t __init pcpu_setup_first_chunk(
68 67 size_t static_size, size_t reserved_size,
69 68 ssize_t dyn_size, size_t unit_size,
70 69 void *base_addr);
... ... @@ -94,8 +94,7 @@
94 94 int map_alloc; /* # of map entries allocated */
95 95 int *map; /* allocation map */
96 96 bool immutable; /* no [de]population allowed */
97   - struct page **page; /* points to page array */
98   - struct page *page_ar[]; /* #cpus * UNIT_PAGES */
  97 + unsigned long populated[]; /* populated bitmap */
99 98 };
100 99  
101 100 static int pcpu_unit_pages __read_mostly;
... ... @@ -129,9 +128,9 @@
129 128 * Synchronization rules.
130 129 *
131 130 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132   - * protects allocation/reclaim paths, chunks and chunk->page arrays.
133   - * The latter is a spinlock and protects the index data structures -
134   - * chunk slots, chunks and area maps in chunks.
  131 + * protects allocation/reclaim paths, chunks, populated bitmap and
  132 + * vmalloc mapping. The latter is a spinlock and protects the index
  133 + * data structures - chunk slots, chunks and area maps in chunks.
135 134 *
136 135 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 136 * pcpu_lock is grabbed and released as necessary. All actual memory
138 137  
139 138  
... ... @@ -188,16 +187,13 @@
188 187 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
189 188 }
190 189  
191   -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
192   - unsigned int cpu, int page_idx)
  190 +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
  191 + unsigned int cpu, int page_idx)
193 192 {
194   - return &chunk->page[pcpu_page_idx(cpu, page_idx)];
195   -}
  193 + /* must not be used on pre-mapped chunk */
  194 + WARN_ON(chunk->immutable);
196 195  
197   -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
198   - int page_idx)
199   -{
200   - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
  196 + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
201 197 }
202 198  
203 199 /* set the pointer to a chunk in a page struct */
... ... @@ -212,6 +208,34 @@
212 208 return (struct pcpu_chunk *)page->index;
213 209 }
214 210  
  211 +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
  212 +{
  213 + *rs = find_next_zero_bit(chunk->populated, end, *rs);
  214 + *re = find_next_bit(chunk->populated, end, *rs + 1);
  215 +}
  216 +
  217 +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
  218 +{
  219 + *rs = find_next_bit(chunk->populated, end, *rs);
  220 + *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
  221 +}
  222 +
  223 +/*
  224 + * (Un)populated page region iterators. Iterate over (un)populated
  225 + * page regions betwen @start and @end in @chunk. @rs and @re should
  226 + * be integer variables and will be set to start and end page index of
  227 + * the current region.
  228 + */
  229 +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
  230 + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
  231 + (rs) < (re); \
  232 + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
  233 +
  234 +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
  235 + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
  236 + (rs) < (re); \
  237 + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
  238 +
215 239 /**
216 240 * pcpu_mem_alloc - allocate memory
217 241 * @size: bytes to allocate
218 242  
219 243  
220 244  
221 245  
222 246  
223 247  
224 248  
225 249  
226 250  
227 251  
... ... @@ -545,42 +569,197 @@
545 569 }
546 570  
547 571 /**
548   - * pcpu_unmap - unmap pages out of a pcpu_chunk
  572 + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
549 573 * @chunk: chunk of interest
  574 + * @bitmapp: output parameter for bitmap
  575 + * @may_alloc: may allocate the array
  576 + *
  577 + * Returns pointer to array of pointers to struct page and bitmap,
  578 + * both of which can be indexed with pcpu_page_idx(). The returned
  579 + * array is cleared to zero and *@bitmapp is copied from
  580 + * @chunk->populated. Note that there is only one array and bitmap
  581 + * and access exclusion is the caller's responsibility.
  582 + *
  583 + * CONTEXT:
  584 + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
  585 + * Otherwise, don't care.
  586 + *
  587 + * RETURNS:
  588 + * Pointer to temp pages array on success, NULL on failure.
  589 + */
  590 +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
  591 + unsigned long **bitmapp,
  592 + bool may_alloc)
  593 +{
  594 + static struct page **pages;
  595 + static unsigned long *bitmap;
  596 + size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
  597 + sizeof(pages[0]);
  598 + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
  599 + sizeof(unsigned long);
  600 +
  601 + if (!pages || !bitmap) {
  602 + if (may_alloc && !pages)
  603 + pages = pcpu_mem_alloc(pages_size);
  604 + if (may_alloc && !bitmap)
  605 + bitmap = pcpu_mem_alloc(bitmap_size);
  606 + if (!pages || !bitmap)
  607 + return NULL;
  608 + }
  609 +
  610 + memset(pages, 0, pages_size);
  611 + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
  612 +
  613 + *bitmapp = bitmap;
  614 + return pages;
  615 +}
  616 +
  617 +/**
  618 + * pcpu_free_pages - free pages which were allocated for @chunk
  619 + * @chunk: chunk pages were allocated for
  620 + * @pages: array of pages to be freed, indexed by pcpu_page_idx()
  621 + * @populated: populated bitmap
  622 + * @page_start: page index of the first page to be freed
  623 + * @page_end: page index of the last page to be freed + 1
  624 + *
  625 + * Free pages [@page_start and @page_end) in @pages for all units.
  626 + * The pages were allocated for @chunk.
  627 + */
  628 +static void pcpu_free_pages(struct pcpu_chunk *chunk,
  629 + struct page **pages, unsigned long *populated,
  630 + int page_start, int page_end)
  631 +{
  632 + unsigned int cpu;
  633 + int i;
  634 +
  635 + for_each_possible_cpu(cpu) {
  636 + for (i = page_start; i < page_end; i++) {
  637 + struct page *page = pages[pcpu_page_idx(cpu, i)];
  638 +
  639 + if (page)
  640 + __free_page(page);
  641 + }
  642 + }
  643 +}
  644 +
  645 +/**
  646 + * pcpu_alloc_pages - allocates pages for @chunk
  647 + * @chunk: target chunk
  648 + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
  649 + * @populated: populated bitmap
  650 + * @page_start: page index of the first page to be allocated
  651 + * @page_end: page index of the last page to be allocated + 1
  652 + *
  653 + * Allocate pages [@page_start,@page_end) into @pages for all units.
  654 + * The allocation is for @chunk. Percpu core doesn't care about the
  655 + * content of @pages and will pass it verbatim to pcpu_map_pages().
  656 + */
  657 +static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
  658 + struct page **pages, unsigned long *populated,
  659 + int page_start, int page_end)
  660 +{
  661 + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
  662 + unsigned int cpu;
  663 + int i;
  664 +
  665 + for_each_possible_cpu(cpu) {
  666 + for (i = page_start; i < page_end; i++) {
  667 + struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
  668 +
  669 + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
  670 + if (!*pagep) {
  671 + pcpu_free_pages(chunk, pages, populated,
  672 + page_start, page_end);
  673 + return -ENOMEM;
  674 + }
  675 + }
  676 + }
  677 + return 0;
  678 +}
  679 +
  680 +/**
  681 + * pcpu_pre_unmap_flush - flush cache prior to unmapping
  682 + * @chunk: chunk the regions to be flushed belongs to
  683 + * @page_start: page index of the first page to be flushed
  684 + * @page_end: page index of the last page to be flushed + 1
  685 + *
  686 + * Pages in [@page_start,@page_end) of @chunk are about to be
  687 + * unmapped. Flush cache. As each flushing trial can be very
  688 + * expensive, issue flush on the whole region at once rather than
  689 + * doing it for each cpu. This could be an overkill but is more
  690 + * scalable.
  691 + */
  692 +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
  693 + int page_start, int page_end)
  694 +{
  695 + unsigned int last = num_possible_cpus() - 1;
  696 +
  697 + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
  698 + pcpu_chunk_addr(chunk, last, page_end));
  699 +}
  700 +
  701 +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
  702 +{
  703 + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
  704 +}
  705 +
  706 +/**
  707 + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
  708 + * @chunk: chunk of interest
  709 + * @pages: pages array which can be used to pass information to free
  710 + * @populated: populated bitmap
550 711 * @page_start: page index of the first page to unmap
551 712 * @page_end: page index of the last page to unmap + 1
552   - * @flush_tlb: whether to flush tlb or not
553 713 *
554 714 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
555   - * If @flush is true, vcache is flushed before unmapping and tlb
556   - * after.
  715 + * Corresponding elements in @pages were cleared by the caller and can
  716 + * be used to carry information to pcpu_free_pages() which will be
  717 + * called after all unmaps are finished. The caller should call
  718 + * proper pre/post flush functions.
557 719 */
558   -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
559   - bool flush_tlb)
  720 +static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
  721 + struct page **pages, unsigned long *populated,
  722 + int page_start, int page_end)
560 723 {
561   - unsigned int last = num_possible_cpus() - 1;
562 724 unsigned int cpu;
  725 + int i;
563 726  
564   - /* unmap must not be done on immutable chunk */
565   - WARN_ON(chunk->immutable);
  727 + for_each_possible_cpu(cpu) {
  728 + for (i = page_start; i < page_end; i++) {
  729 + struct page *page;
566 730  
567   - /*
568   - * Each flushing trial can be very expensive, issue flush on
569   - * the whole region at once rather than doing it for each cpu.
570   - * This could be an overkill but is more scalable.
571   - */
572   - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
573   - pcpu_chunk_addr(chunk, last, page_end));
  731 + page = pcpu_chunk_page(chunk, cpu, i);
  732 + WARN_ON(!page);
  733 + pages[pcpu_page_idx(cpu, i)] = page;
  734 + }
  735 + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
  736 + page_end - page_start);
  737 + }
574 738  
575   - for_each_possible_cpu(cpu)
576   - unmap_kernel_range_noflush(
577   - pcpu_chunk_addr(chunk, cpu, page_start),
578   - (page_end - page_start) << PAGE_SHIFT);
  739 + for (i = page_start; i < page_end; i++)
  740 + __clear_bit(i, populated);
  741 +}
579 742  
580   - /* ditto as flush_cache_vunmap() */
581   - if (flush_tlb)
582   - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
583   - pcpu_chunk_addr(chunk, last, page_end));
  743 +/**
  744 + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
  745 + * @chunk: pcpu_chunk the regions to be flushed belong to
  746 + * @page_start: page index of the first page to be flushed
  747 + * @page_end: page index of the last page to be flushed + 1
  748 + *
  749 + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
  750 + * TLB for the regions. This can be skipped if the area is to be
  751 + * returned to vmalloc as vmalloc will handle TLB flushing lazily.
  752 + *
  753 + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
  754 + * for the whole region.
  755 + */
  756 +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
  757 + int page_start, int page_end)
  758 +{
  759 + unsigned int last = num_possible_cpus() - 1;
  760 +
  761 + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
  762 + pcpu_chunk_addr(chunk, last, page_end));
584 763 }
585 764  
586 765 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
587 766  
588 767  
589 768  
590 769  
591 770  
592 771  
593 772  
594 773  
595 774  
... ... @@ -591,35 +770,76 @@
591 770 }
592 771  
593 772 /**
594   - * pcpu_map - map pages into a pcpu_chunk
  773 + * pcpu_map_pages - map pages into a pcpu_chunk
595 774 * @chunk: chunk of interest
  775 + * @pages: pages array containing pages to be mapped
  776 + * @populated: populated bitmap
596 777 * @page_start: page index of the first page to map
597 778 * @page_end: page index of the last page to map + 1
598 779 *
599   - * For each cpu, map pages [@page_start,@page_end) into @chunk.
600   - * vcache is flushed afterwards.
  780 + * For each cpu, map pages [@page_start,@page_end) into @chunk. The
  781 + * caller is responsible for calling pcpu_post_map_flush() after all
  782 + * mappings are complete.
  783 + *
  784 + * This function is responsible for setting corresponding bits in
  785 + * @chunk->populated bitmap and whatever is necessary for reverse
  786 + * lookup (addr -> chunk).
601 787 */
602   -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  788 +static int pcpu_map_pages(struct pcpu_chunk *chunk,
  789 + struct page **pages, unsigned long *populated,
  790 + int page_start, int page_end)
603 791 {
604   - unsigned int last = num_possible_cpus() - 1;
605   - unsigned int cpu;
606   - int err;
  792 + unsigned int cpu, tcpu;
  793 + int i, err;
607 794  
608   - /* map must not be done on immutable chunk */
609   - WARN_ON(chunk->immutable);
610   -
611 795 for_each_possible_cpu(cpu) {
612 796 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
613   - pcpu_chunk_pagep(chunk, cpu, page_start),
  797 + &pages[pcpu_page_idx(cpu, page_start)],
614 798 page_end - page_start);
615 799 if (err < 0)
616   - return err;
  800 + goto err;
617 801 }
618 802  
  803 + /* mapping successful, link chunk and mark populated */
  804 + for (i = page_start; i < page_end; i++) {
  805 + for_each_possible_cpu(cpu)
  806 + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
  807 + chunk);
  808 + __set_bit(i, populated);
  809 + }
  810 +
  811 + return 0;
  812 +
  813 +err:
  814 + for_each_possible_cpu(tcpu) {
  815 + if (tcpu == cpu)
  816 + break;
  817 + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
  818 + page_end - page_start);
  819 + }
  820 + return err;
  821 +}
  822 +
  823 +/**
  824 + * pcpu_post_map_flush - flush cache after mapping
  825 + * @chunk: pcpu_chunk the regions to be flushed belong to
  826 + * @page_start: page index of the first page to be flushed
  827 + * @page_end: page index of the last page to be flushed + 1
  828 + *
  829 + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
  830 + * cache.
  831 + *
  832 + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
  833 + * for the whole region.
  834 + */
  835 +static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  836 + int page_start, int page_end)
  837 +{
  838 + unsigned int last = num_possible_cpus() - 1;
  839 +
619 840 /* flush at once, please read comments in pcpu_unmap() */
620 841 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
621 842 pcpu_chunk_addr(chunk, last, page_end));
622   - return 0;
623 843 }
624 844  
625 845 /**
626 846  
627 847  
628 848  
629 849  
630 850  
631 851  
632 852  
... ... @@ -636,39 +856,45 @@
636 856 * CONTEXT:
637 857 * pcpu_alloc_mutex.
638 858 */
639   -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
640   - bool flush)
  859 +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
641 860 {
642 861 int page_start = PFN_DOWN(off);
643 862 int page_end = PFN_UP(off + size);
644   - int unmap_start = -1;
645   - int uninitialized_var(unmap_end);
646   - unsigned int cpu;
647   - int i;
  863 + struct page **pages;
  864 + unsigned long *populated;
  865 + int rs, re;
648 866  
649   - for (i = page_start; i < page_end; i++) {
650   - for_each_possible_cpu(cpu) {
651   - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
  867 + /* quick path, check whether it's empty already */
  868 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
  869 + if (rs == page_start && re == page_end)
  870 + return;
  871 + break;
  872 + }
652 873  
653   - if (!*pagep)
654   - continue;
  874 + /* immutable chunks can't be depopulated */
  875 + WARN_ON(chunk->immutable);
655 876  
656   - __free_page(*pagep);
  877 + /*
  878 + * If control reaches here, there must have been at least one
  879 + * successful population attempt so the temp pages array must
  880 + * be available now.
  881 + */
  882 + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
  883 + BUG_ON(!pages);
657 884  
658   - /*
659   - * If it's partial depopulation, it might get
660   - * populated or depopulated again. Mark the
661   - * page gone.
662   - */
663   - *pagep = NULL;
  885 + /* unmap and free */
  886 + pcpu_pre_unmap_flush(chunk, page_start, page_end);
664 887  
665   - unmap_start = unmap_start < 0 ? i : unmap_start;
666   - unmap_end = i + 1;
667   - }
668   - }
  888 + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
  889 + pcpu_unmap_pages(chunk, pages, populated, rs, re);
669 890  
670   - if (unmap_start >= 0)
671   - pcpu_unmap(chunk, unmap_start, unmap_end, flush);
  891 + /* no need to flush tlb, vmalloc will handle it lazily */
  892 +
  893 + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
  894 + pcpu_free_pages(chunk, pages, populated, rs, re);
  895 +
  896 + /* commit new bitmap */
  897 + bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
672 898 }
673 899  
674 900 /**
675 901  
676 902  
677 903  
678 904  
679 905  
680 906  
681 907  
682 908  
683 909  
684 910  
... ... @@ -685,50 +911,61 @@
685 911 */
686 912 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
687 913 {
688   - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
689 914 int page_start = PFN_DOWN(off);
690 915 int page_end = PFN_UP(off + size);
691   - int map_start = -1;
692   - int uninitialized_var(map_end);
  916 + int free_end = page_start, unmap_end = page_start;
  917 + struct page **pages;
  918 + unsigned long *populated;
693 919 unsigned int cpu;
694   - int i;
  920 + int rs, re, rc;
695 921  
696   - for (i = page_start; i < page_end; i++) {
697   - if (pcpu_chunk_page_occupied(chunk, i)) {
698   - if (map_start >= 0) {
699   - if (pcpu_map(chunk, map_start, map_end))
700   - goto err;
701   - map_start = -1;
702   - }
703   - continue;
704   - }
  922 + /* quick path, check whether all pages are already there */
  923 + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
  924 + if (rs == page_start && re == page_end)
  925 + goto clear;
  926 + break;
  927 + }
705 928  
706   - map_start = map_start < 0 ? i : map_start;
707   - map_end = i + 1;
  929 + /* need to allocate and map pages, this chunk can't be immutable */
  930 + WARN_ON(chunk->immutable);
708 931  
709   - for_each_possible_cpu(cpu) {
710   - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
  932 + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
  933 + if (!pages)
  934 + return -ENOMEM;
711 935  
712   - *pagep = alloc_pages_node(cpu_to_node(cpu),
713   - alloc_mask, 0);
714   - if (!*pagep)
715   - goto err;
716   - pcpu_set_page_chunk(*pagep, chunk);
717   - }
  936 + /* alloc and map */
  937 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
  938 + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
  939 + if (rc)
  940 + goto err_free;
  941 + free_end = re;
718 942 }
719 943  
720   - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
721   - goto err;
  944 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
  945 + rc = pcpu_map_pages(chunk, pages, populated, rs, re);
  946 + if (rc)
  947 + goto err_unmap;
  948 + unmap_end = re;
  949 + }
  950 + pcpu_post_map_flush(chunk, page_start, page_end);
722 951  
  952 + /* commit new bitmap */
  953 + bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
  954 +clear:
723 955 for_each_possible_cpu(cpu)
724 956 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
725 957 size);
726   -
727 958 return 0;
728   -err:
729   - /* likely under heavy memory pressure, give memory back */
730   - pcpu_depopulate_chunk(chunk, off, size, true);
731   - return -ENOMEM;
  959 +
  960 +err_unmap:
  961 + pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
  962 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
  963 + pcpu_unmap_pages(chunk, pages, populated, rs, re);
  964 + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
  965 +err_free:
  966 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
  967 + pcpu_free_pages(chunk, pages, populated, rs, re);
  968 + return rc;
732 969 }
733 970  
734 971 static void free_pcpu_chunk(struct pcpu_chunk *chunk)
... ... @@ -752,7 +989,6 @@
752 989 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
753 990 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
754 991 chunk->map[chunk->map_used++] = pcpu_unit_size;
755   - chunk->page = chunk->page_ar;
756 992  
757 993 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
758 994 if (!chunk->vm) {
... ... @@ -933,7 +1169,7 @@
933 1169 mutex_unlock(&pcpu_alloc_mutex);
934 1170  
935 1171 list_for_each_entry_safe(chunk, next, &todo, list) {
936   - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
  1172 + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
937 1173 free_pcpu_chunk(chunk);
938 1174 }
939 1175 }
... ... @@ -981,7 +1217,6 @@
981 1217  
982 1218 /**
983 1219 * pcpu_setup_first_chunk - initialize the first percpu chunk
984   - * @get_page_fn: callback to fetch page pointer
985 1220 * @static_size: the size of static percpu area in bytes
986 1221 * @reserved_size: the size of reserved percpu area in bytes, 0 for none
987 1222 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
... ... @@ -992,14 +1227,6 @@
992 1227 * perpcu area. This function is to be called from arch percpu area
993 1228 * setup path.
994 1229 *
995   - * @get_page_fn() should return pointer to percpu page given cpu
996   - * number and page number. It should at least return enough pages to
997   - * cover the static area. The returned pages for static area should
998   - * have been initialized with valid data. It can also return pages
999   - * after the static area. NULL return indicates end of pages for the
1000   - * cpu. Note that @get_page_fn() must return the same number of pages
1001   - * for all cpus.
1002   - *
1003 1230 * @reserved_size, if non-zero, specifies the amount of bytes to
1004 1231 * reserve after the static area in the first chunk. This reserves
1005 1232 * the first chunk such that it's available only through reserved
... ... @@ -1031,8 +1258,7 @@
1031 1258 * The determined pcpu_unit_size which can be used to initialize
1032 1259 * percpu access.
1033 1260 */
1034   -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1035   - size_t static_size, size_t reserved_size,
  1261 +size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1036 1262 ssize_t dyn_size, size_t unit_size,
1037 1263 void *base_addr)
1038 1264 {
... ... @@ -1041,8 +1267,7 @@
1041 1267 size_t size_sum = static_size + reserved_size +
1042 1268 (dyn_size >= 0 ? dyn_size : 0);
1043 1269 struct pcpu_chunk *schunk, *dchunk = NULL;
1044   - unsigned int cpu;
1045   - int i, nr_pages;
  1270 + int i;
1046 1271  
1047 1272 /* santiy checks */
1048 1273 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
... ... @@ -1056,8 +1281,8 @@
1056 1281 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1057 1282 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1058 1283 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
1059   - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1060   - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
  1284 + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
  1285 + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1061 1286  
1062 1287 if (dyn_size < 0)
1063 1288 dyn_size = pcpu_unit_size - static_size - reserved_size;
1064 1289  
... ... @@ -1087,8 +1312,8 @@
1087 1312 schunk->vm = &first_vm;
1088 1313 schunk->map = smap;
1089 1314 schunk->map_alloc = ARRAY_SIZE(smap);
1090   - schunk->page = schunk->page_ar;
1091 1315 schunk->immutable = true;
  1316 + bitmap_fill(schunk->populated, pcpu_unit_pages);
1092 1317  
1093 1318 if (reserved_size) {
1094 1319 schunk->free_size = reserved_size;
1095 1320  
1096 1321  
1097 1322  
... ... @@ -1106,38 +1331,19 @@
1106 1331  
1107 1332 /* init dynamic chunk if necessary */
1108 1333 if (dyn_size) {
1109   - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
  1334 + dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1110 1335 INIT_LIST_HEAD(&dchunk->list);
1111 1336 dchunk->vm = &first_vm;
1112 1337 dchunk->map = dmap;
1113 1338 dchunk->map_alloc = ARRAY_SIZE(dmap);
1114   - dchunk->page = schunk->page_ar; /* share page map with schunk */
1115 1339 dchunk->immutable = true;
  1340 + bitmap_fill(dchunk->populated, pcpu_unit_pages);
1116 1341  
1117 1342 dchunk->contig_hint = dchunk->free_size = dyn_size;
1118 1343 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1119 1344 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1120 1345 }
1121 1346  
1122   - /* assign pages */
1123   - nr_pages = -1;
1124   - for_each_possible_cpu(cpu) {
1125   - for (i = 0; i < pcpu_unit_pages; i++) {
1126   - struct page *page = get_page_fn(cpu, i);
1127   -
1128   - if (!page)
1129   - break;
1130   - *pcpu_chunk_pagep(schunk, cpu, i) = page;
1131   - }
1132   -
1133   - BUG_ON(i < PFN_UP(static_size));
1134   -
1135   - if (nr_pages < 0)
1136   - nr_pages = i;
1137   - else
1138   - BUG_ON(nr_pages != i);
1139   - }
1140   -
1141 1347 /* link the first chunk in */
1142 1348 pcpu_first_chunk = dchunk ?: schunk;
1143 1349 pcpu_chunk_relocate(pcpu_first_chunk, -1);
... ... @@ -1160,23 +1366,6 @@
1160 1366 return size_sum;
1161 1367 }
1162 1368  
1163   -/*
1164   - * Embedding first chunk setup helper.
1165   - */
1166   -static void *pcpue_ptr __initdata;
1167   -static size_t pcpue_size __initdata;
1168   -static size_t pcpue_unit_size __initdata;
1169   -
1170   -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1171   -{
1172   - size_t off = (size_t)pageno << PAGE_SHIFT;
1173   -
1174   - if (off >= pcpue_size)
1175   - return NULL;
1176   -
1177   - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
1178   -}
1179   -
1180 1369 /**
1181 1370 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1182 1371 * @static_size: the size of static percpu area in bytes
1183 1372  
1184 1373  
1185 1374  
... ... @@ -1207,18 +1396,19 @@
1207 1396 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1208 1397 ssize_t dyn_size)
1209 1398 {
1210   - size_t chunk_size;
  1399 + size_t size_sum, unit_size, chunk_size;
  1400 + void *base;
1211 1401 unsigned int cpu;
1212 1402  
1213 1403 /* determine parameters and allocate */
1214   - pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
  1404 + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1215 1405  
1216   - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1217   - chunk_size = pcpue_unit_size * num_possible_cpus();
  1406 + unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
  1407 + chunk_size = unit_size * num_possible_cpus();
1218 1408  
1219   - pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1220   - __pa(MAX_DMA_ADDRESS));
1221   - if (!pcpue_ptr) {
  1409 + base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
  1410 + __pa(MAX_DMA_ADDRESS));
  1411 + if (!base) {
1222 1412 pr_warning("PERCPU: failed to allocate %zu bytes for "
1223 1413 "embedding\n", chunk_size);
1224 1414 return -ENOMEM;
1225 1415  
1226 1416  
1227 1417  
1228 1418  
... ... @@ -1226,35 +1416,20 @@
1226 1416  
1227 1417 /* return the leftover and copy */
1228 1418 for_each_possible_cpu(cpu) {
1229   - void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
  1419 + void *ptr = base + cpu * unit_size;
1230 1420  
1231   - free_bootmem(__pa(ptr + pcpue_size),
1232   - pcpue_unit_size - pcpue_size);
  1421 + free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
1233 1422 memcpy(ptr, __per_cpu_load, static_size);
1234 1423 }
1235 1424  
1236 1425 /* we're ready, commit */
1237 1426 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1238   - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
  1427 + size_sum >> PAGE_SHIFT, base, static_size);
1239 1428  
1240   - return pcpu_setup_first_chunk(pcpue_get_page, static_size,
1241   - reserved_size, dyn_size,
1242   - pcpue_unit_size, pcpue_ptr);
  1429 + return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
  1430 + unit_size, base);
1243 1431 }
1244 1432  
1245   -/*
1246   - * 4k page first chunk setup helper.
1247   - */
1248   -static struct page **pcpu4k_pages __initdata;
1249   -static int pcpu4k_unit_pages __initdata;
1250   -
1251   -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
1252   -{
1253   - if (pageno < pcpu4k_unit_pages)
1254   - return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
1255   - return NULL;
1256   -}
1257   -
1258 1433 /**
1259 1434 * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
1260 1435 * @static_size: the size of static percpu area in bytes
1261 1436  
1262 1437  
1263 1438  
1264 1439  
... ... @@ -1279,23 +1454,25 @@
1279 1454 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1280 1455 {
1281 1456 static struct vm_struct vm;
  1457 + int unit_pages;
1282 1458 size_t pages_size;
  1459 + struct page **pages;
1283 1460 unsigned int cpu;
1284 1461 int i, j;
1285 1462 ssize_t ret;
1286 1463  
1287   - pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
1288   - PCPU_MIN_UNIT_SIZE));
  1464 + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
  1465 + PCPU_MIN_UNIT_SIZE));
1289 1466  
1290 1467 /* unaligned allocations can't be freed, round up to page size */
1291   - pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() *
1292   - sizeof(pcpu4k_pages[0]));
1293   - pcpu4k_pages = alloc_bootmem(pages_size);
  1468 + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
  1469 + sizeof(pages[0]));
  1470 + pages = alloc_bootmem(pages_size);
1294 1471  
1295 1472 /* allocate pages */
1296 1473 j = 0;
1297 1474 for_each_possible_cpu(cpu)
1298   - for (i = 0; i < pcpu4k_unit_pages; i++) {
  1475 + for (i = 0; i < unit_pages; i++) {
1299 1476 void *ptr;
1300 1477  
1301 1478 ptr = alloc_fn(cpu, PAGE_SIZE);
1302 1479  
1303 1480  
1304 1481  
1305 1482  
... ... @@ -1304,25 +1481,24 @@
1304 1481 "4k page for cpu%u\n", cpu);
1305 1482 goto enomem;
1306 1483 }
1307   - pcpu4k_pages[j++] = virt_to_page(ptr);
  1484 + pages[j++] = virt_to_page(ptr);
1308 1485 }
1309 1486  
1310 1487 /* allocate vm area, map the pages and copy static data */
1311 1488 vm.flags = VM_ALLOC;
1312   - vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT;
  1489 + vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
1313 1490 vm_area_register_early(&vm, PAGE_SIZE);
1314 1491  
1315 1492 for_each_possible_cpu(cpu) {
1316 1493 unsigned long unit_addr = (unsigned long)vm.addr +
1317   - (cpu * pcpu4k_unit_pages << PAGE_SHIFT);
  1494 + (cpu * unit_pages << PAGE_SHIFT);
1318 1495  
1319   - for (i = 0; i < pcpu4k_unit_pages; i++)
  1496 + for (i = 0; i < unit_pages; i++)
1320 1497 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1321 1498  
1322 1499 /* pte already populated, the following shouldn't fail */
1323   - ret = __pcpu_map_pages(unit_addr,
1324   - &pcpu4k_pages[cpu * pcpu4k_unit_pages],
1325   - pcpu4k_unit_pages);
  1500 + ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
  1501 + unit_pages);
1326 1502 if (ret < 0)
1327 1503 panic("failed to map percpu area, err=%zd\n", ret);
1328 1504  
1329 1505  
1330 1506  
1331 1507  
... ... @@ -1340,19 +1516,18 @@
1340 1516  
1341 1517 /* we're ready, commit */
1342 1518 pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
1343   - pcpu4k_unit_pages, static_size);
  1519 + unit_pages, static_size);
1344 1520  
1345   - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
1346   - reserved_size, -1,
1347   - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
  1521 + ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
  1522 + unit_pages << PAGE_SHIFT, vm.addr);
1348 1523 goto out_free_ar;
1349 1524  
1350 1525 enomem:
1351 1526 while (--j >= 0)
1352   - free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
  1527 + free_fn(page_address(pages[j]), PAGE_SIZE);
1353 1528 ret = -ENOMEM;
1354 1529 out_free_ar:
1355   - free_bootmem(__pa(pcpu4k_pages), pages_size);
  1530 + free_bootmem(__pa(pages), pages_size);
1356 1531 return ret;
1357 1532 }
1358 1533  
... ... @@ -1370,16 +1545,6 @@
1370 1545 static struct pcpul_ent *pcpul_map;
1371 1546 static struct vm_struct pcpul_vm;
1372 1547  
1373   -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
1374   -{
1375   - size_t off = (size_t)pageno << PAGE_SHIFT;
1376   -
1377   - if (off >= pcpul_size)
1378   - return NULL;
1379   -
1380   - return virt_to_page(pcpul_map[cpu].ptr + off);
1381   -}
1382   -
1383 1548 /**
1384 1549 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
1385 1550 * @static_size: the size of static percpu area in bytes
... ... @@ -1475,9 +1640,8 @@
1475 1640 pr_info("PERCPU: Remapped at %p with large pages, static data "
1476 1641 "%zu bytes\n", pcpul_vm.addr, static_size);
1477 1642  
1478   - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
1479   - reserved_size, dyn_size, pcpul_unit_size,
1480   - pcpul_vm.addr);
  1643 + ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
  1644 + pcpul_unit_size, pcpul_vm.addr);
1481 1645  
1482 1646 /* sort pcpul_map array for pcpu_lpage_remapped() */
1483 1647 for (i = 0; i < num_possible_cpus() - 1; i++)