Commit 23dcfa61bac244e1200ff9ad19c6e9144dcb6bb5

Authored by Linus Torvalds

Merge branch 'akpm' (Andrew's patch-bomb)

Merge fixes from Andrew Morton.

Random drivers and some VM fixes.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (17 commits)
  mm: compaction: Abort async compaction if locks are contended or taking too long
  mm: have order > 0 compaction start near a pageblock with free pages
  rapidio/tsi721: fix unused variable compiler warning
  rapidio/tsi721: fix inbound doorbell interrupt handling
  drivers/rtc/rtc-rs5c348.c: fix hour decoding in 12-hour mode
  mm: correct page->pfmemalloc to fix deactivate_slab regression
  drivers/rtc/rtc-pcf2123.c: initialize dynamic sysfs attributes
  mm/compaction.c: fix deferring compaction mistake
  drivers/misc/sgi-xp/xpc_uv.c: SGI XPC fails to load when cpu 0 is out of IRQ resources
  string: do not export memweight() to userspace
  hugetlb: update hugetlbpage.txt
  checkpatch: add control statement test to SINGLE_STATEMENT_DO_WHILE_MACRO
  mm: hugetlbfs: correctly populate shared pmd
  cciss: fix incorrect scsi status reporting
  Documentation: update mount option in filesystem/vfat.txt
  mm: change nr_ptes BUG_ON to WARN_ON
  cs5535-clockevt: typo, it's MFGPT, not MFPGT

Showing 16 changed files Side-by-side Diff

Documentation/filesystems/vfat.txt
... ... @@ -137,6 +137,17 @@
137 137 without doing anything or remount the partition in
138 138 read-only mode (default behavior).
139 139  
  140 +discard -- If set, issues discard/TRIM commands to the block
  141 + device when blocks are freed. This is useful for SSD devices
  142 + and sparse/thinly-provisoned LUNs.
  143 +
  144 +nfs -- This option maintains an index (cache) of directory
  145 + inodes by i_logstart which is used by the nfs-related code to
  146 + improve look-ups.
  147 +
  148 + Enable this only if you want to export the FAT filesystem
  149 + over NFS
  150 +
140 151 <bool>: 0,1,yes,no,true,false
141 152  
142 153 TODO
Documentation/vm/hugetlbpage.txt
... ... @@ -299,12 +299,18 @@
299 299 *******************************************************************
300 300  
301 301 /*
302   - * hugepage-shm: see Documentation/vm/hugepage-shm.c
  302 + * map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
303 303 */
304 304  
305 305 *******************************************************************
306 306  
307 307 /*
308   - * hugepage-mmap: see Documentation/vm/hugepage-mmap.c
  308 + * hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c
  309 + */
  310 +
  311 +*******************************************************************
  312 +
  313 +/*
  314 + * hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
309 315 */
arch/x86/mm/hugetlbpage.c
... ... @@ -56,9 +56,16 @@
56 56 }
57 57  
58 58 /*
59   - * search for a shareable pmd page for hugetlb.
  59 + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
  60 + * and returns the corresponding pte. While this is not necessary for the
  61 + * !shared pmd case because we can allocate the pmd later as well, it makes the
  62 + * code much cleaner. pmd allocation is essential for the shared case because
  63 + * pud has to be populated inside the same i_mmap_mutex section - otherwise
  64 + * racing tasks could either miss the sharing (see huge_pte_offset) or select a
  65 + * bad pmd for sharing.
60 66 */
61   -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
  67 +static pte_t *
  68 +huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
62 69 {
63 70 struct vm_area_struct *vma = find_vma(mm, addr);
64 71 struct address_space *mapping = vma->vm_file->f_mapping;
65 72  
... ... @@ -68,9 +75,10 @@
68 75 struct vm_area_struct *svma;
69 76 unsigned long saddr;
70 77 pte_t *spte = NULL;
  78 + pte_t *pte;
71 79  
72 80 if (!vma_shareable(vma, addr))
73   - return;
  81 + return (pte_t *)pmd_alloc(mm, pud, addr);
74 82  
75 83 mutex_lock(&mapping->i_mmap_mutex);
76 84 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
77 85  
... ... @@ -97,7 +105,9 @@
97 105 put_page(virt_to_page(spte));
98 106 spin_unlock(&mm->page_table_lock);
99 107 out:
  108 + pte = (pte_t *)pmd_alloc(mm, pud, addr);
100 109 mutex_unlock(&mapping->i_mmap_mutex);
  110 + return pte;
101 111 }
102 112  
103 113 /*
... ... @@ -142,8 +152,9 @@
142 152 } else {
143 153 BUG_ON(sz != PMD_SIZE);
144 154 if (pud_none(*pud))
145   - huge_pmd_share(mm, addr, pud);
146   - pte = (pte_t *) pmd_alloc(mm, pud, addr);
  155 + pte = huge_pmd_share(mm, addr, pud);
  156 + else
  157 + pte = (pte_t *)pmd_alloc(mm, pud, addr);
147 158 }
148 159 }
149 160 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
drivers/block/cciss_scsi.c
... ... @@ -763,16 +763,7 @@
763 763 {
764 764 case CMD_TARGET_STATUS:
765 765 /* Pass it up to the upper layers... */
766   - if( ei->ScsiStatus)
767   - {
768   -#if 0
769   - printk(KERN_WARNING "cciss: cmd %p "
770   - "has SCSI Status = %x\n",
771   - c, ei->ScsiStatus);
772   -#endif
773   - cmd->result |= (ei->ScsiStatus << 1);
774   - }
775   - else { /* scsi status is zero??? How??? */
  766 + if (!ei->ScsiStatus) {
776 767  
777 768 /* Ordinarily, this case should never happen, but there is a bug
778 769 in some released firmware revisions that allows it to happen
drivers/clocksource/cs5535-clockevt.c
... ... @@ -53,7 +53,7 @@
53 53 #define MFGPT_PERIODIC (MFGPT_HZ / HZ)
54 54  
55 55 /*
56   - * The MFPGT timers on the CS5536 provide us with suitable timers to use
  56 + * The MFGPT timers on the CS5536 provide us with suitable timers to use
57 57 * as clock event sources - not as good as a HPET or APIC, but certainly
58 58 * better than the PIT. This isn't a general purpose MFGPT driver, but
59 59 * a simplified one designed specifically to act as a clock event source.
... ... @@ -144,7 +144,7 @@
144 144  
145 145 timer = cs5535_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
146 146 if (!timer) {
147   - printk(KERN_ERR DRV_NAME ": Could not allocate MFPGT timer\n");
  147 + printk(KERN_ERR DRV_NAME ": Could not allocate MFGPT timer\n");
148 148 return -ENODEV;
149 149 }
150 150 cs5535_event_clock = timer;
drivers/misc/sgi-xp/xpc_uv.c
... ... @@ -18,6 +18,8 @@
18 18 #include <linux/interrupt.h>
19 19 #include <linux/delay.h>
20 20 #include <linux/device.h>
  21 +#include <linux/cpu.h>
  22 +#include <linux/module.h>
21 23 #include <linux/err.h>
22 24 #include <linux/slab.h>
23 25 #include <asm/uv/uv_hub.h>
... ... @@ -59,6 +61,8 @@
59 61 XPC_NOTIFY_MSG_SIZE_UV)
60 62 #define XPC_NOTIFY_IRQ_NAME "xpc_notify"
61 63  
  64 +static int xpc_mq_node = -1;
  65 +
62 66 static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
63 67 static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
64 68  
65 69  
... ... @@ -109,11 +113,8 @@
109 113 #if defined CONFIG_X86_64
110 114 mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
111 115 UV_AFFINITY_CPU);
112   - if (mq->irq < 0) {
113   - dev_err(xpc_part, "uv_setup_irq() returned error=%d\n",
114   - -mq->irq);
  116 + if (mq->irq < 0)
115 117 return mq->irq;
116   - }
117 118  
118 119 mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset);
119 120  
... ... @@ -238,8 +239,9 @@
238 239 mq->mmr_blade = uv_cpu_to_blade_id(cpu);
239 240  
240 241 nid = cpu_to_node(cpu);
241   - page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
242   - pg_order);
  242 + page = alloc_pages_exact_node(nid,
  243 + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
  244 + pg_order);
243 245 if (page == NULL) {
244 246 dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
245 247 "bytes of memory on nid=%d for GRU mq\n", mq_size, nid);
246 248  
... ... @@ -1731,9 +1733,50 @@
1731 1733 .notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv,
1732 1734 };
1733 1735  
  1736 +static int
  1737 +xpc_init_mq_node(int nid)
  1738 +{
  1739 + int cpu;
  1740 +
  1741 + get_online_cpus();
  1742 +
  1743 + for_each_cpu(cpu, cpumask_of_node(nid)) {
  1744 + xpc_activate_mq_uv =
  1745 + xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, nid,
  1746 + XPC_ACTIVATE_IRQ_NAME,
  1747 + xpc_handle_activate_IRQ_uv);
  1748 + if (!IS_ERR(xpc_activate_mq_uv))
  1749 + break;
  1750 + }
  1751 + if (IS_ERR(xpc_activate_mq_uv)) {
  1752 + put_online_cpus();
  1753 + return PTR_ERR(xpc_activate_mq_uv);
  1754 + }
  1755 +
  1756 + for_each_cpu(cpu, cpumask_of_node(nid)) {
  1757 + xpc_notify_mq_uv =
  1758 + xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, nid,
  1759 + XPC_NOTIFY_IRQ_NAME,
  1760 + xpc_handle_notify_IRQ_uv);
  1761 + if (!IS_ERR(xpc_notify_mq_uv))
  1762 + break;
  1763 + }
  1764 + if (IS_ERR(xpc_notify_mq_uv)) {
  1765 + xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
  1766 + put_online_cpus();
  1767 + return PTR_ERR(xpc_notify_mq_uv);
  1768 + }
  1769 +
  1770 + put_online_cpus();
  1771 + return 0;
  1772 +}
  1773 +
1734 1774 int
1735 1775 xpc_init_uv(void)
1736 1776 {
  1777 + int nid;
  1778 + int ret = 0;
  1779 +
1737 1780 xpc_arch_ops = xpc_arch_ops_uv;
1738 1781  
1739 1782 if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) {
1740 1783  
1741 1784  
... ... @@ -1742,21 +1785,21 @@
1742 1785 return -E2BIG;
1743 1786 }
1744 1787  
1745   - xpc_activate_mq_uv = xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, 0,
1746   - XPC_ACTIVATE_IRQ_NAME,
1747   - xpc_handle_activate_IRQ_uv);
1748   - if (IS_ERR(xpc_activate_mq_uv))
1749   - return PTR_ERR(xpc_activate_mq_uv);
  1788 + if (xpc_mq_node < 0)
  1789 + for_each_online_node(nid) {
  1790 + ret = xpc_init_mq_node(nid);
1750 1791  
1751   - xpc_notify_mq_uv = xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, 0,
1752   - XPC_NOTIFY_IRQ_NAME,
1753   - xpc_handle_notify_IRQ_uv);
1754   - if (IS_ERR(xpc_notify_mq_uv)) {
1755   - xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
1756   - return PTR_ERR(xpc_notify_mq_uv);
1757   - }
  1792 + if (!ret)
  1793 + break;
  1794 + }
  1795 + else
  1796 + ret = xpc_init_mq_node(xpc_mq_node);
1758 1797  
1759   - return 0;
  1798 + if (ret < 0)
  1799 + dev_err(xpc_part, "xpc_init_mq_node() returned error=%d\n",
  1800 + -ret);
  1801 +
  1802 + return ret;
1760 1803 }
1761 1804  
1762 1805 void
... ... @@ -1765,4 +1808,7 @@
1765 1808 xpc_destroy_gru_mq_uv(xpc_notify_mq_uv);
1766 1809 xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
1767 1810 }
  1811 +
  1812 +module_param(xpc_mq_node, int, 0);
  1813 +MODULE_PARM_DESC(xpc_mq_node, "Node number on which to allocate message queues.");
drivers/rapidio/devices/tsi721.c
... ... @@ -435,6 +435,9 @@
435 435 " info %4.4x\n", DBELL_SID(idb.bytes),
436 436 DBELL_TID(idb.bytes), DBELL_INF(idb.bytes));
437 437 }
  438 +
  439 + wr_ptr = ioread32(priv->regs +
  440 + TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE;
438 441 }
439 442  
440 443 iowrite32(rd_ptr & (IDB_QSIZE - 1),
... ... @@ -445,6 +448,10 @@
445 448 regval |= TSI721_SR_CHINT_IDBQRCV;
446 449 iowrite32(regval,
447 450 priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
  451 +
  452 + wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE;
  453 + if (wr_ptr != rd_ptr)
  454 + schedule_work(&priv->idb_work);
448 455 }
449 456  
450 457 /**
... ... @@ -2212,7 +2219,7 @@
2212 2219 const struct pci_device_id *id)
2213 2220 {
2214 2221 struct tsi721_device *priv;
2215   - int i, cap;
  2222 + int cap;
2216 2223 int err;
2217 2224 u32 regval;
2218 2225  
2219 2226  
... ... @@ -2232,11 +2239,14 @@
2232 2239 priv->pdev = pdev;
2233 2240  
2234 2241 #ifdef DEBUG
  2242 + {
  2243 + int i;
2235 2244 for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
2236 2245 dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n",
2237 2246 i, (unsigned long long)pci_resource_start(pdev, i),
2238 2247 (unsigned long)pci_resource_len(pdev, i),
2239 2248 pci_resource_flags(pdev, i));
  2249 + }
2240 2250 }
2241 2251 #endif
2242 2252 /*
drivers/rtc/rtc-pcf2123.c
... ... @@ -43,6 +43,7 @@
43 43 #include <linux/rtc.h>
44 44 #include <linux/spi/spi.h>
45 45 #include <linux/module.h>
  46 +#include <linux/sysfs.h>
46 47  
47 48 #define DRV_VERSION "0.6"
48 49  
... ... @@ -292,6 +293,7 @@
292 293 pdata->rtc = rtc;
293 294  
294 295 for (i = 0; i < 16; i++) {
  296 + sysfs_attr_init(&pdata->regs[i].attr.attr);
295 297 sprintf(pdata->regs[i].name, "%1x", i);
296 298 pdata->regs[i].attr.attr.mode = S_IRUGO | S_IWUSR;
297 299 pdata->regs[i].attr.attr.name = pdata->regs[i].name;
drivers/rtc/rtc-rs5c348.c
... ... @@ -122,9 +122,12 @@
122 122 tm->tm_min = bcd2bin(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK);
123 123 tm->tm_hour = bcd2bin(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK);
124 124 if (!pdata->rtc_24h) {
125   - tm->tm_hour %= 12;
126   - if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM)
  125 + if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM) {
  126 + tm->tm_hour -= 20;
  127 + tm->tm_hour %= 12;
127 128 tm->tm_hour += 12;
  129 + } else
  130 + tm->tm_hour %= 12;
128 131 }
129 132 tm->tm_wday = bcd2bin(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK);
130 133 tm->tm_mday = bcd2bin(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK);
include/linux/compaction.h
... ... @@ -22,7 +22,7 @@
22 22 extern int fragmentation_index(struct zone *zone, unsigned int order);
23 23 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25   - bool sync);
  25 + bool sync, bool *contended);
26 26 extern int compact_pgdat(pg_data_t *pgdat, int order);
27 27 extern unsigned long compaction_suitable(struct zone *zone, int order);
28 28  
... ... @@ -64,7 +64,7 @@
64 64 #else
65 65 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
66 66 int order, gfp_t gfp_mask, nodemask_t *nodemask,
67   - bool sync)
  67 + bool sync, bool *contended)
68 68 {
69 69 return COMPACT_CONTINUE;
70 70 }
include/linux/string.h
... ... @@ -144,9 +144,9 @@
144 144 {
145 145 return strncmp(str, prefix, strlen(prefix)) == 0;
146 146 }
147   -#endif
148 147  
149 148 extern size_t memweight(const void *ptr, size_t bytes);
150 149  
  150 +#endif /* __KERNEL__ */
151 151 #endif /* _LINUX_STRING_H_ */
... ... @@ -51,6 +51,47 @@
51 51 }
52 52  
53 53 /*
  54 + * Compaction requires the taking of some coarse locks that are potentially
  55 + * very heavily contended. Check if the process needs to be scheduled or
  56 + * if the lock is contended. For async compaction, back out in the event
  57 + * if contention is severe. For sync compaction, schedule.
  58 + *
  59 + * Returns true if the lock is held.
  60 + * Returns false if the lock is released and compaction should abort
  61 + */
  62 +static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
  63 + bool locked, struct compact_control *cc)
  64 +{
  65 + if (need_resched() || spin_is_contended(lock)) {
  66 + if (locked) {
  67 + spin_unlock_irqrestore(lock, *flags);
  68 + locked = false;
  69 + }
  70 +
  71 + /* async aborts if taking too long or contended */
  72 + if (!cc->sync) {
  73 + if (cc->contended)
  74 + *cc->contended = true;
  75 + return false;
  76 + }
  77 +
  78 + cond_resched();
  79 + if (fatal_signal_pending(current))
  80 + return false;
  81 + }
  82 +
  83 + if (!locked)
  84 + spin_lock_irqsave(lock, *flags);
  85 + return true;
  86 +}
  87 +
  88 +static inline bool compact_trylock_irqsave(spinlock_t *lock,
  89 + unsigned long *flags, struct compact_control *cc)
  90 +{
  91 + return compact_checklock_irqsave(lock, flags, false, cc);
  92 +}
  93 +
  94 +/*
54 95 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
55 96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
56 97 * pages inside of the pageblock (even though it may still end up isolating
... ... @@ -173,7 +214,7 @@
173 214 }
174 215  
175 216 /* Update the number of anon and file isolated pages in the zone */
176   -static void acct_isolated(struct zone *zone, struct compact_control *cc)
  217 +static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
177 218 {
178 219 struct page *page;
179 220 unsigned int count[2] = { 0, };
... ... @@ -181,8 +222,14 @@
181 222 list_for_each_entry(page, &cc->migratepages, lru)
182 223 count[!!page_is_file_cache(page)]++;
183 224  
184   - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
185   - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
  225 + /* If locked we can use the interrupt unsafe versions */
  226 + if (locked) {
  227 + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
  228 + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
  229 + } else {
  230 + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
  231 + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
  232 + }
186 233 }
187 234  
188 235 /* Similar to reclaim, but different enough that they don't share logic */
... ... @@ -228,6 +275,8 @@
228 275 struct list_head *migratelist = &cc->migratepages;
229 276 isolate_mode_t mode = 0;
230 277 struct lruvec *lruvec;
  278 + unsigned long flags;
  279 + bool locked;
231 280  
232 281 /*
233 282 * Ensure that there are not too many pages isolated from the LRU
234 283  
235 284  
236 285  
237 286  
... ... @@ -247,26 +296,23 @@
247 296  
248 297 /* Time to isolate some pages for migration */
249 298 cond_resched();
250   - spin_lock_irq(&zone->lru_lock);
  299 + spin_lock_irqsave(&zone->lru_lock, flags);
  300 + locked = true;
251 301 for (; low_pfn < end_pfn; low_pfn++) {
252 302 struct page *page;
253   - bool locked = true;
254 303  
255 304 /* give a chance to irqs before checking need_resched() */
256 305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257   - spin_unlock_irq(&zone->lru_lock);
  306 + spin_unlock_irqrestore(&zone->lru_lock, flags);
258 307 locked = false;
259 308 }
260   - if (need_resched() || spin_is_contended(&zone->lru_lock)) {
261   - if (locked)
262   - spin_unlock_irq(&zone->lru_lock);
263   - cond_resched();
264   - spin_lock_irq(&zone->lru_lock);
265   - if (fatal_signal_pending(current))
266   - break;
267   - } else if (!locked)
268   - spin_lock_irq(&zone->lru_lock);
269 309  
  310 + /* Check if it is ok to still hold the lock */
  311 + locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
  312 + locked, cc);
  313 + if (!locked)
  314 + break;
  315 +
270 316 /*
271 317 * migrate_pfn does not necessarily start aligned to a
272 318 * pageblock. Ensure that pfn_valid is called when moving
273 319  
... ... @@ -349,9 +395,10 @@
349 395 }
350 396 }
351 397  
352   - acct_isolated(zone, cc);
  398 + acct_isolated(zone, locked, cc);
353 399  
354   - spin_unlock_irq(&zone->lru_lock);
  400 + if (locked)
  401 + spin_unlock_irqrestore(&zone->lru_lock, flags);
355 402  
356 403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357 404  
... ... @@ -384,6 +431,20 @@
384 431 }
385 432  
386 433 /*
  434 + * Returns the start pfn of the last page block in a zone. This is the starting
  435 + * point for full compaction of a zone. Compaction searches for free pages from
  436 + * the end of each zone, while isolate_freepages_block scans forward inside each
  437 + * page block.
  438 + */
  439 +static unsigned long start_free_pfn(struct zone *zone)
  440 +{
  441 + unsigned long free_pfn;
  442 + free_pfn = zone->zone_start_pfn + zone->spanned_pages;
  443 + free_pfn &= ~(pageblock_nr_pages-1);
  444 + return free_pfn;
  445 +}
  446 +
  447 +/*
387 448 * Based on information in the current compact_control, find blocks
388 449 * suitable for isolating free pages from and then isolate them.
389 450 */
... ... @@ -422,17 +483,6 @@
422 483 pfn -= pageblock_nr_pages) {
423 484 unsigned long isolated;
424 485  
425   - /*
426   - * Skip ahead if another thread is compacting in the area
427   - * simultaneously. If we wrapped around, we can only skip
428   - * ahead if zone->compact_cached_free_pfn also wrapped to
429   - * above our starting point.
430   - */
431   - if (cc->order > 0 && (!cc->wrapped ||
432   - zone->compact_cached_free_pfn >
433   - cc->start_free_pfn))
434   - pfn = min(pfn, zone->compact_cached_free_pfn);
435   -
436 486 if (!pfn_valid(pfn))
437 487 continue;
438 488  
... ... @@ -458,7 +508,16 @@
458 508 * are disabled
459 509 */
460 510 isolated = 0;
461   - spin_lock_irqsave(&zone->lock, flags);
  511 +
  512 + /*
  513 + * The zone lock must be held to isolate freepages. This
  514 + * unfortunately this is a very coarse lock and can be
  515 + * heavily contended if there are parallel allocations
  516 + * or parallel compactions. For async compaction do not
  517 + * spin on the lock
  518 + */
  519 + if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
  520 + break;
462 521 if (suitable_migration_target(page)) {
463 522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
464 523 isolated = isolate_freepages_block(pfn, end_pfn,
... ... @@ -474,7 +533,15 @@
474 533 */
475 534 if (isolated) {
476 535 high_pfn = max(high_pfn, pfn);
477   - if (cc->order > 0)
  536 +
  537 + /*
  538 + * If the free scanner has wrapped, update
  539 + * compact_cached_free_pfn to point to the highest
  540 + * pageblock with free pages. This reduces excessive
  541 + * scanning of full pageblocks near the end of the
  542 + * zone
  543 + */
  544 + if (cc->order > 0 && cc->wrapped)
478 545 zone->compact_cached_free_pfn = high_pfn;
479 546 }
480 547 }
... ... @@ -484,6 +551,11 @@
484 551  
485 552 cc->free_pfn = high_pfn;
486 553 cc->nr_freepages = nr_freepages;
  554 +
  555 + /* If compact_cached_free_pfn is reset then set it now */
  556 + if (cc->order > 0 && !cc->wrapped &&
  557 + zone->compact_cached_free_pfn == start_free_pfn(zone))
  558 + zone->compact_cached_free_pfn = high_pfn;
487 559 }
488 560  
489 561 /*
... ... @@ -570,20 +642,6 @@
570 642 return ISOLATE_SUCCESS;
571 643 }
572 644  
573   -/*
574   - * Returns the start pfn of the last page block in a zone. This is the starting
575   - * point for full compaction of a zone. Compaction searches for free pages from
576   - * the end of each zone, while isolate_freepages_block scans forward inside each
577   - * page block.
578   - */
579   -static unsigned long start_free_pfn(struct zone *zone)
580   -{
581   - unsigned long free_pfn;
582   - free_pfn = zone->zone_start_pfn + zone->spanned_pages;
583   - free_pfn &= ~(pageblock_nr_pages-1);
584   - return free_pfn;
585   -}
586   -
587 645 static int compact_finished(struct zone *zone,
588 646 struct compact_control *cc)
589 647 {
... ... @@ -771,7 +829,7 @@
771 829  
772 830 static unsigned long compact_zone_order(struct zone *zone,
773 831 int order, gfp_t gfp_mask,
774   - bool sync)
  832 + bool sync, bool *contended)
775 833 {
776 834 struct compact_control cc = {
777 835 .nr_freepages = 0,
... ... @@ -780,6 +838,7 @@
780 838 .migratetype = allocflags_to_migratetype(gfp_mask),
781 839 .zone = zone,
782 840 .sync = sync,
  841 + .contended = contended,
783 842 };
784 843 INIT_LIST_HEAD(&cc.freepages);
785 844 INIT_LIST_HEAD(&cc.migratepages);
... ... @@ -801,7 +860,7 @@
801 860 */
802 861 unsigned long try_to_compact_pages(struct zonelist *zonelist,
803 862 int order, gfp_t gfp_mask, nodemask_t *nodemask,
804   - bool sync)
  863 + bool sync, bool *contended)
805 864 {
806 865 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
807 866 int may_enter_fs = gfp_mask & __GFP_FS;
... ... @@ -825,7 +884,8 @@
825 884 nodemask) {
826 885 int status;
827 886  
828   - status = compact_zone_order(zone, order, gfp_mask, sync);
  887 + status = compact_zone_order(zone, order, gfp_mask, sync,
  888 + contended);
829 889 rc = max(status, rc);
830 890  
831 891 /* If a normal allocation would succeed, stop compacting */
... ... @@ -861,7 +921,7 @@
861 921 if (cc->order > 0) {
862 922 int ok = zone_watermark_ok(zone, cc->order,
863 923 low_wmark_pages(zone), 0, 0);
864   - if (ok && cc->order > zone->compact_order_failed)
  924 + if (ok && cc->order >= zone->compact_order_failed)
865 925 zone->compact_order_failed = cc->order + 1;
866 926 /* Currently async compaction is never deferred. */
867 927 else if (!ok && cc->sync)
... ... @@ -130,6 +130,7 @@
130 130 int order; /* order a direct compactor needs */
131 131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
132 132 struct zone *zone;
  133 + bool *contended; /* True if a lock was contended */
133 134 };
134 135  
135 136 unsigned long
... ... @@ -2309,7 +2309,7 @@
2309 2309 }
2310 2310 vm_unacct_memory(nr_accounted);
2311 2311  
2312   - BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
  2312 + WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2313 2313 }
2314 2314  
2315 2315 /* Insert vm structure into process list sorted by address
... ... @@ -1928,6 +1928,17 @@
1928 1928 zlc_active = 0;
1929 1929 goto zonelist_scan;
1930 1930 }
  1931 +
  1932 + if (page)
  1933 + /*
  1934 + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
  1935 + * necessary to allocate the page. The expectation is
  1936 + * that the caller is taking steps that will free more
  1937 + * memory. The caller should avoid the page being used
  1938 + * for !PFMEMALLOC purposes.
  1939 + */
  1940 + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
  1941 +
1931 1942 return page;
1932 1943 }
1933 1944  
... ... @@ -2091,7 +2102,7 @@
2091 2102 struct zonelist *zonelist, enum zone_type high_zoneidx,
2092 2103 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2093 2104 int migratetype, bool sync_migration,
2094   - bool *deferred_compaction,
  2105 + bool *contended_compaction, bool *deferred_compaction,
2095 2106 unsigned long *did_some_progress)
2096 2107 {
2097 2108 struct page *page;
... ... @@ -2106,7 +2117,8 @@
2106 2117  
2107 2118 current->flags |= PF_MEMALLOC;
2108 2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2109   - nodemask, sync_migration);
  2120 + nodemask, sync_migration,
  2121 + contended_compaction);
2110 2122 current->flags &= ~PF_MEMALLOC;
2111 2123 if (*did_some_progress != COMPACT_SKIPPED) {
2112 2124  
... ... @@ -2152,7 +2164,7 @@
2152 2164 struct zonelist *zonelist, enum zone_type high_zoneidx,
2153 2165 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2154 2166 int migratetype, bool sync_migration,
2155   - bool *deferred_compaction,
  2167 + bool *contended_compaction, bool *deferred_compaction,
2156 2168 unsigned long *did_some_progress)
2157 2169 {
2158 2170 return NULL;
... ... @@ -2325,6 +2337,7 @@
2325 2337 unsigned long did_some_progress;
2326 2338 bool sync_migration = false;
2327 2339 bool deferred_compaction = false;
  2340 + bool contended_compaction = false;
2328 2341  
2329 2342 /*
2330 2343 * In the slowpath, we sanity check order to avoid ever trying to
... ... @@ -2389,14 +2402,6 @@
2389 2402 zonelist, high_zoneidx, nodemask,
2390 2403 preferred_zone, migratetype);
2391 2404 if (page) {
2392   - /*
2393   - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2394   - * necessary to allocate the page. The expectation is
2395   - * that the caller is taking steps that will free more
2396   - * memory. The caller should avoid the page being used
2397   - * for !PFMEMALLOC purposes.
2398   - */
2399   - page->pfmemalloc = true;
2400 2405 goto got_pg;
2401 2406 }
2402 2407 }
... ... @@ -2422,6 +2427,7 @@
2422 2427 nodemask,
2423 2428 alloc_flags, preferred_zone,
2424 2429 migratetype, sync_migration,
  2430 + &contended_compaction,
2425 2431 &deferred_compaction,
2426 2432 &did_some_progress);
2427 2433 if (page)
2428 2434  
... ... @@ -2431,10 +2437,11 @@
2431 2437 /*
2432 2438 * If compaction is deferred for high-order allocations, it is because
2433 2439 * sync compaction recently failed. In this is the case and the caller
2434   - * has requested the system not be heavily disrupted, fail the
2435   - * allocation now instead of entering direct reclaim
  2440 + * requested a movable allocation that does not heavily disrupt the
  2441 + * system then fail the allocation instead of entering direct reclaim.
2436 2442 */
2437   - if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
  2443 + if ((deferred_compaction || contended_compaction) &&
  2444 + (gfp_mask & __GFP_NO_KSWAPD))
2438 2445 goto nopage;
2439 2446  
2440 2447 /* Try direct reclaim and then allocating */
... ... @@ -2505,6 +2512,7 @@
2505 2512 nodemask,
2506 2513 alloc_flags, preferred_zone,
2507 2514 migratetype, sync_migration,
  2515 + &contended_compaction,
2508 2516 &deferred_compaction,
2509 2517 &did_some_progress);
2510 2518 if (page)
... ... @@ -2569,8 +2577,6 @@
2569 2577 page = __alloc_pages_slowpath(gfp_mask, order,
2570 2578 zonelist, high_zoneidx, nodemask,
2571 2579 preferred_zone, migratetype);
2572   - else
2573   - page->pfmemalloc = false;
2574 2580  
2575 2581 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2576 2582  
scripts/checkpatch.pl
... ... @@ -3016,7 +3016,8 @@
3016 3016 $herectx .= raw_line($linenr, $n) . "\n";
3017 3017 }
3018 3018  
3019   - if (($stmts =~ tr/;/;/) == 1) {
  3019 + if (($stmts =~ tr/;/;/) == 1 &&
  3020 + $stmts !~ /^\s*(if|while|for|switch)\b/) {
3020 3021 WARN("SINGLE_STATEMENT_DO_WHILE_MACRO",
3021 3022 "Single statement macros should not use a do {} while (0) loop\n" . "$herectx");
3022 3023 }