Commit 5d337b9194b1ce3b6fd5f3cb2799455ed2f9a3d1

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 048c27fd72

[PATCH] swap: swap_lock replace list+device

The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.

The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention.  However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split.  Certainly the split is mere
overhead in the common case of a single swap device.

So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).

If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 5 changed files with 66 additions and 95 deletions Side-by-side Diff

Documentation/vm/locking
... ... @@ -83,19 +83,18 @@
83 83 vmtruncate) does not lose sending ipi's to cloned threads that might
84 84 be spawned underneath it and go to user mode to drag in pte's into tlbs.
85 85  
86   -swap_list_lock/swap_device_lock
87   --------------------------------
  86 +swap_lock
  87 +--------------
88 88 The swap devices are chained in priority order from the "swap_list" header.
89 89 The "swap_list" is used for the round-robin swaphandle allocation strategy.
90 90 The #free swaphandles is maintained in "nr_swap_pages". These two together
91   -are protected by the swap_list_lock.
  91 +are protected by the swap_lock.
92 92  
93   -The swap_device_lock, which is per swap device, protects the reference
94   -counts on the corresponding swaphandles, maintained in the "swap_map"
95   -array, and the "highest_bit" and "lowest_bit" fields.
  93 +The swap_lock also protects all the device reference counts on the
  94 +corresponding swaphandles, maintained in the "swap_map" array, and the
  95 +"highest_bit" and "lowest_bit" fields.
96 96  
97   -Both of these are spinlocks, and are never acquired from intr level. The
98   -locking hierarchy is swap_list_lock -> swap_device_lock.
  97 +The swap_lock is a spinlock, and is never acquired from intr level.
99 98  
100 99 To prevent races between swap space deletion or async readahead swapins
101 100 deciding whether a swap handle is being used, ie worthy of being read in
include/linux/swap.h
... ... @@ -121,7 +121,7 @@
121 121 */
122 122 struct swap_info_struct {
123 123 unsigned int flags;
124   - spinlock_t sdev_lock;
  124 + int prio; /* swap priority */
125 125 struct file *swap_file;
126 126 struct block_device *bdev;
127 127 struct list_head extent_list;
... ... @@ -135,7 +135,6 @@
135 135 unsigned int pages;
136 136 unsigned int max;
137 137 unsigned int inuse_pages;
138   - int prio; /* swap priority */
139 138 int next; /* next entry on swap list */
140 139 };
141 140  
... ... @@ -221,13 +220,7 @@
221 220 extern int remove_exclusive_swap_page(struct page *);
222 221 struct backing_dev_info;
223 222  
224   -extern struct swap_list_t swap_list;
225   -extern spinlock_t swaplock;
226   -
227   -#define swap_list_lock() spin_lock(&swaplock)
228   -#define swap_list_unlock() spin_unlock(&swaplock)
229   -#define swap_device_lock(p) spin_lock(&p->sdev_lock)
230   -#define swap_device_unlock(p) spin_unlock(&p->sdev_lock)
  223 +extern spinlock_t swap_lock;
231 224  
232 225 /* linux/mm/thrash.c */
233 226 extern struct mm_struct * swap_token_mm;
... ... @@ -54,9 +54,8 @@
54 54 *
55 55 * ->i_mmap_lock (vmtruncate)
56 56 * ->private_lock (__free_pte->__set_page_dirty_buffers)
57   - * ->swap_list_lock
58   - * ->swap_device_lock (exclusive_swap_page, others)
59   - * ->mapping->tree_lock
  57 + * ->swap_lock (exclusive_swap_page, others)
  58 + * ->mapping->tree_lock
60 59 *
61 60 * ->i_sem
62 61 * ->i_mmap_lock (truncate->unmap_mapping_range)
... ... @@ -86,7 +85,7 @@
86 85 * ->page_table_lock (anon_vma_prepare and various)
87 86 *
88 87 * ->page_table_lock
89   - * ->swap_device_lock (try_to_unmap_one)
  88 + * ->swap_lock (try_to_unmap_one)
90 89 * ->private_lock (try_to_unmap_one)
91 90 * ->tree_lock (try_to_unmap_one)
92 91 * ->zone.lru_lock (follow_page->mark_page_accessed)
... ... @@ -34,9 +34,8 @@
34 34 * anon_vma->lock
35 35 * mm->page_table_lock
36 36 * zone->lru_lock (in mark_page_accessed)
37   - * swap_list_lock (in swap_free etc's swap_info_get)
  37 + * swap_lock (in swap_duplicate, swap_info_get)
38 38 * mmlist_lock (in mmput, drain_mmlist and others)
39   - * swap_device_lock (in swap_duplicate, swap_info_get)
40 39 * mapping->private_lock (in __set_page_dirty_buffers)
41 40 * inode_lock (in set_page_dirty's __mark_inode_dirty)
42 41 * sb_lock (within inode_lock in fs/fs-writeback.c)
... ... @@ -31,7 +31,7 @@
31 31 #include <asm/tlbflush.h>
32 32 #include <linux/swapops.h>
33 33  
34   -DEFINE_SPINLOCK(swaplock);
  34 +DEFINE_SPINLOCK(swap_lock);
35 35 unsigned int nr_swapfiles;
36 36 long total_swap_pages;
37 37 static int swap_overflow;
... ... @@ -51,7 +51,7 @@
51 51  
52 52 /*
53 53 * We need this because the bdev->unplug_fn can sleep and we cannot
54   - * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
  54 + * hold swap_lock while calling the unplug_fn. And swap_lock
55 55 * cannot be turned into a semaphore.
56 56 */
57 57 static DECLARE_RWSEM(swap_unplug_sem);
... ... @@ -105,7 +105,7 @@
105 105 si->cluster_nr = SWAPFILE_CLUSTER - 1;
106 106 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
107 107 goto lowest;
108   - swap_device_unlock(si);
  108 + spin_unlock(&swap_lock);
109 109  
110 110 offset = si->lowest_bit;
111 111 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
... ... @@ -115,7 +115,7 @@
115 115 if (si->swap_map[offset])
116 116 last_in_cluster = offset + SWAPFILE_CLUSTER;
117 117 else if (offset == last_in_cluster) {
118   - swap_device_lock(si);
  118 + spin_lock(&swap_lock);
119 119 si->cluster_next = offset-SWAPFILE_CLUSTER-1;
120 120 goto cluster;
121 121 }
... ... @@ -124,7 +124,7 @@
124 124 latency_ration = LATENCY_LIMIT;
125 125 }
126 126 }
127   - swap_device_lock(si);
  127 + spin_lock(&swap_lock);
128 128 goto lowest;
129 129 }
130 130  
131 131  
... ... @@ -153,10 +153,10 @@
153 153 return offset;
154 154 }
155 155  
156   - swap_device_unlock(si);
  156 + spin_unlock(&swap_lock);
157 157 while (++offset <= si->highest_bit) {
158 158 if (!si->swap_map[offset]) {
159   - swap_device_lock(si);
  159 + spin_lock(&swap_lock);
160 160 goto checks;
161 161 }
162 162 if (unlikely(--latency_ration < 0)) {
... ... @@ -164,7 +164,7 @@
164 164 latency_ration = LATENCY_LIMIT;
165 165 }
166 166 }
167   - swap_device_lock(si);
  167 + spin_lock(&swap_lock);
168 168 goto lowest;
169 169  
170 170 no_page:
... ... @@ -179,7 +179,7 @@
179 179 int type, next;
180 180 int wrapped = 0;
181 181  
182   - swap_list_lock();
  182 + spin_lock(&swap_lock);
183 183 if (nr_swap_pages <= 0)
184 184 goto noswap;
185 185 nr_swap_pages--;
186 186  
187 187  
188 188  
... ... @@ -199,19 +199,17 @@
199 199 continue;
200 200  
201 201 swap_list.next = next;
202   - swap_device_lock(si);
203   - swap_list_unlock();
204 202 offset = scan_swap_map(si);
205   - swap_device_unlock(si);
206   - if (offset)
  203 + if (offset) {
  204 + spin_unlock(&swap_lock);
207 205 return swp_entry(type, offset);
208   - swap_list_lock();
  206 + }
209 207 next = swap_list.next;
210 208 }
211 209  
212 210 nr_swap_pages++;
213 211 noswap:
214   - swap_list_unlock();
  212 + spin_unlock(&swap_lock);
215 213 return (swp_entry_t) {0};
216 214 }
217 215  
... ... @@ -233,8 +231,7 @@
233 231 goto bad_offset;
234 232 if (!p->swap_map[offset])
235 233 goto bad_free;
236   - swap_list_lock();
237   - swap_device_lock(p);
  234 + spin_lock(&swap_lock);
238 235 return p;
239 236  
240 237 bad_free:
... ... @@ -252,12 +249,6 @@
252 249 return NULL;
253 250 }
254 251  
255   -static void swap_info_put(struct swap_info_struct * p)
256   -{
257   - swap_device_unlock(p);
258   - swap_list_unlock();
259   -}
260   -
261 252 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
262 253 {
263 254 int count = p->swap_map[offset];
... ... @@ -290,7 +281,7 @@
290 281 p = swap_info_get(entry);
291 282 if (p) {
292 283 swap_entry_free(p, swp_offset(entry));
293   - swap_info_put(p);
  284 + spin_unlock(&swap_lock);
294 285 }
295 286 }
296 287  
... ... @@ -308,7 +299,7 @@
308 299 if (p) {
309 300 /* Subtract the 1 for the swap cache itself */
310 301 count = p->swap_map[swp_offset(entry)] - 1;
311   - swap_info_put(p);
  302 + spin_unlock(&swap_lock);
312 303 }
313 304 return count;
314 305 }
... ... @@ -365,7 +356,7 @@
365 356 }
366 357 write_unlock_irq(&swapper_space.tree_lock);
367 358 }
368   - swap_info_put(p);
  359 + spin_unlock(&swap_lock);
369 360  
370 361 if (retval) {
371 362 swap_free(entry);
... ... @@ -388,7 +379,7 @@
388 379 if (p) {
389 380 if (swap_entry_free(p, swp_offset(entry)) == 1)
390 381 page = find_trylock_page(&swapper_space, entry.val);
391   - swap_info_put(p);
  382 + spin_unlock(&swap_lock);
392 383 }
393 384 if (page) {
394 385 int one_user;
395 386  
... ... @@ -558,10 +549,10 @@
558 549 int count;
559 550  
560 551 /*
561   - * No need for swap_device_lock(si) here: we're just looking
  552 + * No need for swap_lock here: we're just looking
562 553 * for whether an entry is in use, not modifying it; false
563 554 * hits are okay, and sys_swapoff() has already prevented new
564   - * allocations from this area (while holding swap_list_lock()).
  555 + * allocations from this area (while holding swap_lock).
565 556 */
566 557 for (;;) {
567 558 if (++i >= max) {
568 559  
... ... @@ -751,9 +742,9 @@
751 742 * report them; but do report if we reset SWAP_MAP_MAX.
752 743 */
753 744 if (*swap_map == SWAP_MAP_MAX) {
754   - swap_device_lock(si);
  745 + spin_lock(&swap_lock);
755 746 *swap_map = 1;
756   - swap_device_unlock(si);
  747 + spin_unlock(&swap_lock);
757 748 reset_overflow = 1;
758 749 }
759 750  
... ... @@ -817,9 +808,9 @@
817 808 }
818 809  
819 810 /*
820   - * After a successful try_to_unuse, if no swap is now in use, we know we
821   - * can empty the mmlist. swap_list_lock must be held on entry and exit.
822   - * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
  811 + * After a successful try_to_unuse, if no swap is now in use, we know
  812 + * we can empty the mmlist. swap_lock must be held on entry and exit.
  813 + * Note that mmlist_lock nests inside swap_lock, and an mm must be
823 814 * added to the mmlist just after page_duplicate - before would be racy.
824 815 */
825 816 static void drain_mmlist(void)
... ... @@ -1092,7 +1083,7 @@
1092 1083  
1093 1084 mapping = victim->f_mapping;
1094 1085 prev = -1;
1095   - swap_list_lock();
  1086 + spin_lock(&swap_lock);
1096 1087 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1097 1088 p = swap_info + type;
1098 1089 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
1099 1090  
... ... @@ -1103,14 +1094,14 @@
1103 1094 }
1104 1095 if (type < 0) {
1105 1096 err = -EINVAL;
1106   - swap_list_unlock();
  1097 + spin_unlock(&swap_lock);
1107 1098 goto out_dput;
1108 1099 }
1109 1100 if (!security_vm_enough_memory(p->pages))
1110 1101 vm_unacct_memory(p->pages);
1111 1102 else {
1112 1103 err = -ENOMEM;
1113   - swap_list_unlock();
  1104 + spin_unlock(&swap_lock);
1114 1105 goto out_dput;
1115 1106 }
1116 1107 if (prev < 0) {
1117 1108  
... ... @@ -1124,10 +1115,8 @@
1124 1115 }
1125 1116 nr_swap_pages -= p->pages;
1126 1117 total_swap_pages -= p->pages;
1127   - swap_device_lock(p);
1128 1118 p->flags &= ~SWP_WRITEOK;
1129   - swap_device_unlock(p);
1130   - swap_list_unlock();
  1119 + spin_unlock(&swap_lock);
1131 1120  
1132 1121 current->flags |= PF_SWAPOFF;
1133 1122 err = try_to_unuse(type);
... ... @@ -1135,7 +1124,7 @@
1135 1124  
1136 1125 if (err) {
1137 1126 /* re-insert swap space back into swap_list */
1138   - swap_list_lock();
  1127 + spin_lock(&swap_lock);
1139 1128 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
1140 1129 if (p->prio >= swap_info[i].prio)
1141 1130 break;
1142 1131  
... ... @@ -1146,10 +1135,8 @@
1146 1135 swap_info[prev].next = p - swap_info;
1147 1136 nr_swap_pages += p->pages;
1148 1137 total_swap_pages += p->pages;
1149   - swap_device_lock(p);
1150 1138 p->flags |= SWP_WRITEOK;
1151   - swap_device_unlock(p);
1152   - swap_list_unlock();
  1139 + spin_unlock(&swap_lock);
1153 1140 goto out_dput;
1154 1141 }
1155 1142  
1156 1143  
1157 1144  
1158 1145  
1159 1146  
1160 1147  
1161 1148  
... ... @@ -1157,30 +1144,27 @@
1157 1144 down_write(&swap_unplug_sem);
1158 1145 up_write(&swap_unplug_sem);
1159 1146  
  1147 + destroy_swap_extents(p);
  1148 + down(&swapon_sem);
  1149 + spin_lock(&swap_lock);
  1150 + drain_mmlist();
  1151 +
1160 1152 /* wait for anyone still in scan_swap_map */
1161   - swap_device_lock(p);
1162 1153 p->highest_bit = 0; /* cuts scans short */
1163 1154 while (p->flags >= SWP_SCANNING) {
1164   - swap_device_unlock(p);
  1155 + spin_unlock(&swap_lock);
1165 1156 set_current_state(TASK_UNINTERRUPTIBLE);
1166 1157 schedule_timeout(1);
1167   - swap_device_lock(p);
  1158 + spin_lock(&swap_lock);
1168 1159 }
1169   - swap_device_unlock(p);
1170 1160  
1171   - destroy_swap_extents(p);
1172   - down(&swapon_sem);
1173   - swap_list_lock();
1174   - drain_mmlist();
1175   - swap_device_lock(p);
1176 1161 swap_file = p->swap_file;
1177 1162 p->swap_file = NULL;
1178 1163 p->max = 0;
1179 1164 swap_map = p->swap_map;
1180 1165 p->swap_map = NULL;
1181 1166 p->flags = 0;
1182   - swap_device_unlock(p);
1183   - swap_list_unlock();
  1167 + spin_unlock(&swap_lock);
1184 1168 up(&swapon_sem);
1185 1169 vfree(swap_map);
1186 1170 inode = mapping->host;
... ... @@ -1324,7 +1308,7 @@
1324 1308  
1325 1309 if (!capable(CAP_SYS_ADMIN))
1326 1310 return -EPERM;
1327   - swap_list_lock();
  1311 + spin_lock(&swap_lock);
1328 1312 p = swap_info;
1329 1313 for (type = 0 ; type < nr_swapfiles ; type++,p++)
1330 1314 if (!(p->flags & SWP_USED))
... ... @@ -1343,7 +1327,7 @@
1343 1327 * swp_entry_t or the architecture definition of a swap pte.
1344 1328 */
1345 1329 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
1346   - swap_list_unlock();
  1330 + spin_unlock(&swap_lock);
1347 1331 goto out;
1348 1332 }
1349 1333 if (type >= nr_swapfiles)
... ... @@ -1357,7 +1341,6 @@
1357 1341 p->highest_bit = 0;
1358 1342 p->cluster_nr = 0;
1359 1343 p->inuse_pages = 0;
1360   - spin_lock_init(&p->sdev_lock);
1361 1344 p->next = -1;
1362 1345 if (swap_flags & SWAP_FLAG_PREFER) {
1363 1346 p->prio =
... ... @@ -1365,7 +1348,7 @@
1365 1348 } else {
1366 1349 p->prio = --least_priority;
1367 1350 }
1368   - swap_list_unlock();
  1351 + spin_unlock(&swap_lock);
1369 1352 name = getname(specialfile);
1370 1353 error = PTR_ERR(name);
1371 1354 if (IS_ERR(name)) {
... ... @@ -1542,8 +1525,7 @@
1542 1525 }
1543 1526  
1544 1527 down(&swapon_sem);
1545   - swap_list_lock();
1546   - swap_device_lock(p);
  1528 + spin_lock(&swap_lock);
1547 1529 p->flags = SWP_ACTIVE;
1548 1530 nr_swap_pages += nr_good_pages;
1549 1531 total_swap_pages += nr_good_pages;
... ... @@ -1567,8 +1549,7 @@
1567 1549 } else {
1568 1550 swap_info[prev].next = p - swap_info;
1569 1551 }
1570   - swap_device_unlock(p);
1571   - swap_list_unlock();
  1552 + spin_unlock(&swap_lock);
1572 1553 up(&swapon_sem);
1573 1554 error = 0;
1574 1555 goto out;
1575 1556  
... ... @@ -1579,14 +1560,14 @@
1579 1560 }
1580 1561 destroy_swap_extents(p);
1581 1562 bad_swap_2:
1582   - swap_list_lock();
  1563 + spin_lock(&swap_lock);
1583 1564 swap_map = p->swap_map;
1584 1565 p->swap_file = NULL;
1585 1566 p->swap_map = NULL;
1586 1567 p->flags = 0;
1587 1568 if (!(swap_flags & SWAP_FLAG_PREFER))
1588 1569 ++least_priority;
1589   - swap_list_unlock();
  1570 + spin_unlock(&swap_lock);
1590 1571 vfree(swap_map);
1591 1572 if (swap_file)
1592 1573 filp_close(swap_file, NULL);
... ... @@ -1610,7 +1591,7 @@
1610 1591 unsigned int i;
1611 1592 unsigned long nr_to_be_unused = 0;
1612 1593  
1613   - swap_list_lock();
  1594 + spin_lock(&swap_lock);
1614 1595 for (i = 0; i < nr_swapfiles; i++) {
1615 1596 if (!(swap_info[i].flags & SWP_USED) ||
1616 1597 (swap_info[i].flags & SWP_WRITEOK))
... ... @@ -1619,7 +1600,7 @@
1619 1600 }
1620 1601 val->freeswap = nr_swap_pages + nr_to_be_unused;
1621 1602 val->totalswap = total_swap_pages + nr_to_be_unused;
1622   - swap_list_unlock();
  1603 + spin_unlock(&swap_lock);
1623 1604 }
1624 1605  
1625 1606 /*
... ... @@ -1640,7 +1621,7 @@
1640 1621 p = type + swap_info;
1641 1622 offset = swp_offset(entry);
1642 1623  
1643   - swap_device_lock(p);
  1624 + spin_lock(&swap_lock);
1644 1625 if (offset < p->max && p->swap_map[offset]) {
1645 1626 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
1646 1627 p->swap_map[offset]++;
... ... @@ -1652,7 +1633,7 @@
1652 1633 result = 1;
1653 1634 }
1654 1635 }
1655   - swap_device_unlock(p);
  1636 + spin_unlock(&swap_lock);
1656 1637 out:
1657 1638 return result;
1658 1639  
... ... @@ -1668,7 +1649,7 @@
1668 1649 }
1669 1650  
1670 1651 /*
1671   - * swap_device_lock prevents swap_map being freed. Don't grab an extra
  1652 + * swap_lock prevents swap_map being freed. Don't grab an extra
1672 1653 * reference on the swaphandle, it doesn't matter if it becomes unused.
1673 1654 */
1674 1655 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
... ... @@ -1684,7 +1665,7 @@
1684 1665 toff++, i--;
1685 1666 *offset = toff;
1686 1667  
1687   - swap_device_lock(swapdev);
  1668 + spin_lock(&swap_lock);
1688 1669 do {
1689 1670 /* Don't read-ahead past the end of the swap area */
1690 1671 if (toff >= swapdev->max)
... ... @@ -1697,7 +1678,7 @@
1697 1678 toff++;
1698 1679 ret++;
1699 1680 } while (--i);
1700   - swap_device_unlock(swapdev);
  1681 + spin_unlock(&swap_lock);
1701 1682 return ret;
1702 1683 }