Commit 52b7efdbe5f5696fc80338560a3fc51e0b0a993c

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 7dfad4183b

[PATCH] swap: scan_swap_map drop swap_device_lock

get_swap_page has often shown up on latency traces, doing lengthy scans while
holding two spinlocks.  swap_list_lock is already dropped, now scan_swap_map
drop swap_device_lock before scanning the swap_map.

While scanning for an empty cluster, don't worry that racing tasks may
allocate what was free and free what was allocated; but when allocating an
entry, check it's still free after retaking the lock.  Avoid dropping the lock
in the expected common path.  No barriers beyond the locks, just let the
cookie crumble; highest_bit limit is volatile, but benign.

Guard against swapoff: must check SWP_WRITEOK before allocating, must raise
SWP_SCANNING reference count while in scan_swap_map, swapoff wait for that to
fall - just use schedule_timeout, we don't want to burden scan_swap_map
itself, and it's very unlikely that anyone can really still be in
scan_swap_map once swapoff gets this far.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 37 additions and 7 deletions Side-by-side Diff

include/linux/swap.h
... ... @@ -107,6 +107,8 @@
107 107 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
108 108 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
109 109 SWP_ACTIVE = (SWP_USED | SWP_WRITEOK),
  110 + /* add others here before... */
  111 + SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
110 112 };
111 113  
112 114 #define SWAP_CLUSTER_MAX 32
... ... @@ -98,10 +98,12 @@
98 98 * But we do now try to find an empty cluster. -Andrea
99 99 */
100 100  
  101 + si->flags += SWP_SCANNING;
101 102 if (unlikely(!si->cluster_nr)) {
102 103 si->cluster_nr = SWAPFILE_CLUSTER - 1;
103 104 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
104 105 goto lowest;
  106 + swap_device_unlock(si);
105 107  
106 108 offset = si->lowest_bit;
107 109 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
108 110  
... ... @@ -111,10 +113,12 @@
111 113 if (si->swap_map[offset])
112 114 last_in_cluster = offset + SWAPFILE_CLUSTER;
113 115 else if (offset == last_in_cluster) {
  116 + swap_device_lock(si);
114 117 si->cluster_next = offset-SWAPFILE_CLUSTER-1;
115 118 goto cluster;
116 119 }
117 120 }
  121 + swap_device_lock(si);
118 122 goto lowest;
119 123 }
120 124  
121 125  
... ... @@ -123,10 +127,12 @@
123 127 offset = si->cluster_next;
124 128 if (offset > si->highest_bit)
125 129 lowest: offset = si->lowest_bit;
  130 +checks: if (!(si->flags & SWP_WRITEOK))
  131 + goto no_page;
126 132 if (!si->highest_bit)
127 133 goto no_page;
128 134 if (!si->swap_map[offset]) {
129   -got_page: if (offset == si->lowest_bit)
  135 + if (offset == si->lowest_bit)
130 136 si->lowest_bit++;
131 137 if (offset == si->highest_bit)
132 138 si->highest_bit--;
133 139  
134 140  
135 141  
136 142  
... ... @@ -137,16 +143,22 @@
137 143 }
138 144 si->swap_map[offset] = 1;
139 145 si->cluster_next = offset + 1;
  146 + si->flags -= SWP_SCANNING;
140 147 return offset;
141 148 }
142 149  
  150 + swap_device_unlock(si);
143 151 while (++offset <= si->highest_bit) {
144   - if (!si->swap_map[offset])
145   - goto got_page;
  152 + if (!si->swap_map[offset]) {
  153 + swap_device_lock(si);
  154 + goto checks;
  155 + }
146 156 }
  157 + swap_device_lock(si);
147 158 goto lowest;
148 159  
149 160 no_page:
  161 + si->flags -= SWP_SCANNING;
150 162 return 0;
151 163 }
152 164  
... ... @@ -1111,10 +1123,6 @@
1111 1123 err = try_to_unuse(type);
1112 1124 current->flags &= ~PF_SWAPOFF;
1113 1125  
1114   - /* wait for any unplug function to finish */
1115   - down_write(&swap_unplug_sem);
1116   - up_write(&swap_unplug_sem);
1117   -
1118 1126 if (err) {
1119 1127 /* re-insert swap space back into swap_list */
1120 1128 swap_list_lock();
1121 1129  
1122 1130  
... ... @@ -1128,10 +1136,28 @@
1128 1136 swap_info[prev].next = p - swap_info;
1129 1137 nr_swap_pages += p->pages;
1130 1138 total_swap_pages += p->pages;
  1139 + swap_device_lock(p);
1131 1140 p->flags |= SWP_WRITEOK;
  1141 + swap_device_unlock(p);
1132 1142 swap_list_unlock();
1133 1143 goto out_dput;
1134 1144 }
  1145 +
  1146 + /* wait for any unplug function to finish */
  1147 + down_write(&swap_unplug_sem);
  1148 + up_write(&swap_unplug_sem);
  1149 +
  1150 + /* wait for anyone still in scan_swap_map */
  1151 + swap_device_lock(p);
  1152 + p->highest_bit = 0; /* cuts scans short */
  1153 + while (p->flags >= SWP_SCANNING) {
  1154 + swap_device_unlock(p);
  1155 + set_current_state(TASK_UNINTERRUPTIBLE);
  1156 + schedule_timeout(1);
  1157 + swap_device_lock(p);
  1158 + }
  1159 + swap_device_unlock(p);
  1160 +
1135 1161 destroy_swap_extents(p);
1136 1162 down(&swapon_sem);
1137 1163 swap_list_lock();
... ... @@ -1431,6 +1457,8 @@
1431 1457 }
1432 1458  
1433 1459 p->lowest_bit = 1;
  1460 + p->cluster_next = 1;
  1461 +
1434 1462 /*
1435 1463 * Find out how many pages are allowed for a single swap
1436 1464 * device. There are two limiting factors: 1) the number of