Commit c2a4ffb70eef396148234c1228824008501ab8d2
Committed by
Pablo Neira Ayuso
1 parent
8f3d0023b9
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
ipvs: convert lblc scheduler to rcu
The schedule method now needs _rcu list-traversal primitive for svc->destinations. The read_lock for sched_lock is removed. Use a dead flag to prevent new entries to be created while scheduler is reclaimed. Use hlist for the hash table. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
Showing 1 changed file with 55 additions and 41 deletions Side-by-side Diff
net/netfilter/ipvs/ip_vs_lblc.c
... | ... | @@ -90,11 +90,12 @@ |
90 | 90 | * IP address and its destination server |
91 | 91 | */ |
92 | 92 | struct ip_vs_lblc_entry { |
93 | - struct list_head list; | |
93 | + struct hlist_node list; | |
94 | 94 | int af; /* address family */ |
95 | 95 | union nf_inet_addr addr; /* destination IP address */ |
96 | - struct ip_vs_dest *dest; /* real server (cache) */ | |
96 | + struct ip_vs_dest __rcu *dest; /* real server (cache) */ | |
97 | 97 | unsigned long lastuse; /* last used time */ |
98 | + struct rcu_head rcu_head; | |
98 | 99 | }; |
99 | 100 | |
100 | 101 | |
101 | 102 | |
102 | 103 | |
... | ... | @@ -102,12 +103,14 @@ |
102 | 103 | * IPVS lblc hash table |
103 | 104 | */ |
104 | 105 | struct ip_vs_lblc_table { |
105 | - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | |
106 | + struct rcu_head rcu_head; | |
107 | + struct hlist_head __rcu bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | |
108 | + struct timer_list periodic_timer; /* collect stale entries */ | |
106 | 109 | atomic_t entries; /* number of entries */ |
107 | 110 | int max_size; /* maximum size of entries */ |
108 | - struct timer_list periodic_timer; /* collect stale entries */ | |
109 | 111 | int rover; /* rover for expire check */ |
110 | 112 | int counter; /* counter for no expire */ |
113 | + bool dead; | |
111 | 114 | }; |
112 | 115 | |
113 | 116 | |
114 | 117 | |
... | ... | @@ -129,13 +132,16 @@ |
129 | 132 | |
130 | 133 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) |
131 | 134 | { |
132 | - list_del(&en->list); | |
135 | + struct ip_vs_dest *dest; | |
136 | + | |
137 | + hlist_del_rcu(&en->list); | |
133 | 138 | /* |
134 | 139 | * We don't kfree dest because it is referred either by its service |
135 | 140 | * or the trash dest list. |
136 | 141 | */ |
137 | - atomic_dec(&en->dest->refcnt); | |
138 | - kfree(en); | |
142 | + dest = rcu_dereference_protected(en->dest, 1); | |
143 | + ip_vs_dest_put(dest); | |
144 | + kfree_rcu(en, rcu_head); | |
139 | 145 | } |
140 | 146 | |
141 | 147 | |
142 | 148 | |
... | ... | @@ -165,15 +171,12 @@ |
165 | 171 | { |
166 | 172 | unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); |
167 | 173 | |
168 | - list_add(&en->list, &tbl->bucket[hash]); | |
174 | + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); | |
169 | 175 | atomic_inc(&tbl->entries); |
170 | 176 | } |
171 | 177 | |
172 | 178 | |
173 | -/* | |
174 | - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read | |
175 | - * lock | |
176 | - */ | |
179 | +/* Get ip_vs_lblc_entry associated with supplied parameters. */ | |
177 | 180 | static inline struct ip_vs_lblc_entry * |
178 | 181 | ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, |
179 | 182 | const union nf_inet_addr *addr) |
... | ... | @@ -181,7 +184,7 @@ |
181 | 184 | unsigned int hash = ip_vs_lblc_hashkey(af, addr); |
182 | 185 | struct ip_vs_lblc_entry *en; |
183 | 186 | |
184 | - list_for_each_entry(en, &tbl->bucket[hash], list) | |
187 | + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) | |
185 | 188 | if (ip_vs_addr_equal(af, &en->addr, addr)) |
186 | 189 | return en; |
187 | 190 | |
188 | 191 | |
... | ... | @@ -209,14 +212,20 @@ |
209 | 212 | ip_vs_addr_copy(dest->af, &en->addr, daddr); |
210 | 213 | en->lastuse = jiffies; |
211 | 214 | |
212 | - atomic_inc(&dest->refcnt); | |
213 | - en->dest = dest; | |
215 | + ip_vs_dest_hold(dest); | |
216 | + RCU_INIT_POINTER(en->dest, dest); | |
214 | 217 | |
215 | 218 | ip_vs_lblc_hash(tbl, en); |
216 | - } else if (en->dest != dest) { | |
217 | - atomic_dec(&en->dest->refcnt); | |
218 | - atomic_inc(&dest->refcnt); | |
219 | - en->dest = dest; | |
219 | + } else { | |
220 | + struct ip_vs_dest *old_dest; | |
221 | + | |
222 | + old_dest = rcu_dereference_protected(en->dest, 1); | |
223 | + if (old_dest != dest) { | |
224 | + ip_vs_dest_put(old_dest); | |
225 | + ip_vs_dest_hold(dest); | |
226 | + /* No ordering constraints for refcnt */ | |
227 | + RCU_INIT_POINTER(en->dest, dest); | |
228 | + } | |
220 | 229 | } |
221 | 230 | |
222 | 231 | return en; |
223 | 232 | |
224 | 233 | |
225 | 234 | |
226 | 235 | |
... | ... | @@ -226,17 +235,22 @@ |
226 | 235 | /* |
227 | 236 | * Flush all the entries of the specified table. |
228 | 237 | */ |
229 | -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | |
238 | +static void ip_vs_lblc_flush(struct ip_vs_service *svc) | |
230 | 239 | { |
231 | - struct ip_vs_lblc_entry *en, *nxt; | |
240 | + struct ip_vs_lblc_table *tbl = svc->sched_data; | |
241 | + struct ip_vs_lblc_entry *en; | |
242 | + struct hlist_node *next; | |
232 | 243 | int i; |
233 | 244 | |
245 | + write_lock_bh(&svc->sched_lock); | |
246 | + tbl->dead = 1; | |
234 | 247 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { |
235 | - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | |
248 | + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { | |
236 | 249 | ip_vs_lblc_free(en); |
237 | 250 | atomic_dec(&tbl->entries); |
238 | 251 | } |
239 | 252 | } |
253 | + write_unlock_bh(&svc->sched_lock); | |
240 | 254 | } |
241 | 255 | |
242 | 256 | static int sysctl_lblc_expiration(struct ip_vs_service *svc) |
... | ... | @@ -252,7 +266,8 @@ |
252 | 266 | static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) |
253 | 267 | { |
254 | 268 | struct ip_vs_lblc_table *tbl = svc->sched_data; |
255 | - struct ip_vs_lblc_entry *en, *nxt; | |
269 | + struct ip_vs_lblc_entry *en; | |
270 | + struct hlist_node *next; | |
256 | 271 | unsigned long now = jiffies; |
257 | 272 | int i, j; |
258 | 273 | |
... | ... | @@ -260,7 +275,7 @@ |
260 | 275 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; |
261 | 276 | |
262 | 277 | write_lock(&svc->sched_lock); |
263 | - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | |
278 | + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { | |
264 | 279 | if (time_before(now, |
265 | 280 | en->lastuse + |
266 | 281 | sysctl_lblc_expiration(svc))) |
... | ... | @@ -293,7 +308,8 @@ |
293 | 308 | unsigned long now = jiffies; |
294 | 309 | int goal; |
295 | 310 | int i, j; |
296 | - struct ip_vs_lblc_entry *en, *nxt; | |
311 | + struct ip_vs_lblc_entry *en; | |
312 | + struct hlist_node *next; | |
297 | 313 | |
298 | 314 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { |
299 | 315 | /* do full expiration check */ |
... | ... | @@ -315,7 +331,7 @@ |
315 | 331 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; |
316 | 332 | |
317 | 333 | write_lock(&svc->sched_lock); |
318 | - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | |
334 | + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { | |
319 | 335 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) |
320 | 336 | continue; |
321 | 337 | |
322 | 338 | |
... | ... | @@ -354,11 +370,12 @@ |
354 | 370 | * Initialize the hash buckets |
355 | 371 | */ |
356 | 372 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { |
357 | - INIT_LIST_HEAD(&tbl->bucket[i]); | |
373 | + INIT_HLIST_HEAD(&tbl->bucket[i]); | |
358 | 374 | } |
359 | 375 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; |
360 | 376 | tbl->rover = 0; |
361 | 377 | tbl->counter = 1; |
378 | + tbl->dead = 0; | |
362 | 379 | |
363 | 380 | /* |
364 | 381 | * Hook periodic timer for garbage collection |
365 | 382 | |
... | ... | @@ -379,10 +396,10 @@ |
379 | 396 | del_timer_sync(&tbl->periodic_timer); |
380 | 397 | |
381 | 398 | /* got to clean up table entries here */ |
382 | - ip_vs_lblc_flush(tbl); | |
399 | + ip_vs_lblc_flush(svc); | |
383 | 400 | |
384 | 401 | /* release the table itself */ |
385 | - kfree(tbl); | |
402 | + kfree_rcu(tbl, rcu_head); | |
386 | 403 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", |
387 | 404 | sizeof(*tbl)); |
388 | 405 | |
... | ... | @@ -408,7 +425,7 @@ |
408 | 425 | * The server with weight=0 is quiesced and will not receive any |
409 | 426 | * new connection. |
410 | 427 | */ |
411 | - list_for_each_entry(dest, &svc->destinations, n_list) { | |
428 | + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { | |
412 | 429 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) |
413 | 430 | continue; |
414 | 431 | if (atomic_read(&dest->weight) > 0) { |
... | ... | @@ -423,7 +440,7 @@ |
423 | 440 | * Find the destination with the least load. |
424 | 441 | */ |
425 | 442 | nextstage: |
426 | - list_for_each_entry_continue(dest, &svc->destinations, n_list) { | |
443 | + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { | |
427 | 444 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) |
428 | 445 | continue; |
429 | 446 | |
... | ... | @@ -457,7 +474,7 @@ |
457 | 474 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { |
458 | 475 | struct ip_vs_dest *d; |
459 | 476 | |
460 | - list_for_each_entry(d, &svc->destinations, n_list) { | |
477 | + list_for_each_entry_rcu(d, &svc->destinations, n_list) { | |
461 | 478 | if (atomic_read(&d->activeconns)*2 |
462 | 479 | < atomic_read(&d->weight)) { |
463 | 480 | return 1; |
... | ... | @@ -484,7 +501,6 @@ |
484 | 501 | IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); |
485 | 502 | |
486 | 503 | /* First look in our cache */ |
487 | - read_lock(&svc->sched_lock); | |
488 | 504 | en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); |
489 | 505 | if (en) { |
490 | 506 | /* We only hold a read lock, but this is atomic */ |
491 | 507 | |
492 | 508 | |
... | ... | @@ -499,15 +515,12 @@ |
499 | 515 | * free up entries from the trash at any time. |
500 | 516 | */ |
501 | 517 | |
502 | - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) | |
503 | - dest = en->dest; | |
518 | + dest = rcu_dereference(en->dest); | |
519 | + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && | |
520 | + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) | |
521 | + goto out; | |
504 | 522 | } |
505 | - read_unlock(&svc->sched_lock); | |
506 | 523 | |
507 | - /* If the destination has a weight and is not overloaded, use it */ | |
508 | - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) | |
509 | - goto out; | |
510 | - | |
511 | 524 | /* No cache entry or it is invalid, time to schedule */ |
512 | 525 | dest = __ip_vs_lblc_schedule(svc); |
513 | 526 | if (!dest) { |
... | ... | @@ -517,7 +530,8 @@ |
517 | 530 | |
518 | 531 | /* If we fail to create a cache entry, we'll just use the valid dest */ |
519 | 532 | write_lock(&svc->sched_lock); |
520 | - ip_vs_lblc_new(tbl, &iph.daddr, dest); | |
533 | + if (!tbl->dead) | |
534 | + ip_vs_lblc_new(tbl, &iph.daddr, dest); | |
521 | 535 | write_unlock(&svc->sched_lock); |
522 | 536 | |
523 | 537 | out: |