Commit c2a4ffb70eef396148234c1228824008501ab8d2

Authored by Julian Anastasov
Committed by Pablo Neira Ayuso
1 parent 8f3d0023b9

ipvs: convert lblc scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations. The read_lock for sched_lock is
removed. Use a dead flag to prevent new entries to be created
while scheduler is reclaimed. Use hlist for the hash table.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

Showing 1 changed file with 55 additions and 41 deletions Side-by-side Diff

net/netfilter/ipvs/ip_vs_lblc.c
... ... @@ -90,11 +90,12 @@
90 90 * IP address and its destination server
91 91 */
92 92 struct ip_vs_lblc_entry {
93   - struct list_head list;
  93 + struct hlist_node list;
94 94 int af; /* address family */
95 95 union nf_inet_addr addr; /* destination IP address */
96   - struct ip_vs_dest *dest; /* real server (cache) */
  96 + struct ip_vs_dest __rcu *dest; /* real server (cache) */
97 97 unsigned long lastuse; /* last used time */
  98 + struct rcu_head rcu_head;
98 99 };
99 100  
100 101  
101 102  
102 103  
... ... @@ -102,12 +103,14 @@
102 103 * IPVS lblc hash table
103 104 */
104 105 struct ip_vs_lblc_table {
105   - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
  106 + struct rcu_head rcu_head;
  107 + struct hlist_head __rcu bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
  108 + struct timer_list periodic_timer; /* collect stale entries */
106 109 atomic_t entries; /* number of entries */
107 110 int max_size; /* maximum size of entries */
108   - struct timer_list periodic_timer; /* collect stale entries */
109 111 int rover; /* rover for expire check */
110 112 int counter; /* counter for no expire */
  113 + bool dead;
111 114 };
112 115  
113 116  
114 117  
... ... @@ -129,13 +132,16 @@
129 132  
130 133 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
131 134 {
132   - list_del(&en->list);
  135 + struct ip_vs_dest *dest;
  136 +
  137 + hlist_del_rcu(&en->list);
133 138 /*
134 139 * We don't kfree dest because it is referred either by its service
135 140 * or the trash dest list.
136 141 */
137   - atomic_dec(&en->dest->refcnt);
138   - kfree(en);
  142 + dest = rcu_dereference_protected(en->dest, 1);
  143 + ip_vs_dest_put(dest);
  144 + kfree_rcu(en, rcu_head);
139 145 }
140 146  
141 147  
142 148  
... ... @@ -165,15 +171,12 @@
165 171 {
166 172 unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
167 173  
168   - list_add(&en->list, &tbl->bucket[hash]);
  174 + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
169 175 atomic_inc(&tbl->entries);
170 176 }
171 177  
172 178  
173   -/*
174   - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
175   - * lock
176   - */
  179 +/* Get ip_vs_lblc_entry associated with supplied parameters. */
177 180 static inline struct ip_vs_lblc_entry *
178 181 ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
179 182 const union nf_inet_addr *addr)
... ... @@ -181,7 +184,7 @@
181 184 unsigned int hash = ip_vs_lblc_hashkey(af, addr);
182 185 struct ip_vs_lblc_entry *en;
183 186  
184   - list_for_each_entry(en, &tbl->bucket[hash], list)
  187 + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
185 188 if (ip_vs_addr_equal(af, &en->addr, addr))
186 189 return en;
187 190  
188 191  
... ... @@ -209,14 +212,20 @@
209 212 ip_vs_addr_copy(dest->af, &en->addr, daddr);
210 213 en->lastuse = jiffies;
211 214  
212   - atomic_inc(&dest->refcnt);
213   - en->dest = dest;
  215 + ip_vs_dest_hold(dest);
  216 + RCU_INIT_POINTER(en->dest, dest);
214 217  
215 218 ip_vs_lblc_hash(tbl, en);
216   - } else if (en->dest != dest) {
217   - atomic_dec(&en->dest->refcnt);
218   - atomic_inc(&dest->refcnt);
219   - en->dest = dest;
  219 + } else {
  220 + struct ip_vs_dest *old_dest;
  221 +
  222 + old_dest = rcu_dereference_protected(en->dest, 1);
  223 + if (old_dest != dest) {
  224 + ip_vs_dest_put(old_dest);
  225 + ip_vs_dest_hold(dest);
  226 + /* No ordering constraints for refcnt */
  227 + RCU_INIT_POINTER(en->dest, dest);
  228 + }
220 229 }
221 230  
222 231 return en;
223 232  
224 233  
225 234  
226 235  
... ... @@ -226,17 +235,22 @@
226 235 /*
227 236 * Flush all the entries of the specified table.
228 237 */
229   -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
  238 +static void ip_vs_lblc_flush(struct ip_vs_service *svc)
230 239 {
231   - struct ip_vs_lblc_entry *en, *nxt;
  240 + struct ip_vs_lblc_table *tbl = svc->sched_data;
  241 + struct ip_vs_lblc_entry *en;
  242 + struct hlist_node *next;
232 243 int i;
233 244  
  245 + write_lock_bh(&svc->sched_lock);
  246 + tbl->dead = 1;
234 247 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
235   - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
  248 + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
236 249 ip_vs_lblc_free(en);
237 250 atomic_dec(&tbl->entries);
238 251 }
239 252 }
  253 + write_unlock_bh(&svc->sched_lock);
240 254 }
241 255  
242 256 static int sysctl_lblc_expiration(struct ip_vs_service *svc)
... ... @@ -252,7 +266,8 @@
252 266 static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
253 267 {
254 268 struct ip_vs_lblc_table *tbl = svc->sched_data;
255   - struct ip_vs_lblc_entry *en, *nxt;
  269 + struct ip_vs_lblc_entry *en;
  270 + struct hlist_node *next;
256 271 unsigned long now = jiffies;
257 272 int i, j;
258 273  
... ... @@ -260,7 +275,7 @@
260 275 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
261 276  
262 277 write_lock(&svc->sched_lock);
263   - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
  278 + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
264 279 if (time_before(now,
265 280 en->lastuse +
266 281 sysctl_lblc_expiration(svc)))
... ... @@ -293,7 +308,8 @@
293 308 unsigned long now = jiffies;
294 309 int goal;
295 310 int i, j;
296   - struct ip_vs_lblc_entry *en, *nxt;
  311 + struct ip_vs_lblc_entry *en;
  312 + struct hlist_node *next;
297 313  
298 314 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
299 315 /* do full expiration check */
... ... @@ -315,7 +331,7 @@
315 331 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
316 332  
317 333 write_lock(&svc->sched_lock);
318   - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
  334 + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
319 335 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
320 336 continue;
321 337  
322 338  
... ... @@ -354,11 +370,12 @@
354 370 * Initialize the hash buckets
355 371 */
356 372 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
357   - INIT_LIST_HEAD(&tbl->bucket[i]);
  373 + INIT_HLIST_HEAD(&tbl->bucket[i]);
358 374 }
359 375 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
360 376 tbl->rover = 0;
361 377 tbl->counter = 1;
  378 + tbl->dead = 0;
362 379  
363 380 /*
364 381 * Hook periodic timer for garbage collection
365 382  
... ... @@ -379,10 +396,10 @@
379 396 del_timer_sync(&tbl->periodic_timer);
380 397  
381 398 /* got to clean up table entries here */
382   - ip_vs_lblc_flush(tbl);
  399 + ip_vs_lblc_flush(svc);
383 400  
384 401 /* release the table itself */
385   - kfree(tbl);
  402 + kfree_rcu(tbl, rcu_head);
386 403 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
387 404 sizeof(*tbl));
388 405  
... ... @@ -408,7 +425,7 @@
408 425 * The server with weight=0 is quiesced and will not receive any
409 426 * new connection.
410 427 */
411   - list_for_each_entry(dest, &svc->destinations, n_list) {
  428 + list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
412 429 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
413 430 continue;
414 431 if (atomic_read(&dest->weight) > 0) {
... ... @@ -423,7 +440,7 @@
423 440 * Find the destination with the least load.
424 441 */
425 442 nextstage:
426   - list_for_each_entry_continue(dest, &svc->destinations, n_list) {
  443 + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
427 444 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
428 445 continue;
429 446  
... ... @@ -457,7 +474,7 @@
457 474 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
458 475 struct ip_vs_dest *d;
459 476  
460   - list_for_each_entry(d, &svc->destinations, n_list) {
  477 + list_for_each_entry_rcu(d, &svc->destinations, n_list) {
461 478 if (atomic_read(&d->activeconns)*2
462 479 < atomic_read(&d->weight)) {
463 480 return 1;
... ... @@ -484,7 +501,6 @@
484 501 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
485 502  
486 503 /* First look in our cache */
487   - read_lock(&svc->sched_lock);
488 504 en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
489 505 if (en) {
490 506 /* We only hold a read lock, but this is atomic */
491 507  
492 508  
... ... @@ -499,15 +515,12 @@
499 515 * free up entries from the trash at any time.
500 516 */
501 517  
502   - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
503   - dest = en->dest;
  518 + dest = rcu_dereference(en->dest);
  519 + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
  520 + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
  521 + goto out;
504 522 }
505   - read_unlock(&svc->sched_lock);
506 523  
507   - /* If the destination has a weight and is not overloaded, use it */
508   - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
509   - goto out;
510   -
511 524 /* No cache entry or it is invalid, time to schedule */
512 525 dest = __ip_vs_lblc_schedule(svc);
513 526 if (!dest) {
... ... @@ -517,7 +530,8 @@
517 530  
518 531 /* If we fail to create a cache entry, we'll just use the valid dest */
519 532 write_lock(&svc->sched_lock);
520   - ip_vs_lblc_new(tbl, &iph.daddr, dest);
  533 + if (!tbl->dead)
  534 + ip_vs_lblc_new(tbl, &iph.daddr, dest);
521 535 write_unlock(&svc->sched_lock);
522 536  
523 537 out: