Commit c2a4ffb70eef396148234c1228824008501ab8d2

Authored by Julian Anastasov
Committed by Pablo Neira Ayuso
1 parent 8f3d0023b9

ipvs: convert lblc scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations. The read_lock for sched_lock is
removed. Use a dead flag to prevent new entries to be created
while scheduler is reclaimed. Use hlist for the hash table.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

Showing 1 changed file with 55 additions and 41 deletions Inline Diff

net/netfilter/ipvs/ip_vs_lblc.c
1 /* 1 /*
2 * IPVS: Locality-Based Least-Connection scheduling module 2 * IPVS: Locality-Based Least-Connection scheduling module
3 * 3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org> 4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 * 10 *
11 * Changes: 11 * Changes:
12 * Martin Hamilton : fixed the terrible locking bugs 12 * Martin Hamilton : fixed the terrible locking bugs
13 * *lock(tbl->lock) ==> *lock(&tbl->lock) 13 * *lock(tbl->lock) ==> *lock(&tbl->lock)
14 * Wensong Zhang : fixed the uninitialized tbl->lock bug 14 * Wensong Zhang : fixed the uninitialized tbl->lock bug
15 * Wensong Zhang : added doing full expiration check to 15 * Wensong Zhang : added doing full expiration check to
16 * collect stale entries of 24+ hours when 16 * collect stale entries of 24+ hours when
17 * no partial expire check in a half hour 17 * no partial expire check in a half hour
18 * Julian Anastasov : replaced del_timer call with del_timer_sync 18 * Julian Anastasov : replaced del_timer call with del_timer_sync
19 * to avoid the possible race between timer 19 * to avoid the possible race between timer
20 * handler and del_timer thread in SMP 20 * handler and del_timer thread in SMP
21 * 21 *
22 */ 22 */
23 23
24 /* 24 /*
25 * The lblc algorithm is as follows (pseudo code): 25 * The lblc algorithm is as follows (pseudo code):
26 * 26 *
27 * if cachenode[dest_ip] is null then 27 * if cachenode[dest_ip] is null then
28 * n, cachenode[dest_ip] <- {weighted least-conn node}; 28 * n, cachenode[dest_ip] <- {weighted least-conn node};
29 * else 29 * else
30 * n <- cachenode[dest_ip]; 30 * n <- cachenode[dest_ip];
31 * if (n is dead) OR 31 * if (n is dead) OR
32 * (n.conns>n.weight AND 32 * (n.conns>n.weight AND
33 * there is a node m with m.conns<m.weight/2) then 33 * there is a node m with m.conns<m.weight/2) then
34 * n, cachenode[dest_ip] <- {weighted least-conn node}; 34 * n, cachenode[dest_ip] <- {weighted least-conn node};
35 * 35 *
36 * return n; 36 * return n;
37 * 37 *
38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing 38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
39 * me to write this module. 39 * me to write this module.
40 */ 40 */
41 41
42 #define KMSG_COMPONENT "IPVS" 42 #define KMSG_COMPONENT "IPVS"
43 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 43 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
44 44
45 #include <linux/ip.h> 45 #include <linux/ip.h>
46 #include <linux/slab.h> 46 #include <linux/slab.h>
47 #include <linux/module.h> 47 #include <linux/module.h>
48 #include <linux/kernel.h> 48 #include <linux/kernel.h>
49 #include <linux/skbuff.h> 49 #include <linux/skbuff.h>
50 #include <linux/jiffies.h> 50 #include <linux/jiffies.h>
51 51
52 /* for sysctl */ 52 /* for sysctl */
53 #include <linux/fs.h> 53 #include <linux/fs.h>
54 #include <linux/sysctl.h> 54 #include <linux/sysctl.h>
55 55
56 #include <net/ip_vs.h> 56 #include <net/ip_vs.h>
57 57
58 58
59 /* 59 /*
60 * It is for garbage collection of stale IPVS lblc entries, 60 * It is for garbage collection of stale IPVS lblc entries,
61 * when the table is full. 61 * when the table is full.
62 */ 62 */
63 #define CHECK_EXPIRE_INTERVAL (60*HZ) 63 #define CHECK_EXPIRE_INTERVAL (60*HZ)
64 #define ENTRY_TIMEOUT (6*60*HZ) 64 #define ENTRY_TIMEOUT (6*60*HZ)
65 65
66 #define DEFAULT_EXPIRATION (24*60*60*HZ) 66 #define DEFAULT_EXPIRATION (24*60*60*HZ)
67 67
68 /* 68 /*
69 * It is for full expiration check. 69 * It is for full expiration check.
70 * When there is no partial expiration check (garbage collection) 70 * When there is no partial expiration check (garbage collection)
71 * in a half hour, do a full expiration check to collect stale 71 * in a half hour, do a full expiration check to collect stale
72 * entries that haven't been touched for a day. 72 * entries that haven't been touched for a day.
73 */ 73 */
74 #define COUNT_FOR_FULL_EXPIRATION 30 74 #define COUNT_FOR_FULL_EXPIRATION 30
75 75
76 76
77 /* 77 /*
78 * for IPVS lblc entry hash table 78 * for IPVS lblc entry hash table
79 */ 79 */
80 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS 80 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS
81 #define CONFIG_IP_VS_LBLC_TAB_BITS 10 81 #define CONFIG_IP_VS_LBLC_TAB_BITS 10
82 #endif 82 #endif
83 #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS 83 #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
84 #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) 84 #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
85 #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) 85 #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
86 86
87 87
88 /* 88 /*
89 * IPVS lblc entry represents an association between destination 89 * IPVS lblc entry represents an association between destination
90 * IP address and its destination server 90 * IP address and its destination server
91 */ 91 */
92 struct ip_vs_lblc_entry { 92 struct ip_vs_lblc_entry {
93 struct list_head list; 93 struct hlist_node list;
94 int af; /* address family */ 94 int af; /* address family */
95 union nf_inet_addr addr; /* destination IP address */ 95 union nf_inet_addr addr; /* destination IP address */
96 struct ip_vs_dest *dest; /* real server (cache) */ 96 struct ip_vs_dest __rcu *dest; /* real server (cache) */
97 unsigned long lastuse; /* last used time */ 97 unsigned long lastuse; /* last used time */
98 struct rcu_head rcu_head;
98 }; 99 };
99 100
100 101
101 /* 102 /*
102 * IPVS lblc hash table 103 * IPVS lblc hash table
103 */ 104 */
104 struct ip_vs_lblc_table { 105 struct ip_vs_lblc_table {
105 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 106 struct rcu_head rcu_head;
107 struct hlist_head __rcu bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
108 struct timer_list periodic_timer; /* collect stale entries */
106 atomic_t entries; /* number of entries */ 109 atomic_t entries; /* number of entries */
107 int max_size; /* maximum size of entries */ 110 int max_size; /* maximum size of entries */
108 struct timer_list periodic_timer; /* collect stale entries */
109 int rover; /* rover for expire check */ 111 int rover; /* rover for expire check */
110 int counter; /* counter for no expire */ 112 int counter; /* counter for no expire */
113 bool dead;
111 }; 114 };
112 115
113 116
114 /* 117 /*
115 * IPVS LBLC sysctl table 118 * IPVS LBLC sysctl table
116 */ 119 */
117 #ifdef CONFIG_SYSCTL 120 #ifdef CONFIG_SYSCTL
118 static ctl_table vs_vars_table[] = { 121 static ctl_table vs_vars_table[] = {
119 { 122 {
120 .procname = "lblc_expiration", 123 .procname = "lblc_expiration",
121 .data = NULL, 124 .data = NULL,
122 .maxlen = sizeof(int), 125 .maxlen = sizeof(int),
123 .mode = 0644, 126 .mode = 0644,
124 .proc_handler = proc_dointvec_jiffies, 127 .proc_handler = proc_dointvec_jiffies,
125 }, 128 },
126 { } 129 { }
127 }; 130 };
128 #endif 131 #endif
129 132
130 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 133 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
131 { 134 {
132 list_del(&en->list); 135 struct ip_vs_dest *dest;
136
137 hlist_del_rcu(&en->list);
133 /* 138 /*
134 * We don't kfree dest because it is referred either by its service 139 * We don't kfree dest because it is referred either by its service
135 * or the trash dest list. 140 * or the trash dest list.
136 */ 141 */
137 atomic_dec(&en->dest->refcnt); 142 dest = rcu_dereference_protected(en->dest, 1);
138 kfree(en); 143 ip_vs_dest_put(dest);
144 kfree_rcu(en, rcu_head);
139 } 145 }
140 146
141 147
142 /* 148 /*
143 * Returns hash value for IPVS LBLC entry 149 * Returns hash value for IPVS LBLC entry
144 */ 150 */
145 static inline unsigned int 151 static inline unsigned int
146 ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) 152 ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
147 { 153 {
148 __be32 addr_fold = addr->ip; 154 __be32 addr_fold = addr->ip;
149 155
150 #ifdef CONFIG_IP_VS_IPV6 156 #ifdef CONFIG_IP_VS_IPV6
151 if (af == AF_INET6) 157 if (af == AF_INET6)
152 addr_fold = addr->ip6[0]^addr->ip6[1]^ 158 addr_fold = addr->ip6[0]^addr->ip6[1]^
153 addr->ip6[2]^addr->ip6[3]; 159 addr->ip6[2]^addr->ip6[3];
154 #endif 160 #endif
155 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; 161 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
156 } 162 }
157 163
158 164
159 /* 165 /*
160 * Hash an entry in the ip_vs_lblc_table. 166 * Hash an entry in the ip_vs_lblc_table.
161 * returns bool success. 167 * returns bool success.
162 */ 168 */
163 static void 169 static void
164 ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 170 ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
165 { 171 {
166 unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); 172 unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
167 173
168 list_add(&en->list, &tbl->bucket[hash]); 174 hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
169 atomic_inc(&tbl->entries); 175 atomic_inc(&tbl->entries);
170 } 176 }
171 177
172 178
173 /* 179 /* Get ip_vs_lblc_entry associated with supplied parameters. */
174 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
175 * lock
176 */
177 static inline struct ip_vs_lblc_entry * 180 static inline struct ip_vs_lblc_entry *
178 ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, 181 ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
179 const union nf_inet_addr *addr) 182 const union nf_inet_addr *addr)
180 { 183 {
181 unsigned int hash = ip_vs_lblc_hashkey(af, addr); 184 unsigned int hash = ip_vs_lblc_hashkey(af, addr);
182 struct ip_vs_lblc_entry *en; 185 struct ip_vs_lblc_entry *en;
183 186
184 list_for_each_entry(en, &tbl->bucket[hash], list) 187 hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
185 if (ip_vs_addr_equal(af, &en->addr, addr)) 188 if (ip_vs_addr_equal(af, &en->addr, addr))
186 return en; 189 return en;
187 190
188 return NULL; 191 return NULL;
189 } 192 }
190 193
191 194
192 /* 195 /*
193 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP 196 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
194 * address to a server. Called under write lock. 197 * address to a server. Called under write lock.
195 */ 198 */
196 static inline struct ip_vs_lblc_entry * 199 static inline struct ip_vs_lblc_entry *
197 ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, 200 ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
198 struct ip_vs_dest *dest) 201 struct ip_vs_dest *dest)
199 { 202 {
200 struct ip_vs_lblc_entry *en; 203 struct ip_vs_lblc_entry *en;
201 204
202 en = ip_vs_lblc_get(dest->af, tbl, daddr); 205 en = ip_vs_lblc_get(dest->af, tbl, daddr);
203 if (!en) { 206 if (!en) {
204 en = kmalloc(sizeof(*en), GFP_ATOMIC); 207 en = kmalloc(sizeof(*en), GFP_ATOMIC);
205 if (!en) 208 if (!en)
206 return NULL; 209 return NULL;
207 210
208 en->af = dest->af; 211 en->af = dest->af;
209 ip_vs_addr_copy(dest->af, &en->addr, daddr); 212 ip_vs_addr_copy(dest->af, &en->addr, daddr);
210 en->lastuse = jiffies; 213 en->lastuse = jiffies;
211 214
212 atomic_inc(&dest->refcnt); 215 ip_vs_dest_hold(dest);
213 en->dest = dest; 216 RCU_INIT_POINTER(en->dest, dest);
214 217
215 ip_vs_lblc_hash(tbl, en); 218 ip_vs_lblc_hash(tbl, en);
216 } else if (en->dest != dest) { 219 } else {
217 atomic_dec(&en->dest->refcnt); 220 struct ip_vs_dest *old_dest;
218 atomic_inc(&dest->refcnt); 221
219 en->dest = dest; 222 old_dest = rcu_dereference_protected(en->dest, 1);
223 if (old_dest != dest) {
224 ip_vs_dest_put(old_dest);
225 ip_vs_dest_hold(dest);
226 /* No ordering constraints for refcnt */
227 RCU_INIT_POINTER(en->dest, dest);
228 }
220 } 229 }
221 230
222 return en; 231 return en;
223 } 232 }
224 233
225 234
226 /* 235 /*
227 * Flush all the entries of the specified table. 236 * Flush all the entries of the specified table.
228 */ 237 */
229 static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 238 static void ip_vs_lblc_flush(struct ip_vs_service *svc)
230 { 239 {
231 struct ip_vs_lblc_entry *en, *nxt; 240 struct ip_vs_lblc_table *tbl = svc->sched_data;
241 struct ip_vs_lblc_entry *en;
242 struct hlist_node *next;
232 int i; 243 int i;
233 244
245 write_lock_bh(&svc->sched_lock);
246 tbl->dead = 1;
234 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 247 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
235 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 248 hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
236 ip_vs_lblc_free(en); 249 ip_vs_lblc_free(en);
237 atomic_dec(&tbl->entries); 250 atomic_dec(&tbl->entries);
238 } 251 }
239 } 252 }
253 write_unlock_bh(&svc->sched_lock);
240 } 254 }
241 255
242 static int sysctl_lblc_expiration(struct ip_vs_service *svc) 256 static int sysctl_lblc_expiration(struct ip_vs_service *svc)
243 { 257 {
244 #ifdef CONFIG_SYSCTL 258 #ifdef CONFIG_SYSCTL
245 struct netns_ipvs *ipvs = net_ipvs(svc->net); 259 struct netns_ipvs *ipvs = net_ipvs(svc->net);
246 return ipvs->sysctl_lblc_expiration; 260 return ipvs->sysctl_lblc_expiration;
247 #else 261 #else
248 return DEFAULT_EXPIRATION; 262 return DEFAULT_EXPIRATION;
249 #endif 263 #endif
250 } 264 }
251 265
252 static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) 266 static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
253 { 267 {
254 struct ip_vs_lblc_table *tbl = svc->sched_data; 268 struct ip_vs_lblc_table *tbl = svc->sched_data;
255 struct ip_vs_lblc_entry *en, *nxt; 269 struct ip_vs_lblc_entry *en;
270 struct hlist_node *next;
256 unsigned long now = jiffies; 271 unsigned long now = jiffies;
257 int i, j; 272 int i, j;
258 273
259 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 274 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
260 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 275 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
261 276
262 write_lock(&svc->sched_lock); 277 write_lock(&svc->sched_lock);
263 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 278 hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
264 if (time_before(now, 279 if (time_before(now,
265 en->lastuse + 280 en->lastuse +
266 sysctl_lblc_expiration(svc))) 281 sysctl_lblc_expiration(svc)))
267 continue; 282 continue;
268 283
269 ip_vs_lblc_free(en); 284 ip_vs_lblc_free(en);
270 atomic_dec(&tbl->entries); 285 atomic_dec(&tbl->entries);
271 } 286 }
272 write_unlock(&svc->sched_lock); 287 write_unlock(&svc->sched_lock);
273 } 288 }
274 tbl->rover = j; 289 tbl->rover = j;
275 } 290 }
276 291
277 292
278 /* 293 /*
279 * Periodical timer handler for IPVS lblc table 294 * Periodical timer handler for IPVS lblc table
280 * It is used to collect stale entries when the number of entries 295 * It is used to collect stale entries when the number of entries
281 * exceeds the maximum size of the table. 296 * exceeds the maximum size of the table.
282 * 297 *
283 * Fixme: we probably need more complicated algorithm to collect 298 * Fixme: we probably need more complicated algorithm to collect
284 * entries that have not been used for a long time even 299 * entries that have not been used for a long time even
285 * if the number of entries doesn't exceed the maximum size 300 * if the number of entries doesn't exceed the maximum size
286 * of the table. 301 * of the table.
287 * The full expiration check is for this purpose now. 302 * The full expiration check is for this purpose now.
288 */ 303 */
289 static void ip_vs_lblc_check_expire(unsigned long data) 304 static void ip_vs_lblc_check_expire(unsigned long data)
290 { 305 {
291 struct ip_vs_service *svc = (struct ip_vs_service *) data; 306 struct ip_vs_service *svc = (struct ip_vs_service *) data;
292 struct ip_vs_lblc_table *tbl = svc->sched_data; 307 struct ip_vs_lblc_table *tbl = svc->sched_data;
293 unsigned long now = jiffies; 308 unsigned long now = jiffies;
294 int goal; 309 int goal;
295 int i, j; 310 int i, j;
296 struct ip_vs_lblc_entry *en, *nxt; 311 struct ip_vs_lblc_entry *en;
312 struct hlist_node *next;
297 313
298 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 314 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
299 /* do full expiration check */ 315 /* do full expiration check */
300 ip_vs_lblc_full_check(svc); 316 ip_vs_lblc_full_check(svc);
301 tbl->counter = 1; 317 tbl->counter = 1;
302 goto out; 318 goto out;
303 } 319 }
304 320
305 if (atomic_read(&tbl->entries) <= tbl->max_size) { 321 if (atomic_read(&tbl->entries) <= tbl->max_size) {
306 tbl->counter++; 322 tbl->counter++;
307 goto out; 323 goto out;
308 } 324 }
309 325
310 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; 326 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
311 if (goal > tbl->max_size/2) 327 if (goal > tbl->max_size/2)
312 goal = tbl->max_size/2; 328 goal = tbl->max_size/2;
313 329
314 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 330 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
315 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 331 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
316 332
317 write_lock(&svc->sched_lock); 333 write_lock(&svc->sched_lock);
318 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 334 hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
319 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 335 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
320 continue; 336 continue;
321 337
322 ip_vs_lblc_free(en); 338 ip_vs_lblc_free(en);
323 atomic_dec(&tbl->entries); 339 atomic_dec(&tbl->entries);
324 goal--; 340 goal--;
325 } 341 }
326 write_unlock(&svc->sched_lock); 342 write_unlock(&svc->sched_lock);
327 if (goal <= 0) 343 if (goal <= 0)
328 break; 344 break;
329 } 345 }
330 tbl->rover = j; 346 tbl->rover = j;
331 347
332 out: 348 out:
333 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); 349 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
334 } 350 }
335 351
336 352
337 static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) 353 static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
338 { 354 {
339 int i; 355 int i;
340 struct ip_vs_lblc_table *tbl; 356 struct ip_vs_lblc_table *tbl;
341 357
342 /* 358 /*
343 * Allocate the ip_vs_lblc_table for this service 359 * Allocate the ip_vs_lblc_table for this service
344 */ 360 */
345 tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); 361 tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
346 if (tbl == NULL) 362 if (tbl == NULL)
347 return -ENOMEM; 363 return -ENOMEM;
348 364
349 svc->sched_data = tbl; 365 svc->sched_data = tbl;
350 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 366 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
351 "current service\n", sizeof(*tbl)); 367 "current service\n", sizeof(*tbl));
352 368
353 /* 369 /*
354 * Initialize the hash buckets 370 * Initialize the hash buckets
355 */ 371 */
356 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 372 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
357 INIT_LIST_HEAD(&tbl->bucket[i]); 373 INIT_HLIST_HEAD(&tbl->bucket[i]);
358 } 374 }
359 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 375 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
360 tbl->rover = 0; 376 tbl->rover = 0;
361 tbl->counter = 1; 377 tbl->counter = 1;
378 tbl->dead = 0;
362 379
363 /* 380 /*
364 * Hook periodic timer for garbage collection 381 * Hook periodic timer for garbage collection
365 */ 382 */
366 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 383 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
367 (unsigned long)svc); 384 (unsigned long)svc);
368 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); 385 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
369 386
370 return 0; 387 return 0;
371 } 388 }
372 389
373 390
374 static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) 391 static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
375 { 392 {
376 struct ip_vs_lblc_table *tbl = svc->sched_data; 393 struct ip_vs_lblc_table *tbl = svc->sched_data;
377 394
378 /* remove periodic timer */ 395 /* remove periodic timer */
379 del_timer_sync(&tbl->periodic_timer); 396 del_timer_sync(&tbl->periodic_timer);
380 397
381 /* got to clean up table entries here */ 398 /* got to clean up table entries here */
382 ip_vs_lblc_flush(tbl); 399 ip_vs_lblc_flush(svc);
383 400
384 /* release the table itself */ 401 /* release the table itself */
385 kfree(tbl); 402 kfree_rcu(tbl, rcu_head);
386 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 403 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
387 sizeof(*tbl)); 404 sizeof(*tbl));
388 405
389 return 0; 406 return 0;
390 } 407 }
391 408
392 409
393 static inline struct ip_vs_dest * 410 static inline struct ip_vs_dest *
394 __ip_vs_lblc_schedule(struct ip_vs_service *svc) 411 __ip_vs_lblc_schedule(struct ip_vs_service *svc)
395 { 412 {
396 struct ip_vs_dest *dest, *least; 413 struct ip_vs_dest *dest, *least;
397 int loh, doh; 414 int loh, doh;
398 415
399 /* 416 /*
400 * We use the following formula to estimate the load: 417 * We use the following formula to estimate the load:
401 * (dest overhead) / dest->weight 418 * (dest overhead) / dest->weight
402 * 419 *
403 * Remember -- no floats in kernel mode!!! 420 * Remember -- no floats in kernel mode!!!
404 * The comparison of h1*w2 > h2*w1 is equivalent to that of 421 * The comparison of h1*w2 > h2*w1 is equivalent to that of
405 * h1/w1 > h2/w2 422 * h1/w1 > h2/w2
406 * if every weight is larger than zero. 423 * if every weight is larger than zero.
407 * 424 *
408 * The server with weight=0 is quiesced and will not receive any 425 * The server with weight=0 is quiesced and will not receive any
409 * new connection. 426 * new connection.
410 */ 427 */
411 list_for_each_entry(dest, &svc->destinations, n_list) { 428 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
412 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 429 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
413 continue; 430 continue;
414 if (atomic_read(&dest->weight) > 0) { 431 if (atomic_read(&dest->weight) > 0) {
415 least = dest; 432 least = dest;
416 loh = ip_vs_dest_conn_overhead(least); 433 loh = ip_vs_dest_conn_overhead(least);
417 goto nextstage; 434 goto nextstage;
418 } 435 }
419 } 436 }
420 return NULL; 437 return NULL;
421 438
422 /* 439 /*
423 * Find the destination with the least load. 440 * Find the destination with the least load.
424 */ 441 */
425 nextstage: 442 nextstage:
426 list_for_each_entry_continue(dest, &svc->destinations, n_list) { 443 list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
427 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 444 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
428 continue; 445 continue;
429 446
430 doh = ip_vs_dest_conn_overhead(dest); 447 doh = ip_vs_dest_conn_overhead(dest);
431 if (loh * atomic_read(&dest->weight) > 448 if (loh * atomic_read(&dest->weight) >
432 doh * atomic_read(&least->weight)) { 449 doh * atomic_read(&least->weight)) {
433 least = dest; 450 least = dest;
434 loh = doh; 451 loh = doh;
435 } 452 }
436 } 453 }
437 454
438 IP_VS_DBG_BUF(6, "LBLC: server %s:%d " 455 IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
439 "activeconns %d refcnt %d weight %d overhead %d\n", 456 "activeconns %d refcnt %d weight %d overhead %d\n",
440 IP_VS_DBG_ADDR(least->af, &least->addr), 457 IP_VS_DBG_ADDR(least->af, &least->addr),
441 ntohs(least->port), 458 ntohs(least->port),
442 atomic_read(&least->activeconns), 459 atomic_read(&least->activeconns),
443 atomic_read(&least->refcnt), 460 atomic_read(&least->refcnt),
444 atomic_read(&least->weight), loh); 461 atomic_read(&least->weight), loh);
445 462
446 return least; 463 return least;
447 } 464 }
448 465
449 466
450 /* 467 /*
451 * If this destination server is overloaded and there is a less loaded 468 * If this destination server is overloaded and there is a less loaded
452 * server, then return true. 469 * server, then return true.
453 */ 470 */
454 static inline int 471 static inline int
455 is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) 472 is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
456 { 473 {
457 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { 474 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
458 struct ip_vs_dest *d; 475 struct ip_vs_dest *d;
459 476
460 list_for_each_entry(d, &svc->destinations, n_list) { 477 list_for_each_entry_rcu(d, &svc->destinations, n_list) {
461 if (atomic_read(&d->activeconns)*2 478 if (atomic_read(&d->activeconns)*2
462 < atomic_read(&d->weight)) { 479 < atomic_read(&d->weight)) {
463 return 1; 480 return 1;
464 } 481 }
465 } 482 }
466 } 483 }
467 return 0; 484 return 0;
468 } 485 }
469 486
470 487
471 /* 488 /*
472 * Locality-Based (weighted) Least-Connection scheduling 489 * Locality-Based (weighted) Least-Connection scheduling
473 */ 490 */
474 static struct ip_vs_dest * 491 static struct ip_vs_dest *
475 ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 492 ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
476 { 493 {
477 struct ip_vs_lblc_table *tbl = svc->sched_data; 494 struct ip_vs_lblc_table *tbl = svc->sched_data;
478 struct ip_vs_iphdr iph; 495 struct ip_vs_iphdr iph;
479 struct ip_vs_dest *dest = NULL; 496 struct ip_vs_dest *dest = NULL;
480 struct ip_vs_lblc_entry *en; 497 struct ip_vs_lblc_entry *en;
481 498
482 ip_vs_fill_iph_addr_only(svc->af, skb, &iph); 499 ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
483 500
484 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); 501 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
485 502
486 /* First look in our cache */ 503 /* First look in our cache */
487 read_lock(&svc->sched_lock);
488 en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); 504 en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
489 if (en) { 505 if (en) {
490 /* We only hold a read lock, but this is atomic */ 506 /* We only hold a read lock, but this is atomic */
491 en->lastuse = jiffies; 507 en->lastuse = jiffies;
492 508
493 /* 509 /*
494 * If the destination is not available, i.e. it's in the trash, 510 * If the destination is not available, i.e. it's in the trash,
495 * we must ignore it, as it may be removed from under our feet, 511 * we must ignore it, as it may be removed from under our feet,
496 * if someone drops our reference count. Our caller only makes 512 * if someone drops our reference count. Our caller only makes
497 * sure that destinations, that are not in the trash, are not 513 * sure that destinations, that are not in the trash, are not
498 * moved to the trash, while we are scheduling. But anyone can 514 * moved to the trash, while we are scheduling. But anyone can
499 * free up entries from the trash at any time. 515 * free up entries from the trash at any time.
500 */ 516 */
501 517
502 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) 518 dest = rcu_dereference(en->dest);
503 dest = en->dest; 519 if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
520 atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
521 goto out;
504 } 522 }
505 read_unlock(&svc->sched_lock);
506 523
507 /* If the destination has a weight and is not overloaded, use it */
508 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
509 goto out;
510
511 /* No cache entry or it is invalid, time to schedule */ 524 /* No cache entry or it is invalid, time to schedule */
512 dest = __ip_vs_lblc_schedule(svc); 525 dest = __ip_vs_lblc_schedule(svc);
513 if (!dest) { 526 if (!dest) {
514 ip_vs_scheduler_err(svc, "no destination available"); 527 ip_vs_scheduler_err(svc, "no destination available");
515 return NULL; 528 return NULL;
516 } 529 }
517 530
518 /* If we fail to create a cache entry, we'll just use the valid dest */ 531 /* If we fail to create a cache entry, we'll just use the valid dest */
519 write_lock(&svc->sched_lock); 532 write_lock(&svc->sched_lock);
520 ip_vs_lblc_new(tbl, &iph.daddr, dest); 533 if (!tbl->dead)
534 ip_vs_lblc_new(tbl, &iph.daddr, dest);
521 write_unlock(&svc->sched_lock); 535 write_unlock(&svc->sched_lock);
522 536
523 out: 537 out:
524 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", 538 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
525 IP_VS_DBG_ADDR(svc->af, &iph.daddr), 539 IP_VS_DBG_ADDR(svc->af, &iph.daddr),
526 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); 540 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
527 541
528 return dest; 542 return dest;
529 } 543 }
530 544
531 545
532 /* 546 /*
533 * IPVS LBLC Scheduler structure 547 * IPVS LBLC Scheduler structure
534 */ 548 */
535 static struct ip_vs_scheduler ip_vs_lblc_scheduler = 549 static struct ip_vs_scheduler ip_vs_lblc_scheduler =
536 { 550 {
537 .name = "lblc", 551 .name = "lblc",
538 .refcnt = ATOMIC_INIT(0), 552 .refcnt = ATOMIC_INIT(0),
539 .module = THIS_MODULE, 553 .module = THIS_MODULE,
540 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), 554 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
541 .init_service = ip_vs_lblc_init_svc, 555 .init_service = ip_vs_lblc_init_svc,
542 .done_service = ip_vs_lblc_done_svc, 556 .done_service = ip_vs_lblc_done_svc,
543 .schedule = ip_vs_lblc_schedule, 557 .schedule = ip_vs_lblc_schedule,
544 }; 558 };
545 559
546 /* 560 /*
547 * per netns init. 561 * per netns init.
548 */ 562 */
549 #ifdef CONFIG_SYSCTL 563 #ifdef CONFIG_SYSCTL
550 static int __net_init __ip_vs_lblc_init(struct net *net) 564 static int __net_init __ip_vs_lblc_init(struct net *net)
551 { 565 {
552 struct netns_ipvs *ipvs = net_ipvs(net); 566 struct netns_ipvs *ipvs = net_ipvs(net);
553 567
554 if (!ipvs) 568 if (!ipvs)
555 return -ENOENT; 569 return -ENOENT;
556 570
557 if (!net_eq(net, &init_net)) { 571 if (!net_eq(net, &init_net)) {
558 ipvs->lblc_ctl_table = kmemdup(vs_vars_table, 572 ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
559 sizeof(vs_vars_table), 573 sizeof(vs_vars_table),
560 GFP_KERNEL); 574 GFP_KERNEL);
561 if (ipvs->lblc_ctl_table == NULL) 575 if (ipvs->lblc_ctl_table == NULL)
562 return -ENOMEM; 576 return -ENOMEM;
563 577
564 /* Don't export sysctls to unprivileged users */ 578 /* Don't export sysctls to unprivileged users */
565 if (net->user_ns != &init_user_ns) 579 if (net->user_ns != &init_user_ns)
566 ipvs->lblc_ctl_table[0].procname = NULL; 580 ipvs->lblc_ctl_table[0].procname = NULL;
567 581
568 } else 582 } else
569 ipvs->lblc_ctl_table = vs_vars_table; 583 ipvs->lblc_ctl_table = vs_vars_table;
570 ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; 584 ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
571 ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; 585 ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
572 586
573 ipvs->lblc_ctl_header = 587 ipvs->lblc_ctl_header =
574 register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); 588 register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
575 if (!ipvs->lblc_ctl_header) { 589 if (!ipvs->lblc_ctl_header) {
576 if (!net_eq(net, &init_net)) 590 if (!net_eq(net, &init_net))
577 kfree(ipvs->lblc_ctl_table); 591 kfree(ipvs->lblc_ctl_table);
578 return -ENOMEM; 592 return -ENOMEM;
579 } 593 }
580 594
581 return 0; 595 return 0;
582 } 596 }
583 597
584 static void __net_exit __ip_vs_lblc_exit(struct net *net) 598 static void __net_exit __ip_vs_lblc_exit(struct net *net)
585 { 599 {
586 struct netns_ipvs *ipvs = net_ipvs(net); 600 struct netns_ipvs *ipvs = net_ipvs(net);
587 601
588 unregister_net_sysctl_table(ipvs->lblc_ctl_header); 602 unregister_net_sysctl_table(ipvs->lblc_ctl_header);
589 603
590 if (!net_eq(net, &init_net)) 604 if (!net_eq(net, &init_net))
591 kfree(ipvs->lblc_ctl_table); 605 kfree(ipvs->lblc_ctl_table);
592 } 606 }
593 607
594 #else 608 #else
595 609
596 static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } 610 static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
597 static void __net_exit __ip_vs_lblc_exit(struct net *net) { } 611 static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
598 612
599 #endif 613 #endif
600 614
601 static struct pernet_operations ip_vs_lblc_ops = { 615 static struct pernet_operations ip_vs_lblc_ops = {
602 .init = __ip_vs_lblc_init, 616 .init = __ip_vs_lblc_init,
603 .exit = __ip_vs_lblc_exit, 617 .exit = __ip_vs_lblc_exit,
604 }; 618 };
605 619
606 static int __init ip_vs_lblc_init(void) 620 static int __init ip_vs_lblc_init(void)
607 { 621 {
608 int ret; 622 int ret;
609 623
610 ret = register_pernet_subsys(&ip_vs_lblc_ops); 624 ret = register_pernet_subsys(&ip_vs_lblc_ops);
611 if (ret) 625 if (ret)
612 return ret; 626 return ret;
613 627
614 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 628 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
615 if (ret) 629 if (ret)
616 unregister_pernet_subsys(&ip_vs_lblc_ops); 630 unregister_pernet_subsys(&ip_vs_lblc_ops);
617 return ret; 631 return ret;
618 } 632 }
619 633
620 static void __exit ip_vs_lblc_cleanup(void) 634 static void __exit ip_vs_lblc_cleanup(void)