Commit 72a3effaf633bcae9034b7e176bdbd78d64a71db
Committed by
David S. Miller
1 parent
3c62f75aac
Exists in
master
and in
7 other branches
[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for each LISTEN socket, regardless of various parameters (listen backlog for example) On x86_64, this means order-1 allocations (might fail), even for 'small' sockets, expecting few connections. On the contrary, a huge server wanting a backlog of 50000 is slowed down a bit because of this fixed limit. This patch makes the sizing of listen hash table a dynamic parameter, depending of : - net.core.somaxconn tunable (default is 128) - net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128) - backlog value given by user application (2nd parameter of listen()) For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of kmalloc(). We still limit memory allocation with the two existing tunables (somaxconn & tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM usage. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 9 changed files with 39 additions and 25 deletions Side-by-side Diff
include/net/request_sock.h
... | ... | @@ -28,8 +28,8 @@ |
28 | 28 | |
29 | 29 | struct request_sock_ops { |
30 | 30 | int family; |
31 | - kmem_cache_t *slab; | |
32 | 31 | int obj_size; |
32 | + kmem_cache_t *slab; | |
33 | 33 | int (*rtx_syn_ack)(struct sock *sk, |
34 | 34 | struct request_sock *req, |
35 | 35 | struct dst_entry *dst); |
36 | 36 | |
... | ... | @@ -51,13 +51,13 @@ |
51 | 51 | u32 rcv_wnd; /* rcv_wnd offered first time */ |
52 | 52 | u32 ts_recent; |
53 | 53 | unsigned long expires; |
54 | - struct request_sock_ops *rsk_ops; | |
54 | + const struct request_sock_ops *rsk_ops; | |
55 | 55 | struct sock *sk; |
56 | 56 | u32 secid; |
57 | 57 | u32 peer_secid; |
58 | 58 | }; |
59 | 59 | |
60 | -static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops) | |
60 | +static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops) | |
61 | 61 | { |
62 | 62 | struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC); |
63 | 63 | |
... | ... | @@ -121,7 +121,7 @@ |
121 | 121 | }; |
122 | 122 | |
123 | 123 | extern int reqsk_queue_alloc(struct request_sock_queue *queue, |
124 | - const int nr_table_entries); | |
124 | + unsigned int nr_table_entries); | |
125 | 125 | |
126 | 126 | static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue) |
127 | 127 | { |
include/net/tcp.h
... | ... | @@ -138,7 +138,6 @@ |
138 | 138 | #define MAX_TCP_SYNCNT 127 |
139 | 139 | |
140 | 140 | #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ |
141 | -#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */ | |
142 | 141 | |
143 | 142 | #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) |
144 | 143 | #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated |
net/core/request_sock.c
... | ... | @@ -15,6 +15,7 @@ |
15 | 15 | #include <linux/random.h> |
16 | 16 | #include <linux/slab.h> |
17 | 17 | #include <linux/string.h> |
18 | +#include <linux/vmalloc.h> | |
18 | 19 | |
19 | 20 | #include <net/request_sock.h> |
20 | 21 | |
21 | 22 | |
22 | 23 | |
23 | 24 | |
24 | 25 | |
... | ... | @@ -29,22 +30,31 @@ |
29 | 30 | * it is absolutely not enough even at 100conn/sec. 256 cures most |
30 | 31 | * of problems. This value is adjusted to 128 for very small machines |
31 | 32 | * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). |
32 | - * Further increasing requires to change hash table size. | |
33 | + * Note : Dont forget somaxconn that may limit backlog too. | |
33 | 34 | */ |
34 | 35 | int sysctl_max_syn_backlog = 256; |
35 | 36 | |
36 | 37 | int reqsk_queue_alloc(struct request_sock_queue *queue, |
37 | - const int nr_table_entries) | |
38 | + unsigned int nr_table_entries) | |
38 | 39 | { |
39 | - const int lopt_size = sizeof(struct listen_sock) + | |
40 | - nr_table_entries * sizeof(struct request_sock *); | |
41 | - struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL); | |
40 | + size_t lopt_size = sizeof(struct listen_sock); | |
41 | + struct listen_sock *lopt; | |
42 | 42 | |
43 | + nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); | |
44 | + nr_table_entries = max_t(u32, nr_table_entries, 8); | |
45 | + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); | |
46 | + lopt_size += nr_table_entries * sizeof(struct request_sock *); | |
47 | + if (lopt_size > PAGE_SIZE) | |
48 | + lopt = __vmalloc(lopt_size, | |
49 | + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | |
50 | + PAGE_KERNEL); | |
51 | + else | |
52 | + lopt = kzalloc(lopt_size, GFP_KERNEL); | |
43 | 53 | if (lopt == NULL) |
44 | 54 | return -ENOMEM; |
45 | 55 | |
46 | - for (lopt->max_qlen_log = 6; | |
47 | - (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog; | |
56 | + for (lopt->max_qlen_log = 3; | |
57 | + (1 << lopt->max_qlen_log) < nr_table_entries; | |
48 | 58 | lopt->max_qlen_log++); |
49 | 59 | |
50 | 60 | get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); |
51 | 61 | |
... | ... | @@ -65,9 +75,11 @@ |
65 | 75 | { |
66 | 76 | /* make all the listen_opt local to us */ |
67 | 77 | struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); |
78 | + size_t lopt_size = sizeof(struct listen_sock) + | |
79 | + lopt->nr_table_entries * sizeof(struct request_sock *); | |
68 | 80 | |
69 | 81 | if (lopt->qlen != 0) { |
70 | - int i; | |
82 | + unsigned int i; | |
71 | 83 | |
72 | 84 | for (i = 0; i < lopt->nr_table_entries; i++) { |
73 | 85 | struct request_sock *req; |
... | ... | @@ -81,7 +93,10 @@ |
81 | 93 | } |
82 | 94 | |
83 | 95 | BUG_TRAP(lopt->qlen == 0); |
84 | - kfree(lopt); | |
96 | + if (lopt_size > PAGE_SIZE) | |
97 | + vfree(lopt); | |
98 | + else | |
99 | + kfree(lopt); | |
85 | 100 | } |
86 | 101 | |
87 | 102 | EXPORT_SYMBOL(reqsk_queue_destroy); |
net/dccp/ipv4.c
... | ... | @@ -1022,7 +1022,7 @@ |
1022 | 1022 | kfree(inet_rsk(req)->opt); |
1023 | 1023 | } |
1024 | 1024 | |
1025 | -static struct request_sock_ops dccp_request_sock_ops = { | |
1025 | +static struct request_sock_ops dccp_request_sock_ops _read_mostly = { | |
1026 | 1026 | .family = PF_INET, |
1027 | 1027 | .obj_size = sizeof(struct dccp_request_sock), |
1028 | 1028 | .rtx_syn_ack = dccp_v4_send_response, |
net/dccp/proto.c
... | ... | @@ -262,12 +262,12 @@ |
262 | 262 | |
263 | 263 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); |
264 | 264 | |
265 | -static inline int dccp_listen_start(struct sock *sk) | |
265 | +static inline int dccp_listen_start(struct sock *sk, int backlog) | |
266 | 266 | { |
267 | 267 | struct dccp_sock *dp = dccp_sk(sk); |
268 | 268 | |
269 | 269 | dp->dccps_role = DCCP_ROLE_LISTEN; |
270 | - return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); | |
270 | + return inet_csk_listen_start(sk, backlog); | |
271 | 271 | } |
272 | 272 | |
273 | 273 | int dccp_disconnect(struct sock *sk, int flags) |
... | ... | @@ -788,7 +788,7 @@ |
788 | 788 | * FIXME: here it probably should be sk->sk_prot->listen_start |
789 | 789 | * see tcp_listen_start |
790 | 790 | */ |
791 | - err = dccp_listen_start(sk); | |
791 | + err = dccp_listen_start(sk, backlog); | |
792 | 792 | if (err) |
793 | 793 | goto out; |
794 | 794 | } |
net/ipv4/af_inet.c
net/ipv4/inet_connection_sock.c
... | ... | @@ -343,7 +343,7 @@ |
343 | 343 | EXPORT_SYMBOL_GPL(inet_csk_route_req); |
344 | 344 | |
345 | 345 | static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, |
346 | - const u32 rnd, const u16 synq_hsize) | |
346 | + const u32 rnd, const u32 synq_hsize) | |
347 | 347 | { |
348 | 348 | return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); |
349 | 349 | } |
net/ipv4/tcp_ipv4.c
... | ... | @@ -715,7 +715,7 @@ |
715 | 715 | return dopt; |
716 | 716 | } |
717 | 717 | |
718 | -struct request_sock_ops tcp_request_sock_ops = { | |
718 | +struct request_sock_ops tcp_request_sock_ops __read_mostly = { | |
719 | 719 | .family = PF_INET, |
720 | 720 | .obj_size = sizeof(struct tcp_request_sock), |
721 | 721 | .rtx_syn_ack = tcp_v4_send_synack, |
... | ... | @@ -1385,7 +1385,7 @@ |
1385 | 1385 | if (st->state == TCP_SEQ_STATE_OPENREQ) { |
1386 | 1386 | struct request_sock *req = cur; |
1387 | 1387 | |
1388 | - icsk = inet_csk(st->syn_wait_sk); | |
1388 | + icsk = inet_csk(st->syn_wait_sk); | |
1389 | 1389 | req = req->dl_next; |
1390 | 1390 | while (1) { |
1391 | 1391 | while (req) { |
... | ... | @@ -1395,7 +1395,7 @@ |
1395 | 1395 | } |
1396 | 1396 | req = req->dl_next; |
1397 | 1397 | } |
1398 | - if (++st->sbucket >= TCP_SYNQ_HSIZE) | |
1398 | + if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) | |
1399 | 1399 | break; |
1400 | 1400 | get_req: |
1401 | 1401 | req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; |
net/ipv6/tcp_ipv6.c
... | ... | @@ -526,7 +526,7 @@ |
526 | 526 | kfree_skb(inet6_rsk(req)->pktopts); |
527 | 527 | } |
528 | 528 | |
529 | -static struct request_sock_ops tcp6_request_sock_ops = { | |
529 | +static struct request_sock_ops tcp6_request_sock_ops _read_mostly = { | |
530 | 530 | .family = AF_INET6, |
531 | 531 | .obj_size = sizeof(struct tcp6_request_sock), |
532 | 532 | .rtx_syn_ack = tcp_v6_send_synack, |