Commit 72a3effaf633bcae9034b7e176bdbd78d64a71db

Authored by Eric Dumazet
Committed by David S. Miller
1 parent 3c62f75aac

[NET]: Size listen hash tables using backlog hint

We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for
each LISTEN socket, regardless of various parameters (listen backlog for
example)

On x86_64, this means order-1 allocations (might fail), even for 'small'
sockets, expecting few connections. On the contrary, a huge server wanting a
backlog of 50000 is slowed down a bit because of this fixed limit.

This patch makes the sizing of listen hash table a dynamic parameter,
depending of :
- net.core.somaxconn tunable (default is 128)
- net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128)
- backlog value given by user application  (2nd parameter of listen())

For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of
kmalloc().

We still limit memory allocation with the two existing tunables (somaxconn &
tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM
usage.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 9 changed files with 39 additions and 25 deletions Side-by-side Diff

include/net/request_sock.h
... ... @@ -28,8 +28,8 @@
28 28  
29 29 struct request_sock_ops {
30 30 int family;
31   - kmem_cache_t *slab;
32 31 int obj_size;
  32 + kmem_cache_t *slab;
33 33 int (*rtx_syn_ack)(struct sock *sk,
34 34 struct request_sock *req,
35 35 struct dst_entry *dst);
36 36  
... ... @@ -51,13 +51,13 @@
51 51 u32 rcv_wnd; /* rcv_wnd offered first time */
52 52 u32 ts_recent;
53 53 unsigned long expires;
54   - struct request_sock_ops *rsk_ops;
  54 + const struct request_sock_ops *rsk_ops;
55 55 struct sock *sk;
56 56 u32 secid;
57 57 u32 peer_secid;
58 58 };
59 59  
60   -static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops)
  60 +static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
61 61 {
62 62 struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
63 63  
... ... @@ -121,7 +121,7 @@
121 121 };
122 122  
123 123 extern int reqsk_queue_alloc(struct request_sock_queue *queue,
124   - const int nr_table_entries);
  124 + unsigned int nr_table_entries);
125 125  
126 126 static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
127 127 {
... ... @@ -138,7 +138,6 @@
138 138 #define MAX_TCP_SYNCNT 127
139 139  
140 140 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
141   -#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
142 141  
143 142 #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
144 143 #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
net/core/request_sock.c
... ... @@ -15,6 +15,7 @@
15 15 #include <linux/random.h>
16 16 #include <linux/slab.h>
17 17 #include <linux/string.h>
  18 +#include <linux/vmalloc.h>
18 19  
19 20 #include <net/request_sock.h>
20 21  
21 22  
22 23  
23 24  
24 25  
... ... @@ -29,22 +30,31 @@
29 30 * it is absolutely not enough even at 100conn/sec. 256 cures most
30 31 * of problems. This value is adjusted to 128 for very small machines
31 32 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
32   - * Further increasing requires to change hash table size.
  33 + * Note : Dont forget somaxconn that may limit backlog too.
33 34 */
34 35 int sysctl_max_syn_backlog = 256;
35 36  
36 37 int reqsk_queue_alloc(struct request_sock_queue *queue,
37   - const int nr_table_entries)
  38 + unsigned int nr_table_entries)
38 39 {
39   - const int lopt_size = sizeof(struct listen_sock) +
40   - nr_table_entries * sizeof(struct request_sock *);
41   - struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
  40 + size_t lopt_size = sizeof(struct listen_sock);
  41 + struct listen_sock *lopt;
42 42  
  43 + nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
  44 + nr_table_entries = max_t(u32, nr_table_entries, 8);
  45 + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
  46 + lopt_size += nr_table_entries * sizeof(struct request_sock *);
  47 + if (lopt_size > PAGE_SIZE)
  48 + lopt = __vmalloc(lopt_size,
  49 + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
  50 + PAGE_KERNEL);
  51 + else
  52 + lopt = kzalloc(lopt_size, GFP_KERNEL);
43 53 if (lopt == NULL)
44 54 return -ENOMEM;
45 55  
46   - for (lopt->max_qlen_log = 6;
47   - (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
  56 + for (lopt->max_qlen_log = 3;
  57 + (1 << lopt->max_qlen_log) < nr_table_entries;
48 58 lopt->max_qlen_log++);
49 59  
50 60 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
51 61  
... ... @@ -65,9 +75,11 @@
65 75 {
66 76 /* make all the listen_opt local to us */
67 77 struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
  78 + size_t lopt_size = sizeof(struct listen_sock) +
  79 + lopt->nr_table_entries * sizeof(struct request_sock *);
68 80  
69 81 if (lopt->qlen != 0) {
70   - int i;
  82 + unsigned int i;
71 83  
72 84 for (i = 0; i < lopt->nr_table_entries; i++) {
73 85 struct request_sock *req;
... ... @@ -81,7 +93,10 @@
81 93 }
82 94  
83 95 BUG_TRAP(lopt->qlen == 0);
84   - kfree(lopt);
  96 + if (lopt_size > PAGE_SIZE)
  97 + vfree(lopt);
  98 + else
  99 + kfree(lopt);
85 100 }
86 101  
87 102 EXPORT_SYMBOL(reqsk_queue_destroy);
... ... @@ -1022,7 +1022,7 @@
1022 1022 kfree(inet_rsk(req)->opt);
1023 1023 }
1024 1024  
1025   -static struct request_sock_ops dccp_request_sock_ops = {
  1025 +static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
1026 1026 .family = PF_INET,
1027 1027 .obj_size = sizeof(struct dccp_request_sock),
1028 1028 .rtx_syn_ack = dccp_v4_send_response,
... ... @@ -262,12 +262,12 @@
262 262  
263 263 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
264 264  
265   -static inline int dccp_listen_start(struct sock *sk)
  265 +static inline int dccp_listen_start(struct sock *sk, int backlog)
266 266 {
267 267 struct dccp_sock *dp = dccp_sk(sk);
268 268  
269 269 dp->dccps_role = DCCP_ROLE_LISTEN;
270   - return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
  270 + return inet_csk_listen_start(sk, backlog);
271 271 }
272 272  
273 273 int dccp_disconnect(struct sock *sk, int flags)
... ... @@ -788,7 +788,7 @@
788 788 * FIXME: here it probably should be sk->sk_prot->listen_start
789 789 * see tcp_listen_start
790 790 */
791   - err = dccp_listen_start(sk);
  791 + err = dccp_listen_start(sk, backlog);
792 792 if (err)
793 793 goto out;
794 794 }
... ... @@ -204,7 +204,7 @@
204 204 * we can only allow the backlog to be adjusted.
205 205 */
206 206 if (old_state != TCP_LISTEN) {
207   - err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
  207 + err = inet_csk_listen_start(sk, backlog);
208 208 if (err)
209 209 goto out;
210 210 }
net/ipv4/inet_connection_sock.c
... ... @@ -343,7 +343,7 @@
343 343 EXPORT_SYMBOL_GPL(inet_csk_route_req);
344 344  
345 345 static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
346   - const u32 rnd, const u16 synq_hsize)
  346 + const u32 rnd, const u32 synq_hsize)
347 347 {
348 348 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
349 349 }
... ... @@ -715,7 +715,7 @@
715 715 return dopt;
716 716 }
717 717  
718   -struct request_sock_ops tcp_request_sock_ops = {
  718 +struct request_sock_ops tcp_request_sock_ops __read_mostly = {
719 719 .family = PF_INET,
720 720 .obj_size = sizeof(struct tcp_request_sock),
721 721 .rtx_syn_ack = tcp_v4_send_synack,
... ... @@ -1385,7 +1385,7 @@
1385 1385 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1386 1386 struct request_sock *req = cur;
1387 1387  
1388   - icsk = inet_csk(st->syn_wait_sk);
  1388 + icsk = inet_csk(st->syn_wait_sk);
1389 1389 req = req->dl_next;
1390 1390 while (1) {
1391 1391 while (req) {
... ... @@ -1395,7 +1395,7 @@
1395 1395 }
1396 1396 req = req->dl_next;
1397 1397 }
1398   - if (++st->sbucket >= TCP_SYNQ_HSIZE)
  1398 + if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1399 1399 break;
1400 1400 get_req:
1401 1401 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
... ... @@ -526,7 +526,7 @@
526 526 kfree_skb(inet6_rsk(req)->pktopts);
527 527 }
528 528  
529   -static struct request_sock_ops tcp6_request_sock_ops = {
  529 +static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
530 530 .family = AF_INET6,
531 531 .obj_size = sizeof(struct tcp6_request_sock),
532 532 .rtx_syn_ack = tcp_v6_send_synack,