Commit f86dcc5aa8c7908f2c287e7a211228df599e3e71

Authored by Eric Dumazet
Committed by David S. Miller
1 parent 8a6dfd43d1

udp: dynamically size hash tables at boot time

UDP_HTABLE_SIZE was initialy defined to 128, which is a bit small for
several setups.

4000 active UDP sockets -> 32 sockets per chain in average. An
incoming frame has to lookup all sockets to find best match, so long
chains hurt latency.

Instead of a fixed size hash table that cant be perfect for every
needs, let UDP stack choose its table size at boot time like tcp/ip
route, using alloc_large_system_hash() helper

Add an optional boot parameter, uhash_entries=x so that an admin can
force a size between 256 and 65536 if needed, like thash_entries and
rhash_entries.

dmesg logs two new lines :
[    0.647039] UDP hash table entries: 512 (order: 0, 4096 bytes)
[    0.647099] UDP Lite hash table entries: 512 (order: 0, 4096 bytes)

Maximal size on 64bit arches would be 65536 slots, ie 1 MBytes for non
debugging spinlocks.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 6 changed files with 87 additions and 36 deletions Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -2589,6 +2589,9 @@
2589 2589 uart6850= [HW,OSS]
2590 2590 Format: <io>,<irq>
2591 2591  
  2592 + uhash_entries= [KNL,NET]
  2593 + Set number of hash buckets for UDP/UDP-Lite connections
  2594 +
2592 2595 uhci-hcd.ignore_oc=
2593 2596 [USB] Ignore overcurrent events (default N).
2594 2597 Some badly-designed motherboards generate lots of
... ... @@ -45,11 +45,11 @@
45 45 return (struct udphdr *)skb_transport_header(skb);
46 46 }
47 47  
48   -#define UDP_HTABLE_SIZE 128
  48 +#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256)
49 49  
50   -static inline int udp_hashfn(struct net *net, const unsigned num)
  50 +static inline int udp_hashfn(struct net *net, unsigned num, unsigned mask)
51 51 {
52   - return (num + net_hash_mix(net)) & (UDP_HTABLE_SIZE - 1);
  52 + return (num + net_hash_mix(net)) & mask;
53 53 }
54 54  
55 55 struct udp_sock {
... ... @@ -54,12 +54,19 @@
54 54 struct hlist_nulls_head head;
55 55 spinlock_t lock;
56 56 } __attribute__((aligned(2 * sizeof(long))));
  57 +
57 58 struct udp_table {
58   - struct udp_hslot hash[UDP_HTABLE_SIZE];
  59 + struct udp_hslot *hash;
  60 + unsigned int mask;
  61 + unsigned int log;
59 62 };
60 63 extern struct udp_table udp_table;
61   -extern void udp_table_init(struct udp_table *);
62   -
  64 +extern void udp_table_init(struct udp_table *, const char *);
  65 +static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
  66 + struct net *net, unsigned num)
  67 +{
  68 + return &table->hash[udp_hashfn(net, num, table->mask)];
  69 +}
63 70  
64 71 /* Note: this must match 'valbool' in sock_setsockopt */
65 72 #define UDP_CSUM_NOXMIT 1
... ... @@ -106,7 +106,7 @@
106 106 #include <net/xfrm.h>
107 107 #include "udp_impl.h"
108 108  
109   -struct udp_table udp_table;
  109 +struct udp_table udp_table __read_mostly;
110 110 EXPORT_SYMBOL(udp_table);
111 111  
112 112 int sysctl_udp_mem[3] __read_mostly;
113 113  
... ... @@ -121,14 +121,16 @@
121 121 atomic_t udp_memory_allocated;
122 122 EXPORT_SYMBOL(udp_memory_allocated);
123 123  
124   -#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE)
  124 +#define MAX_UDP_PORTS 65536
  125 +#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
125 126  
126 127 static int udp_lib_lport_inuse(struct net *net, __u16 num,
127 128 const struct udp_hslot *hslot,
128 129 unsigned long *bitmap,
129 130 struct sock *sk,
130 131 int (*saddr_comp)(const struct sock *sk1,
131   - const struct sock *sk2))
  132 + const struct sock *sk2),
  133 + unsigned int log)
132 134 {
133 135 struct sock *sk2;
134 136 struct hlist_nulls_node *node;
... ... @@ -142,8 +144,7 @@
142 144 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
143 145 (*saddr_comp)(sk, sk2)) {
144 146 if (bitmap)
145   - __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE,
146   - bitmap);
  147 + __set_bit(sk2->sk_hash >> log, bitmap);
147 148 else
148 149 return 1;
149 150 }
150 151  
... ... @@ -180,13 +181,15 @@
180 181 /*
181 182 * force rand to be an odd multiple of UDP_HTABLE_SIZE
182 183 */
183   - rand = (rand | 1) * UDP_HTABLE_SIZE;
184   - for (last = first + UDP_HTABLE_SIZE; first != last; first++) {
185   - hslot = &udptable->hash[udp_hashfn(net, first)];
  184 + rand = (rand | 1) * (udptable->mask + 1);
  185 + for (last = first + udptable->mask + 1;
  186 + first != last;
  187 + first++) {
  188 + hslot = udp_hashslot(udptable, net, first);
186 189 bitmap_zero(bitmap, PORTS_PER_CHAIN);
187 190 spin_lock_bh(&hslot->lock);
188 191 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
189   - saddr_comp);
  192 + saddr_comp, udptable->log);
190 193  
191 194 snum = first;
192 195 /*
... ... @@ -196,7 +199,7 @@
196 199 */
197 200 do {
198 201 if (low <= snum && snum <= high &&
199   - !test_bit(snum / UDP_HTABLE_SIZE, bitmap))
  202 + !test_bit(snum >> udptable->log, bitmap))
200 203 goto found;
201 204 snum += rand;
202 205 } while (snum != first);
203 206  
... ... @@ -204,9 +207,10 @@
204 207 }
205 208 goto fail;
206 209 } else {
207   - hslot = &udptable->hash[udp_hashfn(net, snum)];
  210 + hslot = udp_hashslot(udptable, net, snum);
208 211 spin_lock_bh(&hslot->lock);
209   - if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp))
  212 + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
  213 + saddr_comp, 0))
210 214 goto fail_unlock;
211 215 }
212 216 found:
... ... @@ -283,7 +287,7 @@
283 287 struct sock *sk, *result;
284 288 struct hlist_nulls_node *node;
285 289 unsigned short hnum = ntohs(dport);
286   - unsigned int hash = udp_hashfn(net, hnum);
  290 + unsigned int hash = udp_hashfn(net, hnum, udptable->mask);
287 291 struct udp_hslot *hslot = &udptable->hash[hash];
288 292 int score, badness;
289 293  
... ... @@ -1013,8 +1017,8 @@
1013 1017 {
1014 1018 if (sk_hashed(sk)) {
1015 1019 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1016   - unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
1017   - struct udp_hslot *hslot = &udptable->hash[hash];
  1020 + struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk),
  1021 + sk->sk_hash);
1018 1022  
1019 1023 spin_lock_bh(&hslot->lock);
1020 1024 if (sk_nulls_del_node_init_rcu(sk)) {
... ... @@ -1169,7 +1173,7 @@
1169 1173 struct udp_table *udptable)
1170 1174 {
1171 1175 struct sock *sk;
1172   - struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
  1176 + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
1173 1177 int dif;
1174 1178  
1175 1179 spin_lock(&hslot->lock);
1176 1180  
... ... @@ -1609,9 +1613,14 @@
1609 1613 struct udp_iter_state *state = seq->private;
1610 1614 struct net *net = seq_file_net(seq);
1611 1615  
1612   - for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
  1616 + for (state->bucket = start; state->bucket <= state->udp_table->mask;
  1617 + ++state->bucket) {
1613 1618 struct hlist_nulls_node *node;
1614 1619 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
  1620 +
  1621 + if (hlist_nulls_empty(&hslot->head))
  1622 + continue;
  1623 +
1615 1624 spin_lock_bh(&hslot->lock);
1616 1625 sk_nulls_for_each(sk, node, &hslot->head) {
1617 1626 if (!net_eq(sock_net(sk), net))
... ... @@ -1636,7 +1645,7 @@
1636 1645 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1637 1646  
1638 1647 if (!sk) {
1639   - if (state->bucket < UDP_HTABLE_SIZE)
  1648 + if (state->bucket <= state->udp_table->mask)
1640 1649 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1641 1650 return udp_get_first(seq, state->bucket + 1);
1642 1651 }
... ... @@ -1656,7 +1665,7 @@
1656 1665 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1657 1666 {
1658 1667 struct udp_iter_state *state = seq->private;
1659   - state->bucket = UDP_HTABLE_SIZE;
  1668 + state->bucket = MAX_UDP_PORTS;
1660 1669  
1661 1670 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1662 1671 }
... ... @@ -1678,7 +1687,7 @@
1678 1687 {
1679 1688 struct udp_iter_state *state = seq->private;
1680 1689  
1681   - if (state->bucket < UDP_HTABLE_SIZE)
  1690 + if (state->bucket <= state->udp_table->mask)
1682 1691 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1683 1692 }
1684 1693  
... ... @@ -1738,7 +1747,7 @@
1738 1747 __u16 destp = ntohs(inet->dport);
1739 1748 __u16 srcp = ntohs(inet->sport);
1740 1749  
1741   - seq_printf(f, "%4d: %08X:%04X %08X:%04X"
  1750 + seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1742 1751 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1743 1752 bucket, src, srcp, dest, destp, sp->sk_state,
1744 1753 sk_wmem_alloc_get(sp),
1745 1754  
1746 1755  
... ... @@ -1804,11 +1813,43 @@
1804 1813 }
1805 1814 #endif /* CONFIG_PROC_FS */
1806 1815  
1807   -void __init udp_table_init(struct udp_table *table)
  1816 +static __initdata unsigned long uhash_entries;
  1817 +static int __init set_uhash_entries(char *str)
1808 1818 {
1809   - int i;
  1819 + if (!str)
  1820 + return 0;
  1821 + uhash_entries = simple_strtoul(str, &str, 0);
  1822 + if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
  1823 + uhash_entries = UDP_HTABLE_SIZE_MIN;
  1824 + return 1;
  1825 +}
  1826 +__setup("uhash_entries=", set_uhash_entries);
1810 1827  
1811   - for (i = 0; i < UDP_HTABLE_SIZE; i++) {
  1828 +void __init udp_table_init(struct udp_table *table, const char *name)
  1829 +{
  1830 + unsigned int i;
  1831 +
  1832 + if (!CONFIG_BASE_SMALL)
  1833 + table->hash = alloc_large_system_hash(name,
  1834 + sizeof(struct udp_hslot),
  1835 + uhash_entries,
  1836 + 21, /* one slot per 2 MB */
  1837 + 0,
  1838 + &table->log,
  1839 + &table->mask,
  1840 + 64 * 1024);
  1841 + /*
  1842 + * Make sure hash table has the minimum size
  1843 + */
  1844 + if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
  1845 + table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
  1846 + sizeof(struct udp_hslot), GFP_KERNEL);
  1847 + if (!table->hash)
  1848 + panic(name);
  1849 + table->log = ilog2(UDP_HTABLE_SIZE_MIN);
  1850 + table->mask = UDP_HTABLE_SIZE_MIN - 1;
  1851 + }
  1852 + for (i = 0; i <= table->mask; i++) {
1812 1853 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
1813 1854 spin_lock_init(&table->hash[i].lock);
1814 1855 }
... ... @@ -1818,7 +1859,7 @@
1818 1859 {
1819 1860 unsigned long nr_pages, limit;
1820 1861  
1821   - udp_table_init(&udp_table);
  1862 + udp_table_init(&udp_table, "UDP");
1822 1863 /* Set the pressure threshold up by the same strategy of TCP. It is a
1823 1864 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1824 1865 * toward zero with the amount of memory, with a floor of 128 pages.
... ... @@ -12,7 +12,7 @@
12 12 */
13 13 #include "udp_impl.h"
14 14  
15   -struct udp_table udplite_table;
  15 +struct udp_table udplite_table __read_mostly;
16 16 EXPORT_SYMBOL(udplite_table);
17 17  
18 18 static int udplite_rcv(struct sk_buff *skb)
... ... @@ -110,7 +110,7 @@
110 110  
111 111 void __init udplite4_register(void)
112 112 {
113   - udp_table_init(&udplite_table);
  113 + udp_table_init(&udplite_table, "UDP-Lite");
114 114 if (proto_register(&udplite_prot, 1))
115 115 goto out_register_err;
116 116  
... ... @@ -132,7 +132,7 @@
132 132 struct sock *sk, *result;
133 133 struct hlist_nulls_node *node;
134 134 unsigned short hnum = ntohs(dport);
135   - unsigned int hash = udp_hashfn(net, hnum);
  135 + unsigned int hash = udp_hashfn(net, hnum, udptable->mask);
136 136 struct udp_hslot *hslot = &udptable->hash[hash];
137 137 int score, badness;
138 138  
... ... @@ -452,7 +452,7 @@
452 452 {
453 453 struct sock *sk, *sk2;
454 454 const struct udphdr *uh = udp_hdr(skb);
455   - struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
  455 + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
456 456 int dif;
457 457  
458 458 spin_lock(&hslot->lock);
... ... @@ -1197,7 +1197,7 @@
1197 1197 destp = ntohs(inet->dport);
1198 1198 srcp = ntohs(inet->sport);
1199 1199 seq_printf(seq,
1200   - "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
  1200 + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
1201 1201 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
1202 1202 bucket,
1203 1203 src->s6_addr32[0], src->s6_addr32[1],