Commit f86dcc5aa8c7908f2c287e7a211228df599e3e71
Committed by
David S. Miller
1 parent
8a6dfd43d1
Exists in
master
and in
20 other branches
udp: dynamically size hash tables at boot time
UDP_HTABLE_SIZE was initialy defined to 128, which is a bit small for several setups. 4000 active UDP sockets -> 32 sockets per chain in average. An incoming frame has to lookup all sockets to find best match, so long chains hurt latency. Instead of a fixed size hash table that cant be perfect for every needs, let UDP stack choose its table size at boot time like tcp/ip route, using alloc_large_system_hash() helper Add an optional boot parameter, uhash_entries=x so that an admin can force a size between 256 and 65536 if needed, like thash_entries and rhash_entries. dmesg logs two new lines : [ 0.647039] UDP hash table entries: 512 (order: 0, 4096 bytes) [ 0.647099] UDP Lite hash table entries: 512 (order: 0, 4096 bytes) Maximal size on 64bit arches would be 65536 slots, ie 1 MBytes for non debugging spinlocks. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 6 changed files with 87 additions and 36 deletions Side-by-side Diff
Documentation/kernel-parameters.txt
... | ... | @@ -2589,6 +2589,9 @@ |
2589 | 2589 | uart6850= [HW,OSS] |
2590 | 2590 | Format: <io>,<irq> |
2591 | 2591 | |
2592 | + uhash_entries= [KNL,NET] | |
2593 | + Set number of hash buckets for UDP/UDP-Lite connections | |
2594 | + | |
2592 | 2595 | uhci-hcd.ignore_oc= |
2593 | 2596 | [USB] Ignore overcurrent events (default N). |
2594 | 2597 | Some badly-designed motherboards generate lots of |
include/linux/udp.h
... | ... | @@ -45,11 +45,11 @@ |
45 | 45 | return (struct udphdr *)skb_transport_header(skb); |
46 | 46 | } |
47 | 47 | |
48 | -#define UDP_HTABLE_SIZE 128 | |
48 | +#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) | |
49 | 49 | |
50 | -static inline int udp_hashfn(struct net *net, const unsigned num) | |
50 | +static inline int udp_hashfn(struct net *net, unsigned num, unsigned mask) | |
51 | 51 | { |
52 | - return (num + net_hash_mix(net)) & (UDP_HTABLE_SIZE - 1); | |
52 | + return (num + net_hash_mix(net)) & mask; | |
53 | 53 | } |
54 | 54 | |
55 | 55 | struct udp_sock { |
include/net/udp.h
... | ... | @@ -54,12 +54,19 @@ |
54 | 54 | struct hlist_nulls_head head; |
55 | 55 | spinlock_t lock; |
56 | 56 | } __attribute__((aligned(2 * sizeof(long)))); |
57 | + | |
57 | 58 | struct udp_table { |
58 | - struct udp_hslot hash[UDP_HTABLE_SIZE]; | |
59 | + struct udp_hslot *hash; | |
60 | + unsigned int mask; | |
61 | + unsigned int log; | |
59 | 62 | }; |
60 | 63 | extern struct udp_table udp_table; |
61 | -extern void udp_table_init(struct udp_table *); | |
62 | - | |
64 | +extern void udp_table_init(struct udp_table *, const char *); | |
65 | +static inline struct udp_hslot *udp_hashslot(struct udp_table *table, | |
66 | + struct net *net, unsigned num) | |
67 | +{ | |
68 | + return &table->hash[udp_hashfn(net, num, table->mask)]; | |
69 | +} | |
63 | 70 | |
64 | 71 | /* Note: this must match 'valbool' in sock_setsockopt */ |
65 | 72 | #define UDP_CSUM_NOXMIT 1 |
net/ipv4/udp.c
... | ... | @@ -106,7 +106,7 @@ |
106 | 106 | #include <net/xfrm.h> |
107 | 107 | #include "udp_impl.h" |
108 | 108 | |
109 | -struct udp_table udp_table; | |
109 | +struct udp_table udp_table __read_mostly; | |
110 | 110 | EXPORT_SYMBOL(udp_table); |
111 | 111 | |
112 | 112 | int sysctl_udp_mem[3] __read_mostly; |
113 | 113 | |
... | ... | @@ -121,14 +121,16 @@ |
121 | 121 | atomic_t udp_memory_allocated; |
122 | 122 | EXPORT_SYMBOL(udp_memory_allocated); |
123 | 123 | |
124 | -#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) | |
124 | +#define MAX_UDP_PORTS 65536 | |
125 | +#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) | |
125 | 126 | |
126 | 127 | static int udp_lib_lport_inuse(struct net *net, __u16 num, |
127 | 128 | const struct udp_hslot *hslot, |
128 | 129 | unsigned long *bitmap, |
129 | 130 | struct sock *sk, |
130 | 131 | int (*saddr_comp)(const struct sock *sk1, |
131 | - const struct sock *sk2)) | |
132 | + const struct sock *sk2), | |
133 | + unsigned int log) | |
132 | 134 | { |
133 | 135 | struct sock *sk2; |
134 | 136 | struct hlist_nulls_node *node; |
... | ... | @@ -142,8 +144,7 @@ |
142 | 144 | || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && |
143 | 145 | (*saddr_comp)(sk, sk2)) { |
144 | 146 | if (bitmap) |
145 | - __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, | |
146 | - bitmap); | |
147 | + __set_bit(sk2->sk_hash >> log, bitmap); | |
147 | 148 | else |
148 | 149 | return 1; |
149 | 150 | } |
150 | 151 | |
... | ... | @@ -180,13 +181,15 @@ |
180 | 181 | /* |
181 | 182 | * force rand to be an odd multiple of UDP_HTABLE_SIZE |
182 | 183 | */ |
183 | - rand = (rand | 1) * UDP_HTABLE_SIZE; | |
184 | - for (last = first + UDP_HTABLE_SIZE; first != last; first++) { | |
185 | - hslot = &udptable->hash[udp_hashfn(net, first)]; | |
184 | + rand = (rand | 1) * (udptable->mask + 1); | |
185 | + for (last = first + udptable->mask + 1; | |
186 | + first != last; | |
187 | + first++) { | |
188 | + hslot = udp_hashslot(udptable, net, first); | |
186 | 189 | bitmap_zero(bitmap, PORTS_PER_CHAIN); |
187 | 190 | spin_lock_bh(&hslot->lock); |
188 | 191 | udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, |
189 | - saddr_comp); | |
192 | + saddr_comp, udptable->log); | |
190 | 193 | |
191 | 194 | snum = first; |
192 | 195 | /* |
... | ... | @@ -196,7 +199,7 @@ |
196 | 199 | */ |
197 | 200 | do { |
198 | 201 | if (low <= snum && snum <= high && |
199 | - !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) | |
202 | + !test_bit(snum >> udptable->log, bitmap)) | |
200 | 203 | goto found; |
201 | 204 | snum += rand; |
202 | 205 | } while (snum != first); |
203 | 206 | |
... | ... | @@ -204,9 +207,10 @@ |
204 | 207 | } |
205 | 208 | goto fail; |
206 | 209 | } else { |
207 | - hslot = &udptable->hash[udp_hashfn(net, snum)]; | |
210 | + hslot = udp_hashslot(udptable, net, snum); | |
208 | 211 | spin_lock_bh(&hslot->lock); |
209 | - if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) | |
212 | + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, | |
213 | + saddr_comp, 0)) | |
210 | 214 | goto fail_unlock; |
211 | 215 | } |
212 | 216 | found: |
... | ... | @@ -283,7 +287,7 @@ |
283 | 287 | struct sock *sk, *result; |
284 | 288 | struct hlist_nulls_node *node; |
285 | 289 | unsigned short hnum = ntohs(dport); |
286 | - unsigned int hash = udp_hashfn(net, hnum); | |
290 | + unsigned int hash = udp_hashfn(net, hnum, udptable->mask); | |
287 | 291 | struct udp_hslot *hslot = &udptable->hash[hash]; |
288 | 292 | int score, badness; |
289 | 293 | |
... | ... | @@ -1013,8 +1017,8 @@ |
1013 | 1017 | { |
1014 | 1018 | if (sk_hashed(sk)) { |
1015 | 1019 | struct udp_table *udptable = sk->sk_prot->h.udp_table; |
1016 | - unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); | |
1017 | - struct udp_hslot *hslot = &udptable->hash[hash]; | |
1020 | + struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk), | |
1021 | + sk->sk_hash); | |
1018 | 1022 | |
1019 | 1023 | spin_lock_bh(&hslot->lock); |
1020 | 1024 | if (sk_nulls_del_node_init_rcu(sk)) { |
... | ... | @@ -1169,7 +1173,7 @@ |
1169 | 1173 | struct udp_table *udptable) |
1170 | 1174 | { |
1171 | 1175 | struct sock *sk; |
1172 | - struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; | |
1176 | + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); | |
1173 | 1177 | int dif; |
1174 | 1178 | |
1175 | 1179 | spin_lock(&hslot->lock); |
1176 | 1180 | |
... | ... | @@ -1609,9 +1613,14 @@ |
1609 | 1613 | struct udp_iter_state *state = seq->private; |
1610 | 1614 | struct net *net = seq_file_net(seq); |
1611 | 1615 | |
1612 | - for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { | |
1616 | + for (state->bucket = start; state->bucket <= state->udp_table->mask; | |
1617 | + ++state->bucket) { | |
1613 | 1618 | struct hlist_nulls_node *node; |
1614 | 1619 | struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; |
1620 | + | |
1621 | + if (hlist_nulls_empty(&hslot->head)) | |
1622 | + continue; | |
1623 | + | |
1615 | 1624 | spin_lock_bh(&hslot->lock); |
1616 | 1625 | sk_nulls_for_each(sk, node, &hslot->head) { |
1617 | 1626 | if (!net_eq(sock_net(sk), net)) |
... | ... | @@ -1636,7 +1645,7 @@ |
1636 | 1645 | } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); |
1637 | 1646 | |
1638 | 1647 | if (!sk) { |
1639 | - if (state->bucket < UDP_HTABLE_SIZE) | |
1648 | + if (state->bucket <= state->udp_table->mask) | |
1640 | 1649 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); |
1641 | 1650 | return udp_get_first(seq, state->bucket + 1); |
1642 | 1651 | } |
... | ... | @@ -1656,7 +1665,7 @@ |
1656 | 1665 | static void *udp_seq_start(struct seq_file *seq, loff_t *pos) |
1657 | 1666 | { |
1658 | 1667 | struct udp_iter_state *state = seq->private; |
1659 | - state->bucket = UDP_HTABLE_SIZE; | |
1668 | + state->bucket = MAX_UDP_PORTS; | |
1660 | 1669 | |
1661 | 1670 | return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; |
1662 | 1671 | } |
... | ... | @@ -1678,7 +1687,7 @@ |
1678 | 1687 | { |
1679 | 1688 | struct udp_iter_state *state = seq->private; |
1680 | 1689 | |
1681 | - if (state->bucket < UDP_HTABLE_SIZE) | |
1690 | + if (state->bucket <= state->udp_table->mask) | |
1682 | 1691 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); |
1683 | 1692 | } |
1684 | 1693 | |
... | ... | @@ -1738,7 +1747,7 @@ |
1738 | 1747 | __u16 destp = ntohs(inet->dport); |
1739 | 1748 | __u16 srcp = ntohs(inet->sport); |
1740 | 1749 | |
1741 | - seq_printf(f, "%4d: %08X:%04X %08X:%04X" | |
1750 | + seq_printf(f, "%5d: %08X:%04X %08X:%04X" | |
1742 | 1751 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", |
1743 | 1752 | bucket, src, srcp, dest, destp, sp->sk_state, |
1744 | 1753 | sk_wmem_alloc_get(sp), |
1745 | 1754 | |
1746 | 1755 | |
... | ... | @@ -1804,11 +1813,43 @@ |
1804 | 1813 | } |
1805 | 1814 | #endif /* CONFIG_PROC_FS */ |
1806 | 1815 | |
1807 | -void __init udp_table_init(struct udp_table *table) | |
1816 | +static __initdata unsigned long uhash_entries; | |
1817 | +static int __init set_uhash_entries(char *str) | |
1808 | 1818 | { |
1809 | - int i; | |
1819 | + if (!str) | |
1820 | + return 0; | |
1821 | + uhash_entries = simple_strtoul(str, &str, 0); | |
1822 | + if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) | |
1823 | + uhash_entries = UDP_HTABLE_SIZE_MIN; | |
1824 | + return 1; | |
1825 | +} | |
1826 | +__setup("uhash_entries=", set_uhash_entries); | |
1810 | 1827 | |
1811 | - for (i = 0; i < UDP_HTABLE_SIZE; i++) { | |
1828 | +void __init udp_table_init(struct udp_table *table, const char *name) | |
1829 | +{ | |
1830 | + unsigned int i; | |
1831 | + | |
1832 | + if (!CONFIG_BASE_SMALL) | |
1833 | + table->hash = alloc_large_system_hash(name, | |
1834 | + sizeof(struct udp_hslot), | |
1835 | + uhash_entries, | |
1836 | + 21, /* one slot per 2 MB */ | |
1837 | + 0, | |
1838 | + &table->log, | |
1839 | + &table->mask, | |
1840 | + 64 * 1024); | |
1841 | + /* | |
1842 | + * Make sure hash table has the minimum size | |
1843 | + */ | |
1844 | + if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { | |
1845 | + table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * | |
1846 | + sizeof(struct udp_hslot), GFP_KERNEL); | |
1847 | + if (!table->hash) | |
1848 | + panic(name); | |
1849 | + table->log = ilog2(UDP_HTABLE_SIZE_MIN); | |
1850 | + table->mask = UDP_HTABLE_SIZE_MIN - 1; | |
1851 | + } | |
1852 | + for (i = 0; i <= table->mask; i++) { | |
1812 | 1853 | INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); |
1813 | 1854 | spin_lock_init(&table->hash[i].lock); |
1814 | 1855 | } |
... | ... | @@ -1818,7 +1859,7 @@ |
1818 | 1859 | { |
1819 | 1860 | unsigned long nr_pages, limit; |
1820 | 1861 | |
1821 | - udp_table_init(&udp_table); | |
1862 | + udp_table_init(&udp_table, "UDP"); | |
1822 | 1863 | /* Set the pressure threshold up by the same strategy of TCP. It is a |
1823 | 1864 | * fraction of global memory that is up to 1/2 at 256 MB, decreasing |
1824 | 1865 | * toward zero with the amount of memory, with a floor of 128 pages. |
net/ipv4/udplite.c
... | ... | @@ -12,7 +12,7 @@ |
12 | 12 | */ |
13 | 13 | #include "udp_impl.h" |
14 | 14 | |
15 | -struct udp_table udplite_table; | |
15 | +struct udp_table udplite_table __read_mostly; | |
16 | 16 | EXPORT_SYMBOL(udplite_table); |
17 | 17 | |
18 | 18 | static int udplite_rcv(struct sk_buff *skb) |
... | ... | @@ -110,7 +110,7 @@ |
110 | 110 | |
111 | 111 | void __init udplite4_register(void) |
112 | 112 | { |
113 | - udp_table_init(&udplite_table); | |
113 | + udp_table_init(&udplite_table, "UDP-Lite"); | |
114 | 114 | if (proto_register(&udplite_prot, 1)) |
115 | 115 | goto out_register_err; |
116 | 116 |
net/ipv6/udp.c
... | ... | @@ -132,7 +132,7 @@ |
132 | 132 | struct sock *sk, *result; |
133 | 133 | struct hlist_nulls_node *node; |
134 | 134 | unsigned short hnum = ntohs(dport); |
135 | - unsigned int hash = udp_hashfn(net, hnum); | |
135 | + unsigned int hash = udp_hashfn(net, hnum, udptable->mask); | |
136 | 136 | struct udp_hslot *hslot = &udptable->hash[hash]; |
137 | 137 | int score, badness; |
138 | 138 | |
... | ... | @@ -452,7 +452,7 @@ |
452 | 452 | { |
453 | 453 | struct sock *sk, *sk2; |
454 | 454 | const struct udphdr *uh = udp_hdr(skb); |
455 | - struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; | |
455 | + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); | |
456 | 456 | int dif; |
457 | 457 | |
458 | 458 | spin_lock(&hslot->lock); |
... | ... | @@ -1197,7 +1197,7 @@ |
1197 | 1197 | destp = ntohs(inet->dport); |
1198 | 1198 | srcp = ntohs(inet->sport); |
1199 | 1199 | seq_printf(seq, |
1200 | - "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " | |
1200 | + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " | |
1201 | 1201 | "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", |
1202 | 1202 | bucket, |
1203 | 1203 | src->s6_addr32[0], src->s6_addr32[1], |