Commit 53083773dcbd3c80477e2ace143e361e1e806745

Authored by Pavel Emelyanov
Committed by David S. Miller
1 parent e56d8b8a2e

[INET]: Uninline the __inet_inherit_port call.

This deblats ~200 bytes when ipv6 and dccp are 'y'.

Besides, this will ease compilation issues for patches
I'm working on to make inet hash tables more scalable
wrt net namespaces.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 2 changed files with 17 additions and 13 deletions Inline Diff

include/net/inet_hashtables.h
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Authors: Lotsa people, from code originally in tcp 6 * Authors: Lotsa people, from code originally in tcp
7 * 7 *
8 * This program is free software; you can redistribute it and/or 8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License 9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version. 11 * 2 of the License, or (at your option) any later version.
12 */ 12 */
13 13
14 #ifndef _INET_HASHTABLES_H 14 #ifndef _INET_HASHTABLES_H
15 #define _INET_HASHTABLES_H 15 #define _INET_HASHTABLES_H
16 16
17 17
18 #include <linux/interrupt.h> 18 #include <linux/interrupt.h>
19 #include <linux/ipv6.h> 19 #include <linux/ipv6.h>
20 #include <linux/list.h> 20 #include <linux/list.h>
21 #include <linux/slab.h> 21 #include <linux/slab.h>
22 #include <linux/socket.h> 22 #include <linux/socket.h>
23 #include <linux/spinlock.h> 23 #include <linux/spinlock.h>
24 #include <linux/types.h> 24 #include <linux/types.h>
25 #include <linux/wait.h> 25 #include <linux/wait.h>
26 #include <linux/vmalloc.h> 26 #include <linux/vmalloc.h>
27 27
28 #include <net/inet_connection_sock.h> 28 #include <net/inet_connection_sock.h>
29 #include <net/inet_sock.h> 29 #include <net/inet_sock.h>
30 #include <net/sock.h> 30 #include <net/sock.h>
31 #include <net/tcp_states.h> 31 #include <net/tcp_states.h>
32 32
33 #include <asm/atomic.h> 33 #include <asm/atomic.h>
34 #include <asm/byteorder.h> 34 #include <asm/byteorder.h>
35 35
36 /* This is for all connections with a full identity, no wildcards. 36 /* This is for all connections with a full identity, no wildcards.
37 * One chain is dedicated to TIME_WAIT sockets. 37 * One chain is dedicated to TIME_WAIT sockets.
38 * I'll experiment with dynamic table growth later. 38 * I'll experiment with dynamic table growth later.
39 */ 39 */
40 struct inet_ehash_bucket { 40 struct inet_ehash_bucket {
41 struct hlist_head chain; 41 struct hlist_head chain;
42 struct hlist_head twchain; 42 struct hlist_head twchain;
43 }; 43 };
44 44
45 /* There are a few simple rules, which allow for local port reuse by 45 /* There are a few simple rules, which allow for local port reuse by
46 * an application. In essence: 46 * an application. In essence:
47 * 47 *
48 * 1) Sockets bound to different interfaces may share a local port. 48 * 1) Sockets bound to different interfaces may share a local port.
49 * Failing that, goto test 2. 49 * Failing that, goto test 2.
50 * 2) If all sockets have sk->sk_reuse set, and none of them are in 50 * 2) If all sockets have sk->sk_reuse set, and none of them are in
51 * TCP_LISTEN state, the port may be shared. 51 * TCP_LISTEN state, the port may be shared.
52 * Failing that, goto test 3. 52 * Failing that, goto test 3.
53 * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local 53 * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
54 * address, and none of them are the same, the port may be 54 * address, and none of them are the same, the port may be
55 * shared. 55 * shared.
56 * Failing this, the port cannot be shared. 56 * Failing this, the port cannot be shared.
57 * 57 *
58 * The interesting point, is test #2. This is what an FTP server does 58 * The interesting point, is test #2. This is what an FTP server does
59 * all day. To optimize this case we use a specific flag bit defined 59 * all day. To optimize this case we use a specific flag bit defined
60 * below. As we add sockets to a bind bucket list, we perform a 60 * below. As we add sockets to a bind bucket list, we perform a
61 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) 61 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
62 * As long as all sockets added to a bind bucket pass this test, 62 * As long as all sockets added to a bind bucket pass this test,
63 * the flag bit will be set. 63 * the flag bit will be set.
64 * The resulting situation is that tcp_v[46]_verify_bind() can just check 64 * The resulting situation is that tcp_v[46]_verify_bind() can just check
65 * for this flag bit, if it is set and the socket trying to bind has 65 * for this flag bit, if it is set and the socket trying to bind has
66 * sk->sk_reuse set, we don't even have to walk the owners list at all, 66 * sk->sk_reuse set, we don't even have to walk the owners list at all,
67 * we return that it is ok to bind this socket to the requested local port. 67 * we return that it is ok to bind this socket to the requested local port.
68 * 68 *
69 * Sounds like a lot of work, but it is worth it. In a more naive 69 * Sounds like a lot of work, but it is worth it. In a more naive
70 * implementation (ie. current FreeBSD etc.) the entire list of ports 70 * implementation (ie. current FreeBSD etc.) the entire list of ports
71 * must be walked for each data port opened by an ftp server. Needless 71 * must be walked for each data port opened by an ftp server. Needless
72 * to say, this does not scale at all. With a couple thousand FTP 72 * to say, this does not scale at all. With a couple thousand FTP
73 * users logged onto your box, isn't it nice to know that new data 73 * users logged onto your box, isn't it nice to know that new data
74 * ports are created in O(1) time? I thought so. ;-) -DaveM 74 * ports are created in O(1) time? I thought so. ;-) -DaveM
75 */ 75 */
76 struct inet_bind_bucket { 76 struct inet_bind_bucket {
77 struct net *ib_net; 77 struct net *ib_net;
78 unsigned short port; 78 unsigned short port;
79 signed short fastreuse; 79 signed short fastreuse;
80 struct hlist_node node; 80 struct hlist_node node;
81 struct hlist_head owners; 81 struct hlist_head owners;
82 }; 82 };
83 83
84 #define inet_bind_bucket_for_each(tb, node, head) \ 84 #define inet_bind_bucket_for_each(tb, node, head) \
85 hlist_for_each_entry(tb, node, head, node) 85 hlist_for_each_entry(tb, node, head, node)
86 86
87 struct inet_bind_hashbucket { 87 struct inet_bind_hashbucket {
88 spinlock_t lock; 88 spinlock_t lock;
89 struct hlist_head chain; 89 struct hlist_head chain;
90 }; 90 };
91 91
92 /* This is for listening sockets, thus all sockets which possess wildcards. */ 92 /* This is for listening sockets, thus all sockets which possess wildcards. */
93 #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ 93 #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
94 94
95 struct inet_hashinfo { 95 struct inet_hashinfo {
96 /* This is for sockets with full identity only. Sockets here will 96 /* This is for sockets with full identity only. Sockets here will
97 * always be without wildcards and will have the following invariant: 97 * always be without wildcards and will have the following invariant:
98 * 98 *
99 * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE 99 * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
100 * 100 *
101 * TIME_WAIT sockets use a separate chain (twchain). 101 * TIME_WAIT sockets use a separate chain (twchain).
102 */ 102 */
103 struct inet_ehash_bucket *ehash; 103 struct inet_ehash_bucket *ehash;
104 rwlock_t *ehash_locks; 104 rwlock_t *ehash_locks;
105 unsigned int ehash_size; 105 unsigned int ehash_size;
106 unsigned int ehash_locks_mask; 106 unsigned int ehash_locks_mask;
107 107
108 /* Ok, let's try this, I give up, we do need a local binding 108 /* Ok, let's try this, I give up, we do need a local binding
109 * TCP hash as well as the others for fast bind/connect. 109 * TCP hash as well as the others for fast bind/connect.
110 */ 110 */
111 struct inet_bind_hashbucket *bhash; 111 struct inet_bind_hashbucket *bhash;
112 112
113 unsigned int bhash_size; 113 unsigned int bhash_size;
114 /* Note : 4 bytes padding on 64 bit arches */ 114 /* Note : 4 bytes padding on 64 bit arches */
115 115
116 /* All sockets in TCP_LISTEN state will be in here. This is the only 116 /* All sockets in TCP_LISTEN state will be in here. This is the only
117 * table where wildcard'd TCP sockets can exist. Hash function here 117 * table where wildcard'd TCP sockets can exist. Hash function here
118 * is just local port number. 118 * is just local port number.
119 */ 119 */
120 struct hlist_head listening_hash[INET_LHTABLE_SIZE]; 120 struct hlist_head listening_hash[INET_LHTABLE_SIZE];
121 121
122 /* All the above members are written once at bootup and 122 /* All the above members are written once at bootup and
123 * never written again _or_ are predominantly read-access. 123 * never written again _or_ are predominantly read-access.
124 * 124 *
125 * Now align to a new cache line as all the following members 125 * Now align to a new cache line as all the following members
126 * are often dirty. 126 * are often dirty.
127 */ 127 */
128 rwlock_t lhash_lock ____cacheline_aligned; 128 rwlock_t lhash_lock ____cacheline_aligned;
129 atomic_t lhash_users; 129 atomic_t lhash_users;
130 wait_queue_head_t lhash_wait; 130 wait_queue_head_t lhash_wait;
131 struct kmem_cache *bind_bucket_cachep; 131 struct kmem_cache *bind_bucket_cachep;
132 }; 132 };
133 133
134 static inline struct inet_ehash_bucket *inet_ehash_bucket( 134 static inline struct inet_ehash_bucket *inet_ehash_bucket(
135 struct inet_hashinfo *hashinfo, 135 struct inet_hashinfo *hashinfo,
136 unsigned int hash) 136 unsigned int hash)
137 { 137 {
138 return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; 138 return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
139 } 139 }
140 140
141 static inline rwlock_t *inet_ehash_lockp( 141 static inline rwlock_t *inet_ehash_lockp(
142 struct inet_hashinfo *hashinfo, 142 struct inet_hashinfo *hashinfo,
143 unsigned int hash) 143 unsigned int hash)
144 { 144 {
145 return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; 145 return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
146 } 146 }
147 147
148 static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 148 static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
149 { 149 {
150 unsigned int i, size = 256; 150 unsigned int i, size = 256;
151 #if defined(CONFIG_PROVE_LOCKING) 151 #if defined(CONFIG_PROVE_LOCKING)
152 unsigned int nr_pcpus = 2; 152 unsigned int nr_pcpus = 2;
153 #else 153 #else
154 unsigned int nr_pcpus = num_possible_cpus(); 154 unsigned int nr_pcpus = num_possible_cpus();
155 #endif 155 #endif
156 if (nr_pcpus >= 4) 156 if (nr_pcpus >= 4)
157 size = 512; 157 size = 512;
158 if (nr_pcpus >= 8) 158 if (nr_pcpus >= 8)
159 size = 1024; 159 size = 1024;
160 if (nr_pcpus >= 16) 160 if (nr_pcpus >= 16)
161 size = 2048; 161 size = 2048;
162 if (nr_pcpus >= 32) 162 if (nr_pcpus >= 32)
163 size = 4096; 163 size = 4096;
164 if (sizeof(rwlock_t) != 0) { 164 if (sizeof(rwlock_t) != 0) {
165 #ifdef CONFIG_NUMA 165 #ifdef CONFIG_NUMA
166 if (size * sizeof(rwlock_t) > PAGE_SIZE) 166 if (size * sizeof(rwlock_t) > PAGE_SIZE)
167 hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); 167 hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t));
168 else 168 else
169 #endif 169 #endif
170 hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), 170 hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t),
171 GFP_KERNEL); 171 GFP_KERNEL);
172 if (!hashinfo->ehash_locks) 172 if (!hashinfo->ehash_locks)
173 return ENOMEM; 173 return ENOMEM;
174 for (i = 0; i < size; i++) 174 for (i = 0; i < size; i++)
175 rwlock_init(&hashinfo->ehash_locks[i]); 175 rwlock_init(&hashinfo->ehash_locks[i]);
176 } 176 }
177 hashinfo->ehash_locks_mask = size - 1; 177 hashinfo->ehash_locks_mask = size - 1;
178 return 0; 178 return 0;
179 } 179 }
180 180
181 static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) 181 static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
182 { 182 {
183 if (hashinfo->ehash_locks) { 183 if (hashinfo->ehash_locks) {
184 #ifdef CONFIG_NUMA 184 #ifdef CONFIG_NUMA
185 unsigned int size = (hashinfo->ehash_locks_mask + 1) * 185 unsigned int size = (hashinfo->ehash_locks_mask + 1) *
186 sizeof(rwlock_t); 186 sizeof(rwlock_t);
187 if (size > PAGE_SIZE) 187 if (size > PAGE_SIZE)
188 vfree(hashinfo->ehash_locks); 188 vfree(hashinfo->ehash_locks);
189 else 189 else
190 #endif 190 #endif
191 kfree(hashinfo->ehash_locks); 191 kfree(hashinfo->ehash_locks);
192 hashinfo->ehash_locks = NULL; 192 hashinfo->ehash_locks = NULL;
193 } 193 }
194 } 194 }
195 195
196 extern struct inet_bind_bucket * 196 extern struct inet_bind_bucket *
197 inet_bind_bucket_create(struct kmem_cache *cachep, 197 inet_bind_bucket_create(struct kmem_cache *cachep,
198 struct net *net, 198 struct net *net,
199 struct inet_bind_hashbucket *head, 199 struct inet_bind_hashbucket *head,
200 const unsigned short snum); 200 const unsigned short snum);
201 extern void inet_bind_bucket_destroy(struct kmem_cache *cachep, 201 extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
202 struct inet_bind_bucket *tb); 202 struct inet_bind_bucket *tb);
203 203
204 static inline int inet_bhashfn(const __u16 lport, const int bhash_size) 204 static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
205 { 205 {
206 return lport & (bhash_size - 1); 206 return lport & (bhash_size - 1);
207 } 207 }
208 208
209 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 209 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
210 const unsigned short snum); 210 const unsigned short snum);
211 211
212 /* These can have wildcards, don't try too hard. */ 212 /* These can have wildcards, don't try too hard. */
213 static inline int inet_lhashfn(const unsigned short num) 213 static inline int inet_lhashfn(const unsigned short num)
214 { 214 {
215 return num & (INET_LHTABLE_SIZE - 1); 215 return num & (INET_LHTABLE_SIZE - 1);
216 } 216 }
217 217
218 static inline int inet_sk_listen_hashfn(const struct sock *sk) 218 static inline int inet_sk_listen_hashfn(const struct sock *sk)
219 { 219 {
220 return inet_lhashfn(inet_sk(sk)->num); 220 return inet_lhashfn(inet_sk(sk)->num);
221 } 221 }
222 222
223 /* Caller must disable local BH processing. */ 223 /* Caller must disable local BH processing. */
224 static inline void __inet_inherit_port(struct sock *sk, struct sock *child) 224 extern void __inet_inherit_port(struct sock *sk, struct sock *child);
225 {
226 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
227 const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
228 struct inet_bind_hashbucket *head = &table->bhash[bhash];
229 struct inet_bind_bucket *tb;
230
231 spin_lock(&head->lock);
232 tb = inet_csk(sk)->icsk_bind_hash;
233 sk_add_bind_node(child, &tb->owners);
234 inet_csk(child)->icsk_bind_hash = tb;
235 spin_unlock(&head->lock);
236 }
237 225
238 extern void inet_put_port(struct sock *sk); 226 extern void inet_put_port(struct sock *sk);
239 227
240 extern void inet_listen_wlock(struct inet_hashinfo *hashinfo); 228 extern void inet_listen_wlock(struct inet_hashinfo *hashinfo);
241 229
242 /* 230 /*
243 * - We may sleep inside this lock. 231 * - We may sleep inside this lock.
244 * - If sleeping is not required (or called from BH), 232 * - If sleeping is not required (or called from BH),
245 * use plain read_(un)lock(&inet_hashinfo.lhash_lock). 233 * use plain read_(un)lock(&inet_hashinfo.lhash_lock).
246 */ 234 */
247 static inline void inet_listen_lock(struct inet_hashinfo *hashinfo) 235 static inline void inet_listen_lock(struct inet_hashinfo *hashinfo)
248 { 236 {
249 /* read_lock synchronizes to candidates to writers */ 237 /* read_lock synchronizes to candidates to writers */
250 read_lock(&hashinfo->lhash_lock); 238 read_lock(&hashinfo->lhash_lock);
251 atomic_inc(&hashinfo->lhash_users); 239 atomic_inc(&hashinfo->lhash_users);
252 read_unlock(&hashinfo->lhash_lock); 240 read_unlock(&hashinfo->lhash_lock);
253 } 241 }
254 242
255 static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo) 243 static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo)
256 { 244 {
257 if (atomic_dec_and_test(&hashinfo->lhash_users)) 245 if (atomic_dec_and_test(&hashinfo->lhash_users))
258 wake_up(&hashinfo->lhash_wait); 246 wake_up(&hashinfo->lhash_wait);
259 } 247 }
260 248
261 extern void __inet_hash_nolisten(struct sock *sk); 249 extern void __inet_hash_nolisten(struct sock *sk);
262 extern void inet_hash(struct sock *sk); 250 extern void inet_hash(struct sock *sk);
263 extern void inet_unhash(struct sock *sk); 251 extern void inet_unhash(struct sock *sk);
264 252
265 extern struct sock *__inet_lookup_listener(struct net *net, 253 extern struct sock *__inet_lookup_listener(struct net *net,
266 struct inet_hashinfo *hashinfo, 254 struct inet_hashinfo *hashinfo,
267 const __be32 daddr, 255 const __be32 daddr,
268 const unsigned short hnum, 256 const unsigned short hnum,
269 const int dif); 257 const int dif);
270 258
271 static inline struct sock *inet_lookup_listener(struct net *net, 259 static inline struct sock *inet_lookup_listener(struct net *net,
272 struct inet_hashinfo *hashinfo, 260 struct inet_hashinfo *hashinfo,
273 __be32 daddr, __be16 dport, int dif) 261 __be32 daddr, __be16 dport, int dif)
274 { 262 {
275 return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif); 263 return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
276 } 264 }
277 265
278 /* Socket demux engine toys. */ 266 /* Socket demux engine toys. */
279 /* What happens here is ugly; there's a pair of adjacent fields in 267 /* What happens here is ugly; there's a pair of adjacent fields in
280 struct inet_sock; __be16 dport followed by __u16 num. We want to 268 struct inet_sock; __be16 dport followed by __u16 num. We want to
281 search by pair, so we combine the keys into a single 32bit value 269 search by pair, so we combine the keys into a single 32bit value
282 and compare with 32bit value read from &...->dport. Let's at least 270 and compare with 32bit value read from &...->dport. Let's at least
283 make sure that it's not mixed with anything else... 271 make sure that it's not mixed with anything else...
284 On 64bit targets we combine comparisons with pair of adjacent __be32 272 On 64bit targets we combine comparisons with pair of adjacent __be32
285 fields in the same way. 273 fields in the same way.
286 */ 274 */
287 typedef __u32 __bitwise __portpair; 275 typedef __u32 __bitwise __portpair;
288 #ifdef __BIG_ENDIAN 276 #ifdef __BIG_ENDIAN
289 #define INET_COMBINED_PORTS(__sport, __dport) \ 277 #define INET_COMBINED_PORTS(__sport, __dport) \
290 ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) 278 ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
291 #else /* __LITTLE_ENDIAN */ 279 #else /* __LITTLE_ENDIAN */
292 #define INET_COMBINED_PORTS(__sport, __dport) \ 280 #define INET_COMBINED_PORTS(__sport, __dport) \
293 ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) 281 ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
294 #endif 282 #endif
295 283
296 #if (BITS_PER_LONG == 64) 284 #if (BITS_PER_LONG == 64)
297 typedef __u64 __bitwise __addrpair; 285 typedef __u64 __bitwise __addrpair;
298 #ifdef __BIG_ENDIAN 286 #ifdef __BIG_ENDIAN
299 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ 287 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
300 const __addrpair __name = (__force __addrpair) ( \ 288 const __addrpair __name = (__force __addrpair) ( \
301 (((__force __u64)(__be32)(__saddr)) << 32) | \ 289 (((__force __u64)(__be32)(__saddr)) << 32) | \
302 ((__force __u64)(__be32)(__daddr))); 290 ((__force __u64)(__be32)(__daddr)));
303 #else /* __LITTLE_ENDIAN */ 291 #else /* __LITTLE_ENDIAN */
304 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ 292 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
305 const __addrpair __name = (__force __addrpair) ( \ 293 const __addrpair __name = (__force __addrpair) ( \
306 (((__force __u64)(__be32)(__daddr)) << 32) | \ 294 (((__force __u64)(__be32)(__daddr)) << 32) | \
307 ((__force __u64)(__be32)(__saddr))); 295 ((__force __u64)(__be32)(__saddr)));
308 #endif /* __BIG_ENDIAN */ 296 #endif /* __BIG_ENDIAN */
309 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ 297 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
310 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ 298 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \
311 ((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ 299 ((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \
312 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ 300 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \
313 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 301 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
314 #define INET_TW_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ 302 #define INET_TW_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
315 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ 303 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \
316 ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ 304 ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \
317 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ 305 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
318 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 306 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
319 #else /* 32-bit arch */ 307 #else /* 32-bit arch */
320 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) 308 #define INET_ADDR_COOKIE(__name, __saddr, __daddr)
321 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ 309 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif) \
322 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ 310 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \
323 (inet_sk(__sk)->daddr == (__saddr)) && \ 311 (inet_sk(__sk)->daddr == (__saddr)) && \
324 (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ 312 (inet_sk(__sk)->rcv_saddr == (__daddr)) && \
325 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ 313 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \
326 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 314 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
327 #define INET_TW_MATCH(__sk, __net, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ 315 #define INET_TW_MATCH(__sk, __net, __hash,__cookie, __saddr, __daddr, __ports, __dif) \
328 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ 316 (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \
329 (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ 317 (inet_twsk(__sk)->tw_daddr == (__saddr)) && \
330 (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ 318 (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \
331 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ 319 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
332 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 320 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
333 #endif /* 64-bit arch */ 321 #endif /* 64-bit arch */
334 322
335 /* 323 /*
336 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need 324 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
337 * not check it for lookups anymore, thanks Alexey. -DaveM 325 * not check it for lookups anymore, thanks Alexey. -DaveM
338 * 326 *
339 * Local BH must be disabled here. 327 * Local BH must be disabled here.
340 */ 328 */
341 extern struct sock * __inet_lookup_established(struct net *net, 329 extern struct sock * __inet_lookup_established(struct net *net,
342 struct inet_hashinfo *hashinfo, 330 struct inet_hashinfo *hashinfo,
343 const __be32 saddr, const __be16 sport, 331 const __be32 saddr, const __be16 sport,
344 const __be32 daddr, const u16 hnum, const int dif); 332 const __be32 daddr, const u16 hnum, const int dif);
345 333
346 static inline struct sock * 334 static inline struct sock *
347 inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, 335 inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
348 const __be32 saddr, const __be16 sport, 336 const __be32 saddr, const __be16 sport,
349 const __be32 daddr, const __be16 dport, 337 const __be32 daddr, const __be16 dport,
350 const int dif) 338 const int dif)
351 { 339 {
352 return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, 340 return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
353 ntohs(dport), dif); 341 ntohs(dport), dif);
354 } 342 }
355 343
356 static inline struct sock *__inet_lookup(struct net *net, 344 static inline struct sock *__inet_lookup(struct net *net,
357 struct inet_hashinfo *hashinfo, 345 struct inet_hashinfo *hashinfo,
358 const __be32 saddr, const __be16 sport, 346 const __be32 saddr, const __be16 sport,
359 const __be32 daddr, const __be16 dport, 347 const __be32 daddr, const __be16 dport,
360 const int dif) 348 const int dif)
361 { 349 {
362 u16 hnum = ntohs(dport); 350 u16 hnum = ntohs(dport);
363 struct sock *sk = __inet_lookup_established(net, hashinfo, 351 struct sock *sk = __inet_lookup_established(net, hashinfo,
364 saddr, sport, daddr, hnum, dif); 352 saddr, sport, daddr, hnum, dif);
365 353
366 return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif); 354 return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
367 } 355 }
368 356
369 static inline struct sock *inet_lookup(struct net *net, 357 static inline struct sock *inet_lookup(struct net *net,
370 struct inet_hashinfo *hashinfo, 358 struct inet_hashinfo *hashinfo,
371 const __be32 saddr, const __be16 sport, 359 const __be32 saddr, const __be16 sport,
372 const __be32 daddr, const __be16 dport, 360 const __be32 daddr, const __be16 dport,
373 const int dif) 361 const int dif)
374 { 362 {
375 struct sock *sk; 363 struct sock *sk;
376 364
377 local_bh_disable(); 365 local_bh_disable();
378 sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif); 366 sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif);
379 local_bh_enable(); 367 local_bh_enable();
380 368
381 return sk; 369 return sk;
382 } 370 }
383 371
384 extern int __inet_hash_connect(struct inet_timewait_death_row *death_row, 372 extern int __inet_hash_connect(struct inet_timewait_death_row *death_row,
385 struct sock *sk, u32 port_offset, 373 struct sock *sk, u32 port_offset,
386 int (*check_established)(struct inet_timewait_death_row *, 374 int (*check_established)(struct inet_timewait_death_row *,
387 struct sock *, __u16, struct inet_timewait_sock **), 375 struct sock *, __u16, struct inet_timewait_sock **),
388 void (*hash)(struct sock *sk)); 376 void (*hash)(struct sock *sk));
389 extern int inet_hash_connect(struct inet_timewait_death_row *death_row, 377 extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
390 struct sock *sk); 378 struct sock *sk);
391 #endif /* _INET_HASHTABLES_H */ 379 #endif /* _INET_HASHTABLES_H */
392 380
net/ipv4/inet_hashtables.c
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Generic INET transport hashtables 6 * Generic INET transport hashtables
7 * 7 *
8 * Authors: Lotsa people, from code originally in tcp 8 * Authors: Lotsa people, from code originally in tcp
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version 12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 */ 14 */
15 15
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/random.h> 17 #include <linux/random.h>
18 #include <linux/sched.h> 18 #include <linux/sched.h>
19 #include <linux/slab.h> 19 #include <linux/slab.h>
20 #include <linux/wait.h> 20 #include <linux/wait.h>
21 21
22 #include <net/inet_connection_sock.h> 22 #include <net/inet_connection_sock.h>
23 #include <net/inet_hashtables.h> 23 #include <net/inet_hashtables.h>
24 #include <net/ip.h> 24 #include <net/ip.h>
25 25
26 /* 26 /*
27 * Allocate and initialize a new local port bind bucket. 27 * Allocate and initialize a new local port bind bucket.
28 * The bindhash mutex for snum's hash chain must be held here. 28 * The bindhash mutex for snum's hash chain must be held here.
29 */ 29 */
30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
31 struct net *net, 31 struct net *net,
32 struct inet_bind_hashbucket *head, 32 struct inet_bind_hashbucket *head,
33 const unsigned short snum) 33 const unsigned short snum)
34 { 34 {
35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
36 36
37 if (tb != NULL) { 37 if (tb != NULL) {
38 tb->ib_net = hold_net(net); 38 tb->ib_net = hold_net(net);
39 tb->port = snum; 39 tb->port = snum;
40 tb->fastreuse = 0; 40 tb->fastreuse = 0;
41 INIT_HLIST_HEAD(&tb->owners); 41 INIT_HLIST_HEAD(&tb->owners);
42 hlist_add_head(&tb->node, &head->chain); 42 hlist_add_head(&tb->node, &head->chain);
43 } 43 }
44 return tb; 44 return tb;
45 } 45 }
46 46
47 /* 47 /*
48 * Caller must hold hashbucket lock for this tb with local BH disabled 48 * Caller must hold hashbucket lock for this tb with local BH disabled
49 */ 49 */
50 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 50 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
51 { 51 {
52 if (hlist_empty(&tb->owners)) { 52 if (hlist_empty(&tb->owners)) {
53 __hlist_del(&tb->node); 53 __hlist_del(&tb->node);
54 release_net(tb->ib_net); 54 release_net(tb->ib_net);
55 kmem_cache_free(cachep, tb); 55 kmem_cache_free(cachep, tb);
56 } 56 }
57 } 57 }
58 58
59 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 59 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
60 const unsigned short snum) 60 const unsigned short snum)
61 { 61 {
62 inet_sk(sk)->num = snum; 62 inet_sk(sk)->num = snum;
63 sk_add_bind_node(sk, &tb->owners); 63 sk_add_bind_node(sk, &tb->owners);
64 inet_csk(sk)->icsk_bind_hash = tb; 64 inet_csk(sk)->icsk_bind_hash = tb;
65 } 65 }
66 66
67 /* 67 /*
68 * Get rid of any references to a local port held by the given sock. 68 * Get rid of any references to a local port held by the given sock.
69 */ 69 */
70 static void __inet_put_port(struct sock *sk) 70 static void __inet_put_port(struct sock *sk)
71 { 71 {
72 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 72 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
73 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); 73 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
74 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 74 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
75 struct inet_bind_bucket *tb; 75 struct inet_bind_bucket *tb;
76 76
77 spin_lock(&head->lock); 77 spin_lock(&head->lock);
78 tb = inet_csk(sk)->icsk_bind_hash; 78 tb = inet_csk(sk)->icsk_bind_hash;
79 __sk_del_bind_node(sk); 79 __sk_del_bind_node(sk);
80 inet_csk(sk)->icsk_bind_hash = NULL; 80 inet_csk(sk)->icsk_bind_hash = NULL;
81 inet_sk(sk)->num = 0; 81 inet_sk(sk)->num = 0;
82 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 82 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
83 spin_unlock(&head->lock); 83 spin_unlock(&head->lock);
84 } 84 }
85 85
86 void inet_put_port(struct sock *sk) 86 void inet_put_port(struct sock *sk)
87 { 87 {
88 local_bh_disable(); 88 local_bh_disable();
89 __inet_put_port(sk); 89 __inet_put_port(sk);
90 local_bh_enable(); 90 local_bh_enable();
91 } 91 }
92 92
93 EXPORT_SYMBOL(inet_put_port); 93 EXPORT_SYMBOL(inet_put_port);
94 94
95 void __inet_inherit_port(struct sock *sk, struct sock *child)
96 {
97 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
98 const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
99 struct inet_bind_hashbucket *head = &table->bhash[bhash];
100 struct inet_bind_bucket *tb;
101
102 spin_lock(&head->lock);
103 tb = inet_csk(sk)->icsk_bind_hash;
104 sk_add_bind_node(child, &tb->owners);
105 inet_csk(child)->icsk_bind_hash = tb;
106 spin_unlock(&head->lock);
107 }
108
109 EXPORT_SYMBOL_GPL(__inet_inherit_port);
110
95 /* 111 /*
96 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 112 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
97 * Look, when several writers sleep and reader wakes them up, all but one 113 * Look, when several writers sleep and reader wakes them up, all but one
98 * immediately hit write lock and grab all the cpus. Exclusive sleep solves 114 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
99 * this, _but_ remember, it adds useless work on UP machines (wake up each 115 * this, _but_ remember, it adds useless work on UP machines (wake up each
100 * exclusive lock release). It should be ifdefed really. 116 * exclusive lock release). It should be ifdefed really.
101 */ 117 */
102 void inet_listen_wlock(struct inet_hashinfo *hashinfo) 118 void inet_listen_wlock(struct inet_hashinfo *hashinfo)
103 __acquires(hashinfo->lhash_lock) 119 __acquires(hashinfo->lhash_lock)
104 { 120 {
105 write_lock(&hashinfo->lhash_lock); 121 write_lock(&hashinfo->lhash_lock);
106 122
107 if (atomic_read(&hashinfo->lhash_users)) { 123 if (atomic_read(&hashinfo->lhash_users)) {
108 DEFINE_WAIT(wait); 124 DEFINE_WAIT(wait);
109 125
110 for (;;) { 126 for (;;) {
111 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 127 prepare_to_wait_exclusive(&hashinfo->lhash_wait,
112 &wait, TASK_UNINTERRUPTIBLE); 128 &wait, TASK_UNINTERRUPTIBLE);
113 if (!atomic_read(&hashinfo->lhash_users)) 129 if (!atomic_read(&hashinfo->lhash_users))
114 break; 130 break;
115 write_unlock_bh(&hashinfo->lhash_lock); 131 write_unlock_bh(&hashinfo->lhash_lock);
116 schedule(); 132 schedule();
117 write_lock_bh(&hashinfo->lhash_lock); 133 write_lock_bh(&hashinfo->lhash_lock);
118 } 134 }
119 135
120 finish_wait(&hashinfo->lhash_wait, &wait); 136 finish_wait(&hashinfo->lhash_wait, &wait);
121 } 137 }
122 } 138 }
123 139
124 /* 140 /*
125 * Don't inline this cruft. Here are some nice properties to exploit here. The 141 * Don't inline this cruft. Here are some nice properties to exploit here. The
126 * BSD API does not allow a listening sock to specify the remote port nor the 142 * BSD API does not allow a listening sock to specify the remote port nor the
127 * remote address for the connection. So always assume those are both 143 * remote address for the connection. So always assume those are both
128 * wildcarded during the search since they can never be otherwise. 144 * wildcarded during the search since they can never be otherwise.
129 */ 145 */
130 static struct sock *inet_lookup_listener_slow(struct net *net, 146 static struct sock *inet_lookup_listener_slow(struct net *net,
131 const struct hlist_head *head, 147 const struct hlist_head *head,
132 const __be32 daddr, 148 const __be32 daddr,
133 const unsigned short hnum, 149 const unsigned short hnum,
134 const int dif) 150 const int dif)
135 { 151 {
136 struct sock *result = NULL, *sk; 152 struct sock *result = NULL, *sk;
137 const struct hlist_node *node; 153 const struct hlist_node *node;
138 int hiscore = -1; 154 int hiscore = -1;
139 155
140 sk_for_each(sk, node, head) { 156 sk_for_each(sk, node, head) {
141 const struct inet_sock *inet = inet_sk(sk); 157 const struct inet_sock *inet = inet_sk(sk);
142 158
143 if (net_eq(sock_net(sk), net) && inet->num == hnum && 159 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
144 !ipv6_only_sock(sk)) { 160 !ipv6_only_sock(sk)) {
145 const __be32 rcv_saddr = inet->rcv_saddr; 161 const __be32 rcv_saddr = inet->rcv_saddr;
146 int score = sk->sk_family == PF_INET ? 1 : 0; 162 int score = sk->sk_family == PF_INET ? 1 : 0;
147 163
148 if (rcv_saddr) { 164 if (rcv_saddr) {
149 if (rcv_saddr != daddr) 165 if (rcv_saddr != daddr)
150 continue; 166 continue;
151 score += 2; 167 score += 2;
152 } 168 }
153 if (sk->sk_bound_dev_if) { 169 if (sk->sk_bound_dev_if) {
154 if (sk->sk_bound_dev_if != dif) 170 if (sk->sk_bound_dev_if != dif)
155 continue; 171 continue;
156 score += 2; 172 score += 2;
157 } 173 }
158 if (score == 5) 174 if (score == 5)
159 return sk; 175 return sk;
160 if (score > hiscore) { 176 if (score > hiscore) {
161 hiscore = score; 177 hiscore = score;
162 result = sk; 178 result = sk;
163 } 179 }
164 } 180 }
165 } 181 }
166 return result; 182 return result;
167 } 183 }
168 184
169 /* Optimize the common listener case. */ 185 /* Optimize the common listener case. */
170 struct sock *__inet_lookup_listener(struct net *net, 186 struct sock *__inet_lookup_listener(struct net *net,
171 struct inet_hashinfo *hashinfo, 187 struct inet_hashinfo *hashinfo,
172 const __be32 daddr, const unsigned short hnum, 188 const __be32 daddr, const unsigned short hnum,
173 const int dif) 189 const int dif)
174 { 190 {
175 struct sock *sk = NULL; 191 struct sock *sk = NULL;
176 const struct hlist_head *head; 192 const struct hlist_head *head;
177 193
178 read_lock(&hashinfo->lhash_lock); 194 read_lock(&hashinfo->lhash_lock);
179 head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; 195 head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
180 if (!hlist_empty(head)) { 196 if (!hlist_empty(head)) {
181 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 197 const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
182 198
183 if (inet->num == hnum && !sk->sk_node.next && 199 if (inet->num == hnum && !sk->sk_node.next &&
184 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 200 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
185 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 201 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
186 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 202 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
187 goto sherry_cache; 203 goto sherry_cache;
188 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 204 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif);
189 } 205 }
190 if (sk) { 206 if (sk) {
191 sherry_cache: 207 sherry_cache:
192 sock_hold(sk); 208 sock_hold(sk);
193 } 209 }
194 read_unlock(&hashinfo->lhash_lock); 210 read_unlock(&hashinfo->lhash_lock);
195 return sk; 211 return sk;
196 } 212 }
197 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 213 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
198 214
199 struct sock * __inet_lookup_established(struct net *net, 215 struct sock * __inet_lookup_established(struct net *net,
200 struct inet_hashinfo *hashinfo, 216 struct inet_hashinfo *hashinfo,
201 const __be32 saddr, const __be16 sport, 217 const __be32 saddr, const __be16 sport,
202 const __be32 daddr, const u16 hnum, 218 const __be32 daddr, const u16 hnum,
203 const int dif) 219 const int dif)
204 { 220 {
205 INET_ADDR_COOKIE(acookie, saddr, daddr) 221 INET_ADDR_COOKIE(acookie, saddr, daddr)
206 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 222 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
207 struct sock *sk; 223 struct sock *sk;
208 const struct hlist_node *node; 224 const struct hlist_node *node;
209 /* Optimize here for direct hit, only listening connections can 225 /* Optimize here for direct hit, only listening connections can
210 * have wildcards anyways. 226 * have wildcards anyways.
211 */ 227 */
212 unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); 228 unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
213 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 229 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
214 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 230 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
215 231
216 prefetch(head->chain.first); 232 prefetch(head->chain.first);
217 read_lock(lock); 233 read_lock(lock);
218 sk_for_each(sk, node, &head->chain) { 234 sk_for_each(sk, node, &head->chain) {
219 if (INET_MATCH(sk, net, hash, acookie, 235 if (INET_MATCH(sk, net, hash, acookie,
220 saddr, daddr, ports, dif)) 236 saddr, daddr, ports, dif))
221 goto hit; /* You sunk my battleship! */ 237 goto hit; /* You sunk my battleship! */
222 } 238 }
223 239
224 /* Must check for a TIME_WAIT'er before going to listener hash. */ 240 /* Must check for a TIME_WAIT'er before going to listener hash. */
225 sk_for_each(sk, node, &head->twchain) { 241 sk_for_each(sk, node, &head->twchain) {
226 if (INET_TW_MATCH(sk, net, hash, acookie, 242 if (INET_TW_MATCH(sk, net, hash, acookie,
227 saddr, daddr, ports, dif)) 243 saddr, daddr, ports, dif))
228 goto hit; 244 goto hit;
229 } 245 }
230 sk = NULL; 246 sk = NULL;
231 out: 247 out:
232 read_unlock(lock); 248 read_unlock(lock);
233 return sk; 249 return sk;
234 hit: 250 hit:
235 sock_hold(sk); 251 sock_hold(sk);
236 goto out; 252 goto out;
237 } 253 }
238 EXPORT_SYMBOL_GPL(__inet_lookup_established); 254 EXPORT_SYMBOL_GPL(__inet_lookup_established);
239 255
240 /* called with local bh disabled */ 256 /* called with local bh disabled */
241 static int __inet_check_established(struct inet_timewait_death_row *death_row, 257 static int __inet_check_established(struct inet_timewait_death_row *death_row,
242 struct sock *sk, __u16 lport, 258 struct sock *sk, __u16 lport,
243 struct inet_timewait_sock **twp) 259 struct inet_timewait_sock **twp)
244 { 260 {
245 struct inet_hashinfo *hinfo = death_row->hashinfo; 261 struct inet_hashinfo *hinfo = death_row->hashinfo;
246 struct inet_sock *inet = inet_sk(sk); 262 struct inet_sock *inet = inet_sk(sk);
247 __be32 daddr = inet->rcv_saddr; 263 __be32 daddr = inet->rcv_saddr;
248 __be32 saddr = inet->daddr; 264 __be32 saddr = inet->daddr;
249 int dif = sk->sk_bound_dev_if; 265 int dif = sk->sk_bound_dev_if;
250 INET_ADDR_COOKIE(acookie, saddr, daddr) 266 INET_ADDR_COOKIE(acookie, saddr, daddr)
251 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 267 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
252 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 268 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
253 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 269 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
254 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 270 rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
255 struct sock *sk2; 271 struct sock *sk2;
256 const struct hlist_node *node; 272 const struct hlist_node *node;
257 struct inet_timewait_sock *tw; 273 struct inet_timewait_sock *tw;
258 struct net *net = sock_net(sk); 274 struct net *net = sock_net(sk);
259 275
260 prefetch(head->chain.first); 276 prefetch(head->chain.first);
261 write_lock(lock); 277 write_lock(lock);
262 278
263 /* Check TIME-WAIT sockets first. */ 279 /* Check TIME-WAIT sockets first. */
264 sk_for_each(sk2, node, &head->twchain) { 280 sk_for_each(sk2, node, &head->twchain) {
265 tw = inet_twsk(sk2); 281 tw = inet_twsk(sk2);
266 282
267 if (INET_TW_MATCH(sk2, net, hash, acookie, 283 if (INET_TW_MATCH(sk2, net, hash, acookie,
268 saddr, daddr, ports, dif)) { 284 saddr, daddr, ports, dif)) {
269 if (twsk_unique(sk, sk2, twp)) 285 if (twsk_unique(sk, sk2, twp))
270 goto unique; 286 goto unique;
271 else 287 else
272 goto not_unique; 288 goto not_unique;
273 } 289 }
274 } 290 }
275 tw = NULL; 291 tw = NULL;
276 292
277 /* And established part... */ 293 /* And established part... */
278 sk_for_each(sk2, node, &head->chain) { 294 sk_for_each(sk2, node, &head->chain) {
279 if (INET_MATCH(sk2, net, hash, acookie, 295 if (INET_MATCH(sk2, net, hash, acookie,
280 saddr, daddr, ports, dif)) 296 saddr, daddr, ports, dif))
281 goto not_unique; 297 goto not_unique;
282 } 298 }
283 299
284 unique: 300 unique:
285 /* Must record num and sport now. Otherwise we will see 301 /* Must record num and sport now. Otherwise we will see
286 * in hash table socket with a funny identity. */ 302 * in hash table socket with a funny identity. */
287 inet->num = lport; 303 inet->num = lport;
288 inet->sport = htons(lport); 304 inet->sport = htons(lport);
289 sk->sk_hash = hash; 305 sk->sk_hash = hash;
290 BUG_TRAP(sk_unhashed(sk)); 306 BUG_TRAP(sk_unhashed(sk));
291 __sk_add_node(sk, &head->chain); 307 __sk_add_node(sk, &head->chain);
292 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 308 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
293 write_unlock(lock); 309 write_unlock(lock);
294 310
295 if (twp) { 311 if (twp) {
296 *twp = tw; 312 *twp = tw;
297 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 313 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
298 } else if (tw) { 314 } else if (tw) {
299 /* Silly. Should hash-dance instead... */ 315 /* Silly. Should hash-dance instead... */
300 inet_twsk_deschedule(tw, death_row); 316 inet_twsk_deschedule(tw, death_row);
301 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 317 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
302 318
303 inet_twsk_put(tw); 319 inet_twsk_put(tw);
304 } 320 }
305 321
306 return 0; 322 return 0;
307 323
308 not_unique: 324 not_unique:
309 write_unlock(lock); 325 write_unlock(lock);
310 return -EADDRNOTAVAIL; 326 return -EADDRNOTAVAIL;
311 } 327 }
312 328
313 static inline u32 inet_sk_port_offset(const struct sock *sk) 329 static inline u32 inet_sk_port_offset(const struct sock *sk)
314 { 330 {
315 const struct inet_sock *inet = inet_sk(sk); 331 const struct inet_sock *inet = inet_sk(sk);
316 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 332 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
317 inet->dport); 333 inet->dport);
318 } 334 }
319 335
320 void __inet_hash_nolisten(struct sock *sk) 336 void __inet_hash_nolisten(struct sock *sk)
321 { 337 {
322 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 338 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
323 struct hlist_head *list; 339 struct hlist_head *list;
324 rwlock_t *lock; 340 rwlock_t *lock;
325 struct inet_ehash_bucket *head; 341 struct inet_ehash_bucket *head;
326 342
327 BUG_TRAP(sk_unhashed(sk)); 343 BUG_TRAP(sk_unhashed(sk));
328 344
329 sk->sk_hash = inet_sk_ehashfn(sk); 345 sk->sk_hash = inet_sk_ehashfn(sk);
330 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 346 head = inet_ehash_bucket(hashinfo, sk->sk_hash);
331 list = &head->chain; 347 list = &head->chain;
332 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 348 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
333 349
334 write_lock(lock); 350 write_lock(lock);
335 __sk_add_node(sk, list); 351 __sk_add_node(sk, list);
336 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 352 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
337 write_unlock(lock); 353 write_unlock(lock);
338 } 354 }
339 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 355 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
340 356
341 static void __inet_hash(struct sock *sk) 357 static void __inet_hash(struct sock *sk)
342 { 358 {
343 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 359 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
344 struct hlist_head *list; 360 struct hlist_head *list;
345 rwlock_t *lock; 361 rwlock_t *lock;
346 362
347 if (sk->sk_state != TCP_LISTEN) { 363 if (sk->sk_state != TCP_LISTEN) {
348 __inet_hash_nolisten(sk); 364 __inet_hash_nolisten(sk);
349 return; 365 return;
350 } 366 }
351 367
352 BUG_TRAP(sk_unhashed(sk)); 368 BUG_TRAP(sk_unhashed(sk));
353 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 369 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
354 lock = &hashinfo->lhash_lock; 370 lock = &hashinfo->lhash_lock;
355 371
356 inet_listen_wlock(hashinfo); 372 inet_listen_wlock(hashinfo);
357 __sk_add_node(sk, list); 373 __sk_add_node(sk, list);
358 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 374 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
359 write_unlock(lock); 375 write_unlock(lock);
360 wake_up(&hashinfo->lhash_wait); 376 wake_up(&hashinfo->lhash_wait);
361 } 377 }
362 378
363 void inet_hash(struct sock *sk) 379 void inet_hash(struct sock *sk)
364 { 380 {
365 if (sk->sk_state != TCP_CLOSE) { 381 if (sk->sk_state != TCP_CLOSE) {
366 local_bh_disable(); 382 local_bh_disable();
367 __inet_hash(sk); 383 __inet_hash(sk);
368 local_bh_enable(); 384 local_bh_enable();
369 } 385 }
370 } 386 }
371 EXPORT_SYMBOL_GPL(inet_hash); 387 EXPORT_SYMBOL_GPL(inet_hash);
372 388
373 void inet_unhash(struct sock *sk) 389 void inet_unhash(struct sock *sk)
374 { 390 {
375 rwlock_t *lock; 391 rwlock_t *lock;
376 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 392 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
377 393
378 if (sk_unhashed(sk)) 394 if (sk_unhashed(sk))
379 goto out; 395 goto out;
380 396
381 if (sk->sk_state == TCP_LISTEN) { 397 if (sk->sk_state == TCP_LISTEN) {
382 local_bh_disable(); 398 local_bh_disable();
383 inet_listen_wlock(hashinfo); 399 inet_listen_wlock(hashinfo);
384 lock = &hashinfo->lhash_lock; 400 lock = &hashinfo->lhash_lock;
385 } else { 401 } else {
386 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 402 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
387 write_lock_bh(lock); 403 write_lock_bh(lock);
388 } 404 }
389 405
390 if (__sk_del_node_init(sk)) 406 if (__sk_del_node_init(sk))
391 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 407 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
392 write_unlock_bh(lock); 408 write_unlock_bh(lock);
393 out: 409 out:
394 if (sk->sk_state == TCP_LISTEN) 410 if (sk->sk_state == TCP_LISTEN)
395 wake_up(&hashinfo->lhash_wait); 411 wake_up(&hashinfo->lhash_wait);
396 } 412 }
397 EXPORT_SYMBOL_GPL(inet_unhash); 413 EXPORT_SYMBOL_GPL(inet_unhash);
398 414
399 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 415 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
400 struct sock *sk, u32 port_offset, 416 struct sock *sk, u32 port_offset,
401 int (*check_established)(struct inet_timewait_death_row *, 417 int (*check_established)(struct inet_timewait_death_row *,
402 struct sock *, __u16, struct inet_timewait_sock **), 418 struct sock *, __u16, struct inet_timewait_sock **),
403 void (*hash)(struct sock *sk)) 419 void (*hash)(struct sock *sk))
404 { 420 {
405 struct inet_hashinfo *hinfo = death_row->hashinfo; 421 struct inet_hashinfo *hinfo = death_row->hashinfo;
406 const unsigned short snum = inet_sk(sk)->num; 422 const unsigned short snum = inet_sk(sk)->num;
407 struct inet_bind_hashbucket *head; 423 struct inet_bind_hashbucket *head;
408 struct inet_bind_bucket *tb; 424 struct inet_bind_bucket *tb;
409 int ret; 425 int ret;
410 struct net *net = sock_net(sk); 426 struct net *net = sock_net(sk);
411 427
412 if (!snum) { 428 if (!snum) {
413 int i, remaining, low, high, port; 429 int i, remaining, low, high, port;
414 static u32 hint; 430 static u32 hint;
415 u32 offset = hint + port_offset; 431 u32 offset = hint + port_offset;
416 struct hlist_node *node; 432 struct hlist_node *node;
417 struct inet_timewait_sock *tw = NULL; 433 struct inet_timewait_sock *tw = NULL;
418 434
419 inet_get_local_port_range(&low, &high); 435 inet_get_local_port_range(&low, &high);
420 remaining = (high - low) + 1; 436 remaining = (high - low) + 1;
421 437
422 local_bh_disable(); 438 local_bh_disable();
423 for (i = 1; i <= remaining; i++) { 439 for (i = 1; i <= remaining; i++) {
424 port = low + (i + offset) % remaining; 440 port = low + (i + offset) % remaining;
425 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; 441 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
426 spin_lock(&head->lock); 442 spin_lock(&head->lock);
427 443
428 /* Does not bother with rcv_saddr checks, 444 /* Does not bother with rcv_saddr checks,
429 * because the established check is already 445 * because the established check is already
430 * unique enough. 446 * unique enough.
431 */ 447 */
432 inet_bind_bucket_for_each(tb, node, &head->chain) { 448 inet_bind_bucket_for_each(tb, node, &head->chain) {
433 if (tb->ib_net == net && tb->port == port) { 449 if (tb->ib_net == net && tb->port == port) {
434 BUG_TRAP(!hlist_empty(&tb->owners)); 450 BUG_TRAP(!hlist_empty(&tb->owners));
435 if (tb->fastreuse >= 0) 451 if (tb->fastreuse >= 0)
436 goto next_port; 452 goto next_port;
437 if (!check_established(death_row, sk, 453 if (!check_established(death_row, sk,
438 port, &tw)) 454 port, &tw))
439 goto ok; 455 goto ok;
440 goto next_port; 456 goto next_port;
441 } 457 }
442 } 458 }
443 459
444 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 460 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
445 net, head, port); 461 net, head, port);
446 if (!tb) { 462 if (!tb) {
447 spin_unlock(&head->lock); 463 spin_unlock(&head->lock);
448 break; 464 break;
449 } 465 }
450 tb->fastreuse = -1; 466 tb->fastreuse = -1;
451 goto ok; 467 goto ok;
452 468
453 next_port: 469 next_port:
454 spin_unlock(&head->lock); 470 spin_unlock(&head->lock);
455 } 471 }
456 local_bh_enable(); 472 local_bh_enable();
457 473
458 return -EADDRNOTAVAIL; 474 return -EADDRNOTAVAIL;
459 475
460 ok: 476 ok:
461 hint += i; 477 hint += i;
462 478
463 /* Head lock still held and bh's disabled */ 479 /* Head lock still held and bh's disabled */
464 inet_bind_hash(sk, tb, port); 480 inet_bind_hash(sk, tb, port);
465 if (sk_unhashed(sk)) { 481 if (sk_unhashed(sk)) {
466 inet_sk(sk)->sport = htons(port); 482 inet_sk(sk)->sport = htons(port);
467 hash(sk); 483 hash(sk);
468 } 484 }
469 spin_unlock(&head->lock); 485 spin_unlock(&head->lock);
470 486
471 if (tw) { 487 if (tw) {
472 inet_twsk_deschedule(tw, death_row); 488 inet_twsk_deschedule(tw, death_row);
473 inet_twsk_put(tw); 489 inet_twsk_put(tw);
474 } 490 }
475 491
476 ret = 0; 492 ret = 0;
477 goto out; 493 goto out;
478 } 494 }
479 495
480 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; 496 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
481 tb = inet_csk(sk)->icsk_bind_hash; 497 tb = inet_csk(sk)->icsk_bind_hash;
482 spin_lock_bh(&head->lock); 498 spin_lock_bh(&head->lock);
483 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 499 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
484 hash(sk); 500 hash(sk);
485 spin_unlock_bh(&head->lock); 501 spin_unlock_bh(&head->lock);
486 return 0; 502 return 0;
487 } else { 503 } else {
488 spin_unlock(&head->lock); 504 spin_unlock(&head->lock);
489 /* No definite answer... Walk to established hash table */ 505 /* No definite answer... Walk to established hash table */
490 ret = check_established(death_row, sk, snum, NULL); 506 ret = check_established(death_row, sk, snum, NULL);
491 out: 507 out:
492 local_bh_enable(); 508 local_bh_enable();
493 return ret; 509 return ret;
494 } 510 }
495 } 511 }
496 512
497 /* 513 /*
498 * Bind a port for a connect operation and hash it. 514 * Bind a port for a connect operation and hash it.
499 */ 515 */
500 int inet_hash_connect(struct inet_timewait_death_row *death_row, 516 int inet_hash_connect(struct inet_timewait_death_row *death_row,
501 struct sock *sk) 517 struct sock *sk)
502 { 518 {
503 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 519 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
504 __inet_check_established, __inet_hash_nolisten); 520 __inet_check_established, __inet_hash_nolisten);
505 } 521 }
506 522
507 EXPORT_SYMBOL_GPL(inet_hash_connect); 523 EXPORT_SYMBOL_GPL(inet_hash_connect);
508 524