Commit 5a5f3a8db9d70c90e9d55b46e02b2d8deb1c2c2e

Authored by Jianjun Kong
Committed by David S. Miller
1 parent d9319100c1

net: clean up net/ipv4/ipip.c raw.c tcp.c tcp_minisocks.c tcp_yeah.c xfrm4_policy.c

Signed-off-by: Jianjun Kong <jianjun@zeuux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 6 changed files with 12 additions and 12 deletions Inline Diff

1 /* 1 /*
2 * Linux NET3: IP/IP protocol decoder. 2 * Linux NET3: IP/IP protocol decoder.
3 * 3 *
4 * Authors: 4 * Authors:
5 * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 5 * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
6 * 6 *
7 * Fixes: 7 * Fixes:
8 * Alan Cox : Merged and made usable non modular (its so tiny its silly as 8 * Alan Cox : Merged and made usable non modular (its so tiny its silly as
9 * a module taking up 2 pages). 9 * a module taking up 2 pages).
10 * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) 10 * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
11 * to keep ip_forward happy. 11 * to keep ip_forward happy.
12 * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). 12 * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
13 * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL 13 * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
14 * David Woodhouse : Perform some basic ICMP handling. 14 * David Woodhouse : Perform some basic ICMP handling.
15 * IPIP Routing without decapsulation. 15 * IPIP Routing without decapsulation.
16 * Carlos Picoto : GRE over IP support 16 * Carlos Picoto : GRE over IP support
17 * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. 17 * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
18 * I do not want to merge them together. 18 * I do not want to merge them together.
19 * 19 *
20 * This program is free software; you can redistribute it and/or 20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License 21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version 22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version. 23 * 2 of the License, or (at your option) any later version.
24 * 24 *
25 */ 25 */
26 26
27 /* tunnel.c: an IP tunnel driver 27 /* tunnel.c: an IP tunnel driver
28 28
29 The purpose of this driver is to provide an IP tunnel through 29 The purpose of this driver is to provide an IP tunnel through
30 which you can tunnel network traffic transparently across subnets. 30 which you can tunnel network traffic transparently across subnets.
31 31
32 This was written by looking at Nick Holloway's dummy driver 32 This was written by looking at Nick Holloway's dummy driver
33 Thanks for the great code! 33 Thanks for the great code!
34 34
35 -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 35 -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
36 36
37 Minor tweaks: 37 Minor tweaks:
38 Cleaned up the code a little and added some pre-1.3.0 tweaks. 38 Cleaned up the code a little and added some pre-1.3.0 tweaks.
39 dev->hard_header/hard_header_len changed to use no headers. 39 dev->hard_header/hard_header_len changed to use no headers.
40 Comments/bracketing tweaked. 40 Comments/bracketing tweaked.
41 Made the tunnels use dev->name not tunnel: when error reporting. 41 Made the tunnels use dev->name not tunnel: when error reporting.
42 Added tx_dropped stat 42 Added tx_dropped stat
43 43
44 -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 44 -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95
45 45
46 Reworked: 46 Reworked:
47 Changed to tunnel to destination gateway in addition to the 47 Changed to tunnel to destination gateway in addition to the
48 tunnel's pointopoint address 48 tunnel's pointopoint address
49 Almost completely rewritten 49 Almost completely rewritten
50 Note: There is currently no firewall or ICMP handling done. 50 Note: There is currently no firewall or ICMP handling done.
51 51
52 -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 52 -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
53 53
54 */ 54 */
55 55
56 /* Things I wish I had known when writing the tunnel driver: 56 /* Things I wish I had known when writing the tunnel driver:
57 57
58 When the tunnel_xmit() function is called, the skb contains the 58 When the tunnel_xmit() function is called, the skb contains the
59 packet to be sent (plus a great deal of extra info), and dev 59 packet to be sent (plus a great deal of extra info), and dev
60 contains the tunnel device that _we_ are. 60 contains the tunnel device that _we_ are.
61 61
62 When we are passed a packet, we are expected to fill in the 62 When we are passed a packet, we are expected to fill in the
63 source address with our source IP address. 63 source address with our source IP address.
64 64
65 What is the proper way to allocate, copy and free a buffer? 65 What is the proper way to allocate, copy and free a buffer?
66 After you allocate it, it is a "0 length" chunk of memory 66 After you allocate it, it is a "0 length" chunk of memory
67 starting at zero. If you want to add headers to the buffer 67 starting at zero. If you want to add headers to the buffer
68 later, you'll have to call "skb_reserve(skb, amount)" with 68 later, you'll have to call "skb_reserve(skb, amount)" with
69 the amount of memory you want reserved. Then, you call 69 the amount of memory you want reserved. Then, you call
70 "skb_put(skb, amount)" with the amount of space you want in 70 "skb_put(skb, amount)" with the amount of space you want in
71 the buffer. skb_put() returns a pointer to the top (#0) of 71 the buffer. skb_put() returns a pointer to the top (#0) of
72 that buffer. skb->len is set to the amount of space you have 72 that buffer. skb->len is set to the amount of space you have
73 "allocated" with skb_put(). You can then write up to skb->len 73 "allocated" with skb_put(). You can then write up to skb->len
74 bytes to that buffer. If you need more, you can call skb_put() 74 bytes to that buffer. If you need more, you can call skb_put()
75 again with the additional amount of space you need. You can 75 again with the additional amount of space you need. You can
76 find out how much more space you can allocate by calling 76 find out how much more space you can allocate by calling
77 "skb_tailroom(skb)". 77 "skb_tailroom(skb)".
78 Now, to add header space, call "skb_push(skb, header_len)". 78 Now, to add header space, call "skb_push(skb, header_len)".
79 This creates space at the beginning of the buffer and returns 79 This creates space at the beginning of the buffer and returns
80 a pointer to this new space. If later you need to strip a 80 a pointer to this new space. If later you need to strip a
81 header from a buffer, call "skb_pull(skb, header_len)". 81 header from a buffer, call "skb_pull(skb, header_len)".
82 skb_headroom() will return how much space is left at the top 82 skb_headroom() will return how much space is left at the top
83 of the buffer (before the main data). Remember, this headroom 83 of the buffer (before the main data). Remember, this headroom
84 space must be reserved before the skb_put() function is called. 84 space must be reserved before the skb_put() function is called.
85 */ 85 */
86 86
87 /* 87 /*
88 This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c 88 This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
89 89
90 For comments look at net/ipv4/ip_gre.c --ANK 90 For comments look at net/ipv4/ip_gre.c --ANK
91 */ 91 */
92 92
93 93
94 #include <linux/capability.h> 94 #include <linux/capability.h>
95 #include <linux/module.h> 95 #include <linux/module.h>
96 #include <linux/types.h> 96 #include <linux/types.h>
97 #include <linux/kernel.h> 97 #include <linux/kernel.h>
98 #include <asm/uaccess.h> 98 #include <asm/uaccess.h>
99 #include <linux/skbuff.h> 99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h> 100 #include <linux/netdevice.h>
101 #include <linux/in.h> 101 #include <linux/in.h>
102 #include <linux/tcp.h> 102 #include <linux/tcp.h>
103 #include <linux/udp.h> 103 #include <linux/udp.h>
104 #include <linux/if_arp.h> 104 #include <linux/if_arp.h>
105 #include <linux/mroute.h> 105 #include <linux/mroute.h>
106 #include <linux/init.h> 106 #include <linux/init.h>
107 #include <linux/netfilter_ipv4.h> 107 #include <linux/netfilter_ipv4.h>
108 #include <linux/if_ether.h> 108 #include <linux/if_ether.h>
109 109
110 #include <net/sock.h> 110 #include <net/sock.h>
111 #include <net/ip.h> 111 #include <net/ip.h>
112 #include <net/icmp.h> 112 #include <net/icmp.h>
113 #include <net/ipip.h> 113 #include <net/ipip.h>
114 #include <net/inet_ecn.h> 114 #include <net/inet_ecn.h>
115 #include <net/xfrm.h> 115 #include <net/xfrm.h>
116 #include <net/net_namespace.h> 116 #include <net/net_namespace.h>
117 #include <net/netns/generic.h> 117 #include <net/netns/generic.h>
118 118
119 #define HASH_SIZE 16 119 #define HASH_SIZE 16
120 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 120 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
121 121
122 static int ipip_net_id; 122 static int ipip_net_id;
123 struct ipip_net { 123 struct ipip_net {
124 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 124 struct ip_tunnel *tunnels_r_l[HASH_SIZE];
125 struct ip_tunnel *tunnels_r[HASH_SIZE]; 125 struct ip_tunnel *tunnels_r[HASH_SIZE];
126 struct ip_tunnel *tunnels_l[HASH_SIZE]; 126 struct ip_tunnel *tunnels_l[HASH_SIZE];
127 struct ip_tunnel *tunnels_wc[1]; 127 struct ip_tunnel *tunnels_wc[1];
128 struct ip_tunnel **tunnels[4]; 128 struct ip_tunnel **tunnels[4];
129 129
130 struct net_device *fb_tunnel_dev; 130 struct net_device *fb_tunnel_dev;
131 }; 131 };
132 132
133 static int ipip_fb_tunnel_init(struct net_device *dev); 133 static int ipip_fb_tunnel_init(struct net_device *dev);
134 static int ipip_tunnel_init(struct net_device *dev); 134 static int ipip_tunnel_init(struct net_device *dev);
135 static void ipip_tunnel_setup(struct net_device *dev); 135 static void ipip_tunnel_setup(struct net_device *dev);
136 136
137 static DEFINE_RWLOCK(ipip_lock); 137 static DEFINE_RWLOCK(ipip_lock);
138 138
139 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 139 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
140 __be32 remote, __be32 local) 140 __be32 remote, __be32 local)
141 { 141 {
142 unsigned h0 = HASH(remote); 142 unsigned h0 = HASH(remote);
143 unsigned h1 = HASH(local); 143 unsigned h1 = HASH(local);
144 struct ip_tunnel *t; 144 struct ip_tunnel *t;
145 struct ipip_net *ipn = net_generic(net, ipip_net_id); 145 struct ipip_net *ipn = net_generic(net, ipip_net_id);
146 146
147 for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { 147 for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
148 if (local == t->parms.iph.saddr && 148 if (local == t->parms.iph.saddr &&
149 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 149 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
150 return t; 150 return t;
151 } 151 }
152 for (t = ipn->tunnels_r[h0]; t; t = t->next) { 152 for (t = ipn->tunnels_r[h0]; t; t = t->next) {
153 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 153 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
154 return t; 154 return t;
155 } 155 }
156 for (t = ipn->tunnels_l[h1]; t; t = t->next) { 156 for (t = ipn->tunnels_l[h1]; t; t = t->next) {
157 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) 157 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
158 return t; 158 return t;
159 } 159 }
160 if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) 160 if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
161 return t; 161 return t;
162 return NULL; 162 return NULL;
163 } 163 }
164 164
165 static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, 165 static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
166 struct ip_tunnel_parm *parms) 166 struct ip_tunnel_parm *parms)
167 { 167 {
168 __be32 remote = parms->iph.daddr; 168 __be32 remote = parms->iph.daddr;
169 __be32 local = parms->iph.saddr; 169 __be32 local = parms->iph.saddr;
170 unsigned h = 0; 170 unsigned h = 0;
171 int prio = 0; 171 int prio = 0;
172 172
173 if (remote) { 173 if (remote) {
174 prio |= 2; 174 prio |= 2;
175 h ^= HASH(remote); 175 h ^= HASH(remote);
176 } 176 }
177 if (local) { 177 if (local) {
178 prio |= 1; 178 prio |= 1;
179 h ^= HASH(local); 179 h ^= HASH(local);
180 } 180 }
181 return &ipn->tunnels[prio][h]; 181 return &ipn->tunnels[prio][h];
182 } 182 }
183 183
184 static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, 184 static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
185 struct ip_tunnel *t) 185 struct ip_tunnel *t)
186 { 186 {
187 return __ipip_bucket(ipn, &t->parms); 187 return __ipip_bucket(ipn, &t->parms);
188 } 188 }
189 189
190 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) 190 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
191 { 191 {
192 struct ip_tunnel **tp; 192 struct ip_tunnel **tp;
193 193
194 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 194 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
195 if (t == *tp) { 195 if (t == *tp) {
196 write_lock_bh(&ipip_lock); 196 write_lock_bh(&ipip_lock);
197 *tp = t->next; 197 *tp = t->next;
198 write_unlock_bh(&ipip_lock); 198 write_unlock_bh(&ipip_lock);
199 break; 199 break;
200 } 200 }
201 } 201 }
202 } 202 }
203 203
204 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) 204 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
205 { 205 {
206 struct ip_tunnel **tp = ipip_bucket(ipn, t); 206 struct ip_tunnel **tp = ipip_bucket(ipn, t);
207 207
208 t->next = *tp; 208 t->next = *tp;
209 write_lock_bh(&ipip_lock); 209 write_lock_bh(&ipip_lock);
210 *tp = t; 210 *tp = t;
211 write_unlock_bh(&ipip_lock); 211 write_unlock_bh(&ipip_lock);
212 } 212 }
213 213
214 static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 214 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
215 struct ip_tunnel_parm *parms, int create) 215 struct ip_tunnel_parm *parms, int create)
216 { 216 {
217 __be32 remote = parms->iph.daddr; 217 __be32 remote = parms->iph.daddr;
218 __be32 local = parms->iph.saddr; 218 __be32 local = parms->iph.saddr;
219 struct ip_tunnel *t, **tp, *nt; 219 struct ip_tunnel *t, **tp, *nt;
220 struct net_device *dev; 220 struct net_device *dev;
221 char name[IFNAMSIZ]; 221 char name[IFNAMSIZ];
222 struct ipip_net *ipn = net_generic(net, ipip_net_id); 222 struct ipip_net *ipn = net_generic(net, ipip_net_id);
223 223
224 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { 224 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
225 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 225 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
226 return t; 226 return t;
227 } 227 }
228 if (!create) 228 if (!create)
229 return NULL; 229 return NULL;
230 230
231 if (parms->name[0]) 231 if (parms->name[0])
232 strlcpy(name, parms->name, IFNAMSIZ); 232 strlcpy(name, parms->name, IFNAMSIZ);
233 else 233 else
234 sprintf(name, "tunl%%d"); 234 sprintf(name, "tunl%%d");
235 235
236 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); 236 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
237 if (dev == NULL) 237 if (dev == NULL)
238 return NULL; 238 return NULL;
239 239
240 dev_net_set(dev, net); 240 dev_net_set(dev, net);
241 241
242 if (strchr(name, '%')) { 242 if (strchr(name, '%')) {
243 if (dev_alloc_name(dev, name) < 0) 243 if (dev_alloc_name(dev, name) < 0)
244 goto failed_free; 244 goto failed_free;
245 } 245 }
246 246
247 nt = netdev_priv(dev); 247 nt = netdev_priv(dev);
248 dev->init = ipip_tunnel_init; 248 dev->init = ipip_tunnel_init;
249 nt->parms = *parms; 249 nt->parms = *parms;
250 250
251 if (register_netdevice(dev) < 0) 251 if (register_netdevice(dev) < 0)
252 goto failed_free; 252 goto failed_free;
253 253
254 dev_hold(dev); 254 dev_hold(dev);
255 ipip_tunnel_link(ipn, nt); 255 ipip_tunnel_link(ipn, nt);
256 return nt; 256 return nt;
257 257
258 failed_free: 258 failed_free:
259 free_netdev(dev); 259 free_netdev(dev);
260 return NULL; 260 return NULL;
261 } 261 }
262 262
263 static void ipip_tunnel_uninit(struct net_device *dev) 263 static void ipip_tunnel_uninit(struct net_device *dev)
264 { 264 {
265 struct net *net = dev_net(dev); 265 struct net *net = dev_net(dev);
266 struct ipip_net *ipn = net_generic(net, ipip_net_id); 266 struct ipip_net *ipn = net_generic(net, ipip_net_id);
267 267
268 if (dev == ipn->fb_tunnel_dev) { 268 if (dev == ipn->fb_tunnel_dev) {
269 write_lock_bh(&ipip_lock); 269 write_lock_bh(&ipip_lock);
270 ipn->tunnels_wc[0] = NULL; 270 ipn->tunnels_wc[0] = NULL;
271 write_unlock_bh(&ipip_lock); 271 write_unlock_bh(&ipip_lock);
272 } else 272 } else
273 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 273 ipip_tunnel_unlink(ipn, netdev_priv(dev));
274 dev_put(dev); 274 dev_put(dev);
275 } 275 }
276 276
277 static int ipip_err(struct sk_buff *skb, u32 info) 277 static int ipip_err(struct sk_buff *skb, u32 info)
278 { 278 {
279 279
280 /* All the routers (except for Linux) return only 280 /* All the routers (except for Linux) return only
281 8 bytes of packet payload. It means, that precise relaying of 281 8 bytes of packet payload. It means, that precise relaying of
282 ICMP in the real Internet is absolutely infeasible. 282 ICMP in the real Internet is absolutely infeasible.
283 */ 283 */
284 struct iphdr *iph = (struct iphdr*)skb->data; 284 struct iphdr *iph = (struct iphdr *)skb->data;
285 const int type = icmp_hdr(skb)->type; 285 const int type = icmp_hdr(skb)->type;
286 const int code = icmp_hdr(skb)->code; 286 const int code = icmp_hdr(skb)->code;
287 struct ip_tunnel *t; 287 struct ip_tunnel *t;
288 int err; 288 int err;
289 289
290 switch (type) { 290 switch (type) {
291 default: 291 default:
292 case ICMP_PARAMETERPROB: 292 case ICMP_PARAMETERPROB:
293 return 0; 293 return 0;
294 294
295 case ICMP_DEST_UNREACH: 295 case ICMP_DEST_UNREACH:
296 switch (code) { 296 switch (code) {
297 case ICMP_SR_FAILED: 297 case ICMP_SR_FAILED:
298 case ICMP_PORT_UNREACH: 298 case ICMP_PORT_UNREACH:
299 /* Impossible event. */ 299 /* Impossible event. */
300 return 0; 300 return 0;
301 case ICMP_FRAG_NEEDED: 301 case ICMP_FRAG_NEEDED:
302 /* Soft state for pmtu is maintained by IP core. */ 302 /* Soft state for pmtu is maintained by IP core. */
303 return 0; 303 return 0;
304 default: 304 default:
305 /* All others are translated to HOST_UNREACH. 305 /* All others are translated to HOST_UNREACH.
306 rfc2003 contains "deep thoughts" about NET_UNREACH, 306 rfc2003 contains "deep thoughts" about NET_UNREACH,
307 I believe they are just ether pollution. --ANK 307 I believe they are just ether pollution. --ANK
308 */ 308 */
309 break; 309 break;
310 } 310 }
311 break; 311 break;
312 case ICMP_TIME_EXCEEDED: 312 case ICMP_TIME_EXCEEDED:
313 if (code != ICMP_EXC_TTL) 313 if (code != ICMP_EXC_TTL)
314 return 0; 314 return 0;
315 break; 315 break;
316 } 316 }
317 317
318 err = -ENOENT; 318 err = -ENOENT;
319 319
320 read_lock(&ipip_lock); 320 read_lock(&ipip_lock);
321 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 321 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
322 if (t == NULL || t->parms.iph.daddr == 0) 322 if (t == NULL || t->parms.iph.daddr == 0)
323 goto out; 323 goto out;
324 324
325 err = 0; 325 err = 0;
326 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 326 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
327 goto out; 327 goto out;
328 328
329 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) 329 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
330 t->err_count++; 330 t->err_count++;
331 else 331 else
332 t->err_count = 1; 332 t->err_count = 1;
333 t->err_time = jiffies; 333 t->err_time = jiffies;
334 out: 334 out:
335 read_unlock(&ipip_lock); 335 read_unlock(&ipip_lock);
336 return err; 336 return err;
337 } 337 }
338 338
339 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, 339 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
340 struct sk_buff *skb) 340 struct sk_buff *skb)
341 { 341 {
342 struct iphdr *inner_iph = ip_hdr(skb); 342 struct iphdr *inner_iph = ip_hdr(skb);
343 343
344 if (INET_ECN_is_ce(outer_iph->tos)) 344 if (INET_ECN_is_ce(outer_iph->tos))
345 IP_ECN_set_ce(inner_iph); 345 IP_ECN_set_ce(inner_iph);
346 } 346 }
347 347
348 static int ipip_rcv(struct sk_buff *skb) 348 static int ipip_rcv(struct sk_buff *skb)
349 { 349 {
350 struct ip_tunnel *tunnel; 350 struct ip_tunnel *tunnel;
351 const struct iphdr *iph = ip_hdr(skb); 351 const struct iphdr *iph = ip_hdr(skb);
352 352
353 read_lock(&ipip_lock); 353 read_lock(&ipip_lock);
354 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 354 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
355 iph->saddr, iph->daddr)) != NULL) { 355 iph->saddr, iph->daddr)) != NULL) {
356 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 356 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
357 read_unlock(&ipip_lock); 357 read_unlock(&ipip_lock);
358 kfree_skb(skb); 358 kfree_skb(skb);
359 return 0; 359 return 0;
360 } 360 }
361 361
362 secpath_reset(skb); 362 secpath_reset(skb);
363 363
364 skb->mac_header = skb->network_header; 364 skb->mac_header = skb->network_header;
365 skb_reset_network_header(skb); 365 skb_reset_network_header(skb);
366 skb->protocol = htons(ETH_P_IP); 366 skb->protocol = htons(ETH_P_IP);
367 skb->pkt_type = PACKET_HOST; 367 skb->pkt_type = PACKET_HOST;
368 368
369 tunnel->dev->stats.rx_packets++; 369 tunnel->dev->stats.rx_packets++;
370 tunnel->dev->stats.rx_bytes += skb->len; 370 tunnel->dev->stats.rx_bytes += skb->len;
371 skb->dev = tunnel->dev; 371 skb->dev = tunnel->dev;
372 dst_release(skb->dst); 372 dst_release(skb->dst);
373 skb->dst = NULL; 373 skb->dst = NULL;
374 nf_reset(skb); 374 nf_reset(skb);
375 ipip_ecn_decapsulate(iph, skb); 375 ipip_ecn_decapsulate(iph, skb);
376 netif_rx(skb); 376 netif_rx(skb);
377 read_unlock(&ipip_lock); 377 read_unlock(&ipip_lock);
378 return 0; 378 return 0;
379 } 379 }
380 read_unlock(&ipip_lock); 380 read_unlock(&ipip_lock);
381 381
382 return -1; 382 return -1;
383 } 383 }
384 384
385 /* 385 /*
386 * This function assumes it is being called from dev_queue_xmit() 386 * This function assumes it is being called from dev_queue_xmit()
387 * and that skb is filled properly by that function. 387 * and that skb is filled properly by that function.
388 */ 388 */
389 389
390 static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 390 static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
391 { 391 {
392 struct ip_tunnel *tunnel = netdev_priv(dev); 392 struct ip_tunnel *tunnel = netdev_priv(dev);
393 struct net_device_stats *stats = &tunnel->dev->stats; 393 struct net_device_stats *stats = &tunnel->dev->stats;
394 struct iphdr *tiph = &tunnel->parms.iph; 394 struct iphdr *tiph = &tunnel->parms.iph;
395 u8 tos = tunnel->parms.iph.tos; 395 u8 tos = tunnel->parms.iph.tos;
396 __be16 df = tiph->frag_off; 396 __be16 df = tiph->frag_off;
397 struct rtable *rt; /* Route to the other host */ 397 struct rtable *rt; /* Route to the other host */
398 struct net_device *tdev; /* Device to other host */ 398 struct net_device *tdev; /* Device to other host */
399 struct iphdr *old_iph = ip_hdr(skb); 399 struct iphdr *old_iph = ip_hdr(skb);
400 struct iphdr *iph; /* Our new IP header */ 400 struct iphdr *iph; /* Our new IP header */
401 unsigned int max_headroom; /* The extra header space needed */ 401 unsigned int max_headroom; /* The extra header space needed */
402 __be32 dst = tiph->daddr; 402 __be32 dst = tiph->daddr;
403 int mtu; 403 int mtu;
404 404
405 if (tunnel->recursion++) { 405 if (tunnel->recursion++) {
406 stats->collisions++; 406 stats->collisions++;
407 goto tx_error; 407 goto tx_error;
408 } 408 }
409 409
410 if (skb->protocol != htons(ETH_P_IP)) 410 if (skb->protocol != htons(ETH_P_IP))
411 goto tx_error; 411 goto tx_error;
412 412
413 if (tos&1) 413 if (tos&1)
414 tos = old_iph->tos; 414 tos = old_iph->tos;
415 415
416 if (!dst) { 416 if (!dst) {
417 /* NBMA tunnel */ 417 /* NBMA tunnel */
418 if ((rt = skb->rtable) == NULL) { 418 if ((rt = skb->rtable) == NULL) {
419 stats->tx_fifo_errors++; 419 stats->tx_fifo_errors++;
420 goto tx_error; 420 goto tx_error;
421 } 421 }
422 if ((dst = rt->rt_gateway) == 0) 422 if ((dst = rt->rt_gateway) == 0)
423 goto tx_error_icmp; 423 goto tx_error_icmp;
424 } 424 }
425 425
426 { 426 {
427 struct flowi fl = { .oif = tunnel->parms.link, 427 struct flowi fl = { .oif = tunnel->parms.link,
428 .nl_u = { .ip4_u = 428 .nl_u = { .ip4_u =
429 { .daddr = dst, 429 { .daddr = dst,
430 .saddr = tiph->saddr, 430 .saddr = tiph->saddr,
431 .tos = RT_TOS(tos) } }, 431 .tos = RT_TOS(tos) } },
432 .proto = IPPROTO_IPIP }; 432 .proto = IPPROTO_IPIP };
433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
434 stats->tx_carrier_errors++; 434 stats->tx_carrier_errors++;
435 goto tx_error_icmp; 435 goto tx_error_icmp;
436 } 436 }
437 } 437 }
438 tdev = rt->u.dst.dev; 438 tdev = rt->u.dst.dev;
439 439
440 if (tdev == dev) { 440 if (tdev == dev) {
441 ip_rt_put(rt); 441 ip_rt_put(rt);
442 stats->collisions++; 442 stats->collisions++;
443 goto tx_error; 443 goto tx_error;
444 } 444 }
445 445
446 if (tiph->frag_off) 446 if (tiph->frag_off)
447 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); 447 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
448 else 448 else
449 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; 449 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
450 450
451 if (mtu < 68) { 451 if (mtu < 68) {
452 stats->collisions++; 452 stats->collisions++;
453 ip_rt_put(rt); 453 ip_rt_put(rt);
454 goto tx_error; 454 goto tx_error;
455 } 455 }
456 if (skb->dst) 456 if (skb->dst)
457 skb->dst->ops->update_pmtu(skb->dst, mtu); 457 skb->dst->ops->update_pmtu(skb->dst, mtu);
458 458
459 df |= (old_iph->frag_off&htons(IP_DF)); 459 df |= (old_iph->frag_off&htons(IP_DF));
460 460
461 if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { 461 if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
462 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 462 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
463 ip_rt_put(rt); 463 ip_rt_put(rt);
464 goto tx_error; 464 goto tx_error;
465 } 465 }
466 466
467 if (tunnel->err_count > 0) { 467 if (tunnel->err_count > 0) {
468 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { 468 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
469 tunnel->err_count--; 469 tunnel->err_count--;
470 dst_link_failure(skb); 470 dst_link_failure(skb);
471 } else 471 } else
472 tunnel->err_count = 0; 472 tunnel->err_count = 0;
473 } 473 }
474 474
475 /* 475 /*
476 * Okay, now see if we can stuff it in the buffer as-is. 476 * Okay, now see if we can stuff it in the buffer as-is.
477 */ 477 */
478 max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); 478 max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
479 479
480 if (skb_headroom(skb) < max_headroom || skb_shared(skb) || 480 if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
481 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 481 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
482 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 482 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
483 if (!new_skb) { 483 if (!new_skb) {
484 ip_rt_put(rt); 484 ip_rt_put(rt);
485 stats->tx_dropped++; 485 stats->tx_dropped++;
486 dev_kfree_skb(skb); 486 dev_kfree_skb(skb);
487 tunnel->recursion--; 487 tunnel->recursion--;
488 return 0; 488 return 0;
489 } 489 }
490 if (skb->sk) 490 if (skb->sk)
491 skb_set_owner_w(new_skb, skb->sk); 491 skb_set_owner_w(new_skb, skb->sk);
492 dev_kfree_skb(skb); 492 dev_kfree_skb(skb);
493 skb = new_skb; 493 skb = new_skb;
494 old_iph = ip_hdr(skb); 494 old_iph = ip_hdr(skb);
495 } 495 }
496 496
497 skb->transport_header = skb->network_header; 497 skb->transport_header = skb->network_header;
498 skb_push(skb, sizeof(struct iphdr)); 498 skb_push(skb, sizeof(struct iphdr));
499 skb_reset_network_header(skb); 499 skb_reset_network_header(skb);
500 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 500 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
501 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 501 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
502 IPSKB_REROUTED); 502 IPSKB_REROUTED);
503 dst_release(skb->dst); 503 dst_release(skb->dst);
504 skb->dst = &rt->u.dst; 504 skb->dst = &rt->u.dst;
505 505
506 /* 506 /*
507 * Push down and install the IPIP header. 507 * Push down and install the IPIP header.
508 */ 508 */
509 509
510 iph = ip_hdr(skb); 510 iph = ip_hdr(skb);
511 iph->version = 4; 511 iph->version = 4;
512 iph->ihl = sizeof(struct iphdr)>>2; 512 iph->ihl = sizeof(struct iphdr)>>2;
513 iph->frag_off = df; 513 iph->frag_off = df;
514 iph->protocol = IPPROTO_IPIP; 514 iph->protocol = IPPROTO_IPIP;
515 iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); 515 iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
516 iph->daddr = rt->rt_dst; 516 iph->daddr = rt->rt_dst;
517 iph->saddr = rt->rt_src; 517 iph->saddr = rt->rt_src;
518 518
519 if ((iph->ttl = tiph->ttl) == 0) 519 if ((iph->ttl = tiph->ttl) == 0)
520 iph->ttl = old_iph->ttl; 520 iph->ttl = old_iph->ttl;
521 521
522 nf_reset(skb); 522 nf_reset(skb);
523 523
524 IPTUNNEL_XMIT(); 524 IPTUNNEL_XMIT();
525 tunnel->recursion--; 525 tunnel->recursion--;
526 return 0; 526 return 0;
527 527
528 tx_error_icmp: 528 tx_error_icmp:
529 dst_link_failure(skb); 529 dst_link_failure(skb);
530 tx_error: 530 tx_error:
531 stats->tx_errors++; 531 stats->tx_errors++;
532 dev_kfree_skb(skb); 532 dev_kfree_skb(skb);
533 tunnel->recursion--; 533 tunnel->recursion--;
534 return 0; 534 return 0;
535 } 535 }
536 536
537 static void ipip_tunnel_bind_dev(struct net_device *dev) 537 static void ipip_tunnel_bind_dev(struct net_device *dev)
538 { 538 {
539 struct net_device *tdev = NULL; 539 struct net_device *tdev = NULL;
540 struct ip_tunnel *tunnel; 540 struct ip_tunnel *tunnel;
541 struct iphdr *iph; 541 struct iphdr *iph;
542 542
543 tunnel = netdev_priv(dev); 543 tunnel = netdev_priv(dev);
544 iph = &tunnel->parms.iph; 544 iph = &tunnel->parms.iph;
545 545
546 if (iph->daddr) { 546 if (iph->daddr) {
547 struct flowi fl = { .oif = tunnel->parms.link, 547 struct flowi fl = { .oif = tunnel->parms.link,
548 .nl_u = { .ip4_u = 548 .nl_u = { .ip4_u =
549 { .daddr = iph->daddr, 549 { .daddr = iph->daddr,
550 .saddr = iph->saddr, 550 .saddr = iph->saddr,
551 .tos = RT_TOS(iph->tos) } }, 551 .tos = RT_TOS(iph->tos) } },
552 .proto = IPPROTO_IPIP }; 552 .proto = IPPROTO_IPIP };
553 struct rtable *rt; 553 struct rtable *rt;
554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
555 tdev = rt->u.dst.dev; 555 tdev = rt->u.dst.dev;
556 ip_rt_put(rt); 556 ip_rt_put(rt);
557 } 557 }
558 dev->flags |= IFF_POINTOPOINT; 558 dev->flags |= IFF_POINTOPOINT;
559 } 559 }
560 560
561 if (!tdev && tunnel->parms.link) 561 if (!tdev && tunnel->parms.link)
562 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 562 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
563 563
564 if (tdev) { 564 if (tdev) {
565 dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); 565 dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
566 dev->mtu = tdev->mtu - sizeof(struct iphdr); 566 dev->mtu = tdev->mtu - sizeof(struct iphdr);
567 } 567 }
568 dev->iflink = tunnel->parms.link; 568 dev->iflink = tunnel->parms.link;
569 } 569 }
570 570
571 static int 571 static int
572 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 572 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
573 { 573 {
574 int err = 0; 574 int err = 0;
575 struct ip_tunnel_parm p; 575 struct ip_tunnel_parm p;
576 struct ip_tunnel *t; 576 struct ip_tunnel *t;
577 struct net *net = dev_net(dev); 577 struct net *net = dev_net(dev);
578 struct ipip_net *ipn = net_generic(net, ipip_net_id); 578 struct ipip_net *ipn = net_generic(net, ipip_net_id);
579 579
580 switch (cmd) { 580 switch (cmd) {
581 case SIOCGETTUNNEL: 581 case SIOCGETTUNNEL:
582 t = NULL; 582 t = NULL;
583 if (dev == ipn->fb_tunnel_dev) { 583 if (dev == ipn->fb_tunnel_dev) {
584 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { 584 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
585 err = -EFAULT; 585 err = -EFAULT;
586 break; 586 break;
587 } 587 }
588 t = ipip_tunnel_locate(net, &p, 0); 588 t = ipip_tunnel_locate(net, &p, 0);
589 } 589 }
590 if (t == NULL) 590 if (t == NULL)
591 t = netdev_priv(dev); 591 t = netdev_priv(dev);
592 memcpy(&p, &t->parms, sizeof(p)); 592 memcpy(&p, &t->parms, sizeof(p));
593 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 593 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
594 err = -EFAULT; 594 err = -EFAULT;
595 break; 595 break;
596 596
597 case SIOCADDTUNNEL: 597 case SIOCADDTUNNEL:
598 case SIOCCHGTUNNEL: 598 case SIOCCHGTUNNEL:
599 err = -EPERM; 599 err = -EPERM;
600 if (!capable(CAP_NET_ADMIN)) 600 if (!capable(CAP_NET_ADMIN))
601 goto done; 601 goto done;
602 602
603 err = -EFAULT; 603 err = -EFAULT;
604 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 604 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
605 goto done; 605 goto done;
606 606
607 err = -EINVAL; 607 err = -EINVAL;
608 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || 608 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
609 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) 609 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
610 goto done; 610 goto done;
611 if (p.iph.ttl) 611 if (p.iph.ttl)
612 p.iph.frag_off |= htons(IP_DF); 612 p.iph.frag_off |= htons(IP_DF);
613 613
614 t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); 614 t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
615 615
616 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 616 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
617 if (t != NULL) { 617 if (t != NULL) {
618 if (t->dev != dev) { 618 if (t->dev != dev) {
619 err = -EEXIST; 619 err = -EEXIST;
620 break; 620 break;
621 } 621 }
622 } else { 622 } else {
623 if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || 623 if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
624 (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { 624 (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
625 err = -EINVAL; 625 err = -EINVAL;
626 break; 626 break;
627 } 627 }
628 t = netdev_priv(dev); 628 t = netdev_priv(dev);
629 ipip_tunnel_unlink(ipn, t); 629 ipip_tunnel_unlink(ipn, t);
630 t->parms.iph.saddr = p.iph.saddr; 630 t->parms.iph.saddr = p.iph.saddr;
631 t->parms.iph.daddr = p.iph.daddr; 631 t->parms.iph.daddr = p.iph.daddr;
632 memcpy(dev->dev_addr, &p.iph.saddr, 4); 632 memcpy(dev->dev_addr, &p.iph.saddr, 4);
633 memcpy(dev->broadcast, &p.iph.daddr, 4); 633 memcpy(dev->broadcast, &p.iph.daddr, 4);
634 ipip_tunnel_link(ipn, t); 634 ipip_tunnel_link(ipn, t);
635 netdev_state_change(dev); 635 netdev_state_change(dev);
636 } 636 }
637 } 637 }
638 638
639 if (t) { 639 if (t) {
640 err = 0; 640 err = 0;
641 if (cmd == SIOCCHGTUNNEL) { 641 if (cmd == SIOCCHGTUNNEL) {
642 t->parms.iph.ttl = p.iph.ttl; 642 t->parms.iph.ttl = p.iph.ttl;
643 t->parms.iph.tos = p.iph.tos; 643 t->parms.iph.tos = p.iph.tos;
644 t->parms.iph.frag_off = p.iph.frag_off; 644 t->parms.iph.frag_off = p.iph.frag_off;
645 if (t->parms.link != p.link) { 645 if (t->parms.link != p.link) {
646 t->parms.link = p.link; 646 t->parms.link = p.link;
647 ipip_tunnel_bind_dev(dev); 647 ipip_tunnel_bind_dev(dev);
648 netdev_state_change(dev); 648 netdev_state_change(dev);
649 } 649 }
650 } 650 }
651 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) 651 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
652 err = -EFAULT; 652 err = -EFAULT;
653 } else 653 } else
654 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 654 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
655 break; 655 break;
656 656
657 case SIOCDELTUNNEL: 657 case SIOCDELTUNNEL:
658 err = -EPERM; 658 err = -EPERM;
659 if (!capable(CAP_NET_ADMIN)) 659 if (!capable(CAP_NET_ADMIN))
660 goto done; 660 goto done;
661 661
662 if (dev == ipn->fb_tunnel_dev) { 662 if (dev == ipn->fb_tunnel_dev) {
663 err = -EFAULT; 663 err = -EFAULT;
664 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 664 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
665 goto done; 665 goto done;
666 err = -ENOENT; 666 err = -ENOENT;
667 if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) 667 if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
668 goto done; 668 goto done;
669 err = -EPERM; 669 err = -EPERM;
670 if (t->dev == ipn->fb_tunnel_dev) 670 if (t->dev == ipn->fb_tunnel_dev)
671 goto done; 671 goto done;
672 dev = t->dev; 672 dev = t->dev;
673 } 673 }
674 unregister_netdevice(dev); 674 unregister_netdevice(dev);
675 err = 0; 675 err = 0;
676 break; 676 break;
677 677
678 default: 678 default:
679 err = -EINVAL; 679 err = -EINVAL;
680 } 680 }
681 681
682 done: 682 done:
683 return err; 683 return err;
684 } 684 }
685 685
686 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 686 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
687 { 687 {
688 if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) 688 if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
689 return -EINVAL; 689 return -EINVAL;
690 dev->mtu = new_mtu; 690 dev->mtu = new_mtu;
691 return 0; 691 return 0;
692 } 692 }
693 693
694 static void ipip_tunnel_setup(struct net_device *dev) 694 static void ipip_tunnel_setup(struct net_device *dev)
695 { 695 {
696 dev->uninit = ipip_tunnel_uninit; 696 dev->uninit = ipip_tunnel_uninit;
697 dev->hard_start_xmit = ipip_tunnel_xmit; 697 dev->hard_start_xmit = ipip_tunnel_xmit;
698 dev->do_ioctl = ipip_tunnel_ioctl; 698 dev->do_ioctl = ipip_tunnel_ioctl;
699 dev->change_mtu = ipip_tunnel_change_mtu; 699 dev->change_mtu = ipip_tunnel_change_mtu;
700 dev->destructor = free_netdev; 700 dev->destructor = free_netdev;
701 701
702 dev->type = ARPHRD_TUNNEL; 702 dev->type = ARPHRD_TUNNEL;
703 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 703 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
704 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); 704 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
705 dev->flags = IFF_NOARP; 705 dev->flags = IFF_NOARP;
706 dev->iflink = 0; 706 dev->iflink = 0;
707 dev->addr_len = 4; 707 dev->addr_len = 4;
708 dev->features |= NETIF_F_NETNS_LOCAL; 708 dev->features |= NETIF_F_NETNS_LOCAL;
709 } 709 }
710 710
711 static int ipip_tunnel_init(struct net_device *dev) 711 static int ipip_tunnel_init(struct net_device *dev)
712 { 712 {
713 struct ip_tunnel *tunnel; 713 struct ip_tunnel *tunnel;
714 714
715 tunnel = netdev_priv(dev); 715 tunnel = netdev_priv(dev);
716 716
717 tunnel->dev = dev; 717 tunnel->dev = dev;
718 strcpy(tunnel->parms.name, dev->name); 718 strcpy(tunnel->parms.name, dev->name);
719 719
720 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 720 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
721 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 721 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
722 722
723 ipip_tunnel_bind_dev(dev); 723 ipip_tunnel_bind_dev(dev);
724 724
725 return 0; 725 return 0;
726 } 726 }
727 727
728 static int ipip_fb_tunnel_init(struct net_device *dev) 728 static int ipip_fb_tunnel_init(struct net_device *dev)
729 { 729 {
730 struct ip_tunnel *tunnel = netdev_priv(dev); 730 struct ip_tunnel *tunnel = netdev_priv(dev);
731 struct iphdr *iph = &tunnel->parms.iph; 731 struct iphdr *iph = &tunnel->parms.iph;
732 struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); 732 struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
733 733
734 tunnel->dev = dev; 734 tunnel->dev = dev;
735 strcpy(tunnel->parms.name, dev->name); 735 strcpy(tunnel->parms.name, dev->name);
736 736
737 iph->version = 4; 737 iph->version = 4;
738 iph->protocol = IPPROTO_IPIP; 738 iph->protocol = IPPROTO_IPIP;
739 iph->ihl = 5; 739 iph->ihl = 5;
740 740
741 dev_hold(dev); 741 dev_hold(dev);
742 ipn->tunnels_wc[0] = tunnel; 742 ipn->tunnels_wc[0] = tunnel;
743 return 0; 743 return 0;
744 } 744 }
745 745
746 static struct xfrm_tunnel ipip_handler = { 746 static struct xfrm_tunnel ipip_handler = {
747 .handler = ipip_rcv, 747 .handler = ipip_rcv,
748 .err_handler = ipip_err, 748 .err_handler = ipip_err,
749 .priority = 1, 749 .priority = 1,
750 }; 750 };
751 751
752 static char banner[] __initdata = 752 static char banner[] __initdata =
753 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 753 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
754 754
755 static void ipip_destroy_tunnels(struct ipip_net *ipn) 755 static void ipip_destroy_tunnels(struct ipip_net *ipn)
756 { 756 {
757 int prio; 757 int prio;
758 758
759 for (prio = 1; prio < 4; prio++) { 759 for (prio = 1; prio < 4; prio++) {
760 int h; 760 int h;
761 for (h = 0; h < HASH_SIZE; h++) { 761 for (h = 0; h < HASH_SIZE; h++) {
762 struct ip_tunnel *t; 762 struct ip_tunnel *t;
763 while ((t = ipn->tunnels[prio][h]) != NULL) 763 while ((t = ipn->tunnels[prio][h]) != NULL)
764 unregister_netdevice(t->dev); 764 unregister_netdevice(t->dev);
765 } 765 }
766 } 766 }
767 } 767 }
768 768
769 static int ipip_init_net(struct net *net) 769 static int ipip_init_net(struct net *net)
770 { 770 {
771 int err; 771 int err;
772 struct ipip_net *ipn; 772 struct ipip_net *ipn;
773 773
774 err = -ENOMEM; 774 err = -ENOMEM;
775 ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); 775 ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
776 if (ipn == NULL) 776 if (ipn == NULL)
777 goto err_alloc; 777 goto err_alloc;
778 778
779 err = net_assign_generic(net, ipip_net_id, ipn); 779 err = net_assign_generic(net, ipip_net_id, ipn);
780 if (err < 0) 780 if (err < 0)
781 goto err_assign; 781 goto err_assign;
782 782
783 ipn->tunnels[0] = ipn->tunnels_wc; 783 ipn->tunnels[0] = ipn->tunnels_wc;
784 ipn->tunnels[1] = ipn->tunnels_l; 784 ipn->tunnels[1] = ipn->tunnels_l;
785 ipn->tunnels[2] = ipn->tunnels_r; 785 ipn->tunnels[2] = ipn->tunnels_r;
786 ipn->tunnels[3] = ipn->tunnels_r_l; 786 ipn->tunnels[3] = ipn->tunnels_r_l;
787 787
788 ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), 788 ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
789 "tunl0", 789 "tunl0",
790 ipip_tunnel_setup); 790 ipip_tunnel_setup);
791 if (!ipn->fb_tunnel_dev) { 791 if (!ipn->fb_tunnel_dev) {
792 err = -ENOMEM; 792 err = -ENOMEM;
793 goto err_alloc_dev; 793 goto err_alloc_dev;
794 } 794 }
795 795
796 ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init; 796 ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;
797 dev_net_set(ipn->fb_tunnel_dev, net); 797 dev_net_set(ipn->fb_tunnel_dev, net);
798 798
799 if ((err = register_netdev(ipn->fb_tunnel_dev))) 799 if ((err = register_netdev(ipn->fb_tunnel_dev)))
800 goto err_reg_dev; 800 goto err_reg_dev;
801 801
802 return 0; 802 return 0;
803 803
804 err_reg_dev: 804 err_reg_dev:
805 free_netdev(ipn->fb_tunnel_dev); 805 free_netdev(ipn->fb_tunnel_dev);
806 err_alloc_dev: 806 err_alloc_dev:
807 /* nothing */ 807 /* nothing */
808 err_assign: 808 err_assign:
809 kfree(ipn); 809 kfree(ipn);
810 err_alloc: 810 err_alloc:
811 return err; 811 return err;
812 } 812 }
813 813
814 static void ipip_exit_net(struct net *net) 814 static void ipip_exit_net(struct net *net)
815 { 815 {
816 struct ipip_net *ipn; 816 struct ipip_net *ipn;
817 817
818 ipn = net_generic(net, ipip_net_id); 818 ipn = net_generic(net, ipip_net_id);
819 rtnl_lock(); 819 rtnl_lock();
820 ipip_destroy_tunnels(ipn); 820 ipip_destroy_tunnels(ipn);
821 unregister_netdevice(ipn->fb_tunnel_dev); 821 unregister_netdevice(ipn->fb_tunnel_dev);
822 rtnl_unlock(); 822 rtnl_unlock();
823 kfree(ipn); 823 kfree(ipn);
824 } 824 }
825 825
826 static struct pernet_operations ipip_net_ops = { 826 static struct pernet_operations ipip_net_ops = {
827 .init = ipip_init_net, 827 .init = ipip_init_net,
828 .exit = ipip_exit_net, 828 .exit = ipip_exit_net,
829 }; 829 };
830 830
831 static int __init ipip_init(void) 831 static int __init ipip_init(void)
832 { 832 {
833 int err; 833 int err;
834 834
835 printk(banner); 835 printk(banner);
836 836
837 if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { 837 if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {
838 printk(KERN_INFO "ipip init: can't register tunnel\n"); 838 printk(KERN_INFO "ipip init: can't register tunnel\n");
839 return -EAGAIN; 839 return -EAGAIN;
840 } 840 }
841 841
842 err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); 842 err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
843 if (err) 843 if (err)
844 xfrm4_tunnel_deregister(&ipip_handler, AF_INET); 844 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
845 845
846 return err; 846 return err;
847 } 847 }
848 848
849 static void __exit ipip_fini(void) 849 static void __exit ipip_fini(void)
850 { 850 {
851 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) 851 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
852 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 852 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
853 853
854 unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); 854 unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
855 } 855 }
856 856
857 module_init(ipip_init); 857 module_init(ipip_init);
858 module_exit(ipip_fini); 858 module_exit(ipip_fini);
859 MODULE_LICENSE("GPL"); 859 MODULE_LICENSE("GPL");
860 860
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * RAW - implementation of IP "raw" sockets. 6 * RAW - implementation of IP "raw" sockets.
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * 10 *
11 * Fixes: 11 * Fixes:
12 * Alan Cox : verify_area() fixed up 12 * Alan Cox : verify_area() fixed up
13 * Alan Cox : ICMP error handling 13 * Alan Cox : ICMP error handling
14 * Alan Cox : EMSGSIZE if you send too big a packet 14 * Alan Cox : EMSGSIZE if you send too big a packet
15 * Alan Cox : Now uses generic datagrams and shared 15 * Alan Cox : Now uses generic datagrams and shared
16 * skbuff library. No more peek crashes, 16 * skbuff library. No more peek crashes,
17 * no more backlogs 17 * no more backlogs
18 * Alan Cox : Checks sk->broadcast. 18 * Alan Cox : Checks sk->broadcast.
19 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram 19 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
20 * Alan Cox : Raw passes ip options too 20 * Alan Cox : Raw passes ip options too
21 * Alan Cox : Setsocketopt added 21 * Alan Cox : Setsocketopt added
22 * Alan Cox : Fixed error return for broadcasts 22 * Alan Cox : Fixed error return for broadcasts
23 * Alan Cox : Removed wake_up calls 23 * Alan Cox : Removed wake_up calls
24 * Alan Cox : Use ttl/tos 24 * Alan Cox : Use ttl/tos
25 * Alan Cox : Cleaned up old debugging 25 * Alan Cox : Cleaned up old debugging
26 * Alan Cox : Use new kernel side addresses 26 * Alan Cox : Use new kernel side addresses
27 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. 27 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
28 * Alan Cox : BSD style RAW socket demultiplexing. 28 * Alan Cox : BSD style RAW socket demultiplexing.
29 * Alan Cox : Beginnings of mrouted support. 29 * Alan Cox : Beginnings of mrouted support.
30 * Alan Cox : Added IP_HDRINCL option. 30 * Alan Cox : Added IP_HDRINCL option.
31 * Alan Cox : Skip broadcast check if BSDism set. 31 * Alan Cox : Skip broadcast check if BSDism set.
32 * David S. Miller : New socket lookup architecture. 32 * David S. Miller : New socket lookup architecture.
33 * 33 *
34 * This program is free software; you can redistribute it and/or 34 * This program is free software; you can redistribute it and/or
35 * modify it under the terms of the GNU General Public License 35 * modify it under the terms of the GNU General Public License
36 * as published by the Free Software Foundation; either version 36 * as published by the Free Software Foundation; either version
37 * 2 of the License, or (at your option) any later version. 37 * 2 of the License, or (at your option) any later version.
38 */ 38 */
39 39
40 #include <linux/types.h> 40 #include <linux/types.h>
41 #include <asm/atomic.h> 41 #include <asm/atomic.h>
42 #include <asm/byteorder.h> 42 #include <asm/byteorder.h>
43 #include <asm/current.h> 43 #include <asm/current.h>
44 #include <asm/uaccess.h> 44 #include <asm/uaccess.h>
45 #include <asm/ioctls.h> 45 #include <asm/ioctls.h>
46 #include <linux/stddef.h> 46 #include <linux/stddef.h>
47 #include <linux/slab.h> 47 #include <linux/slab.h>
48 #include <linux/errno.h> 48 #include <linux/errno.h>
49 #include <linux/aio.h> 49 #include <linux/aio.h>
50 #include <linux/kernel.h> 50 #include <linux/kernel.h>
51 #include <linux/spinlock.h> 51 #include <linux/spinlock.h>
52 #include <linux/sockios.h> 52 #include <linux/sockios.h>
53 #include <linux/socket.h> 53 #include <linux/socket.h>
54 #include <linux/in.h> 54 #include <linux/in.h>
55 #include <linux/mroute.h> 55 #include <linux/mroute.h>
56 #include <linux/netdevice.h> 56 #include <linux/netdevice.h>
57 #include <linux/in_route.h> 57 #include <linux/in_route.h>
58 #include <linux/route.h> 58 #include <linux/route.h>
59 #include <linux/skbuff.h> 59 #include <linux/skbuff.h>
60 #include <net/net_namespace.h> 60 #include <net/net_namespace.h>
61 #include <net/dst.h> 61 #include <net/dst.h>
62 #include <net/sock.h> 62 #include <net/sock.h>
63 #include <linux/gfp.h> 63 #include <linux/gfp.h>
64 #include <linux/ip.h> 64 #include <linux/ip.h>
65 #include <linux/net.h> 65 #include <linux/net.h>
66 #include <net/ip.h> 66 #include <net/ip.h>
67 #include <net/icmp.h> 67 #include <net/icmp.h>
68 #include <net/udp.h> 68 #include <net/udp.h>
69 #include <net/raw.h> 69 #include <net/raw.h>
70 #include <net/snmp.h> 70 #include <net/snmp.h>
71 #include <net/tcp_states.h> 71 #include <net/tcp_states.h>
72 #include <net/inet_common.h> 72 #include <net/inet_common.h>
73 #include <net/checksum.h> 73 #include <net/checksum.h>
74 #include <net/xfrm.h> 74 #include <net/xfrm.h>
75 #include <linux/rtnetlink.h> 75 #include <linux/rtnetlink.h>
76 #include <linux/proc_fs.h> 76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h> 77 #include <linux/seq_file.h>
78 #include <linux/netfilter.h> 78 #include <linux/netfilter.h>
79 #include <linux/netfilter_ipv4.h> 79 #include <linux/netfilter_ipv4.h>
80 80
81 static struct raw_hashinfo raw_v4_hashinfo = { 81 static struct raw_hashinfo raw_v4_hashinfo = {
82 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), 82 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
83 }; 83 };
84 84
85 void raw_hash_sk(struct sock *sk) 85 void raw_hash_sk(struct sock *sk)
86 { 86 {
87 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 87 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
88 struct hlist_head *head; 88 struct hlist_head *head;
89 89
90 head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; 90 head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];
91 91
92 write_lock_bh(&h->lock); 92 write_lock_bh(&h->lock);
93 sk_add_node(sk, head); 93 sk_add_node(sk, head);
94 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 94 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
95 write_unlock_bh(&h->lock); 95 write_unlock_bh(&h->lock);
96 } 96 }
97 EXPORT_SYMBOL_GPL(raw_hash_sk); 97 EXPORT_SYMBOL_GPL(raw_hash_sk);
98 98
99 void raw_unhash_sk(struct sock *sk) 99 void raw_unhash_sk(struct sock *sk)
100 { 100 {
101 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 101 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
102 102
103 write_lock_bh(&h->lock); 103 write_lock_bh(&h->lock);
104 if (sk_del_node_init(sk)) 104 if (sk_del_node_init(sk))
105 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 105 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
106 write_unlock_bh(&h->lock); 106 write_unlock_bh(&h->lock);
107 } 107 }
108 EXPORT_SYMBOL_GPL(raw_unhash_sk); 108 EXPORT_SYMBOL_GPL(raw_unhash_sk);
109 109
110 static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, 110 static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
111 unsigned short num, __be32 raddr, __be32 laddr, int dif) 111 unsigned short num, __be32 raddr, __be32 laddr, int dif)
112 { 112 {
113 struct hlist_node *node; 113 struct hlist_node *node;
114 114
115 sk_for_each_from(sk, node) { 115 sk_for_each_from(sk, node) {
116 struct inet_sock *inet = inet_sk(sk); 116 struct inet_sock *inet = inet_sk(sk);
117 117
118 if (net_eq(sock_net(sk), net) && inet->num == num && 118 if (net_eq(sock_net(sk), net) && inet->num == num &&
119 !(inet->daddr && inet->daddr != raddr) && 119 !(inet->daddr && inet->daddr != raddr) &&
120 !(inet->rcv_saddr && inet->rcv_saddr != laddr) && 120 !(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
121 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) 121 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
122 goto found; /* gotcha */ 122 goto found; /* gotcha */
123 } 123 }
124 sk = NULL; 124 sk = NULL;
125 found: 125 found:
126 return sk; 126 return sk;
127 } 127 }
128 128
129 /* 129 /*
130 * 0 - deliver 130 * 0 - deliver
131 * 1 - block 131 * 1 - block
132 */ 132 */
133 static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) 133 static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
134 { 134 {
135 int type; 135 int type;
136 136
137 if (!pskb_may_pull(skb, sizeof(struct icmphdr))) 137 if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
138 return 1; 138 return 1;
139 139
140 type = icmp_hdr(skb)->type; 140 type = icmp_hdr(skb)->type;
141 if (type < 32) { 141 if (type < 32) {
142 __u32 data = raw_sk(sk)->filter.data; 142 __u32 data = raw_sk(sk)->filter.data;
143 143
144 return ((1 << type) & data) != 0; 144 return ((1 << type) & data) != 0;
145 } 145 }
146 146
147 /* Do not block unknown ICMP types */ 147 /* Do not block unknown ICMP types */
148 return 0; 148 return 0;
149 } 149 }
150 150
151 /* IP input processing comes here for RAW socket delivery. 151 /* IP input processing comes here for RAW socket delivery.
152 * Caller owns SKB, so we must make clones. 152 * Caller owns SKB, so we must make clones.
153 * 153 *
154 * RFC 1122: SHOULD pass TOS value up to the transport layer. 154 * RFC 1122: SHOULD pass TOS value up to the transport layer.
155 * -> It does. And not only TOS, but all IP header. 155 * -> It does. And not only TOS, but all IP header.
156 */ 156 */
157 static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) 157 static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
158 { 158 {
159 struct sock *sk; 159 struct sock *sk;
160 struct hlist_head *head; 160 struct hlist_head *head;
161 int delivered = 0; 161 int delivered = 0;
162 struct net *net; 162 struct net *net;
163 163
164 read_lock(&raw_v4_hashinfo.lock); 164 read_lock(&raw_v4_hashinfo.lock);
165 head = &raw_v4_hashinfo.ht[hash]; 165 head = &raw_v4_hashinfo.ht[hash];
166 if (hlist_empty(head)) 166 if (hlist_empty(head))
167 goto out; 167 goto out;
168 168
169 net = dev_net(skb->dev); 169 net = dev_net(skb->dev);
170 sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, 170 sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
171 iph->saddr, iph->daddr, 171 iph->saddr, iph->daddr,
172 skb->dev->ifindex); 172 skb->dev->ifindex);
173 173
174 while (sk) { 174 while (sk) {
175 delivered = 1; 175 delivered = 1;
176 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { 176 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
177 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 177 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
178 178
179 /* Not releasing hash table! */ 179 /* Not releasing hash table! */
180 if (clone) 180 if (clone)
181 raw_rcv(sk, clone); 181 raw_rcv(sk, clone);
182 } 182 }
183 sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, 183 sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
184 iph->saddr, iph->daddr, 184 iph->saddr, iph->daddr,
185 skb->dev->ifindex); 185 skb->dev->ifindex);
186 } 186 }
187 out: 187 out:
188 read_unlock(&raw_v4_hashinfo.lock); 188 read_unlock(&raw_v4_hashinfo.lock);
189 return delivered; 189 return delivered;
190 } 190 }
191 191
192 int raw_local_deliver(struct sk_buff *skb, int protocol) 192 int raw_local_deliver(struct sk_buff *skb, int protocol)
193 { 193 {
194 int hash; 194 int hash;
195 struct sock *raw_sk; 195 struct sock *raw_sk;
196 196
197 hash = protocol & (RAW_HTABLE_SIZE - 1); 197 hash = protocol & (RAW_HTABLE_SIZE - 1);
198 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); 198 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
199 199
200 /* If there maybe a raw socket we must check - if not we 200 /* If there maybe a raw socket we must check - if not we
201 * don't care less 201 * don't care less
202 */ 202 */
203 if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) 203 if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
204 raw_sk = NULL; 204 raw_sk = NULL;
205 205
206 return raw_sk != NULL; 206 return raw_sk != NULL;
207 207
208 } 208 }
209 209
210 static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) 210 static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
211 { 211 {
212 struct inet_sock *inet = inet_sk(sk); 212 struct inet_sock *inet = inet_sk(sk);
213 const int type = icmp_hdr(skb)->type; 213 const int type = icmp_hdr(skb)->type;
214 const int code = icmp_hdr(skb)->code; 214 const int code = icmp_hdr(skb)->code;
215 int err = 0; 215 int err = 0;
216 int harderr = 0; 216 int harderr = 0;
217 217
218 /* Report error on raw socket, if: 218 /* Report error on raw socket, if:
219 1. User requested ip_recverr. 219 1. User requested ip_recverr.
220 2. Socket is connected (otherwise the error indication 220 2. Socket is connected (otherwise the error indication
221 is useless without ip_recverr and error is hard. 221 is useless without ip_recverr and error is hard.
222 */ 222 */
223 if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) 223 if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
224 return; 224 return;
225 225
226 switch (type) { 226 switch (type) {
227 default: 227 default:
228 case ICMP_TIME_EXCEEDED: 228 case ICMP_TIME_EXCEEDED:
229 err = EHOSTUNREACH; 229 err = EHOSTUNREACH;
230 break; 230 break;
231 case ICMP_SOURCE_QUENCH: 231 case ICMP_SOURCE_QUENCH:
232 return; 232 return;
233 case ICMP_PARAMETERPROB: 233 case ICMP_PARAMETERPROB:
234 err = EPROTO; 234 err = EPROTO;
235 harderr = 1; 235 harderr = 1;
236 break; 236 break;
237 case ICMP_DEST_UNREACH: 237 case ICMP_DEST_UNREACH:
238 err = EHOSTUNREACH; 238 err = EHOSTUNREACH;
239 if (code > NR_ICMP_UNREACH) 239 if (code > NR_ICMP_UNREACH)
240 break; 240 break;
241 err = icmp_err_convert[code].errno; 241 err = icmp_err_convert[code].errno;
242 harderr = icmp_err_convert[code].fatal; 242 harderr = icmp_err_convert[code].fatal;
243 if (code == ICMP_FRAG_NEEDED) { 243 if (code == ICMP_FRAG_NEEDED) {
244 harderr = inet->pmtudisc != IP_PMTUDISC_DONT; 244 harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
245 err = EMSGSIZE; 245 err = EMSGSIZE;
246 } 246 }
247 } 247 }
248 248
249 if (inet->recverr) { 249 if (inet->recverr) {
250 struct iphdr *iph = (struct iphdr*)skb->data; 250 struct iphdr *iph = (struct iphdr *)skb->data;
251 u8 *payload = skb->data + (iph->ihl << 2); 251 u8 *payload = skb->data + (iph->ihl << 2);
252 252
253 if (inet->hdrincl) 253 if (inet->hdrincl)
254 payload = skb->data; 254 payload = skb->data;
255 ip_icmp_error(sk, skb, err, 0, info, payload); 255 ip_icmp_error(sk, skb, err, 0, info, payload);
256 } 256 }
257 257
258 if (inet->recverr || harderr) { 258 if (inet->recverr || harderr) {
259 sk->sk_err = err; 259 sk->sk_err = err;
260 sk->sk_error_report(sk); 260 sk->sk_error_report(sk);
261 } 261 }
262 } 262 }
263 263
264 void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) 264 void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
265 { 265 {
266 int hash; 266 int hash;
267 struct sock *raw_sk; 267 struct sock *raw_sk;
268 struct iphdr *iph; 268 struct iphdr *iph;
269 struct net *net; 269 struct net *net;
270 270
271 hash = protocol & (RAW_HTABLE_SIZE - 1); 271 hash = protocol & (RAW_HTABLE_SIZE - 1);
272 272
273 read_lock(&raw_v4_hashinfo.lock); 273 read_lock(&raw_v4_hashinfo.lock);
274 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); 274 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
275 if (raw_sk != NULL) { 275 if (raw_sk != NULL) {
276 iph = (struct iphdr *)skb->data; 276 iph = (struct iphdr *)skb->data;
277 net = dev_net(skb->dev); 277 net = dev_net(skb->dev);
278 278
279 while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, 279 while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
280 iph->daddr, iph->saddr, 280 iph->daddr, iph->saddr,
281 skb->dev->ifindex)) != NULL) { 281 skb->dev->ifindex)) != NULL) {
282 raw_err(raw_sk, skb, info); 282 raw_err(raw_sk, skb, info);
283 raw_sk = sk_next(raw_sk); 283 raw_sk = sk_next(raw_sk);
284 iph = (struct iphdr *)skb->data; 284 iph = (struct iphdr *)skb->data;
285 } 285 }
286 } 286 }
287 read_unlock(&raw_v4_hashinfo.lock); 287 read_unlock(&raw_v4_hashinfo.lock);
288 } 288 }
289 289
290 static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) 290 static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
291 { 291 {
292 /* Charge it to the socket. */ 292 /* Charge it to the socket. */
293 293
294 if (sock_queue_rcv_skb(sk, skb) < 0) { 294 if (sock_queue_rcv_skb(sk, skb) < 0) {
295 atomic_inc(&sk->sk_drops); 295 atomic_inc(&sk->sk_drops);
296 kfree_skb(skb); 296 kfree_skb(skb);
297 return NET_RX_DROP; 297 return NET_RX_DROP;
298 } 298 }
299 299
300 return NET_RX_SUCCESS; 300 return NET_RX_SUCCESS;
301 } 301 }
302 302
303 int raw_rcv(struct sock *sk, struct sk_buff *skb) 303 int raw_rcv(struct sock *sk, struct sk_buff *skb)
304 { 304 {
305 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 305 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
306 atomic_inc(&sk->sk_drops); 306 atomic_inc(&sk->sk_drops);
307 kfree_skb(skb); 307 kfree_skb(skb);
308 return NET_RX_DROP; 308 return NET_RX_DROP;
309 } 309 }
310 nf_reset(skb); 310 nf_reset(skb);
311 311
312 skb_push(skb, skb->data - skb_network_header(skb)); 312 skb_push(skb, skb->data - skb_network_header(skb));
313 313
314 raw_rcv_skb(sk, skb); 314 raw_rcv_skb(sk, skb);
315 return 0; 315 return 0;
316 } 316 }
317 317
318 static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, 318 static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
319 struct rtable *rt, 319 struct rtable *rt,
320 unsigned int flags) 320 unsigned int flags)
321 { 321 {
322 struct inet_sock *inet = inet_sk(sk); 322 struct inet_sock *inet = inet_sk(sk);
323 struct net *net = sock_net(sk); 323 struct net *net = sock_net(sk);
324 struct iphdr *iph; 324 struct iphdr *iph;
325 struct sk_buff *skb; 325 struct sk_buff *skb;
326 unsigned int iphlen; 326 unsigned int iphlen;
327 int err; 327 int err;
328 328
329 if (length > rt->u.dst.dev->mtu) { 329 if (length > rt->u.dst.dev->mtu) {
330 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, 330 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
331 rt->u.dst.dev->mtu); 331 rt->u.dst.dev->mtu);
332 return -EMSGSIZE; 332 return -EMSGSIZE;
333 } 333 }
334 if (flags&MSG_PROBE) 334 if (flags&MSG_PROBE)
335 goto out; 335 goto out;
336 336
337 skb = sock_alloc_send_skb(sk, 337 skb = sock_alloc_send_skb(sk,
338 length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, 338 length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,
339 flags & MSG_DONTWAIT, &err); 339 flags & MSG_DONTWAIT, &err);
340 if (skb == NULL) 340 if (skb == NULL)
341 goto error; 341 goto error;
342 skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); 342 skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));
343 343
344 skb->priority = sk->sk_priority; 344 skb->priority = sk->sk_priority;
345 skb->mark = sk->sk_mark; 345 skb->mark = sk->sk_mark;
346 skb->dst = dst_clone(&rt->u.dst); 346 skb->dst = dst_clone(&rt->u.dst);
347 347
348 skb_reset_network_header(skb); 348 skb_reset_network_header(skb);
349 iph = ip_hdr(skb); 349 iph = ip_hdr(skb);
350 skb_put(skb, length); 350 skb_put(skb, length);
351 351
352 skb->ip_summed = CHECKSUM_NONE; 352 skb->ip_summed = CHECKSUM_NONE;
353 353
354 skb->transport_header = skb->network_header; 354 skb->transport_header = skb->network_header;
355 err = memcpy_fromiovecend((void *)iph, from, 0, length); 355 err = memcpy_fromiovecend((void *)iph, from, 0, length);
356 if (err) 356 if (err)
357 goto error_fault; 357 goto error_fault;
358 358
359 /* We don't modify invalid header */ 359 /* We don't modify invalid header */
360 iphlen = iph->ihl * 4; 360 iphlen = iph->ihl * 4;
361 if (iphlen >= sizeof(*iph) && iphlen <= length) { 361 if (iphlen >= sizeof(*iph) && iphlen <= length) {
362 if (!iph->saddr) 362 if (!iph->saddr)
363 iph->saddr = rt->rt_src; 363 iph->saddr = rt->rt_src;
364 iph->check = 0; 364 iph->check = 0;
365 iph->tot_len = htons(length); 365 iph->tot_len = htons(length);
366 if (!iph->id) 366 if (!iph->id)
367 ip_select_ident(iph, &rt->u.dst, NULL); 367 ip_select_ident(iph, &rt->u.dst, NULL);
368 368
369 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 369 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
370 } 370 }
371 if (iph->protocol == IPPROTO_ICMP) 371 if (iph->protocol == IPPROTO_ICMP)
372 icmp_out_count(net, ((struct icmphdr *) 372 icmp_out_count(net, ((struct icmphdr *)
373 skb_transport_header(skb))->type); 373 skb_transport_header(skb))->type);
374 374
375 err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, 375 err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
376 dst_output); 376 dst_output);
377 if (err > 0) 377 if (err > 0)
378 err = inet->recverr ? net_xmit_errno(err) : 0; 378 err = inet->recverr ? net_xmit_errno(err) : 0;
379 if (err) 379 if (err)
380 goto error; 380 goto error;
381 out: 381 out:
382 return 0; 382 return 0;
383 383
384 error_fault: 384 error_fault:
385 err = -EFAULT; 385 err = -EFAULT;
386 kfree_skb(skb); 386 kfree_skb(skb);
387 error: 387 error:
388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
389 return err; 389 return err;
390 } 390 }
391 391
392 static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) 392 static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
393 { 393 {
394 struct iovec *iov; 394 struct iovec *iov;
395 u8 __user *type = NULL; 395 u8 __user *type = NULL;
396 u8 __user *code = NULL; 396 u8 __user *code = NULL;
397 int probed = 0; 397 int probed = 0;
398 unsigned int i; 398 unsigned int i;
399 399
400 if (!msg->msg_iov) 400 if (!msg->msg_iov)
401 return 0; 401 return 0;
402 402
403 for (i = 0; i < msg->msg_iovlen; i++) { 403 for (i = 0; i < msg->msg_iovlen; i++) {
404 iov = &msg->msg_iov[i]; 404 iov = &msg->msg_iov[i];
405 if (!iov) 405 if (!iov)
406 continue; 406 continue;
407 407
408 switch (fl->proto) { 408 switch (fl->proto) {
409 case IPPROTO_ICMP: 409 case IPPROTO_ICMP:
410 /* check if one-byte field is readable or not. */ 410 /* check if one-byte field is readable or not. */
411 if (iov->iov_base && iov->iov_len < 1) 411 if (iov->iov_base && iov->iov_len < 1)
412 break; 412 break;
413 413
414 if (!type) { 414 if (!type) {
415 type = iov->iov_base; 415 type = iov->iov_base;
416 /* check if code field is readable or not. */ 416 /* check if code field is readable or not. */
417 if (iov->iov_len > 1) 417 if (iov->iov_len > 1)
418 code = type + 1; 418 code = type + 1;
419 } else if (!code) 419 } else if (!code)
420 code = iov->iov_base; 420 code = iov->iov_base;
421 421
422 if (type && code) { 422 if (type && code) {
423 if (get_user(fl->fl_icmp_type, type) || 423 if (get_user(fl->fl_icmp_type, type) ||
424 get_user(fl->fl_icmp_code, code)) 424 get_user(fl->fl_icmp_code, code))
425 return -EFAULT; 425 return -EFAULT;
426 probed = 1; 426 probed = 1;
427 } 427 }
428 break; 428 break;
429 default: 429 default:
430 probed = 1; 430 probed = 1;
431 break; 431 break;
432 } 432 }
433 if (probed) 433 if (probed)
434 break; 434 break;
435 } 435 }
436 return 0; 436 return 0;
437 } 437 }
438 438
439 static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 439 static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
440 size_t len) 440 size_t len)
441 { 441 {
442 struct inet_sock *inet = inet_sk(sk); 442 struct inet_sock *inet = inet_sk(sk);
443 struct ipcm_cookie ipc; 443 struct ipcm_cookie ipc;
444 struct rtable *rt = NULL; 444 struct rtable *rt = NULL;
445 int free = 0; 445 int free = 0;
446 __be32 daddr; 446 __be32 daddr;
447 __be32 saddr; 447 __be32 saddr;
448 u8 tos; 448 u8 tos;
449 int err; 449 int err;
450 450
451 err = -EMSGSIZE; 451 err = -EMSGSIZE;
452 if (len > 0xFFFF) 452 if (len > 0xFFFF)
453 goto out; 453 goto out;
454 454
455 /* 455 /*
456 * Check the flags. 456 * Check the flags.
457 */ 457 */
458 458
459 err = -EOPNOTSUPP; 459 err = -EOPNOTSUPP;
460 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ 460 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
461 goto out; /* compatibility */ 461 goto out; /* compatibility */
462 462
463 /* 463 /*
464 * Get and verify the address. 464 * Get and verify the address.
465 */ 465 */
466 466
467 if (msg->msg_namelen) { 467 if (msg->msg_namelen) {
468 struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; 468 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
469 err = -EINVAL; 469 err = -EINVAL;
470 if (msg->msg_namelen < sizeof(*usin)) 470 if (msg->msg_namelen < sizeof(*usin))
471 goto out; 471 goto out;
472 if (usin->sin_family != AF_INET) { 472 if (usin->sin_family != AF_INET) {
473 static int complained; 473 static int complained;
474 if (!complained++) 474 if (!complained++)
475 printk(KERN_INFO "%s forgot to set AF_INET in " 475 printk(KERN_INFO "%s forgot to set AF_INET in "
476 "raw sendmsg. Fix it!\n", 476 "raw sendmsg. Fix it!\n",
477 current->comm); 477 current->comm);
478 err = -EAFNOSUPPORT; 478 err = -EAFNOSUPPORT;
479 if (usin->sin_family) 479 if (usin->sin_family)
480 goto out; 480 goto out;
481 } 481 }
482 daddr = usin->sin_addr.s_addr; 482 daddr = usin->sin_addr.s_addr;
483 /* ANK: I did not forget to get protocol from port field. 483 /* ANK: I did not forget to get protocol from port field.
484 * I just do not know, who uses this weirdness. 484 * I just do not know, who uses this weirdness.
485 * IP_HDRINCL is much more convenient. 485 * IP_HDRINCL is much more convenient.
486 */ 486 */
487 } else { 487 } else {
488 err = -EDESTADDRREQ; 488 err = -EDESTADDRREQ;
489 if (sk->sk_state != TCP_ESTABLISHED) 489 if (sk->sk_state != TCP_ESTABLISHED)
490 goto out; 490 goto out;
491 daddr = inet->daddr; 491 daddr = inet->daddr;
492 } 492 }
493 493
494 ipc.addr = inet->saddr; 494 ipc.addr = inet->saddr;
495 ipc.opt = NULL; 495 ipc.opt = NULL;
496 ipc.oif = sk->sk_bound_dev_if; 496 ipc.oif = sk->sk_bound_dev_if;
497 497
498 if (msg->msg_controllen) { 498 if (msg->msg_controllen) {
499 err = ip_cmsg_send(sock_net(sk), msg, &ipc); 499 err = ip_cmsg_send(sock_net(sk), msg, &ipc);
500 if (err) 500 if (err)
501 goto out; 501 goto out;
502 if (ipc.opt) 502 if (ipc.opt)
503 free = 1; 503 free = 1;
504 } 504 }
505 505
506 saddr = ipc.addr; 506 saddr = ipc.addr;
507 ipc.addr = daddr; 507 ipc.addr = daddr;
508 508
509 if (!ipc.opt) 509 if (!ipc.opt)
510 ipc.opt = inet->opt; 510 ipc.opt = inet->opt;
511 511
512 if (ipc.opt) { 512 if (ipc.opt) {
513 err = -EINVAL; 513 err = -EINVAL;
514 /* Linux does not mangle headers on raw sockets, 514 /* Linux does not mangle headers on raw sockets,
515 * so that IP options + IP_HDRINCL is non-sense. 515 * so that IP options + IP_HDRINCL is non-sense.
516 */ 516 */
517 if (inet->hdrincl) 517 if (inet->hdrincl)
518 goto done; 518 goto done;
519 if (ipc.opt->srr) { 519 if (ipc.opt->srr) {
520 if (!daddr) 520 if (!daddr)
521 goto done; 521 goto done;
522 daddr = ipc.opt->faddr; 522 daddr = ipc.opt->faddr;
523 } 523 }
524 } 524 }
525 tos = RT_CONN_FLAGS(sk); 525 tos = RT_CONN_FLAGS(sk);
526 if (msg->msg_flags & MSG_DONTROUTE) 526 if (msg->msg_flags & MSG_DONTROUTE)
527 tos |= RTO_ONLINK; 527 tos |= RTO_ONLINK;
528 528
529 if (ipv4_is_multicast(daddr)) { 529 if (ipv4_is_multicast(daddr)) {
530 if (!ipc.oif) 530 if (!ipc.oif)
531 ipc.oif = inet->mc_index; 531 ipc.oif = inet->mc_index;
532 if (!saddr) 532 if (!saddr)
533 saddr = inet->mc_addr; 533 saddr = inet->mc_addr;
534 } 534 }
535 535
536 { 536 {
537 struct flowi fl = { .oif = ipc.oif, 537 struct flowi fl = { .oif = ipc.oif,
538 .mark = sk->sk_mark, 538 .mark = sk->sk_mark,
539 .nl_u = { .ip4_u = 539 .nl_u = { .ip4_u =
540 { .daddr = daddr, 540 { .daddr = daddr,
541 .saddr = saddr, 541 .saddr = saddr,
542 .tos = tos } }, 542 .tos = tos } },
543 .proto = inet->hdrincl ? IPPROTO_RAW : 543 .proto = inet->hdrincl ? IPPROTO_RAW :
544 sk->sk_protocol, 544 sk->sk_protocol,
545 }; 545 };
546 if (!inet->hdrincl) { 546 if (!inet->hdrincl) {
547 err = raw_probe_proto_opt(&fl, msg); 547 err = raw_probe_proto_opt(&fl, msg);
548 if (err) 548 if (err)
549 goto done; 549 goto done;
550 } 550 }
551 551
552 security_sk_classify_flow(sk, &fl); 552 security_sk_classify_flow(sk, &fl);
553 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); 553 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
554 } 554 }
555 if (err) 555 if (err)
556 goto done; 556 goto done;
557 557
558 err = -EACCES; 558 err = -EACCES;
559 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) 559 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
560 goto done; 560 goto done;
561 561
562 if (msg->msg_flags & MSG_CONFIRM) 562 if (msg->msg_flags & MSG_CONFIRM)
563 goto do_confirm; 563 goto do_confirm;
564 back_from_confirm: 564 back_from_confirm:
565 565
566 if (inet->hdrincl) 566 if (inet->hdrincl)
567 err = raw_send_hdrinc(sk, msg->msg_iov, len, 567 err = raw_send_hdrinc(sk, msg->msg_iov, len,
568 rt, msg->msg_flags); 568 rt, msg->msg_flags);
569 569
570 else { 570 else {
571 if (!ipc.addr) 571 if (!ipc.addr)
572 ipc.addr = rt->rt_dst; 572 ipc.addr = rt->rt_dst;
573 lock_sock(sk); 573 lock_sock(sk);
574 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, 574 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
575 &ipc, rt, msg->msg_flags); 575 &ipc, rt, msg->msg_flags);
576 if (err) 576 if (err)
577 ip_flush_pending_frames(sk); 577 ip_flush_pending_frames(sk);
578 else if (!(msg->msg_flags & MSG_MORE)) 578 else if (!(msg->msg_flags & MSG_MORE))
579 err = ip_push_pending_frames(sk); 579 err = ip_push_pending_frames(sk);
580 release_sock(sk); 580 release_sock(sk);
581 } 581 }
582 done: 582 done:
583 if (free) 583 if (free)
584 kfree(ipc.opt); 584 kfree(ipc.opt);
585 ip_rt_put(rt); 585 ip_rt_put(rt);
586 586
587 out: 587 out:
588 if (err < 0) 588 if (err < 0)
589 return err; 589 return err;
590 return len; 590 return len;
591 591
592 do_confirm: 592 do_confirm:
593 dst_confirm(&rt->u.dst); 593 dst_confirm(&rt->u.dst);
594 if (!(msg->msg_flags & MSG_PROBE) || len) 594 if (!(msg->msg_flags & MSG_PROBE) || len)
595 goto back_from_confirm; 595 goto back_from_confirm;
596 err = 0; 596 err = 0;
597 goto done; 597 goto done;
598 } 598 }
599 599
600 static void raw_close(struct sock *sk, long timeout) 600 static void raw_close(struct sock *sk, long timeout)
601 { 601 {
602 /* 602 /*
603 * Raw sockets may have direct kernel refereneces. Kill them. 603 * Raw sockets may have direct kernel refereneces. Kill them.
604 */ 604 */
605 ip_ra_control(sk, 0, NULL); 605 ip_ra_control(sk, 0, NULL);
606 606
607 sk_common_release(sk); 607 sk_common_release(sk);
608 } 608 }
609 609
610 static void raw_destroy(struct sock *sk) 610 static void raw_destroy(struct sock *sk)
611 { 611 {
612 lock_sock(sk); 612 lock_sock(sk);
613 ip_flush_pending_frames(sk); 613 ip_flush_pending_frames(sk);
614 release_sock(sk); 614 release_sock(sk);
615 } 615 }
616 616
617 /* This gets rid of all the nasties in af_inet. -DaveM */ 617 /* This gets rid of all the nasties in af_inet. -DaveM */
618 static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) 618 static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
619 { 619 {
620 struct inet_sock *inet = inet_sk(sk); 620 struct inet_sock *inet = inet_sk(sk);
621 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; 621 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
622 int ret = -EINVAL; 622 int ret = -EINVAL;
623 int chk_addr_ret; 623 int chk_addr_ret;
624 624
625 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) 625 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
626 goto out; 626 goto out;
627 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); 627 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
628 ret = -EADDRNOTAVAIL; 628 ret = -EADDRNOTAVAIL;
629 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && 629 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
630 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) 630 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
631 goto out; 631 goto out;
632 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 632 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
633 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 633 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
634 inet->saddr = 0; /* Use device */ 634 inet->saddr = 0; /* Use device */
635 sk_dst_reset(sk); 635 sk_dst_reset(sk);
636 ret = 0; 636 ret = 0;
637 out: return ret; 637 out: return ret;
638 } 638 }
639 639
640 /* 640 /*
641 * This should be easy, if there is something there 641 * This should be easy, if there is something there
642 * we return it, otherwise we block. 642 * we return it, otherwise we block.
643 */ 643 */
644 644
645 static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 645 static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
646 size_t len, int noblock, int flags, int *addr_len) 646 size_t len, int noblock, int flags, int *addr_len)
647 { 647 {
648 struct inet_sock *inet = inet_sk(sk); 648 struct inet_sock *inet = inet_sk(sk);
649 size_t copied = 0; 649 size_t copied = 0;
650 int err = -EOPNOTSUPP; 650 int err = -EOPNOTSUPP;
651 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 651 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
652 struct sk_buff *skb; 652 struct sk_buff *skb;
653 653
654 if (flags & MSG_OOB) 654 if (flags & MSG_OOB)
655 goto out; 655 goto out;
656 656
657 if (addr_len) 657 if (addr_len)
658 *addr_len = sizeof(*sin); 658 *addr_len = sizeof(*sin);
659 659
660 if (flags & MSG_ERRQUEUE) { 660 if (flags & MSG_ERRQUEUE) {
661 err = ip_recv_error(sk, msg, len); 661 err = ip_recv_error(sk, msg, len);
662 goto out; 662 goto out;
663 } 663 }
664 664
665 skb = skb_recv_datagram(sk, flags, noblock, &err); 665 skb = skb_recv_datagram(sk, flags, noblock, &err);
666 if (!skb) 666 if (!skb)
667 goto out; 667 goto out;
668 668
669 copied = skb->len; 669 copied = skb->len;
670 if (len < copied) { 670 if (len < copied) {
671 msg->msg_flags |= MSG_TRUNC; 671 msg->msg_flags |= MSG_TRUNC;
672 copied = len; 672 copied = len;
673 } 673 }
674 674
675 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 675 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
676 if (err) 676 if (err)
677 goto done; 677 goto done;
678 678
679 sock_recv_timestamp(msg, sk, skb); 679 sock_recv_timestamp(msg, sk, skb);
680 680
681 /* Copy the address. */ 681 /* Copy the address. */
682 if (sin) { 682 if (sin) {
683 sin->sin_family = AF_INET; 683 sin->sin_family = AF_INET;
684 sin->sin_addr.s_addr = ip_hdr(skb)->saddr; 684 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
685 sin->sin_port = 0; 685 sin->sin_port = 0;
686 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 686 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
687 } 687 }
688 if (inet->cmsg_flags) 688 if (inet->cmsg_flags)
689 ip_cmsg_recv(msg, skb); 689 ip_cmsg_recv(msg, skb);
690 if (flags & MSG_TRUNC) 690 if (flags & MSG_TRUNC)
691 copied = skb->len; 691 copied = skb->len;
692 done: 692 done:
693 skb_free_datagram(sk, skb); 693 skb_free_datagram(sk, skb);
694 out: 694 out:
695 if (err) 695 if (err)
696 return err; 696 return err;
697 return copied; 697 return copied;
698 } 698 }
699 699
700 static int raw_init(struct sock *sk) 700 static int raw_init(struct sock *sk)
701 { 701 {
702 struct raw_sock *rp = raw_sk(sk); 702 struct raw_sock *rp = raw_sk(sk);
703 703
704 if (inet_sk(sk)->num == IPPROTO_ICMP) 704 if (inet_sk(sk)->num == IPPROTO_ICMP)
705 memset(&rp->filter, 0, sizeof(rp->filter)); 705 memset(&rp->filter, 0, sizeof(rp->filter));
706 return 0; 706 return 0;
707 } 707 }
708 708
709 static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) 709 static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
710 { 710 {
711 if (optlen > sizeof(struct icmp_filter)) 711 if (optlen > sizeof(struct icmp_filter))
712 optlen = sizeof(struct icmp_filter); 712 optlen = sizeof(struct icmp_filter);
713 if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) 713 if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
714 return -EFAULT; 714 return -EFAULT;
715 return 0; 715 return 0;
716 } 716 }
717 717
718 static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) 718 static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
719 { 719 {
720 int len, ret = -EFAULT; 720 int len, ret = -EFAULT;
721 721
722 if (get_user(len, optlen)) 722 if (get_user(len, optlen))
723 goto out; 723 goto out;
724 ret = -EINVAL; 724 ret = -EINVAL;
725 if (len < 0) 725 if (len < 0)
726 goto out; 726 goto out;
727 if (len > sizeof(struct icmp_filter)) 727 if (len > sizeof(struct icmp_filter))
728 len = sizeof(struct icmp_filter); 728 len = sizeof(struct icmp_filter);
729 ret = -EFAULT; 729 ret = -EFAULT;
730 if (put_user(len, optlen) || 730 if (put_user(len, optlen) ||
731 copy_to_user(optval, &raw_sk(sk)->filter, len)) 731 copy_to_user(optval, &raw_sk(sk)->filter, len))
732 goto out; 732 goto out;
733 ret = 0; 733 ret = 0;
734 out: return ret; 734 out: return ret;
735 } 735 }
736 736
737 static int do_raw_setsockopt(struct sock *sk, int level, int optname, 737 static int do_raw_setsockopt(struct sock *sk, int level, int optname,
738 char __user *optval, int optlen) 738 char __user *optval, int optlen)
739 { 739 {
740 if (optname == ICMP_FILTER) { 740 if (optname == ICMP_FILTER) {
741 if (inet_sk(sk)->num != IPPROTO_ICMP) 741 if (inet_sk(sk)->num != IPPROTO_ICMP)
742 return -EOPNOTSUPP; 742 return -EOPNOTSUPP;
743 else 743 else
744 return raw_seticmpfilter(sk, optval, optlen); 744 return raw_seticmpfilter(sk, optval, optlen);
745 } 745 }
746 return -ENOPROTOOPT; 746 return -ENOPROTOOPT;
747 } 747 }
748 748
749 static int raw_setsockopt(struct sock *sk, int level, int optname, 749 static int raw_setsockopt(struct sock *sk, int level, int optname,
750 char __user *optval, int optlen) 750 char __user *optval, int optlen)
751 { 751 {
752 if (level != SOL_RAW) 752 if (level != SOL_RAW)
753 return ip_setsockopt(sk, level, optname, optval, optlen); 753 return ip_setsockopt(sk, level, optname, optval, optlen);
754 return do_raw_setsockopt(sk, level, optname, optval, optlen); 754 return do_raw_setsockopt(sk, level, optname, optval, optlen);
755 } 755 }
756 756
757 #ifdef CONFIG_COMPAT 757 #ifdef CONFIG_COMPAT
758 static int compat_raw_setsockopt(struct sock *sk, int level, int optname, 758 static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
759 char __user *optval, int optlen) 759 char __user *optval, int optlen)
760 { 760 {
761 if (level != SOL_RAW) 761 if (level != SOL_RAW)
762 return compat_ip_setsockopt(sk, level, optname, optval, optlen); 762 return compat_ip_setsockopt(sk, level, optname, optval, optlen);
763 return do_raw_setsockopt(sk, level, optname, optval, optlen); 763 return do_raw_setsockopt(sk, level, optname, optval, optlen);
764 } 764 }
765 #endif 765 #endif
766 766
767 static int do_raw_getsockopt(struct sock *sk, int level, int optname, 767 static int do_raw_getsockopt(struct sock *sk, int level, int optname,
768 char __user *optval, int __user *optlen) 768 char __user *optval, int __user *optlen)
769 { 769 {
770 if (optname == ICMP_FILTER) { 770 if (optname == ICMP_FILTER) {
771 if (inet_sk(sk)->num != IPPROTO_ICMP) 771 if (inet_sk(sk)->num != IPPROTO_ICMP)
772 return -EOPNOTSUPP; 772 return -EOPNOTSUPP;
773 else 773 else
774 return raw_geticmpfilter(sk, optval, optlen); 774 return raw_geticmpfilter(sk, optval, optlen);
775 } 775 }
776 return -ENOPROTOOPT; 776 return -ENOPROTOOPT;
777 } 777 }
778 778
779 static int raw_getsockopt(struct sock *sk, int level, int optname, 779 static int raw_getsockopt(struct sock *sk, int level, int optname,
780 char __user *optval, int __user *optlen) 780 char __user *optval, int __user *optlen)
781 { 781 {
782 if (level != SOL_RAW) 782 if (level != SOL_RAW)
783 return ip_getsockopt(sk, level, optname, optval, optlen); 783 return ip_getsockopt(sk, level, optname, optval, optlen);
784 return do_raw_getsockopt(sk, level, optname, optval, optlen); 784 return do_raw_getsockopt(sk, level, optname, optval, optlen);
785 } 785 }
786 786
787 #ifdef CONFIG_COMPAT 787 #ifdef CONFIG_COMPAT
788 static int compat_raw_getsockopt(struct sock *sk, int level, int optname, 788 static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
789 char __user *optval, int __user *optlen) 789 char __user *optval, int __user *optlen)
790 { 790 {
791 if (level != SOL_RAW) 791 if (level != SOL_RAW)
792 return compat_ip_getsockopt(sk, level, optname, optval, optlen); 792 return compat_ip_getsockopt(sk, level, optname, optval, optlen);
793 return do_raw_getsockopt(sk, level, optname, optval, optlen); 793 return do_raw_getsockopt(sk, level, optname, optval, optlen);
794 } 794 }
795 #endif 795 #endif
796 796
797 static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) 797 static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
798 { 798 {
799 switch (cmd) { 799 switch (cmd) {
800 case SIOCOUTQ: { 800 case SIOCOUTQ: {
801 int amount = atomic_read(&sk->sk_wmem_alloc); 801 int amount = atomic_read(&sk->sk_wmem_alloc);
802 return put_user(amount, (int __user *)arg); 802 return put_user(amount, (int __user *)arg);
803 } 803 }
804 case SIOCINQ: { 804 case SIOCINQ: {
805 struct sk_buff *skb; 805 struct sk_buff *skb;
806 int amount = 0; 806 int amount = 0;
807 807
808 spin_lock_bh(&sk->sk_receive_queue.lock); 808 spin_lock_bh(&sk->sk_receive_queue.lock);
809 skb = skb_peek(&sk->sk_receive_queue); 809 skb = skb_peek(&sk->sk_receive_queue);
810 if (skb != NULL) 810 if (skb != NULL)
811 amount = skb->len; 811 amount = skb->len;
812 spin_unlock_bh(&sk->sk_receive_queue.lock); 812 spin_unlock_bh(&sk->sk_receive_queue.lock);
813 return put_user(amount, (int __user *)arg); 813 return put_user(amount, (int __user *)arg);
814 } 814 }
815 815
816 default: 816 default:
817 #ifdef CONFIG_IP_MROUTE 817 #ifdef CONFIG_IP_MROUTE
818 return ipmr_ioctl(sk, cmd, (void __user *)arg); 818 return ipmr_ioctl(sk, cmd, (void __user *)arg);
819 #else 819 #else
820 return -ENOIOCTLCMD; 820 return -ENOIOCTLCMD;
821 #endif 821 #endif
822 } 822 }
823 } 823 }
824 824
825 struct proto raw_prot = { 825 struct proto raw_prot = {
826 .name = "RAW", 826 .name = "RAW",
827 .owner = THIS_MODULE, 827 .owner = THIS_MODULE,
828 .close = raw_close, 828 .close = raw_close,
829 .destroy = raw_destroy, 829 .destroy = raw_destroy,
830 .connect = ip4_datagram_connect, 830 .connect = ip4_datagram_connect,
831 .disconnect = udp_disconnect, 831 .disconnect = udp_disconnect,
832 .ioctl = raw_ioctl, 832 .ioctl = raw_ioctl,
833 .init = raw_init, 833 .init = raw_init,
834 .setsockopt = raw_setsockopt, 834 .setsockopt = raw_setsockopt,
835 .getsockopt = raw_getsockopt, 835 .getsockopt = raw_getsockopt,
836 .sendmsg = raw_sendmsg, 836 .sendmsg = raw_sendmsg,
837 .recvmsg = raw_recvmsg, 837 .recvmsg = raw_recvmsg,
838 .bind = raw_bind, 838 .bind = raw_bind,
839 .backlog_rcv = raw_rcv_skb, 839 .backlog_rcv = raw_rcv_skb,
840 .hash = raw_hash_sk, 840 .hash = raw_hash_sk,
841 .unhash = raw_unhash_sk, 841 .unhash = raw_unhash_sk,
842 .obj_size = sizeof(struct raw_sock), 842 .obj_size = sizeof(struct raw_sock),
843 .h.raw_hash = &raw_v4_hashinfo, 843 .h.raw_hash = &raw_v4_hashinfo,
844 #ifdef CONFIG_COMPAT 844 #ifdef CONFIG_COMPAT
845 .compat_setsockopt = compat_raw_setsockopt, 845 .compat_setsockopt = compat_raw_setsockopt,
846 .compat_getsockopt = compat_raw_getsockopt, 846 .compat_getsockopt = compat_raw_getsockopt,
847 #endif 847 #endif
848 }; 848 };
849 849
850 #ifdef CONFIG_PROC_FS 850 #ifdef CONFIG_PROC_FS
851 static struct sock *raw_get_first(struct seq_file *seq) 851 static struct sock *raw_get_first(struct seq_file *seq)
852 { 852 {
853 struct sock *sk; 853 struct sock *sk;
854 struct raw_iter_state* state = raw_seq_private(seq); 854 struct raw_iter_state *state = raw_seq_private(seq);
855 855
856 for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; 856 for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
857 ++state->bucket) { 857 ++state->bucket) {
858 struct hlist_node *node; 858 struct hlist_node *node;
859 859
860 sk_for_each(sk, node, &state->h->ht[state->bucket]) 860 sk_for_each(sk, node, &state->h->ht[state->bucket])
861 if (sock_net(sk) == seq_file_net(seq)) 861 if (sock_net(sk) == seq_file_net(seq))
862 goto found; 862 goto found;
863 } 863 }
864 sk = NULL; 864 sk = NULL;
865 found: 865 found:
866 return sk; 866 return sk;
867 } 867 }
868 868
869 static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) 869 static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
870 { 870 {
871 struct raw_iter_state* state = raw_seq_private(seq); 871 struct raw_iter_state *state = raw_seq_private(seq);
872 872
873 do { 873 do {
874 sk = sk_next(sk); 874 sk = sk_next(sk);
875 try_again: 875 try_again:
876 ; 876 ;
877 } while (sk && sock_net(sk) != seq_file_net(seq)); 877 } while (sk && sock_net(sk) != seq_file_net(seq));
878 878
879 if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { 879 if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
880 sk = sk_head(&state->h->ht[state->bucket]); 880 sk = sk_head(&state->h->ht[state->bucket]);
881 goto try_again; 881 goto try_again;
882 } 882 }
883 return sk; 883 return sk;
884 } 884 }
885 885
886 static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) 886 static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
887 { 887 {
888 struct sock *sk = raw_get_first(seq); 888 struct sock *sk = raw_get_first(seq);
889 889
890 if (sk) 890 if (sk)
891 while (pos && (sk = raw_get_next(seq, sk)) != NULL) 891 while (pos && (sk = raw_get_next(seq, sk)) != NULL)
892 --pos; 892 --pos;
893 return pos ? NULL : sk; 893 return pos ? NULL : sk;
894 } 894 }
895 895
896 void *raw_seq_start(struct seq_file *seq, loff_t *pos) 896 void *raw_seq_start(struct seq_file *seq, loff_t *pos)
897 { 897 {
898 struct raw_iter_state *state = raw_seq_private(seq); 898 struct raw_iter_state *state = raw_seq_private(seq);
899 899
900 read_lock(&state->h->lock); 900 read_lock(&state->h->lock);
901 return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 901 return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
902 } 902 }
903 EXPORT_SYMBOL_GPL(raw_seq_start); 903 EXPORT_SYMBOL_GPL(raw_seq_start);
904 904
905 void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) 905 void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
906 { 906 {
907 struct sock *sk; 907 struct sock *sk;
908 908
909 if (v == SEQ_START_TOKEN) 909 if (v == SEQ_START_TOKEN)
910 sk = raw_get_first(seq); 910 sk = raw_get_first(seq);
911 else 911 else
912 sk = raw_get_next(seq, v); 912 sk = raw_get_next(seq, v);
913 ++*pos; 913 ++*pos;
914 return sk; 914 return sk;
915 } 915 }
916 EXPORT_SYMBOL_GPL(raw_seq_next); 916 EXPORT_SYMBOL_GPL(raw_seq_next);
917 917
918 void raw_seq_stop(struct seq_file *seq, void *v) 918 void raw_seq_stop(struct seq_file *seq, void *v)
919 { 919 {
920 struct raw_iter_state *state = raw_seq_private(seq); 920 struct raw_iter_state *state = raw_seq_private(seq);
921 921
922 read_unlock(&state->h->lock); 922 read_unlock(&state->h->lock);
923 } 923 }
924 EXPORT_SYMBOL_GPL(raw_seq_stop); 924 EXPORT_SYMBOL_GPL(raw_seq_stop);
925 925
926 static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 926 static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
927 { 927 {
928 struct inet_sock *inet = inet_sk(sp); 928 struct inet_sock *inet = inet_sk(sp);
929 __be32 dest = inet->daddr, 929 __be32 dest = inet->daddr,
930 src = inet->rcv_saddr; 930 src = inet->rcv_saddr;
931 __u16 destp = 0, 931 __u16 destp = 0,
932 srcp = inet->num; 932 srcp = inet->num;
933 933
934 seq_printf(seq, "%4d: %08X:%04X %08X:%04X" 934 seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
935 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", 935 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
936 i, src, srcp, dest, destp, sp->sk_state, 936 i, src, srcp, dest, destp, sp->sk_state,
937 atomic_read(&sp->sk_wmem_alloc), 937 atomic_read(&sp->sk_wmem_alloc),
938 atomic_read(&sp->sk_rmem_alloc), 938 atomic_read(&sp->sk_rmem_alloc),
939 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 939 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
940 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); 940 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
941 } 941 }
942 942
943 static int raw_seq_show(struct seq_file *seq, void *v) 943 static int raw_seq_show(struct seq_file *seq, void *v)
944 { 944 {
945 if (v == SEQ_START_TOKEN) 945 if (v == SEQ_START_TOKEN)
946 seq_printf(seq, " sl local_address rem_address st tx_queue " 946 seq_printf(seq, " sl local_address rem_address st tx_queue "
947 "rx_queue tr tm->when retrnsmt uid timeout " 947 "rx_queue tr tm->when retrnsmt uid timeout "
948 "inode ref pointer drops\n"); 948 "inode ref pointer drops\n");
949 else 949 else
950 raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); 950 raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
951 return 0; 951 return 0;
952 } 952 }
953 953
954 static const struct seq_operations raw_seq_ops = { 954 static const struct seq_operations raw_seq_ops = {
955 .start = raw_seq_start, 955 .start = raw_seq_start,
956 .next = raw_seq_next, 956 .next = raw_seq_next,
957 .stop = raw_seq_stop, 957 .stop = raw_seq_stop,
958 .show = raw_seq_show, 958 .show = raw_seq_show,
959 }; 959 };
960 960
961 int raw_seq_open(struct inode *ino, struct file *file, 961 int raw_seq_open(struct inode *ino, struct file *file,
962 struct raw_hashinfo *h, const struct seq_operations *ops) 962 struct raw_hashinfo *h, const struct seq_operations *ops)
963 { 963 {
964 int err; 964 int err;
965 struct raw_iter_state *i; 965 struct raw_iter_state *i;
966 966
967 err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state)); 967 err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
968 if (err < 0) 968 if (err < 0)
969 return err; 969 return err;
970 970
971 i = raw_seq_private((struct seq_file *)file->private_data); 971 i = raw_seq_private((struct seq_file *)file->private_data);
972 i->h = h; 972 i->h = h;
973 return 0; 973 return 0;
974 } 974 }
975 EXPORT_SYMBOL_GPL(raw_seq_open); 975 EXPORT_SYMBOL_GPL(raw_seq_open);
976 976
977 static int raw_v4_seq_open(struct inode *inode, struct file *file) 977 static int raw_v4_seq_open(struct inode *inode, struct file *file)
978 { 978 {
979 return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops); 979 return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
980 } 980 }
981 981
982 static const struct file_operations raw_seq_fops = { 982 static const struct file_operations raw_seq_fops = {
983 .owner = THIS_MODULE, 983 .owner = THIS_MODULE,
984 .open = raw_v4_seq_open, 984 .open = raw_v4_seq_open,
985 .read = seq_read, 985 .read = seq_read,
986 .llseek = seq_lseek, 986 .llseek = seq_lseek,
987 .release = seq_release_net, 987 .release = seq_release_net,
988 }; 988 };
989 989
990 static __net_init int raw_init_net(struct net *net) 990 static __net_init int raw_init_net(struct net *net)
991 { 991 {
992 if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) 992 if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
993 return -ENOMEM; 993 return -ENOMEM;
994 994
995 return 0; 995 return 0;
996 } 996 }
997 997
998 static __net_exit void raw_exit_net(struct net *net) 998 static __net_exit void raw_exit_net(struct net *net)
999 { 999 {
1000 proc_net_remove(net, "raw"); 1000 proc_net_remove(net, "raw");
1001 } 1001 }
1002 1002
1003 static __net_initdata struct pernet_operations raw_net_ops = { 1003 static __net_initdata struct pernet_operations raw_net_ops = {
1004 .init = raw_init_net, 1004 .init = raw_init_net,
1005 .exit = raw_exit_net, 1005 .exit = raw_exit_net,
1006 }; 1006 };
1007 1007
1008 int __init raw_proc_init(void) 1008 int __init raw_proc_init(void)
1009 { 1009 {
1010 return register_pernet_subsys(&raw_net_ops); 1010 return register_pernet_subsys(&raw_net_ops);
1011 } 1011 }
1012 1012
1013 void __init raw_proc_exit(void) 1013 void __init raw_proc_exit(void)
1014 { 1014 {
1015 unregister_pernet_subsys(&raw_net_ops); 1015 unregister_pernet_subsys(&raw_net_ops);
1016 } 1016 }
1017 #endif /* CONFIG_PROC_FS */ 1017 #endif /* CONFIG_PROC_FS */
1018 1018
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Implementation of the Transmission Control Protocol(TCP). 6 * Implementation of the Transmission Control Protocol(TCP).
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Mark Evans, <evansmp@uhura.aston.ac.uk> 10 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Corey Minyard <wf-rch!minyard@relay.EU.net> 11 * Corey Minyard <wf-rch!minyard@relay.EU.net>
12 * Florian La Roche, <flla@stud.uni-sb.de> 12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 * Linus Torvalds, <torvalds@cs.helsinki.fi> 14 * Linus Torvalds, <torvalds@cs.helsinki.fi>
15 * Alan Cox, <gw4pts@gw4pts.ampr.org> 15 * Alan Cox, <gw4pts@gw4pts.ampr.org>
16 * Matthew Dillon, <dillon@apollo.west.oic.com> 16 * Matthew Dillon, <dillon@apollo.west.oic.com>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Jorge Cwik, <jorge@laser.satlink.net> 18 * Jorge Cwik, <jorge@laser.satlink.net>
19 * 19 *
20 * Fixes: 20 * Fixes:
21 * Alan Cox : Numerous verify_area() calls 21 * Alan Cox : Numerous verify_area() calls
22 * Alan Cox : Set the ACK bit on a reset 22 * Alan Cox : Set the ACK bit on a reset
23 * Alan Cox : Stopped it crashing if it closed while 23 * Alan Cox : Stopped it crashing if it closed while
24 * sk->inuse=1 and was trying to connect 24 * sk->inuse=1 and was trying to connect
25 * (tcp_err()). 25 * (tcp_err()).
26 * Alan Cox : All icmp error handling was broken 26 * Alan Cox : All icmp error handling was broken
27 * pointers passed where wrong and the 27 * pointers passed where wrong and the
28 * socket was looked up backwards. Nobody 28 * socket was looked up backwards. Nobody
29 * tested any icmp error code obviously. 29 * tested any icmp error code obviously.
30 * Alan Cox : tcp_err() now handled properly. It 30 * Alan Cox : tcp_err() now handled properly. It
31 * wakes people on errors. poll 31 * wakes people on errors. poll
32 * behaves and the icmp error race 32 * behaves and the icmp error race
33 * has gone by moving it into sock.c 33 * has gone by moving it into sock.c
34 * Alan Cox : tcp_send_reset() fixed to work for 34 * Alan Cox : tcp_send_reset() fixed to work for
35 * everything not just packets for 35 * everything not just packets for
36 * unknown sockets. 36 * unknown sockets.
37 * Alan Cox : tcp option processing. 37 * Alan Cox : tcp option processing.
38 * Alan Cox : Reset tweaked (still not 100%) [Had 38 * Alan Cox : Reset tweaked (still not 100%) [Had
39 * syn rule wrong] 39 * syn rule wrong]
40 * Herp Rosmanith : More reset fixes 40 * Herp Rosmanith : More reset fixes
41 * Alan Cox : No longer acks invalid rst frames. 41 * Alan Cox : No longer acks invalid rst frames.
42 * Acking any kind of RST is right out. 42 * Acking any kind of RST is right out.
43 * Alan Cox : Sets an ignore me flag on an rst 43 * Alan Cox : Sets an ignore me flag on an rst
44 * receive otherwise odd bits of prattle 44 * receive otherwise odd bits of prattle
45 * escape still 45 * escape still
46 * Alan Cox : Fixed another acking RST frame bug. 46 * Alan Cox : Fixed another acking RST frame bug.
47 * Should stop LAN workplace lockups. 47 * Should stop LAN workplace lockups.
48 * Alan Cox : Some tidyups using the new skb list 48 * Alan Cox : Some tidyups using the new skb list
49 * facilities 49 * facilities
50 * Alan Cox : sk->keepopen now seems to work 50 * Alan Cox : sk->keepopen now seems to work
51 * Alan Cox : Pulls options out correctly on accepts 51 * Alan Cox : Pulls options out correctly on accepts
52 * Alan Cox : Fixed assorted sk->rqueue->next errors 52 * Alan Cox : Fixed assorted sk->rqueue->next errors
53 * Alan Cox : PSH doesn't end a TCP read. Switched a 53 * Alan Cox : PSH doesn't end a TCP read. Switched a
54 * bit to skb ops. 54 * bit to skb ops.
55 * Alan Cox : Tidied tcp_data to avoid a potential 55 * Alan Cox : Tidied tcp_data to avoid a potential
56 * nasty. 56 * nasty.
57 * Alan Cox : Added some better commenting, as the 57 * Alan Cox : Added some better commenting, as the
58 * tcp is hard to follow 58 * tcp is hard to follow
59 * Alan Cox : Removed incorrect check for 20 * psh 59 * Alan Cox : Removed incorrect check for 20 * psh
60 * Michael O'Reilly : ack < copied bug fix. 60 * Michael O'Reilly : ack < copied bug fix.
61 * Johannes Stille : Misc tcp fixes (not all in yet). 61 * Johannes Stille : Misc tcp fixes (not all in yet).
62 * Alan Cox : FIN with no memory -> CRASH 62 * Alan Cox : FIN with no memory -> CRASH
63 * Alan Cox : Added socket option proto entries. 63 * Alan Cox : Added socket option proto entries.
64 * Also added awareness of them to accept. 64 * Also added awareness of them to accept.
65 * Alan Cox : Added TCP options (SOL_TCP) 65 * Alan Cox : Added TCP options (SOL_TCP)
66 * Alan Cox : Switched wakeup calls to callbacks, 66 * Alan Cox : Switched wakeup calls to callbacks,
67 * so the kernel can layer network 67 * so the kernel can layer network
68 * sockets. 68 * sockets.
69 * Alan Cox : Use ip_tos/ip_ttl settings. 69 * Alan Cox : Use ip_tos/ip_ttl settings.
70 * Alan Cox : Handle FIN (more) properly (we hope). 70 * Alan Cox : Handle FIN (more) properly (we hope).
71 * Alan Cox : RST frames sent on unsynchronised 71 * Alan Cox : RST frames sent on unsynchronised
72 * state ack error. 72 * state ack error.
73 * Alan Cox : Put in missing check for SYN bit. 73 * Alan Cox : Put in missing check for SYN bit.
74 * Alan Cox : Added tcp_select_window() aka NET2E 74 * Alan Cox : Added tcp_select_window() aka NET2E
75 * window non shrink trick. 75 * window non shrink trick.
76 * Alan Cox : Added a couple of small NET2E timer 76 * Alan Cox : Added a couple of small NET2E timer
77 * fixes 77 * fixes
78 * Charles Hedrick : TCP fixes 78 * Charles Hedrick : TCP fixes
79 * Toomas Tamm : TCP window fixes 79 * Toomas Tamm : TCP window fixes
80 * Alan Cox : Small URG fix to rlogin ^C ack fight 80 * Alan Cox : Small URG fix to rlogin ^C ack fight
81 * Charles Hedrick : Rewrote most of it to actually work 81 * Charles Hedrick : Rewrote most of it to actually work
82 * Linus : Rewrote tcp_read() and URG handling 82 * Linus : Rewrote tcp_read() and URG handling
83 * completely 83 * completely
84 * Gerhard Koerting: Fixed some missing timer handling 84 * Gerhard Koerting: Fixed some missing timer handling
85 * Matthew Dillon : Reworked TCP machine states as per RFC 85 * Matthew Dillon : Reworked TCP machine states as per RFC
86 * Gerhard Koerting: PC/TCP workarounds 86 * Gerhard Koerting: PC/TCP workarounds
87 * Adam Caldwell : Assorted timer/timing errors 87 * Adam Caldwell : Assorted timer/timing errors
88 * Matthew Dillon : Fixed another RST bug 88 * Matthew Dillon : Fixed another RST bug
89 * Alan Cox : Move to kernel side addressing changes. 89 * Alan Cox : Move to kernel side addressing changes.
90 * Alan Cox : Beginning work on TCP fastpathing 90 * Alan Cox : Beginning work on TCP fastpathing
91 * (not yet usable) 91 * (not yet usable)
92 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 92 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
93 * Alan Cox : TCP fast path debugging 93 * Alan Cox : TCP fast path debugging
94 * Alan Cox : Window clamping 94 * Alan Cox : Window clamping
95 * Michael Riepe : Bug in tcp_check() 95 * Michael Riepe : Bug in tcp_check()
96 * Matt Dillon : More TCP improvements and RST bug fixes 96 * Matt Dillon : More TCP improvements and RST bug fixes
97 * Matt Dillon : Yet more small nasties remove from the 97 * Matt Dillon : Yet more small nasties remove from the
98 * TCP code (Be very nice to this man if 98 * TCP code (Be very nice to this man if
99 * tcp finally works 100%) 8) 99 * tcp finally works 100%) 8)
100 * Alan Cox : BSD accept semantics. 100 * Alan Cox : BSD accept semantics.
101 * Alan Cox : Reset on closedown bug. 101 * Alan Cox : Reset on closedown bug.
102 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 102 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
103 * Michael Pall : Handle poll() after URG properly in 103 * Michael Pall : Handle poll() after URG properly in
104 * all cases. 104 * all cases.
105 * Michael Pall : Undo the last fix in tcp_read_urg() 105 * Michael Pall : Undo the last fix in tcp_read_urg()
106 * (multi URG PUSH broke rlogin). 106 * (multi URG PUSH broke rlogin).
107 * Michael Pall : Fix the multi URG PUSH problem in 107 * Michael Pall : Fix the multi URG PUSH problem in
108 * tcp_readable(), poll() after URG 108 * tcp_readable(), poll() after URG
109 * works now. 109 * works now.
110 * Michael Pall : recv(...,MSG_OOB) never blocks in the 110 * Michael Pall : recv(...,MSG_OOB) never blocks in the
111 * BSD api. 111 * BSD api.
112 * Alan Cox : Changed the semantics of sk->socket to 112 * Alan Cox : Changed the semantics of sk->socket to
113 * fix a race and a signal problem with 113 * fix a race and a signal problem with
114 * accept() and async I/O. 114 * accept() and async I/O.
115 * Alan Cox : Relaxed the rules on tcp_sendto(). 115 * Alan Cox : Relaxed the rules on tcp_sendto().
116 * Yury Shevchuk : Really fixed accept() blocking problem. 116 * Yury Shevchuk : Really fixed accept() blocking problem.
117 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 117 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
118 * clients/servers which listen in on 118 * clients/servers which listen in on
119 * fixed ports. 119 * fixed ports.
120 * Alan Cox : Cleaned the above up and shrank it to 120 * Alan Cox : Cleaned the above up and shrank it to
121 * a sensible code size. 121 * a sensible code size.
122 * Alan Cox : Self connect lockup fix. 122 * Alan Cox : Self connect lockup fix.
123 * Alan Cox : No connect to multicast. 123 * Alan Cox : No connect to multicast.
124 * Ross Biro : Close unaccepted children on master 124 * Ross Biro : Close unaccepted children on master
125 * socket close. 125 * socket close.
126 * Alan Cox : Reset tracing code. 126 * Alan Cox : Reset tracing code.
127 * Alan Cox : Spurious resets on shutdown. 127 * Alan Cox : Spurious resets on shutdown.
128 * Alan Cox : Giant 15 minute/60 second timer error 128 * Alan Cox : Giant 15 minute/60 second timer error
129 * Alan Cox : Small whoops in polling before an 129 * Alan Cox : Small whoops in polling before an
130 * accept. 130 * accept.
131 * Alan Cox : Kept the state trace facility since 131 * Alan Cox : Kept the state trace facility since
132 * it's handy for debugging. 132 * it's handy for debugging.
133 * Alan Cox : More reset handler fixes. 133 * Alan Cox : More reset handler fixes.
134 * Alan Cox : Started rewriting the code based on 134 * Alan Cox : Started rewriting the code based on
135 * the RFC's for other useful protocol 135 * the RFC's for other useful protocol
136 * references see: Comer, KA9Q NOS, and 136 * references see: Comer, KA9Q NOS, and
137 * for a reference on the difference 137 * for a reference on the difference
138 * between specifications and how BSD 138 * between specifications and how BSD
139 * works see the 4.4lite source. 139 * works see the 4.4lite source.
140 * A.N.Kuznetsov : Don't time wait on completion of tidy 140 * A.N.Kuznetsov : Don't time wait on completion of tidy
141 * close. 141 * close.
142 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 142 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
143 * Linus Torvalds : Fixed BSD port reuse to work first syn 143 * Linus Torvalds : Fixed BSD port reuse to work first syn
144 * Alan Cox : Reimplemented timers as per the RFC 144 * Alan Cox : Reimplemented timers as per the RFC
145 * and using multiple timers for sanity. 145 * and using multiple timers for sanity.
146 * Alan Cox : Small bug fixes, and a lot of new 146 * Alan Cox : Small bug fixes, and a lot of new
147 * comments. 147 * comments.
148 * Alan Cox : Fixed dual reader crash by locking 148 * Alan Cox : Fixed dual reader crash by locking
149 * the buffers (much like datagram.c) 149 * the buffers (much like datagram.c)
150 * Alan Cox : Fixed stuck sockets in probe. A probe 150 * Alan Cox : Fixed stuck sockets in probe. A probe
151 * now gets fed up of retrying without 151 * now gets fed up of retrying without
152 * (even a no space) answer. 152 * (even a no space) answer.
153 * Alan Cox : Extracted closing code better 153 * Alan Cox : Extracted closing code better
154 * Alan Cox : Fixed the closing state machine to 154 * Alan Cox : Fixed the closing state machine to
155 * resemble the RFC. 155 * resemble the RFC.
156 * Alan Cox : More 'per spec' fixes. 156 * Alan Cox : More 'per spec' fixes.
157 * Jorge Cwik : Even faster checksumming. 157 * Jorge Cwik : Even faster checksumming.
158 * Alan Cox : tcp_data() doesn't ack illegal PSH 158 * Alan Cox : tcp_data() doesn't ack illegal PSH
159 * only frames. At least one pc tcp stack 159 * only frames. At least one pc tcp stack
160 * generates them. 160 * generates them.
161 * Alan Cox : Cache last socket. 161 * Alan Cox : Cache last socket.
162 * Alan Cox : Per route irtt. 162 * Alan Cox : Per route irtt.
163 * Matt Day : poll()->select() match BSD precisely on error 163 * Matt Day : poll()->select() match BSD precisely on error
164 * Alan Cox : New buffers 164 * Alan Cox : New buffers
165 * Marc Tamsky : Various sk->prot->retransmits and 165 * Marc Tamsky : Various sk->prot->retransmits and
166 * sk->retransmits misupdating fixed. 166 * sk->retransmits misupdating fixed.
167 * Fixed tcp_write_timeout: stuck close, 167 * Fixed tcp_write_timeout: stuck close,
168 * and TCP syn retries gets used now. 168 * and TCP syn retries gets used now.
169 * Mark Yarvis : In tcp_read_wakeup(), don't send an 169 * Mark Yarvis : In tcp_read_wakeup(), don't send an
170 * ack if state is TCP_CLOSED. 170 * ack if state is TCP_CLOSED.
171 * Alan Cox : Look up device on a retransmit - routes may 171 * Alan Cox : Look up device on a retransmit - routes may
172 * change. Doesn't yet cope with MSS shrink right 172 * change. Doesn't yet cope with MSS shrink right
173 * but it's a start! 173 * but it's a start!
174 * Marc Tamsky : Closing in closing fixes. 174 * Marc Tamsky : Closing in closing fixes.
175 * Mike Shaver : RFC1122 verifications. 175 * Mike Shaver : RFC1122 verifications.
176 * Alan Cox : rcv_saddr errors. 176 * Alan Cox : rcv_saddr errors.
177 * Alan Cox : Block double connect(). 177 * Alan Cox : Block double connect().
178 * Alan Cox : Small hooks for enSKIP. 178 * Alan Cox : Small hooks for enSKIP.
179 * Alexey Kuznetsov: Path MTU discovery. 179 * Alexey Kuznetsov: Path MTU discovery.
180 * Alan Cox : Support soft errors. 180 * Alan Cox : Support soft errors.
181 * Alan Cox : Fix MTU discovery pathological case 181 * Alan Cox : Fix MTU discovery pathological case
182 * when the remote claims no mtu! 182 * when the remote claims no mtu!
183 * Marc Tamsky : TCP_CLOSE fix. 183 * Marc Tamsky : TCP_CLOSE fix.
184 * Colin (G3TNE) : Send a reset on syn ack replies in 184 * Colin (G3TNE) : Send a reset on syn ack replies in
185 * window but wrong (fixes NT lpd problems) 185 * window but wrong (fixes NT lpd problems)
186 * Pedro Roque : Better TCP window handling, delayed ack. 186 * Pedro Roque : Better TCP window handling, delayed ack.
187 * Joerg Reuter : No modification of locked buffers in 187 * Joerg Reuter : No modification of locked buffers in
188 * tcp_do_retransmit() 188 * tcp_do_retransmit()
189 * Eric Schenk : Changed receiver side silly window 189 * Eric Schenk : Changed receiver side silly window
190 * avoidance algorithm to BSD style 190 * avoidance algorithm to BSD style
191 * algorithm. This doubles throughput 191 * algorithm. This doubles throughput
192 * against machines running Solaris, 192 * against machines running Solaris,
193 * and seems to result in general 193 * and seems to result in general
194 * improvement. 194 * improvement.
195 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD 195 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
196 * Willy Konynenberg : Transparent proxying support. 196 * Willy Konynenberg : Transparent proxying support.
197 * Mike McLagan : Routing by source 197 * Mike McLagan : Routing by source
198 * Keith Owens : Do proper merging with partial SKB's in 198 * Keith Owens : Do proper merging with partial SKB's in
199 * tcp_do_sendmsg to avoid burstiness. 199 * tcp_do_sendmsg to avoid burstiness.
200 * Eric Schenk : Fix fast close down bug with 200 * Eric Schenk : Fix fast close down bug with
201 * shutdown() followed by close(). 201 * shutdown() followed by close().
202 * Andi Kleen : Make poll agree with SIGIO 202 * Andi Kleen : Make poll agree with SIGIO
203 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and 203 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
204 * lingertime == 0 (RFC 793 ABORT Call) 204 * lingertime == 0 (RFC 793 ABORT Call)
205 * Hirokazu Takahashi : Use copy_from_user() instead of 205 * Hirokazu Takahashi : Use copy_from_user() instead of
206 * csum_and_copy_from_user() if possible. 206 * csum_and_copy_from_user() if possible.
207 * 207 *
208 * This program is free software; you can redistribute it and/or 208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License 209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version 210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version. 211 * 2 of the License, or(at your option) any later version.
212 * 212 *
213 * Description of States: 213 * Description of States:
214 * 214 *
215 * TCP_SYN_SENT sent a connection request, waiting for ack 215 * TCP_SYN_SENT sent a connection request, waiting for ack
216 * 216 *
217 * TCP_SYN_RECV received a connection request, sent ack, 217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake. 218 * waiting for final ack in three-way handshake.
219 * 219 *
220 * TCP_ESTABLISHED connection established 220 * TCP_ESTABLISHED connection established
221 * 221 *
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data 223 * transmission of remaining buffered data
224 * 224 *
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * to shutdown 226 * to shutdown
227 * 227 *
228 * TCP_CLOSING both sides have shutdown but we still have 228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending 229 * data we have to finish sending
230 * 230 *
231 * TCP_TIME_WAIT timeout to catch resent junk before entering 231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2 232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end 233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it 234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore) 235 * to retransmit the data packet (which we ignore)
236 * 236 *
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown 238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK) 239 * (we have to close() to move on to LAST_ACK)
240 * 240 *
241 * TCP_LAST_ACK out side has shutdown after remote has 241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our 242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending 243 * buffer that we have to finish sending
244 * 244 *
245 * TCP_CLOSE socket is finished 245 * TCP_CLOSE socket is finished
246 */ 246 */
247 247
248 #include <linux/kernel.h> 248 #include <linux/kernel.h>
249 #include <linux/module.h> 249 #include <linux/module.h>
250 #include <linux/types.h> 250 #include <linux/types.h>
251 #include <linux/fcntl.h> 251 #include <linux/fcntl.h>
252 #include <linux/poll.h> 252 #include <linux/poll.h>
253 #include <linux/init.h> 253 #include <linux/init.h>
254 #include <linux/fs.h> 254 #include <linux/fs.h>
255 #include <linux/skbuff.h> 255 #include <linux/skbuff.h>
256 #include <linux/scatterlist.h> 256 #include <linux/scatterlist.h>
257 #include <linux/splice.h> 257 #include <linux/splice.h>
258 #include <linux/net.h> 258 #include <linux/net.h>
259 #include <linux/socket.h> 259 #include <linux/socket.h>
260 #include <linux/random.h> 260 #include <linux/random.h>
261 #include <linux/bootmem.h> 261 #include <linux/bootmem.h>
262 #include <linux/highmem.h> 262 #include <linux/highmem.h>
263 #include <linux/swap.h> 263 #include <linux/swap.h>
264 #include <linux/cache.h> 264 #include <linux/cache.h>
265 #include <linux/err.h> 265 #include <linux/err.h>
266 #include <linux/crypto.h> 266 #include <linux/crypto.h>
267 267
268 #include <net/icmp.h> 268 #include <net/icmp.h>
269 #include <net/tcp.h> 269 #include <net/tcp.h>
270 #include <net/xfrm.h> 270 #include <net/xfrm.h>
271 #include <net/ip.h> 271 #include <net/ip.h>
272 #include <net/netdma.h> 272 #include <net/netdma.h>
273 #include <net/sock.h> 273 #include <net/sock.h>
274 274
275 #include <asm/uaccess.h> 275 #include <asm/uaccess.h>
276 #include <asm/ioctls.h> 276 #include <asm/ioctls.h>
277 277
278 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; 278 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279 279
280 atomic_t tcp_orphan_count = ATOMIC_INIT(0); 280 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
281 281
282 EXPORT_SYMBOL_GPL(tcp_orphan_count); 282 EXPORT_SYMBOL_GPL(tcp_orphan_count);
283 283
284 int sysctl_tcp_mem[3] __read_mostly; 284 int sysctl_tcp_mem[3] __read_mostly;
285 int sysctl_tcp_wmem[3] __read_mostly; 285 int sysctl_tcp_wmem[3] __read_mostly;
286 int sysctl_tcp_rmem[3] __read_mostly; 286 int sysctl_tcp_rmem[3] __read_mostly;
287 287
288 EXPORT_SYMBOL(sysctl_tcp_mem); 288 EXPORT_SYMBOL(sysctl_tcp_mem);
289 EXPORT_SYMBOL(sysctl_tcp_rmem); 289 EXPORT_SYMBOL(sysctl_tcp_rmem);
290 EXPORT_SYMBOL(sysctl_tcp_wmem); 290 EXPORT_SYMBOL(sysctl_tcp_wmem);
291 291
292 atomic_t tcp_memory_allocated; /* Current allocated memory. */ 292 atomic_t tcp_memory_allocated; /* Current allocated memory. */
293 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ 293 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
294 294
295 EXPORT_SYMBOL(tcp_memory_allocated); 295 EXPORT_SYMBOL(tcp_memory_allocated);
296 EXPORT_SYMBOL(tcp_sockets_allocated); 296 EXPORT_SYMBOL(tcp_sockets_allocated);
297 297
298 /* 298 /*
299 * TCP splice context 299 * TCP splice context
300 */ 300 */
301 struct tcp_splice_state { 301 struct tcp_splice_state {
302 struct pipe_inode_info *pipe; 302 struct pipe_inode_info *pipe;
303 size_t len; 303 size_t len;
304 unsigned int flags; 304 unsigned int flags;
305 }; 305 };
306 306
307 /* 307 /*
308 * Pressure flag: try to collapse. 308 * Pressure flag: try to collapse.
309 * Technical note: it is used by multiple contexts non atomically. 309 * Technical note: it is used by multiple contexts non atomically.
310 * All the __sk_mem_schedule() is of this nature: accounting 310 * All the __sk_mem_schedule() is of this nature: accounting
311 * is strict, actions are advisory and have some latency. 311 * is strict, actions are advisory and have some latency.
312 */ 312 */
313 int tcp_memory_pressure __read_mostly; 313 int tcp_memory_pressure __read_mostly;
314 314
315 EXPORT_SYMBOL(tcp_memory_pressure); 315 EXPORT_SYMBOL(tcp_memory_pressure);
316 316
317 void tcp_enter_memory_pressure(struct sock *sk) 317 void tcp_enter_memory_pressure(struct sock *sk)
318 { 318 {
319 if (!tcp_memory_pressure) { 319 if (!tcp_memory_pressure) {
320 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); 320 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
321 tcp_memory_pressure = 1; 321 tcp_memory_pressure = 1;
322 } 322 }
323 } 323 }
324 324
325 EXPORT_SYMBOL(tcp_enter_memory_pressure); 325 EXPORT_SYMBOL(tcp_enter_memory_pressure);
326 326
327 /* 327 /*
328 * Wait for a TCP event. 328 * Wait for a TCP event.
329 * 329 *
330 * Note that we don't need to lock the socket, as the upper poll layers 330 * Note that we don't need to lock the socket, as the upper poll layers
331 * take care of normal races (between the test and the event) and we don't 331 * take care of normal races (between the test and the event) and we don't
332 * go look at any of the socket buffers directly. 332 * go look at any of the socket buffers directly.
333 */ 333 */
334 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) 334 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
335 { 335 {
336 unsigned int mask; 336 unsigned int mask;
337 struct sock *sk = sock->sk; 337 struct sock *sk = sock->sk;
338 struct tcp_sock *tp = tcp_sk(sk); 338 struct tcp_sock *tp = tcp_sk(sk);
339 339
340 poll_wait(file, sk->sk_sleep, wait); 340 poll_wait(file, sk->sk_sleep, wait);
341 if (sk->sk_state == TCP_LISTEN) 341 if (sk->sk_state == TCP_LISTEN)
342 return inet_csk_listen_poll(sk); 342 return inet_csk_listen_poll(sk);
343 343
344 /* Socket is not locked. We are protected from async events 344 /* Socket is not locked. We are protected from async events
345 * by poll logic and correct handling of state changes 345 * by poll logic and correct handling of state changes
346 * made by other threads is impossible in any case. 346 * made by other threads is impossible in any case.
347 */ 347 */
348 348
349 mask = 0; 349 mask = 0;
350 if (sk->sk_err) 350 if (sk->sk_err)
351 mask = POLLERR; 351 mask = POLLERR;
352 352
353 /* 353 /*
354 * POLLHUP is certainly not done right. But poll() doesn't 354 * POLLHUP is certainly not done right. But poll() doesn't
355 * have a notion of HUP in just one direction, and for a 355 * have a notion of HUP in just one direction, and for a
356 * socket the read side is more interesting. 356 * socket the read side is more interesting.
357 * 357 *
358 * Some poll() documentation says that POLLHUP is incompatible 358 * Some poll() documentation says that POLLHUP is incompatible
359 * with the POLLOUT/POLLWR flags, so somebody should check this 359 * with the POLLOUT/POLLWR flags, so somebody should check this
360 * all. But careful, it tends to be safer to return too many 360 * all. But careful, it tends to be safer to return too many
361 * bits than too few, and you can easily break real applications 361 * bits than too few, and you can easily break real applications
362 * if you don't tell them that something has hung up! 362 * if you don't tell them that something has hung up!
363 * 363 *
364 * Check-me. 364 * Check-me.
365 * 365 *
366 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and 366 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
367 * our fs/select.c). It means that after we received EOF, 367 * our fs/select.c). It means that after we received EOF,
368 * poll always returns immediately, making impossible poll() on write() 368 * poll always returns immediately, making impossible poll() on write()
369 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP 369 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
370 * if and only if shutdown has been made in both directions. 370 * if and only if shutdown has been made in both directions.
371 * Actually, it is interesting to look how Solaris and DUX 371 * Actually, it is interesting to look how Solaris and DUX
372 * solve this dilemma. I would prefer, if POLLHUP were maskable, 372 * solve this dilemma. I would prefer, if POLLHUP were maskable,
373 * then we could set it on SND_SHUTDOWN. BTW examples given 373 * then we could set it on SND_SHUTDOWN. BTW examples given
374 * in Stevens' books assume exactly this behaviour, it explains 374 * in Stevens' books assume exactly this behaviour, it explains
375 * why POLLHUP is incompatible with POLLOUT. --ANK 375 * why POLLHUP is incompatible with POLLOUT. --ANK
376 * 376 *
377 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent 377 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
378 * blocking on fresh not-connected or disconnected socket. --ANK 378 * blocking on fresh not-connected or disconnected socket. --ANK
379 */ 379 */
380 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) 380 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
381 mask |= POLLHUP; 381 mask |= POLLHUP;
382 if (sk->sk_shutdown & RCV_SHUTDOWN) 382 if (sk->sk_shutdown & RCV_SHUTDOWN)
383 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 383 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
384 384
385 /* Connected? */ 385 /* Connected? */
386 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 386 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
387 int target = sock_rcvlowat(sk, 0, INT_MAX); 387 int target = sock_rcvlowat(sk, 0, INT_MAX);
388 388
389 if (tp->urg_seq == tp->copied_seq && 389 if (tp->urg_seq == tp->copied_seq &&
390 !sock_flag(sk, SOCK_URGINLINE) && 390 !sock_flag(sk, SOCK_URGINLINE) &&
391 tp->urg_data) 391 tp->urg_data)
392 target--; 392 target--;
393 393
394 /* Potential race condition. If read of tp below will 394 /* Potential race condition. If read of tp below will
395 * escape above sk->sk_state, we can be illegally awaken 395 * escape above sk->sk_state, we can be illegally awaken
396 * in SYN_* states. */ 396 * in SYN_* states. */
397 if (tp->rcv_nxt - tp->copied_seq >= target) 397 if (tp->rcv_nxt - tp->copied_seq >= target)
398 mask |= POLLIN | POLLRDNORM; 398 mask |= POLLIN | POLLRDNORM;
399 399
400 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 400 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
401 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { 401 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
402 mask |= POLLOUT | POLLWRNORM; 402 mask |= POLLOUT | POLLWRNORM;
403 } else { /* send SIGIO later */ 403 } else { /* send SIGIO later */
404 set_bit(SOCK_ASYNC_NOSPACE, 404 set_bit(SOCK_ASYNC_NOSPACE,
405 &sk->sk_socket->flags); 405 &sk->sk_socket->flags);
406 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 406 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
407 407
408 /* Race breaker. If space is freed after 408 /* Race breaker. If space is freed after
409 * wspace test but before the flags are set, 409 * wspace test but before the flags are set,
410 * IO signal will be lost. 410 * IO signal will be lost.
411 */ 411 */
412 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 412 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
413 mask |= POLLOUT | POLLWRNORM; 413 mask |= POLLOUT | POLLWRNORM;
414 } 414 }
415 } 415 }
416 416
417 if (tp->urg_data & TCP_URG_VALID) 417 if (tp->urg_data & TCP_URG_VALID)
418 mask |= POLLPRI; 418 mask |= POLLPRI;
419 } 419 }
420 return mask; 420 return mask;
421 } 421 }
422 422
423 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 423 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
424 { 424 {
425 struct tcp_sock *tp = tcp_sk(sk); 425 struct tcp_sock *tp = tcp_sk(sk);
426 int answ; 426 int answ;
427 427
428 switch (cmd) { 428 switch (cmd) {
429 case SIOCINQ: 429 case SIOCINQ:
430 if (sk->sk_state == TCP_LISTEN) 430 if (sk->sk_state == TCP_LISTEN)
431 return -EINVAL; 431 return -EINVAL;
432 432
433 lock_sock(sk); 433 lock_sock(sk);
434 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 434 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
435 answ = 0; 435 answ = 0;
436 else if (sock_flag(sk, SOCK_URGINLINE) || 436 else if (sock_flag(sk, SOCK_URGINLINE) ||
437 !tp->urg_data || 437 !tp->urg_data ||
438 before(tp->urg_seq, tp->copied_seq) || 438 before(tp->urg_seq, tp->copied_seq) ||
439 !before(tp->urg_seq, tp->rcv_nxt)) { 439 !before(tp->urg_seq, tp->rcv_nxt)) {
440 answ = tp->rcv_nxt - tp->copied_seq; 440 answ = tp->rcv_nxt - tp->copied_seq;
441 441
442 /* Subtract 1, if FIN is in queue. */ 442 /* Subtract 1, if FIN is in queue. */
443 if (answ && !skb_queue_empty(&sk->sk_receive_queue)) 443 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
444 answ -= 444 answ -=
445 tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; 445 tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
446 } else 446 } else
447 answ = tp->urg_seq - tp->copied_seq; 447 answ = tp->urg_seq - tp->copied_seq;
448 release_sock(sk); 448 release_sock(sk);
449 break; 449 break;
450 case SIOCATMARK: 450 case SIOCATMARK:
451 answ = tp->urg_data && tp->urg_seq == tp->copied_seq; 451 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
452 break; 452 break;
453 case SIOCOUTQ: 453 case SIOCOUTQ:
454 if (sk->sk_state == TCP_LISTEN) 454 if (sk->sk_state == TCP_LISTEN)
455 return -EINVAL; 455 return -EINVAL;
456 456
457 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 457 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
458 answ = 0; 458 answ = 0;
459 else 459 else
460 answ = tp->write_seq - tp->snd_una; 460 answ = tp->write_seq - tp->snd_una;
461 break; 461 break;
462 default: 462 default:
463 return -ENOIOCTLCMD; 463 return -ENOIOCTLCMD;
464 } 464 }
465 465
466 return put_user(answ, (int __user *)arg); 466 return put_user(answ, (int __user *)arg);
467 } 467 }
468 468
469 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 469 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
470 { 470 {
471 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 471 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
472 tp->pushed_seq = tp->write_seq; 472 tp->pushed_seq = tp->write_seq;
473 } 473 }
474 474
475 static inline int forced_push(struct tcp_sock *tp) 475 static inline int forced_push(struct tcp_sock *tp)
476 { 476 {
477 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 477 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
478 } 478 }
479 479
480 static inline void skb_entail(struct sock *sk, struct sk_buff *skb) 480 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
481 { 481 {
482 struct tcp_sock *tp = tcp_sk(sk); 482 struct tcp_sock *tp = tcp_sk(sk);
483 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 483 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
484 484
485 skb->csum = 0; 485 skb->csum = 0;
486 tcb->seq = tcb->end_seq = tp->write_seq; 486 tcb->seq = tcb->end_seq = tp->write_seq;
487 tcb->flags = TCPCB_FLAG_ACK; 487 tcb->flags = TCPCB_FLAG_ACK;
488 tcb->sacked = 0; 488 tcb->sacked = 0;
489 skb_header_release(skb); 489 skb_header_release(skb);
490 tcp_add_write_queue_tail(sk, skb); 490 tcp_add_write_queue_tail(sk, skb);
491 sk->sk_wmem_queued += skb->truesize; 491 sk->sk_wmem_queued += skb->truesize;
492 sk_mem_charge(sk, skb->truesize); 492 sk_mem_charge(sk, skb->truesize);
493 if (tp->nonagle & TCP_NAGLE_PUSH) 493 if (tp->nonagle & TCP_NAGLE_PUSH)
494 tp->nonagle &= ~TCP_NAGLE_PUSH; 494 tp->nonagle &= ~TCP_NAGLE_PUSH;
495 } 495 }
496 496
497 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 497 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
498 struct sk_buff *skb) 498 struct sk_buff *skb)
499 { 499 {
500 if (flags & MSG_OOB) 500 if (flags & MSG_OOB)
501 tp->snd_up = tp->write_seq; 501 tp->snd_up = tp->write_seq;
502 } 502 }
503 503
504 static inline void tcp_push(struct sock *sk, int flags, int mss_now, 504 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
505 int nonagle) 505 int nonagle)
506 { 506 {
507 struct tcp_sock *tp = tcp_sk(sk); 507 struct tcp_sock *tp = tcp_sk(sk);
508 508
509 if (tcp_send_head(sk)) { 509 if (tcp_send_head(sk)) {
510 struct sk_buff *skb = tcp_write_queue_tail(sk); 510 struct sk_buff *skb = tcp_write_queue_tail(sk);
511 if (!(flags & MSG_MORE) || forced_push(tp)) 511 if (!(flags & MSG_MORE) || forced_push(tp))
512 tcp_mark_push(tp, skb); 512 tcp_mark_push(tp, skb);
513 tcp_mark_urg(tp, flags, skb); 513 tcp_mark_urg(tp, flags, skb);
514 __tcp_push_pending_frames(sk, mss_now, 514 __tcp_push_pending_frames(sk, mss_now,
515 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 515 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
516 } 516 }
517 } 517 }
518 518
519 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, 519 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
520 unsigned int offset, size_t len) 520 unsigned int offset, size_t len)
521 { 521 {
522 struct tcp_splice_state *tss = rd_desc->arg.data; 522 struct tcp_splice_state *tss = rd_desc->arg.data;
523 523
524 return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); 524 return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
525 } 525 }
526 526
527 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) 527 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
528 { 528 {
529 /* Store TCP splice context information in read_descriptor_t. */ 529 /* Store TCP splice context information in read_descriptor_t. */
530 read_descriptor_t rd_desc = { 530 read_descriptor_t rd_desc = {
531 .arg.data = tss, 531 .arg.data = tss,
532 }; 532 };
533 533
534 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); 534 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
535 } 535 }
536 536
537 /** 537 /**
538 * tcp_splice_read - splice data from TCP socket to a pipe 538 * tcp_splice_read - splice data from TCP socket to a pipe
539 * @sock: socket to splice from 539 * @sock: socket to splice from
540 * @ppos: position (not valid) 540 * @ppos: position (not valid)
541 * @pipe: pipe to splice to 541 * @pipe: pipe to splice to
542 * @len: number of bytes to splice 542 * @len: number of bytes to splice
543 * @flags: splice modifier flags 543 * @flags: splice modifier flags
544 * 544 *
545 * Description: 545 * Description:
546 * Will read pages from given socket and fill them into a pipe. 546 * Will read pages from given socket and fill them into a pipe.
547 * 547 *
548 **/ 548 **/
549 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, 549 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
550 struct pipe_inode_info *pipe, size_t len, 550 struct pipe_inode_info *pipe, size_t len,
551 unsigned int flags) 551 unsigned int flags)
552 { 552 {
553 struct sock *sk = sock->sk; 553 struct sock *sk = sock->sk;
554 struct tcp_splice_state tss = { 554 struct tcp_splice_state tss = {
555 .pipe = pipe, 555 .pipe = pipe,
556 .len = len, 556 .len = len,
557 .flags = flags, 557 .flags = flags,
558 }; 558 };
559 long timeo; 559 long timeo;
560 ssize_t spliced; 560 ssize_t spliced;
561 int ret; 561 int ret;
562 562
563 /* 563 /*
564 * We can't seek on a socket input 564 * We can't seek on a socket input
565 */ 565 */
566 if (unlikely(*ppos)) 566 if (unlikely(*ppos))
567 return -ESPIPE; 567 return -ESPIPE;
568 568
569 ret = spliced = 0; 569 ret = spliced = 0;
570 570
571 lock_sock(sk); 571 lock_sock(sk);
572 572
573 timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); 573 timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
574 while (tss.len) { 574 while (tss.len) {
575 ret = __tcp_splice_read(sk, &tss); 575 ret = __tcp_splice_read(sk, &tss);
576 if (ret < 0) 576 if (ret < 0)
577 break; 577 break;
578 else if (!ret) { 578 else if (!ret) {
579 if (spliced) 579 if (spliced)
580 break; 580 break;
581 if (flags & SPLICE_F_NONBLOCK) { 581 if (flags & SPLICE_F_NONBLOCK) {
582 ret = -EAGAIN; 582 ret = -EAGAIN;
583 break; 583 break;
584 } 584 }
585 if (sock_flag(sk, SOCK_DONE)) 585 if (sock_flag(sk, SOCK_DONE))
586 break; 586 break;
587 if (sk->sk_err) { 587 if (sk->sk_err) {
588 ret = sock_error(sk); 588 ret = sock_error(sk);
589 break; 589 break;
590 } 590 }
591 if (sk->sk_shutdown & RCV_SHUTDOWN) 591 if (sk->sk_shutdown & RCV_SHUTDOWN)
592 break; 592 break;
593 if (sk->sk_state == TCP_CLOSE) { 593 if (sk->sk_state == TCP_CLOSE) {
594 /* 594 /*
595 * This occurs when user tries to read 595 * This occurs when user tries to read
596 * from never connected socket. 596 * from never connected socket.
597 */ 597 */
598 if (!sock_flag(sk, SOCK_DONE)) 598 if (!sock_flag(sk, SOCK_DONE))
599 ret = -ENOTCONN; 599 ret = -ENOTCONN;
600 break; 600 break;
601 } 601 }
602 if (!timeo) { 602 if (!timeo) {
603 ret = -EAGAIN; 603 ret = -EAGAIN;
604 break; 604 break;
605 } 605 }
606 sk_wait_data(sk, &timeo); 606 sk_wait_data(sk, &timeo);
607 if (signal_pending(current)) { 607 if (signal_pending(current)) {
608 ret = sock_intr_errno(timeo); 608 ret = sock_intr_errno(timeo);
609 break; 609 break;
610 } 610 }
611 continue; 611 continue;
612 } 612 }
613 tss.len -= ret; 613 tss.len -= ret;
614 spliced += ret; 614 spliced += ret;
615 615
616 release_sock(sk); 616 release_sock(sk);
617 lock_sock(sk); 617 lock_sock(sk);
618 618
619 if (sk->sk_err || sk->sk_state == TCP_CLOSE || 619 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
620 (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || 620 (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
621 signal_pending(current)) 621 signal_pending(current))
622 break; 622 break;
623 } 623 }
624 624
625 release_sock(sk); 625 release_sock(sk);
626 626
627 if (spliced) 627 if (spliced)
628 return spliced; 628 return spliced;
629 629
630 return ret; 630 return ret;
631 } 631 }
632 632
633 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) 633 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
634 { 634 {
635 struct sk_buff *skb; 635 struct sk_buff *skb;
636 636
637 /* The TCP header must be at least 32-bit aligned. */ 637 /* The TCP header must be at least 32-bit aligned. */
638 size = ALIGN(size, 4); 638 size = ALIGN(size, 4);
639 639
640 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); 640 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
641 if (skb) { 641 if (skb) {
642 if (sk_wmem_schedule(sk, skb->truesize)) { 642 if (sk_wmem_schedule(sk, skb->truesize)) {
643 /* 643 /*
644 * Make sure that we have exactly size bytes 644 * Make sure that we have exactly size bytes
645 * available to the caller, no more, no less. 645 * available to the caller, no more, no less.
646 */ 646 */
647 skb_reserve(skb, skb_tailroom(skb) - size); 647 skb_reserve(skb, skb_tailroom(skb) - size);
648 return skb; 648 return skb;
649 } 649 }
650 __kfree_skb(skb); 650 __kfree_skb(skb);
651 } else { 651 } else {
652 sk->sk_prot->enter_memory_pressure(sk); 652 sk->sk_prot->enter_memory_pressure(sk);
653 sk_stream_moderate_sndbuf(sk); 653 sk_stream_moderate_sndbuf(sk);
654 } 654 }
655 return NULL; 655 return NULL;
656 } 656 }
657 657
658 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, 658 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
659 size_t psize, int flags) 659 size_t psize, int flags)
660 { 660 {
661 struct tcp_sock *tp = tcp_sk(sk); 661 struct tcp_sock *tp = tcp_sk(sk);
662 int mss_now, size_goal; 662 int mss_now, size_goal;
663 int err; 663 int err;
664 ssize_t copied; 664 ssize_t copied;
665 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 665 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
666 666
667 /* Wait for a connection to finish. */ 667 /* Wait for a connection to finish. */
668 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 668 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
669 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 669 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
670 goto out_err; 670 goto out_err;
671 671
672 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 672 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
673 673
674 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 674 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
675 size_goal = tp->xmit_size_goal; 675 size_goal = tp->xmit_size_goal;
676 copied = 0; 676 copied = 0;
677 677
678 err = -EPIPE; 678 err = -EPIPE;
679 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 679 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
680 goto do_error; 680 goto do_error;
681 681
682 while (psize > 0) { 682 while (psize > 0) {
683 struct sk_buff *skb = tcp_write_queue_tail(sk); 683 struct sk_buff *skb = tcp_write_queue_tail(sk);
684 struct page *page = pages[poffset / PAGE_SIZE]; 684 struct page *page = pages[poffset / PAGE_SIZE];
685 int copy, i, can_coalesce; 685 int copy, i, can_coalesce;
686 int offset = poffset % PAGE_SIZE; 686 int offset = poffset % PAGE_SIZE;
687 int size = min_t(size_t, psize, PAGE_SIZE - offset); 687 int size = min_t(size_t, psize, PAGE_SIZE - offset);
688 688
689 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { 689 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
690 new_segment: 690 new_segment:
691 if (!sk_stream_memory_free(sk)) 691 if (!sk_stream_memory_free(sk))
692 goto wait_for_sndbuf; 692 goto wait_for_sndbuf;
693 693
694 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); 694 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
695 if (!skb) 695 if (!skb)
696 goto wait_for_memory; 696 goto wait_for_memory;
697 697
698 skb_entail(sk, skb); 698 skb_entail(sk, skb);
699 copy = size_goal; 699 copy = size_goal;
700 } 700 }
701 701
702 if (copy > size) 702 if (copy > size)
703 copy = size; 703 copy = size;
704 704
705 i = skb_shinfo(skb)->nr_frags; 705 i = skb_shinfo(skb)->nr_frags;
706 can_coalesce = skb_can_coalesce(skb, i, page, offset); 706 can_coalesce = skb_can_coalesce(skb, i, page, offset);
707 if (!can_coalesce && i >= MAX_SKB_FRAGS) { 707 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
708 tcp_mark_push(tp, skb); 708 tcp_mark_push(tp, skb);
709 goto new_segment; 709 goto new_segment;
710 } 710 }
711 if (!sk_wmem_schedule(sk, copy)) 711 if (!sk_wmem_schedule(sk, copy))
712 goto wait_for_memory; 712 goto wait_for_memory;
713 713
714 if (can_coalesce) { 714 if (can_coalesce) {
715 skb_shinfo(skb)->frags[i - 1].size += copy; 715 skb_shinfo(skb)->frags[i - 1].size += copy;
716 } else { 716 } else {
717 get_page(page); 717 get_page(page);
718 skb_fill_page_desc(skb, i, page, offset, copy); 718 skb_fill_page_desc(skb, i, page, offset, copy);
719 } 719 }
720 720
721 skb->len += copy; 721 skb->len += copy;
722 skb->data_len += copy; 722 skb->data_len += copy;
723 skb->truesize += copy; 723 skb->truesize += copy;
724 sk->sk_wmem_queued += copy; 724 sk->sk_wmem_queued += copy;
725 sk_mem_charge(sk, copy); 725 sk_mem_charge(sk, copy);
726 skb->ip_summed = CHECKSUM_PARTIAL; 726 skb->ip_summed = CHECKSUM_PARTIAL;
727 tp->write_seq += copy; 727 tp->write_seq += copy;
728 TCP_SKB_CB(skb)->end_seq += copy; 728 TCP_SKB_CB(skb)->end_seq += copy;
729 skb_shinfo(skb)->gso_segs = 0; 729 skb_shinfo(skb)->gso_segs = 0;
730 730
731 if (!copied) 731 if (!copied)
732 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 732 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
733 733
734 copied += copy; 734 copied += copy;
735 poffset += copy; 735 poffset += copy;
736 if (!(psize -= copy)) 736 if (!(psize -= copy))
737 goto out; 737 goto out;
738 738
739 if (skb->len < size_goal || (flags & MSG_OOB)) 739 if (skb->len < size_goal || (flags & MSG_OOB))
740 continue; 740 continue;
741 741
742 if (forced_push(tp)) { 742 if (forced_push(tp)) {
743 tcp_mark_push(tp, skb); 743 tcp_mark_push(tp, skb);
744 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); 744 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
745 } else if (skb == tcp_send_head(sk)) 745 } else if (skb == tcp_send_head(sk))
746 tcp_push_one(sk, mss_now); 746 tcp_push_one(sk, mss_now);
747 continue; 747 continue;
748 748
749 wait_for_sndbuf: 749 wait_for_sndbuf:
750 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 750 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
751 wait_for_memory: 751 wait_for_memory:
752 if (copied) 752 if (copied)
753 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 753 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
754 754
755 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 755 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
756 goto do_error; 756 goto do_error;
757 757
758 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 758 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
759 size_goal = tp->xmit_size_goal; 759 size_goal = tp->xmit_size_goal;
760 } 760 }
761 761
762 out: 762 out:
763 if (copied) 763 if (copied)
764 tcp_push(sk, flags, mss_now, tp->nonagle); 764 tcp_push(sk, flags, mss_now, tp->nonagle);
765 return copied; 765 return copied;
766 766
767 do_error: 767 do_error:
768 if (copied) 768 if (copied)
769 goto out; 769 goto out;
770 out_err: 770 out_err:
771 return sk_stream_error(sk, flags, err); 771 return sk_stream_error(sk, flags, err);
772 } 772 }
773 773
774 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, 774 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
775 size_t size, int flags) 775 size_t size, int flags)
776 { 776 {
777 ssize_t res; 777 ssize_t res;
778 struct sock *sk = sock->sk; 778 struct sock *sk = sock->sk;
779 779
780 if (!(sk->sk_route_caps & NETIF_F_SG) || 780 if (!(sk->sk_route_caps & NETIF_F_SG) ||
781 !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) 781 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
782 return sock_no_sendpage(sock, page, offset, size, flags); 782 return sock_no_sendpage(sock, page, offset, size, flags);
783 783
784 lock_sock(sk); 784 lock_sock(sk);
785 TCP_CHECK_TIMER(sk); 785 TCP_CHECK_TIMER(sk);
786 res = do_tcp_sendpages(sk, &page, offset, size, flags); 786 res = do_tcp_sendpages(sk, &page, offset, size, flags);
787 TCP_CHECK_TIMER(sk); 787 TCP_CHECK_TIMER(sk);
788 release_sock(sk); 788 release_sock(sk);
789 return res; 789 return res;
790 } 790 }
791 791
792 #define TCP_PAGE(sk) (sk->sk_sndmsg_page) 792 #define TCP_PAGE(sk) (sk->sk_sndmsg_page)
793 #define TCP_OFF(sk) (sk->sk_sndmsg_off) 793 #define TCP_OFF(sk) (sk->sk_sndmsg_off)
794 794
795 static inline int select_size(struct sock *sk) 795 static inline int select_size(struct sock *sk)
796 { 796 {
797 struct tcp_sock *tp = tcp_sk(sk); 797 struct tcp_sock *tp = tcp_sk(sk);
798 int tmp = tp->mss_cache; 798 int tmp = tp->mss_cache;
799 799
800 if (sk->sk_route_caps & NETIF_F_SG) { 800 if (sk->sk_route_caps & NETIF_F_SG) {
801 if (sk_can_gso(sk)) 801 if (sk_can_gso(sk))
802 tmp = 0; 802 tmp = 0;
803 else { 803 else {
804 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 804 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
805 805
806 if (tmp >= pgbreak && 806 if (tmp >= pgbreak &&
807 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 807 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
808 tmp = pgbreak; 808 tmp = pgbreak;
809 } 809 }
810 } 810 }
811 811
812 return tmp; 812 return tmp;
813 } 813 }
814 814
815 int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 815 int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
816 size_t size) 816 size_t size)
817 { 817 {
818 struct sock *sk = sock->sk; 818 struct sock *sk = sock->sk;
819 struct iovec *iov; 819 struct iovec *iov;
820 struct tcp_sock *tp = tcp_sk(sk); 820 struct tcp_sock *tp = tcp_sk(sk);
821 struct sk_buff *skb; 821 struct sk_buff *skb;
822 int iovlen, flags; 822 int iovlen, flags;
823 int mss_now, size_goal; 823 int mss_now, size_goal;
824 int err, copied; 824 int err, copied;
825 long timeo; 825 long timeo;
826 826
827 lock_sock(sk); 827 lock_sock(sk);
828 TCP_CHECK_TIMER(sk); 828 TCP_CHECK_TIMER(sk);
829 829
830 flags = msg->msg_flags; 830 flags = msg->msg_flags;
831 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 831 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
832 832
833 /* Wait for a connection to finish. */ 833 /* Wait for a connection to finish. */
834 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 834 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
835 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 835 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
836 goto out_err; 836 goto out_err;
837 837
838 /* This should be in poll */ 838 /* This should be in poll */
839 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 839 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
840 840
841 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 841 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
842 size_goal = tp->xmit_size_goal; 842 size_goal = tp->xmit_size_goal;
843 843
844 /* Ok commence sending. */ 844 /* Ok commence sending. */
845 iovlen = msg->msg_iovlen; 845 iovlen = msg->msg_iovlen;
846 iov = msg->msg_iov; 846 iov = msg->msg_iov;
847 copied = 0; 847 copied = 0;
848 848
849 err = -EPIPE; 849 err = -EPIPE;
850 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 850 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
851 goto do_error; 851 goto do_error;
852 852
853 while (--iovlen >= 0) { 853 while (--iovlen >= 0) {
854 int seglen = iov->iov_len; 854 int seglen = iov->iov_len;
855 unsigned char __user *from = iov->iov_base; 855 unsigned char __user *from = iov->iov_base;
856 856
857 iov++; 857 iov++;
858 858
859 while (seglen > 0) { 859 while (seglen > 0) {
860 int copy; 860 int copy;
861 861
862 skb = tcp_write_queue_tail(sk); 862 skb = tcp_write_queue_tail(sk);
863 863
864 if (!tcp_send_head(sk) || 864 if (!tcp_send_head(sk) ||
865 (copy = size_goal - skb->len) <= 0) { 865 (copy = size_goal - skb->len) <= 0) {
866 866
867 new_segment: 867 new_segment:
868 /* Allocate new segment. If the interface is SG, 868 /* Allocate new segment. If the interface is SG,
869 * allocate skb fitting to single page. 869 * allocate skb fitting to single page.
870 */ 870 */
871 if (!sk_stream_memory_free(sk)) 871 if (!sk_stream_memory_free(sk))
872 goto wait_for_sndbuf; 872 goto wait_for_sndbuf;
873 873
874 skb = sk_stream_alloc_skb(sk, select_size(sk), 874 skb = sk_stream_alloc_skb(sk, select_size(sk),
875 sk->sk_allocation); 875 sk->sk_allocation);
876 if (!skb) 876 if (!skb)
877 goto wait_for_memory; 877 goto wait_for_memory;
878 878
879 /* 879 /*
880 * Check whether we can use HW checksum. 880 * Check whether we can use HW checksum.
881 */ 881 */
882 if (sk->sk_route_caps & NETIF_F_ALL_CSUM) 882 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
883 skb->ip_summed = CHECKSUM_PARTIAL; 883 skb->ip_summed = CHECKSUM_PARTIAL;
884 884
885 skb_entail(sk, skb); 885 skb_entail(sk, skb);
886 copy = size_goal; 886 copy = size_goal;
887 } 887 }
888 888
889 /* Try to append data to the end of skb. */ 889 /* Try to append data to the end of skb. */
890 if (copy > seglen) 890 if (copy > seglen)
891 copy = seglen; 891 copy = seglen;
892 892
893 /* Where to copy to? */ 893 /* Where to copy to? */
894 if (skb_tailroom(skb) > 0) { 894 if (skb_tailroom(skb) > 0) {
895 /* We have some space in skb head. Superb! */ 895 /* We have some space in skb head. Superb! */
896 if (copy > skb_tailroom(skb)) 896 if (copy > skb_tailroom(skb))
897 copy = skb_tailroom(skb); 897 copy = skb_tailroom(skb);
898 if ((err = skb_add_data(skb, from, copy)) != 0) 898 if ((err = skb_add_data(skb, from, copy)) != 0)
899 goto do_fault; 899 goto do_fault;
900 } else { 900 } else {
901 int merge = 0; 901 int merge = 0;
902 int i = skb_shinfo(skb)->nr_frags; 902 int i = skb_shinfo(skb)->nr_frags;
903 struct page *page = TCP_PAGE(sk); 903 struct page *page = TCP_PAGE(sk);
904 int off = TCP_OFF(sk); 904 int off = TCP_OFF(sk);
905 905
906 if (skb_can_coalesce(skb, i, page, off) && 906 if (skb_can_coalesce(skb, i, page, off) &&
907 off != PAGE_SIZE) { 907 off != PAGE_SIZE) {
908 /* We can extend the last page 908 /* We can extend the last page
909 * fragment. */ 909 * fragment. */
910 merge = 1; 910 merge = 1;
911 } else if (i == MAX_SKB_FRAGS || 911 } else if (i == MAX_SKB_FRAGS ||
912 (!i && 912 (!i &&
913 !(sk->sk_route_caps & NETIF_F_SG))) { 913 !(sk->sk_route_caps & NETIF_F_SG))) {
914 /* Need to add new fragment and cannot 914 /* Need to add new fragment and cannot
915 * do this because interface is non-SG, 915 * do this because interface is non-SG,
916 * or because all the page slots are 916 * or because all the page slots are
917 * busy. */ 917 * busy. */
918 tcp_mark_push(tp, skb); 918 tcp_mark_push(tp, skb);
919 goto new_segment; 919 goto new_segment;
920 } else if (page) { 920 } else if (page) {
921 if (off == PAGE_SIZE) { 921 if (off == PAGE_SIZE) {
922 put_page(page); 922 put_page(page);
923 TCP_PAGE(sk) = page = NULL; 923 TCP_PAGE(sk) = page = NULL;
924 off = 0; 924 off = 0;
925 } 925 }
926 } else 926 } else
927 off = 0; 927 off = 0;
928 928
929 if (copy > PAGE_SIZE - off) 929 if (copy > PAGE_SIZE - off)
930 copy = PAGE_SIZE - off; 930 copy = PAGE_SIZE - off;
931 931
932 if (!sk_wmem_schedule(sk, copy)) 932 if (!sk_wmem_schedule(sk, copy))
933 goto wait_for_memory; 933 goto wait_for_memory;
934 934
935 if (!page) { 935 if (!page) {
936 /* Allocate new cache page. */ 936 /* Allocate new cache page. */
937 if (!(page = sk_stream_alloc_page(sk))) 937 if (!(page = sk_stream_alloc_page(sk)))
938 goto wait_for_memory; 938 goto wait_for_memory;
939 } 939 }
940 940
941 /* Time to copy data. We are close to 941 /* Time to copy data. We are close to
942 * the end! */ 942 * the end! */
943 err = skb_copy_to_page(sk, from, skb, page, 943 err = skb_copy_to_page(sk, from, skb, page,
944 off, copy); 944 off, copy);
945 if (err) { 945 if (err) {
946 /* If this page was new, give it to the 946 /* If this page was new, give it to the
947 * socket so it does not get leaked. 947 * socket so it does not get leaked.
948 */ 948 */
949 if (!TCP_PAGE(sk)) { 949 if (!TCP_PAGE(sk)) {
950 TCP_PAGE(sk) = page; 950 TCP_PAGE(sk) = page;
951 TCP_OFF(sk) = 0; 951 TCP_OFF(sk) = 0;
952 } 952 }
953 goto do_error; 953 goto do_error;
954 } 954 }
955 955
956 /* Update the skb. */ 956 /* Update the skb. */
957 if (merge) { 957 if (merge) {
958 skb_shinfo(skb)->frags[i - 1].size += 958 skb_shinfo(skb)->frags[i - 1].size +=
959 copy; 959 copy;
960 } else { 960 } else {
961 skb_fill_page_desc(skb, i, page, off, copy); 961 skb_fill_page_desc(skb, i, page, off, copy);
962 if (TCP_PAGE(sk)) { 962 if (TCP_PAGE(sk)) {
963 get_page(page); 963 get_page(page);
964 } else if (off + copy < PAGE_SIZE) { 964 } else if (off + copy < PAGE_SIZE) {
965 get_page(page); 965 get_page(page);
966 TCP_PAGE(sk) = page; 966 TCP_PAGE(sk) = page;
967 } 967 }
968 } 968 }
969 969
970 TCP_OFF(sk) = off + copy; 970 TCP_OFF(sk) = off + copy;
971 } 971 }
972 972
973 if (!copied) 973 if (!copied)
974 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 974 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
975 975
976 tp->write_seq += copy; 976 tp->write_seq += copy;
977 TCP_SKB_CB(skb)->end_seq += copy; 977 TCP_SKB_CB(skb)->end_seq += copy;
978 skb_shinfo(skb)->gso_segs = 0; 978 skb_shinfo(skb)->gso_segs = 0;
979 979
980 from += copy; 980 from += copy;
981 copied += copy; 981 copied += copy;
982 if ((seglen -= copy) == 0 && iovlen == 0) 982 if ((seglen -= copy) == 0 && iovlen == 0)
983 goto out; 983 goto out;
984 984
985 if (skb->len < size_goal || (flags & MSG_OOB)) 985 if (skb->len < size_goal || (flags & MSG_OOB))
986 continue; 986 continue;
987 987
988 if (forced_push(tp)) { 988 if (forced_push(tp)) {
989 tcp_mark_push(tp, skb); 989 tcp_mark_push(tp, skb);
990 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); 990 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
991 } else if (skb == tcp_send_head(sk)) 991 } else if (skb == tcp_send_head(sk))
992 tcp_push_one(sk, mss_now); 992 tcp_push_one(sk, mss_now);
993 continue; 993 continue;
994 994
995 wait_for_sndbuf: 995 wait_for_sndbuf:
996 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 996 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
997 wait_for_memory: 997 wait_for_memory:
998 if (copied) 998 if (copied)
999 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 999 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1000 1000
1001 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1001 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1002 goto do_error; 1002 goto do_error;
1003 1003
1004 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 1004 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1005 size_goal = tp->xmit_size_goal; 1005 size_goal = tp->xmit_size_goal;
1006 } 1006 }
1007 } 1007 }
1008 1008
1009 out: 1009 out:
1010 if (copied) 1010 if (copied)
1011 tcp_push(sk, flags, mss_now, tp->nonagle); 1011 tcp_push(sk, flags, mss_now, tp->nonagle);
1012 TCP_CHECK_TIMER(sk); 1012 TCP_CHECK_TIMER(sk);
1013 release_sock(sk); 1013 release_sock(sk);
1014 return copied; 1014 return copied;
1015 1015
1016 do_fault: 1016 do_fault:
1017 if (!skb->len) { 1017 if (!skb->len) {
1018 tcp_unlink_write_queue(skb, sk); 1018 tcp_unlink_write_queue(skb, sk);
1019 /* It is the one place in all of TCP, except connection 1019 /* It is the one place in all of TCP, except connection
1020 * reset, where we can be unlinking the send_head. 1020 * reset, where we can be unlinking the send_head.
1021 */ 1021 */
1022 tcp_check_send_head(sk, skb); 1022 tcp_check_send_head(sk, skb);
1023 sk_wmem_free_skb(sk, skb); 1023 sk_wmem_free_skb(sk, skb);
1024 } 1024 }
1025 1025
1026 do_error: 1026 do_error:
1027 if (copied) 1027 if (copied)
1028 goto out; 1028 goto out;
1029 out_err: 1029 out_err:
1030 err = sk_stream_error(sk, flags, err); 1030 err = sk_stream_error(sk, flags, err);
1031 TCP_CHECK_TIMER(sk); 1031 TCP_CHECK_TIMER(sk);
1032 release_sock(sk); 1032 release_sock(sk);
1033 return err; 1033 return err;
1034 } 1034 }
1035 1035
1036 /* 1036 /*
1037 * Handle reading urgent data. BSD has very simple semantics for 1037 * Handle reading urgent data. BSD has very simple semantics for
1038 * this, no blocking and very strange errors 8) 1038 * this, no blocking and very strange errors 8)
1039 */ 1039 */
1040 1040
1041 static int tcp_recv_urg(struct sock *sk, long timeo, 1041 static int tcp_recv_urg(struct sock *sk, long timeo,
1042 struct msghdr *msg, int len, int flags, 1042 struct msghdr *msg, int len, int flags,
1043 int *addr_len) 1043 int *addr_len)
1044 { 1044 {
1045 struct tcp_sock *tp = tcp_sk(sk); 1045 struct tcp_sock *tp = tcp_sk(sk);
1046 1046
1047 /* No URG data to read. */ 1047 /* No URG data to read. */
1048 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || 1048 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1049 tp->urg_data == TCP_URG_READ) 1049 tp->urg_data == TCP_URG_READ)
1050 return -EINVAL; /* Yes this is right ! */ 1050 return -EINVAL; /* Yes this is right ! */
1051 1051
1052 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) 1052 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1053 return -ENOTCONN; 1053 return -ENOTCONN;
1054 1054
1055 if (tp->urg_data & TCP_URG_VALID) { 1055 if (tp->urg_data & TCP_URG_VALID) {
1056 int err = 0; 1056 int err = 0;
1057 char c = tp->urg_data; 1057 char c = tp->urg_data;
1058 1058
1059 if (!(flags & MSG_PEEK)) 1059 if (!(flags & MSG_PEEK))
1060 tp->urg_data = TCP_URG_READ; 1060 tp->urg_data = TCP_URG_READ;
1061 1061
1062 /* Read urgent data. */ 1062 /* Read urgent data. */
1063 msg->msg_flags |= MSG_OOB; 1063 msg->msg_flags |= MSG_OOB;
1064 1064
1065 if (len > 0) { 1065 if (len > 0) {
1066 if (!(flags & MSG_TRUNC)) 1066 if (!(flags & MSG_TRUNC))
1067 err = memcpy_toiovec(msg->msg_iov, &c, 1); 1067 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1068 len = 1; 1068 len = 1;
1069 } else 1069 } else
1070 msg->msg_flags |= MSG_TRUNC; 1070 msg->msg_flags |= MSG_TRUNC;
1071 1071
1072 return err ? -EFAULT : len; 1072 return err ? -EFAULT : len;
1073 } 1073 }
1074 1074
1075 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) 1075 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1076 return 0; 1076 return 0;
1077 1077
1078 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and 1078 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1079 * the available implementations agree in this case: 1079 * the available implementations agree in this case:
1080 * this call should never block, independent of the 1080 * this call should never block, independent of the
1081 * blocking state of the socket. 1081 * blocking state of the socket.
1082 * Mike <pall@rz.uni-karlsruhe.de> 1082 * Mike <pall@rz.uni-karlsruhe.de>
1083 */ 1083 */
1084 return -EAGAIN; 1084 return -EAGAIN;
1085 } 1085 }
1086 1086
1087 /* Clean up the receive buffer for full frames taken by the user, 1087 /* Clean up the receive buffer for full frames taken by the user,
1088 * then send an ACK if necessary. COPIED is the number of bytes 1088 * then send an ACK if necessary. COPIED is the number of bytes
1089 * tcp_recvmsg has given to the user so far, it speeds up the 1089 * tcp_recvmsg has given to the user so far, it speeds up the
1090 * calculation of whether or not we must ACK for the sake of 1090 * calculation of whether or not we must ACK for the sake of
1091 * a window update. 1091 * a window update.
1092 */ 1092 */
1093 void tcp_cleanup_rbuf(struct sock *sk, int copied) 1093 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1094 { 1094 {
1095 struct tcp_sock *tp = tcp_sk(sk); 1095 struct tcp_sock *tp = tcp_sk(sk);
1096 int time_to_ack = 0; 1096 int time_to_ack = 0;
1097 1097
1098 #if TCP_DEBUG 1098 #if TCP_DEBUG
1099 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1099 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1100 1100
1101 WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); 1101 WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1102 #endif 1102 #endif
1103 1103
1104 if (inet_csk_ack_scheduled(sk)) { 1104 if (inet_csk_ack_scheduled(sk)) {
1105 const struct inet_connection_sock *icsk = inet_csk(sk); 1105 const struct inet_connection_sock *icsk = inet_csk(sk);
1106 /* Delayed ACKs frequently hit locked sockets during bulk 1106 /* Delayed ACKs frequently hit locked sockets during bulk
1107 * receive. */ 1107 * receive. */
1108 if (icsk->icsk_ack.blocked || 1108 if (icsk->icsk_ack.blocked ||
1109 /* Once-per-two-segments ACK was not sent by tcp_input.c */ 1109 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1110 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || 1110 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1111 /* 1111 /*
1112 * If this read emptied read buffer, we send ACK, if 1112 * If this read emptied read buffer, we send ACK, if
1113 * connection is not bidirectional, user drained 1113 * connection is not bidirectional, user drained
1114 * receive buffer and there was a small segment 1114 * receive buffer and there was a small segment
1115 * in queue. 1115 * in queue.
1116 */ 1116 */
1117 (copied > 0 && 1117 (copied > 0 &&
1118 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || 1118 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1119 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && 1119 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1120 !icsk->icsk_ack.pingpong)) && 1120 !icsk->icsk_ack.pingpong)) &&
1121 !atomic_read(&sk->sk_rmem_alloc))) 1121 !atomic_read(&sk->sk_rmem_alloc)))
1122 time_to_ack = 1; 1122 time_to_ack = 1;
1123 } 1123 }
1124 1124
1125 /* We send an ACK if we can now advertise a non-zero window 1125 /* We send an ACK if we can now advertise a non-zero window
1126 * which has been raised "significantly". 1126 * which has been raised "significantly".
1127 * 1127 *
1128 * Even if window raised up to infinity, do not send window open ACK 1128 * Even if window raised up to infinity, do not send window open ACK
1129 * in states, where we will not receive more. It is useless. 1129 * in states, where we will not receive more. It is useless.
1130 */ 1130 */
1131 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1131 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1132 __u32 rcv_window_now = tcp_receive_window(tp); 1132 __u32 rcv_window_now = tcp_receive_window(tp);
1133 1133
1134 /* Optimize, __tcp_select_window() is not cheap. */ 1134 /* Optimize, __tcp_select_window() is not cheap. */
1135 if (2*rcv_window_now <= tp->window_clamp) { 1135 if (2*rcv_window_now <= tp->window_clamp) {
1136 __u32 new_window = __tcp_select_window(sk); 1136 __u32 new_window = __tcp_select_window(sk);
1137 1137
1138 /* Send ACK now, if this read freed lots of space 1138 /* Send ACK now, if this read freed lots of space
1139 * in our buffer. Certainly, new_window is new window. 1139 * in our buffer. Certainly, new_window is new window.
1140 * We can advertise it now, if it is not less than current one. 1140 * We can advertise it now, if it is not less than current one.
1141 * "Lots" means "at least twice" here. 1141 * "Lots" means "at least twice" here.
1142 */ 1142 */
1143 if (new_window && new_window >= 2 * rcv_window_now) 1143 if (new_window && new_window >= 2 * rcv_window_now)
1144 time_to_ack = 1; 1144 time_to_ack = 1;
1145 } 1145 }
1146 } 1146 }
1147 if (time_to_ack) 1147 if (time_to_ack)
1148 tcp_send_ack(sk); 1148 tcp_send_ack(sk);
1149 } 1149 }
1150 1150
1151 static void tcp_prequeue_process(struct sock *sk) 1151 static void tcp_prequeue_process(struct sock *sk)
1152 { 1152 {
1153 struct sk_buff *skb; 1153 struct sk_buff *skb;
1154 struct tcp_sock *tp = tcp_sk(sk); 1154 struct tcp_sock *tp = tcp_sk(sk);
1155 1155
1156 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); 1156 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1157 1157
1158 /* RX process wants to run with disabled BHs, though it is not 1158 /* RX process wants to run with disabled BHs, though it is not
1159 * necessary */ 1159 * necessary */
1160 local_bh_disable(); 1160 local_bh_disable();
1161 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 1161 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1162 sk_backlog_rcv(sk, skb); 1162 sk_backlog_rcv(sk, skb);
1163 local_bh_enable(); 1163 local_bh_enable();
1164 1164
1165 /* Clear memory counter. */ 1165 /* Clear memory counter. */
1166 tp->ucopy.memory = 0; 1166 tp->ucopy.memory = 0;
1167 } 1167 }
1168 1168
1169 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1169 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1170 { 1170 {
1171 struct sk_buff *skb; 1171 struct sk_buff *skb;
1172 u32 offset; 1172 u32 offset;
1173 1173
1174 skb_queue_walk(&sk->sk_receive_queue, skb) { 1174 skb_queue_walk(&sk->sk_receive_queue, skb) {
1175 offset = seq - TCP_SKB_CB(skb)->seq; 1175 offset = seq - TCP_SKB_CB(skb)->seq;
1176 if (tcp_hdr(skb)->syn) 1176 if (tcp_hdr(skb)->syn)
1177 offset--; 1177 offset--;
1178 if (offset < skb->len || tcp_hdr(skb)->fin) { 1178 if (offset < skb->len || tcp_hdr(skb)->fin) {
1179 *off = offset; 1179 *off = offset;
1180 return skb; 1180 return skb;
1181 } 1181 }
1182 } 1182 }
1183 return NULL; 1183 return NULL;
1184 } 1184 }
1185 1185
1186 /* 1186 /*
1187 * This routine provides an alternative to tcp_recvmsg() for routines 1187 * This routine provides an alternative to tcp_recvmsg() for routines
1188 * that would like to handle copying from skbuffs directly in 'sendfile' 1188 * that would like to handle copying from skbuffs directly in 'sendfile'
1189 * fashion. 1189 * fashion.
1190 * Note: 1190 * Note:
1191 * - It is assumed that the socket was locked by the caller. 1191 * - It is assumed that the socket was locked by the caller.
1192 * - The routine does not block. 1192 * - The routine does not block.
1193 * - At present, there is no support for reading OOB data 1193 * - At present, there is no support for reading OOB data
1194 * or for 'peeking' the socket using this routine 1194 * or for 'peeking' the socket using this routine
1195 * (although both would be easy to implement). 1195 * (although both would be easy to implement).
1196 */ 1196 */
1197 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, 1197 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1198 sk_read_actor_t recv_actor) 1198 sk_read_actor_t recv_actor)
1199 { 1199 {
1200 struct sk_buff *skb; 1200 struct sk_buff *skb;
1201 struct tcp_sock *tp = tcp_sk(sk); 1201 struct tcp_sock *tp = tcp_sk(sk);
1202 u32 seq = tp->copied_seq; 1202 u32 seq = tp->copied_seq;
1203 u32 offset; 1203 u32 offset;
1204 int copied = 0; 1204 int copied = 0;
1205 1205
1206 if (sk->sk_state == TCP_LISTEN) 1206 if (sk->sk_state == TCP_LISTEN)
1207 return -ENOTCONN; 1207 return -ENOTCONN;
1208 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { 1208 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1209 if (offset < skb->len) { 1209 if (offset < skb->len) {
1210 int used; 1210 int used;
1211 size_t len; 1211 size_t len;
1212 1212
1213 len = skb->len - offset; 1213 len = skb->len - offset;
1214 /* Stop reading if we hit a patch of urgent data */ 1214 /* Stop reading if we hit a patch of urgent data */
1215 if (tp->urg_data) { 1215 if (tp->urg_data) {
1216 u32 urg_offset = tp->urg_seq - seq; 1216 u32 urg_offset = tp->urg_seq - seq;
1217 if (urg_offset < len) 1217 if (urg_offset < len)
1218 len = urg_offset; 1218 len = urg_offset;
1219 if (!len) 1219 if (!len)
1220 break; 1220 break;
1221 } 1221 }
1222 used = recv_actor(desc, skb, offset, len); 1222 used = recv_actor(desc, skb, offset, len);
1223 if (used < 0) { 1223 if (used < 0) {
1224 if (!copied) 1224 if (!copied)
1225 copied = used; 1225 copied = used;
1226 break; 1226 break;
1227 } else if (used <= len) { 1227 } else if (used <= len) {
1228 seq += used; 1228 seq += used;
1229 copied += used; 1229 copied += used;
1230 offset += used; 1230 offset += used;
1231 } 1231 }
1232 /* 1232 /*
1233 * If recv_actor drops the lock (e.g. TCP splice 1233 * If recv_actor drops the lock (e.g. TCP splice
1234 * receive) the skb pointer might be invalid when 1234 * receive) the skb pointer might be invalid when
1235 * getting here: tcp_collapse might have deleted it 1235 * getting here: tcp_collapse might have deleted it
1236 * while aggregating skbs from the socket queue. 1236 * while aggregating skbs from the socket queue.
1237 */ 1237 */
1238 skb = tcp_recv_skb(sk, seq-1, &offset); 1238 skb = tcp_recv_skb(sk, seq-1, &offset);
1239 if (!skb || (offset+1 != skb->len)) 1239 if (!skb || (offset+1 != skb->len))
1240 break; 1240 break;
1241 } 1241 }
1242 if (tcp_hdr(skb)->fin) { 1242 if (tcp_hdr(skb)->fin) {
1243 sk_eat_skb(sk, skb, 0); 1243 sk_eat_skb(sk, skb, 0);
1244 ++seq; 1244 ++seq;
1245 break; 1245 break;
1246 } 1246 }
1247 sk_eat_skb(sk, skb, 0); 1247 sk_eat_skb(sk, skb, 0);
1248 if (!desc->count) 1248 if (!desc->count)
1249 break; 1249 break;
1250 } 1250 }
1251 tp->copied_seq = seq; 1251 tp->copied_seq = seq;
1252 1252
1253 tcp_rcv_space_adjust(sk); 1253 tcp_rcv_space_adjust(sk);
1254 1254
1255 /* Clean up data we have read: This will do ACK frames. */ 1255 /* Clean up data we have read: This will do ACK frames. */
1256 if (copied > 0) 1256 if (copied > 0)
1257 tcp_cleanup_rbuf(sk, copied); 1257 tcp_cleanup_rbuf(sk, copied);
1258 return copied; 1258 return copied;
1259 } 1259 }
1260 1260
1261 /* 1261 /*
1262 * This routine copies from a sock struct into the user buffer. 1262 * This routine copies from a sock struct into the user buffer.
1263 * 1263 *
1264 * Technical note: in 2.3 we work on _locked_ socket, so that 1264 * Technical note: in 2.3 we work on _locked_ socket, so that
1265 * tricks with *seq access order and skb->users are not required. 1265 * tricks with *seq access order and skb->users are not required.
1266 * Probably, code can be easily improved even more. 1266 * Probably, code can be easily improved even more.
1267 */ 1267 */
1268 1268
1269 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1269 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1270 size_t len, int nonblock, int flags, int *addr_len) 1270 size_t len, int nonblock, int flags, int *addr_len)
1271 { 1271 {
1272 struct tcp_sock *tp = tcp_sk(sk); 1272 struct tcp_sock *tp = tcp_sk(sk);
1273 int copied = 0; 1273 int copied = 0;
1274 u32 peek_seq; 1274 u32 peek_seq;
1275 u32 *seq; 1275 u32 *seq;
1276 unsigned long used; 1276 unsigned long used;
1277 int err; 1277 int err;
1278 int target; /* Read at least this many bytes */ 1278 int target; /* Read at least this many bytes */
1279 long timeo; 1279 long timeo;
1280 struct task_struct *user_recv = NULL; 1280 struct task_struct *user_recv = NULL;
1281 int copied_early = 0; 1281 int copied_early = 0;
1282 struct sk_buff *skb; 1282 struct sk_buff *skb;
1283 1283
1284 lock_sock(sk); 1284 lock_sock(sk);
1285 1285
1286 TCP_CHECK_TIMER(sk); 1286 TCP_CHECK_TIMER(sk);
1287 1287
1288 err = -ENOTCONN; 1288 err = -ENOTCONN;
1289 if (sk->sk_state == TCP_LISTEN) 1289 if (sk->sk_state == TCP_LISTEN)
1290 goto out; 1290 goto out;
1291 1291
1292 timeo = sock_rcvtimeo(sk, nonblock); 1292 timeo = sock_rcvtimeo(sk, nonblock);
1293 1293
1294 /* Urgent data needs to be handled specially. */ 1294 /* Urgent data needs to be handled specially. */
1295 if (flags & MSG_OOB) 1295 if (flags & MSG_OOB)
1296 goto recv_urg; 1296 goto recv_urg;
1297 1297
1298 seq = &tp->copied_seq; 1298 seq = &tp->copied_seq;
1299 if (flags & MSG_PEEK) { 1299 if (flags & MSG_PEEK) {
1300 peek_seq = tp->copied_seq; 1300 peek_seq = tp->copied_seq;
1301 seq = &peek_seq; 1301 seq = &peek_seq;
1302 } 1302 }
1303 1303
1304 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1304 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1305 1305
1306 #ifdef CONFIG_NET_DMA 1306 #ifdef CONFIG_NET_DMA
1307 tp->ucopy.dma_chan = NULL; 1307 tp->ucopy.dma_chan = NULL;
1308 preempt_disable(); 1308 preempt_disable();
1309 skb = skb_peek_tail(&sk->sk_receive_queue); 1309 skb = skb_peek_tail(&sk->sk_receive_queue);
1310 { 1310 {
1311 int available = 0; 1311 int available = 0;
1312 1312
1313 if (skb) 1313 if (skb)
1314 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); 1314 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1315 if ((available < target) && 1315 if ((available < target) &&
1316 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 1316 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1317 !sysctl_tcp_low_latency && 1317 !sysctl_tcp_low_latency &&
1318 __get_cpu_var(softnet_data).net_dma) { 1318 __get_cpu_var(softnet_data).net_dma) {
1319 preempt_enable_no_resched(); 1319 preempt_enable_no_resched();
1320 tp->ucopy.pinned_list = 1320 tp->ucopy.pinned_list =
1321 dma_pin_iovec_pages(msg->msg_iov, len); 1321 dma_pin_iovec_pages(msg->msg_iov, len);
1322 } else { 1322 } else {
1323 preempt_enable_no_resched(); 1323 preempt_enable_no_resched();
1324 } 1324 }
1325 } 1325 }
1326 #endif 1326 #endif
1327 1327
1328 do { 1328 do {
1329 u32 offset; 1329 u32 offset;
1330 1330
1331 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ 1331 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1332 if (tp->urg_data && tp->urg_seq == *seq) { 1332 if (tp->urg_data && tp->urg_seq == *seq) {
1333 if (copied) 1333 if (copied)
1334 break; 1334 break;
1335 if (signal_pending(current)) { 1335 if (signal_pending(current)) {
1336 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; 1336 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1337 break; 1337 break;
1338 } 1338 }
1339 } 1339 }
1340 1340
1341 /* Next get a buffer. */ 1341 /* Next get a buffer. */
1342 1342
1343 skb = skb_peek(&sk->sk_receive_queue); 1343 skb = skb_peek(&sk->sk_receive_queue);
1344 do { 1344 do {
1345 if (!skb) 1345 if (!skb)
1346 break; 1346 break;
1347 1347
1348 /* Now that we have two receive queues this 1348 /* Now that we have two receive queues this
1349 * shouldn't happen. 1349 * shouldn't happen.
1350 */ 1350 */
1351 if (before(*seq, TCP_SKB_CB(skb)->seq)) { 1351 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1352 printk(KERN_INFO "recvmsg bug: copied %X " 1352 printk(KERN_INFO "recvmsg bug: copied %X "
1353 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); 1353 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1354 break; 1354 break;
1355 } 1355 }
1356 offset = *seq - TCP_SKB_CB(skb)->seq; 1356 offset = *seq - TCP_SKB_CB(skb)->seq;
1357 if (tcp_hdr(skb)->syn) 1357 if (tcp_hdr(skb)->syn)
1358 offset--; 1358 offset--;
1359 if (offset < skb->len) 1359 if (offset < skb->len)
1360 goto found_ok_skb; 1360 goto found_ok_skb;
1361 if (tcp_hdr(skb)->fin) 1361 if (tcp_hdr(skb)->fin)
1362 goto found_fin_ok; 1362 goto found_fin_ok;
1363 WARN_ON(!(flags & MSG_PEEK)); 1363 WARN_ON(!(flags & MSG_PEEK));
1364 skb = skb->next; 1364 skb = skb->next;
1365 } while (skb != (struct sk_buff *)&sk->sk_receive_queue); 1365 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1366 1366
1367 /* Well, if we have backlog, try to process it now yet. */ 1367 /* Well, if we have backlog, try to process it now yet. */
1368 1368
1369 if (copied >= target && !sk->sk_backlog.tail) 1369 if (copied >= target && !sk->sk_backlog.tail)
1370 break; 1370 break;
1371 1371
1372 if (copied) { 1372 if (copied) {
1373 if (sk->sk_err || 1373 if (sk->sk_err ||
1374 sk->sk_state == TCP_CLOSE || 1374 sk->sk_state == TCP_CLOSE ||
1375 (sk->sk_shutdown & RCV_SHUTDOWN) || 1375 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1376 !timeo || 1376 !timeo ||
1377 signal_pending(current) || 1377 signal_pending(current) ||
1378 (flags & MSG_PEEK)) 1378 (flags & MSG_PEEK))
1379 break; 1379 break;
1380 } else { 1380 } else {
1381 if (sock_flag(sk, SOCK_DONE)) 1381 if (sock_flag(sk, SOCK_DONE))
1382 break; 1382 break;
1383 1383
1384 if (sk->sk_err) { 1384 if (sk->sk_err) {
1385 copied = sock_error(sk); 1385 copied = sock_error(sk);
1386 break; 1386 break;
1387 } 1387 }
1388 1388
1389 if (sk->sk_shutdown & RCV_SHUTDOWN) 1389 if (sk->sk_shutdown & RCV_SHUTDOWN)
1390 break; 1390 break;
1391 1391
1392 if (sk->sk_state == TCP_CLOSE) { 1392 if (sk->sk_state == TCP_CLOSE) {
1393 if (!sock_flag(sk, SOCK_DONE)) { 1393 if (!sock_flag(sk, SOCK_DONE)) {
1394 /* This occurs when user tries to read 1394 /* This occurs when user tries to read
1395 * from never connected socket. 1395 * from never connected socket.
1396 */ 1396 */
1397 copied = -ENOTCONN; 1397 copied = -ENOTCONN;
1398 break; 1398 break;
1399 } 1399 }
1400 break; 1400 break;
1401 } 1401 }
1402 1402
1403 if (!timeo) { 1403 if (!timeo) {
1404 copied = -EAGAIN; 1404 copied = -EAGAIN;
1405 break; 1405 break;
1406 } 1406 }
1407 1407
1408 if (signal_pending(current)) { 1408 if (signal_pending(current)) {
1409 copied = sock_intr_errno(timeo); 1409 copied = sock_intr_errno(timeo);
1410 break; 1410 break;
1411 } 1411 }
1412 } 1412 }
1413 1413
1414 tcp_cleanup_rbuf(sk, copied); 1414 tcp_cleanup_rbuf(sk, copied);
1415 1415
1416 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { 1416 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1417 /* Install new reader */ 1417 /* Install new reader */
1418 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { 1418 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1419 user_recv = current; 1419 user_recv = current;
1420 tp->ucopy.task = user_recv; 1420 tp->ucopy.task = user_recv;
1421 tp->ucopy.iov = msg->msg_iov; 1421 tp->ucopy.iov = msg->msg_iov;
1422 } 1422 }
1423 1423
1424 tp->ucopy.len = len; 1424 tp->ucopy.len = len;
1425 1425
1426 WARN_ON(tp->copied_seq != tp->rcv_nxt && 1426 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1427 !(flags & (MSG_PEEK | MSG_TRUNC))); 1427 !(flags & (MSG_PEEK | MSG_TRUNC)));
1428 1428
1429 /* Ugly... If prequeue is not empty, we have to 1429 /* Ugly... If prequeue is not empty, we have to
1430 * process it before releasing socket, otherwise 1430 * process it before releasing socket, otherwise
1431 * order will be broken at second iteration. 1431 * order will be broken at second iteration.
1432 * More elegant solution is required!!! 1432 * More elegant solution is required!!!
1433 * 1433 *
1434 * Look: we have the following (pseudo)queues: 1434 * Look: we have the following (pseudo)queues:
1435 * 1435 *
1436 * 1. packets in flight 1436 * 1. packets in flight
1437 * 2. backlog 1437 * 2. backlog
1438 * 3. prequeue 1438 * 3. prequeue
1439 * 4. receive_queue 1439 * 4. receive_queue
1440 * 1440 *
1441 * Each queue can be processed only if the next ones 1441 * Each queue can be processed only if the next ones
1442 * are empty. At this point we have empty receive_queue. 1442 * are empty. At this point we have empty receive_queue.
1443 * But prequeue _can_ be not empty after 2nd iteration, 1443 * But prequeue _can_ be not empty after 2nd iteration,
1444 * when we jumped to start of loop because backlog 1444 * when we jumped to start of loop because backlog
1445 * processing added something to receive_queue. 1445 * processing added something to receive_queue.
1446 * We cannot release_sock(), because backlog contains 1446 * We cannot release_sock(), because backlog contains
1447 * packets arrived _after_ prequeued ones. 1447 * packets arrived _after_ prequeued ones.
1448 * 1448 *
1449 * Shortly, algorithm is clear --- to process all 1449 * Shortly, algorithm is clear --- to process all
1450 * the queues in order. We could make it more directly, 1450 * the queues in order. We could make it more directly,
1451 * requeueing packets from backlog to prequeue, if 1451 * requeueing packets from backlog to prequeue, if
1452 * is not empty. It is more elegant, but eats cycles, 1452 * is not empty. It is more elegant, but eats cycles,
1453 * unfortunately. 1453 * unfortunately.
1454 */ 1454 */
1455 if (!skb_queue_empty(&tp->ucopy.prequeue)) 1455 if (!skb_queue_empty(&tp->ucopy.prequeue))
1456 goto do_prequeue; 1456 goto do_prequeue;
1457 1457
1458 /* __ Set realtime policy in scheduler __ */ 1458 /* __ Set realtime policy in scheduler __ */
1459 } 1459 }
1460 1460
1461 if (copied >= target) { 1461 if (copied >= target) {
1462 /* Do not sleep, just process backlog. */ 1462 /* Do not sleep, just process backlog. */
1463 release_sock(sk); 1463 release_sock(sk);
1464 lock_sock(sk); 1464 lock_sock(sk);
1465 } else 1465 } else
1466 sk_wait_data(sk, &timeo); 1466 sk_wait_data(sk, &timeo);
1467 1467
1468 #ifdef CONFIG_NET_DMA 1468 #ifdef CONFIG_NET_DMA
1469 tp->ucopy.wakeup = 0; 1469 tp->ucopy.wakeup = 0;
1470 #endif 1470 #endif
1471 1471
1472 if (user_recv) { 1472 if (user_recv) {
1473 int chunk; 1473 int chunk;
1474 1474
1475 /* __ Restore normal policy in scheduler __ */ 1475 /* __ Restore normal policy in scheduler __ */
1476 1476
1477 if ((chunk = len - tp->ucopy.len) != 0) { 1477 if ((chunk = len - tp->ucopy.len) != 0) {
1478 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); 1478 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1479 len -= chunk; 1479 len -= chunk;
1480 copied += chunk; 1480 copied += chunk;
1481 } 1481 }
1482 1482
1483 if (tp->rcv_nxt == tp->copied_seq && 1483 if (tp->rcv_nxt == tp->copied_seq &&
1484 !skb_queue_empty(&tp->ucopy.prequeue)) { 1484 !skb_queue_empty(&tp->ucopy.prequeue)) {
1485 do_prequeue: 1485 do_prequeue:
1486 tcp_prequeue_process(sk); 1486 tcp_prequeue_process(sk);
1487 1487
1488 if ((chunk = len - tp->ucopy.len) != 0) { 1488 if ((chunk = len - tp->ucopy.len) != 0) {
1489 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); 1489 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1490 len -= chunk; 1490 len -= chunk;
1491 copied += chunk; 1491 copied += chunk;
1492 } 1492 }
1493 } 1493 }
1494 } 1494 }
1495 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { 1495 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1496 if (net_ratelimit()) 1496 if (net_ratelimit())
1497 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", 1497 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1498 current->comm, task_pid_nr(current)); 1498 current->comm, task_pid_nr(current));
1499 peek_seq = tp->copied_seq; 1499 peek_seq = tp->copied_seq;
1500 } 1500 }
1501 continue; 1501 continue;
1502 1502
1503 found_ok_skb: 1503 found_ok_skb:
1504 /* Ok so how much can we use? */ 1504 /* Ok so how much can we use? */
1505 used = skb->len - offset; 1505 used = skb->len - offset;
1506 if (len < used) 1506 if (len < used)
1507 used = len; 1507 used = len;
1508 1508
1509 /* Do we have urgent data here? */ 1509 /* Do we have urgent data here? */
1510 if (tp->urg_data) { 1510 if (tp->urg_data) {
1511 u32 urg_offset = tp->urg_seq - *seq; 1511 u32 urg_offset = tp->urg_seq - *seq;
1512 if (urg_offset < used) { 1512 if (urg_offset < used) {
1513 if (!urg_offset) { 1513 if (!urg_offset) {
1514 if (!sock_flag(sk, SOCK_URGINLINE)) { 1514 if (!sock_flag(sk, SOCK_URGINLINE)) {
1515 ++*seq; 1515 ++*seq;
1516 offset++; 1516 offset++;
1517 used--; 1517 used--;
1518 if (!used) 1518 if (!used)
1519 goto skip_copy; 1519 goto skip_copy;
1520 } 1520 }
1521 } else 1521 } else
1522 used = urg_offset; 1522 used = urg_offset;
1523 } 1523 }
1524 } 1524 }
1525 1525
1526 if (!(flags & MSG_TRUNC)) { 1526 if (!(flags & MSG_TRUNC)) {
1527 #ifdef CONFIG_NET_DMA 1527 #ifdef CONFIG_NET_DMA
1528 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1528 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1529 tp->ucopy.dma_chan = get_softnet_dma(); 1529 tp->ucopy.dma_chan = get_softnet_dma();
1530 1530
1531 if (tp->ucopy.dma_chan) { 1531 if (tp->ucopy.dma_chan) {
1532 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( 1532 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1533 tp->ucopy.dma_chan, skb, offset, 1533 tp->ucopy.dma_chan, skb, offset,
1534 msg->msg_iov, used, 1534 msg->msg_iov, used,
1535 tp->ucopy.pinned_list); 1535 tp->ucopy.pinned_list);
1536 1536
1537 if (tp->ucopy.dma_cookie < 0) { 1537 if (tp->ucopy.dma_cookie < 0) {
1538 1538
1539 printk(KERN_ALERT "dma_cookie < 0\n"); 1539 printk(KERN_ALERT "dma_cookie < 0\n");
1540 1540
1541 /* Exception. Bailout! */ 1541 /* Exception. Bailout! */
1542 if (!copied) 1542 if (!copied)
1543 copied = -EFAULT; 1543 copied = -EFAULT;
1544 break; 1544 break;
1545 } 1545 }
1546 if ((offset + used) == skb->len) 1546 if ((offset + used) == skb->len)
1547 copied_early = 1; 1547 copied_early = 1;
1548 1548
1549 } else 1549 } else
1550 #endif 1550 #endif
1551 { 1551 {
1552 err = skb_copy_datagram_iovec(skb, offset, 1552 err = skb_copy_datagram_iovec(skb, offset,
1553 msg->msg_iov, used); 1553 msg->msg_iov, used);
1554 if (err) { 1554 if (err) {
1555 /* Exception. Bailout! */ 1555 /* Exception. Bailout! */
1556 if (!copied) 1556 if (!copied)
1557 copied = -EFAULT; 1557 copied = -EFAULT;
1558 break; 1558 break;
1559 } 1559 }
1560 } 1560 }
1561 } 1561 }
1562 1562
1563 *seq += used; 1563 *seq += used;
1564 copied += used; 1564 copied += used;
1565 len -= used; 1565 len -= used;
1566 1566
1567 tcp_rcv_space_adjust(sk); 1567 tcp_rcv_space_adjust(sk);
1568 1568
1569 skip_copy: 1569 skip_copy:
1570 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { 1570 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1571 tp->urg_data = 0; 1571 tp->urg_data = 0;
1572 tcp_fast_path_check(sk); 1572 tcp_fast_path_check(sk);
1573 } 1573 }
1574 if (used + offset < skb->len) 1574 if (used + offset < skb->len)
1575 continue; 1575 continue;
1576 1576
1577 if (tcp_hdr(skb)->fin) 1577 if (tcp_hdr(skb)->fin)
1578 goto found_fin_ok; 1578 goto found_fin_ok;
1579 if (!(flags & MSG_PEEK)) { 1579 if (!(flags & MSG_PEEK)) {
1580 sk_eat_skb(sk, skb, copied_early); 1580 sk_eat_skb(sk, skb, copied_early);
1581 copied_early = 0; 1581 copied_early = 0;
1582 } 1582 }
1583 continue; 1583 continue;
1584 1584
1585 found_fin_ok: 1585 found_fin_ok:
1586 /* Process the FIN. */ 1586 /* Process the FIN. */
1587 ++*seq; 1587 ++*seq;
1588 if (!(flags & MSG_PEEK)) { 1588 if (!(flags & MSG_PEEK)) {
1589 sk_eat_skb(sk, skb, copied_early); 1589 sk_eat_skb(sk, skb, copied_early);
1590 copied_early = 0; 1590 copied_early = 0;
1591 } 1591 }
1592 break; 1592 break;
1593 } while (len > 0); 1593 } while (len > 0);
1594 1594
1595 if (user_recv) { 1595 if (user_recv) {
1596 if (!skb_queue_empty(&tp->ucopy.prequeue)) { 1596 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1597 int chunk; 1597 int chunk;
1598 1598
1599 tp->ucopy.len = copied > 0 ? len : 0; 1599 tp->ucopy.len = copied > 0 ? len : 0;
1600 1600
1601 tcp_prequeue_process(sk); 1601 tcp_prequeue_process(sk);
1602 1602
1603 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { 1603 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1604 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); 1604 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1605 len -= chunk; 1605 len -= chunk;
1606 copied += chunk; 1606 copied += chunk;
1607 } 1607 }
1608 } 1608 }
1609 1609
1610 tp->ucopy.task = NULL; 1610 tp->ucopy.task = NULL;
1611 tp->ucopy.len = 0; 1611 tp->ucopy.len = 0;
1612 } 1612 }
1613 1613
1614 #ifdef CONFIG_NET_DMA 1614 #ifdef CONFIG_NET_DMA
1615 if (tp->ucopy.dma_chan) { 1615 if (tp->ucopy.dma_chan) {
1616 dma_cookie_t done, used; 1616 dma_cookie_t done, used;
1617 1617
1618 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); 1618 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1619 1619
1620 while (dma_async_memcpy_complete(tp->ucopy.dma_chan, 1620 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1621 tp->ucopy.dma_cookie, &done, 1621 tp->ucopy.dma_cookie, &done,
1622 &used) == DMA_IN_PROGRESS) { 1622 &used) == DMA_IN_PROGRESS) {
1623 /* do partial cleanup of sk_async_wait_queue */ 1623 /* do partial cleanup of sk_async_wait_queue */
1624 while ((skb = skb_peek(&sk->sk_async_wait_queue)) && 1624 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1625 (dma_async_is_complete(skb->dma_cookie, done, 1625 (dma_async_is_complete(skb->dma_cookie, done,
1626 used) == DMA_SUCCESS)) { 1626 used) == DMA_SUCCESS)) {
1627 __skb_dequeue(&sk->sk_async_wait_queue); 1627 __skb_dequeue(&sk->sk_async_wait_queue);
1628 kfree_skb(skb); 1628 kfree_skb(skb);
1629 } 1629 }
1630 } 1630 }
1631 1631
1632 /* Safe to free early-copied skbs now */ 1632 /* Safe to free early-copied skbs now */
1633 __skb_queue_purge(&sk->sk_async_wait_queue); 1633 __skb_queue_purge(&sk->sk_async_wait_queue);
1634 dma_chan_put(tp->ucopy.dma_chan); 1634 dma_chan_put(tp->ucopy.dma_chan);
1635 tp->ucopy.dma_chan = NULL; 1635 tp->ucopy.dma_chan = NULL;
1636 } 1636 }
1637 if (tp->ucopy.pinned_list) { 1637 if (tp->ucopy.pinned_list) {
1638 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1638 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1639 tp->ucopy.pinned_list = NULL; 1639 tp->ucopy.pinned_list = NULL;
1640 } 1640 }
1641 #endif 1641 #endif
1642 1642
1643 /* According to UNIX98, msg_name/msg_namelen are ignored 1643 /* According to UNIX98, msg_name/msg_namelen are ignored
1644 * on connected socket. I was just happy when found this 8) --ANK 1644 * on connected socket. I was just happy when found this 8) --ANK
1645 */ 1645 */
1646 1646
1647 /* Clean up data we have read: This will do ACK frames. */ 1647 /* Clean up data we have read: This will do ACK frames. */
1648 tcp_cleanup_rbuf(sk, copied); 1648 tcp_cleanup_rbuf(sk, copied);
1649 1649
1650 TCP_CHECK_TIMER(sk); 1650 TCP_CHECK_TIMER(sk);
1651 release_sock(sk); 1651 release_sock(sk);
1652 return copied; 1652 return copied;
1653 1653
1654 out: 1654 out:
1655 TCP_CHECK_TIMER(sk); 1655 TCP_CHECK_TIMER(sk);
1656 release_sock(sk); 1656 release_sock(sk);
1657 return err; 1657 return err;
1658 1658
1659 recv_urg: 1659 recv_urg:
1660 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); 1660 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1661 goto out; 1661 goto out;
1662 } 1662 }
1663 1663
1664 void tcp_set_state(struct sock *sk, int state) 1664 void tcp_set_state(struct sock *sk, int state)
1665 { 1665 {
1666 int oldstate = sk->sk_state; 1666 int oldstate = sk->sk_state;
1667 1667
1668 switch (state) { 1668 switch (state) {
1669 case TCP_ESTABLISHED: 1669 case TCP_ESTABLISHED:
1670 if (oldstate != TCP_ESTABLISHED) 1670 if (oldstate != TCP_ESTABLISHED)
1671 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); 1671 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1672 break; 1672 break;
1673 1673
1674 case TCP_CLOSE: 1674 case TCP_CLOSE:
1675 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) 1675 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1676 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); 1676 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1677 1677
1678 sk->sk_prot->unhash(sk); 1678 sk->sk_prot->unhash(sk);
1679 if (inet_csk(sk)->icsk_bind_hash && 1679 if (inet_csk(sk)->icsk_bind_hash &&
1680 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) 1680 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1681 inet_put_port(sk); 1681 inet_put_port(sk);
1682 /* fall through */ 1682 /* fall through */
1683 default: 1683 default:
1684 if (oldstate==TCP_ESTABLISHED) 1684 if (oldstate == TCP_ESTABLISHED)
1685 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); 1685 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1686 } 1686 }
1687 1687
1688 /* Change state AFTER socket is unhashed to avoid closed 1688 /* Change state AFTER socket is unhashed to avoid closed
1689 * socket sitting in hash tables. 1689 * socket sitting in hash tables.
1690 */ 1690 */
1691 sk->sk_state = state; 1691 sk->sk_state = state;
1692 1692
1693 #ifdef STATE_TRACE 1693 #ifdef STATE_TRACE
1694 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); 1694 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1695 #endif 1695 #endif
1696 } 1696 }
1697 EXPORT_SYMBOL_GPL(tcp_set_state); 1697 EXPORT_SYMBOL_GPL(tcp_set_state);
1698 1698
1699 /* 1699 /*
1700 * State processing on a close. This implements the state shift for 1700 * State processing on a close. This implements the state shift for
1701 * sending our FIN frame. Note that we only send a FIN for some 1701 * sending our FIN frame. Note that we only send a FIN for some
1702 * states. A shutdown() may have already sent the FIN, or we may be 1702 * states. A shutdown() may have already sent the FIN, or we may be
1703 * closed. 1703 * closed.
1704 */ 1704 */
1705 1705
1706 static const unsigned char new_state[16] = { 1706 static const unsigned char new_state[16] = {
1707 /* current state: new state: action: */ 1707 /* current state: new state: action: */
1708 /* (Invalid) */ TCP_CLOSE, 1708 /* (Invalid) */ TCP_CLOSE,
1709 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, 1709 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1710 /* TCP_SYN_SENT */ TCP_CLOSE, 1710 /* TCP_SYN_SENT */ TCP_CLOSE,
1711 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, 1711 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1712 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, 1712 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1713 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, 1713 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1714 /* TCP_TIME_WAIT */ TCP_CLOSE, 1714 /* TCP_TIME_WAIT */ TCP_CLOSE,
1715 /* TCP_CLOSE */ TCP_CLOSE, 1715 /* TCP_CLOSE */ TCP_CLOSE,
1716 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, 1716 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1717 /* TCP_LAST_ACK */ TCP_LAST_ACK, 1717 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1718 /* TCP_LISTEN */ TCP_CLOSE, 1718 /* TCP_LISTEN */ TCP_CLOSE,
1719 /* TCP_CLOSING */ TCP_CLOSING, 1719 /* TCP_CLOSING */ TCP_CLOSING,
1720 }; 1720 };
1721 1721
1722 static int tcp_close_state(struct sock *sk) 1722 static int tcp_close_state(struct sock *sk)
1723 { 1723 {
1724 int next = (int)new_state[sk->sk_state]; 1724 int next = (int)new_state[sk->sk_state];
1725 int ns = next & TCP_STATE_MASK; 1725 int ns = next & TCP_STATE_MASK;
1726 1726
1727 tcp_set_state(sk, ns); 1727 tcp_set_state(sk, ns);
1728 1728
1729 return next & TCP_ACTION_FIN; 1729 return next & TCP_ACTION_FIN;
1730 } 1730 }
1731 1731
1732 /* 1732 /*
1733 * Shutdown the sending side of a connection. Much like close except 1733 * Shutdown the sending side of a connection. Much like close except
1734 * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). 1734 * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
1735 */ 1735 */
1736 1736
1737 void tcp_shutdown(struct sock *sk, int how) 1737 void tcp_shutdown(struct sock *sk, int how)
1738 { 1738 {
1739 /* We need to grab some memory, and put together a FIN, 1739 /* We need to grab some memory, and put together a FIN,
1740 * and then put it into the queue to be sent. 1740 * and then put it into the queue to be sent.
1741 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. 1741 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1742 */ 1742 */
1743 if (!(how & SEND_SHUTDOWN)) 1743 if (!(how & SEND_SHUTDOWN))
1744 return; 1744 return;
1745 1745
1746 /* If we've already sent a FIN, or it's a closed state, skip this. */ 1746 /* If we've already sent a FIN, or it's a closed state, skip this. */
1747 if ((1 << sk->sk_state) & 1747 if ((1 << sk->sk_state) &
1748 (TCPF_ESTABLISHED | TCPF_SYN_SENT | 1748 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1749 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { 1749 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1750 /* Clear out any half completed packets. FIN if needed. */ 1750 /* Clear out any half completed packets. FIN if needed. */
1751 if (tcp_close_state(sk)) 1751 if (tcp_close_state(sk))
1752 tcp_send_fin(sk); 1752 tcp_send_fin(sk);
1753 } 1753 }
1754 } 1754 }
1755 1755
1756 void tcp_close(struct sock *sk, long timeout) 1756 void tcp_close(struct sock *sk, long timeout)
1757 { 1757 {
1758 struct sk_buff *skb; 1758 struct sk_buff *skb;
1759 int data_was_unread = 0; 1759 int data_was_unread = 0;
1760 int state; 1760 int state;
1761 1761
1762 lock_sock(sk); 1762 lock_sock(sk);
1763 sk->sk_shutdown = SHUTDOWN_MASK; 1763 sk->sk_shutdown = SHUTDOWN_MASK;
1764 1764
1765 if (sk->sk_state == TCP_LISTEN) { 1765 if (sk->sk_state == TCP_LISTEN) {
1766 tcp_set_state(sk, TCP_CLOSE); 1766 tcp_set_state(sk, TCP_CLOSE);
1767 1767
1768 /* Special case. */ 1768 /* Special case. */
1769 inet_csk_listen_stop(sk); 1769 inet_csk_listen_stop(sk);
1770 1770
1771 goto adjudge_to_death; 1771 goto adjudge_to_death;
1772 } 1772 }
1773 1773
1774 /* We need to flush the recv. buffs. We do this only on the 1774 /* We need to flush the recv. buffs. We do this only on the
1775 * descriptor close, not protocol-sourced closes, because the 1775 * descriptor close, not protocol-sourced closes, because the
1776 * reader process may not have drained the data yet! 1776 * reader process may not have drained the data yet!
1777 */ 1777 */
1778 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { 1778 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1779 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - 1779 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1780 tcp_hdr(skb)->fin; 1780 tcp_hdr(skb)->fin;
1781 data_was_unread += len; 1781 data_was_unread += len;
1782 __kfree_skb(skb); 1782 __kfree_skb(skb);
1783 } 1783 }
1784 1784
1785 sk_mem_reclaim(sk); 1785 sk_mem_reclaim(sk);
1786 1786
1787 /* As outlined in RFC 2525, section 2.17, we send a RST here because 1787 /* As outlined in RFC 2525, section 2.17, we send a RST here because
1788 * data was lost. To witness the awful effects of the old behavior of 1788 * data was lost. To witness the awful effects of the old behavior of
1789 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 1789 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1790 * GET in an FTP client, suspend the process, wait for the client to 1790 * GET in an FTP client, suspend the process, wait for the client to
1791 * advertise a zero window, then kill -9 the FTP client, wheee... 1791 * advertise a zero window, then kill -9 the FTP client, wheee...
1792 * Note: timeout is always zero in such a case. 1792 * Note: timeout is always zero in such a case.
1793 */ 1793 */
1794 if (data_was_unread) { 1794 if (data_was_unread) {
1795 /* Unread data was tossed, zap the connection. */ 1795 /* Unread data was tossed, zap the connection. */
1796 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 1796 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1797 tcp_set_state(sk, TCP_CLOSE); 1797 tcp_set_state(sk, TCP_CLOSE);
1798 tcp_send_active_reset(sk, GFP_KERNEL); 1798 tcp_send_active_reset(sk, GFP_KERNEL);
1799 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 1799 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1800 /* Check zero linger _after_ checking for unread data. */ 1800 /* Check zero linger _after_ checking for unread data. */
1801 sk->sk_prot->disconnect(sk, 0); 1801 sk->sk_prot->disconnect(sk, 0);
1802 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 1802 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1803 } else if (tcp_close_state(sk)) { 1803 } else if (tcp_close_state(sk)) {
1804 /* We FIN if the application ate all the data before 1804 /* We FIN if the application ate all the data before
1805 * zapping the connection. 1805 * zapping the connection.
1806 */ 1806 */
1807 1807
1808 /* RED-PEN. Formally speaking, we have broken TCP state 1808 /* RED-PEN. Formally speaking, we have broken TCP state
1809 * machine. State transitions: 1809 * machine. State transitions:
1810 * 1810 *
1811 * TCP_ESTABLISHED -> TCP_FIN_WAIT1 1811 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1812 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) 1812 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1813 * TCP_CLOSE_WAIT -> TCP_LAST_ACK 1813 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1814 * 1814 *
1815 * are legal only when FIN has been sent (i.e. in window), 1815 * are legal only when FIN has been sent (i.e. in window),
1816 * rather than queued out of window. Purists blame. 1816 * rather than queued out of window. Purists blame.
1817 * 1817 *
1818 * F.e. "RFC state" is ESTABLISHED, 1818 * F.e. "RFC state" is ESTABLISHED,
1819 * if Linux state is FIN-WAIT-1, but FIN is still not sent. 1819 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1820 * 1820 *
1821 * The visible declinations are that sometimes 1821 * The visible declinations are that sometimes
1822 * we enter time-wait state, when it is not required really 1822 * we enter time-wait state, when it is not required really
1823 * (harmless), do not send active resets, when they are 1823 * (harmless), do not send active resets, when they are
1824 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when 1824 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1825 * they look as CLOSING or LAST_ACK for Linux) 1825 * they look as CLOSING or LAST_ACK for Linux)
1826 * Probably, I missed some more holelets. 1826 * Probably, I missed some more holelets.
1827 * --ANK 1827 * --ANK
1828 */ 1828 */
1829 tcp_send_fin(sk); 1829 tcp_send_fin(sk);
1830 } 1830 }
1831 1831
1832 sk_stream_wait_close(sk, timeout); 1832 sk_stream_wait_close(sk, timeout);
1833 1833
1834 adjudge_to_death: 1834 adjudge_to_death:
1835 state = sk->sk_state; 1835 state = sk->sk_state;
1836 sock_hold(sk); 1836 sock_hold(sk);
1837 sock_orphan(sk); 1837 sock_orphan(sk);
1838 atomic_inc(sk->sk_prot->orphan_count); 1838 atomic_inc(sk->sk_prot->orphan_count);
1839 1839
1840 /* It is the last release_sock in its life. It will remove backlog. */ 1840 /* It is the last release_sock in its life. It will remove backlog. */
1841 release_sock(sk); 1841 release_sock(sk);
1842 1842
1843 1843
1844 /* Now socket is owned by kernel and we acquire BH lock 1844 /* Now socket is owned by kernel and we acquire BH lock
1845 to finish close. No need to check for user refs. 1845 to finish close. No need to check for user refs.
1846 */ 1846 */
1847 local_bh_disable(); 1847 local_bh_disable();
1848 bh_lock_sock(sk); 1848 bh_lock_sock(sk);
1849 WARN_ON(sock_owned_by_user(sk)); 1849 WARN_ON(sock_owned_by_user(sk));
1850 1850
1851 /* Have we already been destroyed by a softirq or backlog? */ 1851 /* Have we already been destroyed by a softirq or backlog? */
1852 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) 1852 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1853 goto out; 1853 goto out;
1854 1854
1855 /* This is a (useful) BSD violating of the RFC. There is a 1855 /* This is a (useful) BSD violating of the RFC. There is a
1856 * problem with TCP as specified in that the other end could 1856 * problem with TCP as specified in that the other end could
1857 * keep a socket open forever with no application left this end. 1857 * keep a socket open forever with no application left this end.
1858 * We use a 3 minute timeout (about the same as BSD) then kill 1858 * We use a 3 minute timeout (about the same as BSD) then kill
1859 * our end. If they send after that then tough - BUT: long enough 1859 * our end. If they send after that then tough - BUT: long enough
1860 * that we won't make the old 4*rto = almost no time - whoops 1860 * that we won't make the old 4*rto = almost no time - whoops
1861 * reset mistake. 1861 * reset mistake.
1862 * 1862 *
1863 * Nope, it was not mistake. It is really desired behaviour 1863 * Nope, it was not mistake. It is really desired behaviour
1864 * f.e. on http servers, when such sockets are useless, but 1864 * f.e. on http servers, when such sockets are useless, but
1865 * consume significant resources. Let's do it with special 1865 * consume significant resources. Let's do it with special
1866 * linger2 option. --ANK 1866 * linger2 option. --ANK
1867 */ 1867 */
1868 1868
1869 if (sk->sk_state == TCP_FIN_WAIT2) { 1869 if (sk->sk_state == TCP_FIN_WAIT2) {
1870 struct tcp_sock *tp = tcp_sk(sk); 1870 struct tcp_sock *tp = tcp_sk(sk);
1871 if (tp->linger2 < 0) { 1871 if (tp->linger2 < 0) {
1872 tcp_set_state(sk, TCP_CLOSE); 1872 tcp_set_state(sk, TCP_CLOSE);
1873 tcp_send_active_reset(sk, GFP_ATOMIC); 1873 tcp_send_active_reset(sk, GFP_ATOMIC);
1874 NET_INC_STATS_BH(sock_net(sk), 1874 NET_INC_STATS_BH(sock_net(sk),
1875 LINUX_MIB_TCPABORTONLINGER); 1875 LINUX_MIB_TCPABORTONLINGER);
1876 } else { 1876 } else {
1877 const int tmo = tcp_fin_time(sk); 1877 const int tmo = tcp_fin_time(sk);
1878 1878
1879 if (tmo > TCP_TIMEWAIT_LEN) { 1879 if (tmo > TCP_TIMEWAIT_LEN) {
1880 inet_csk_reset_keepalive_timer(sk, 1880 inet_csk_reset_keepalive_timer(sk,
1881 tmo - TCP_TIMEWAIT_LEN); 1881 tmo - TCP_TIMEWAIT_LEN);
1882 } else { 1882 } else {
1883 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 1883 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1884 goto out; 1884 goto out;
1885 } 1885 }
1886 } 1886 }
1887 } 1887 }
1888 if (sk->sk_state != TCP_CLOSE) { 1888 if (sk->sk_state != TCP_CLOSE) {
1889 sk_mem_reclaim(sk); 1889 sk_mem_reclaim(sk);
1890 if (tcp_too_many_orphans(sk, 1890 if (tcp_too_many_orphans(sk,
1891 atomic_read(sk->sk_prot->orphan_count))) { 1891 atomic_read(sk->sk_prot->orphan_count))) {
1892 if (net_ratelimit()) 1892 if (net_ratelimit())
1893 printk(KERN_INFO "TCP: too many of orphaned " 1893 printk(KERN_INFO "TCP: too many of orphaned "
1894 "sockets\n"); 1894 "sockets\n");
1895 tcp_set_state(sk, TCP_CLOSE); 1895 tcp_set_state(sk, TCP_CLOSE);
1896 tcp_send_active_reset(sk, GFP_ATOMIC); 1896 tcp_send_active_reset(sk, GFP_ATOMIC);
1897 NET_INC_STATS_BH(sock_net(sk), 1897 NET_INC_STATS_BH(sock_net(sk),
1898 LINUX_MIB_TCPABORTONMEMORY); 1898 LINUX_MIB_TCPABORTONMEMORY);
1899 } 1899 }
1900 } 1900 }
1901 1901
1902 if (sk->sk_state == TCP_CLOSE) 1902 if (sk->sk_state == TCP_CLOSE)
1903 inet_csk_destroy_sock(sk); 1903 inet_csk_destroy_sock(sk);
1904 /* Otherwise, socket is reprieved until protocol close. */ 1904 /* Otherwise, socket is reprieved until protocol close. */
1905 1905
1906 out: 1906 out:
1907 bh_unlock_sock(sk); 1907 bh_unlock_sock(sk);
1908 local_bh_enable(); 1908 local_bh_enable();
1909 sock_put(sk); 1909 sock_put(sk);
1910 } 1910 }
1911 1911
1912 /* These states need RST on ABORT according to RFC793 */ 1912 /* These states need RST on ABORT according to RFC793 */
1913 1913
1914 static inline int tcp_need_reset(int state) 1914 static inline int tcp_need_reset(int state)
1915 { 1915 {
1916 return (1 << state) & 1916 return (1 << state) &
1917 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | 1917 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1918 TCPF_FIN_WAIT2 | TCPF_SYN_RECV); 1918 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1919 } 1919 }
1920 1920
1921 int tcp_disconnect(struct sock *sk, int flags) 1921 int tcp_disconnect(struct sock *sk, int flags)
1922 { 1922 {
1923 struct inet_sock *inet = inet_sk(sk); 1923 struct inet_sock *inet = inet_sk(sk);
1924 struct inet_connection_sock *icsk = inet_csk(sk); 1924 struct inet_connection_sock *icsk = inet_csk(sk);
1925 struct tcp_sock *tp = tcp_sk(sk); 1925 struct tcp_sock *tp = tcp_sk(sk);
1926 int err = 0; 1926 int err = 0;
1927 int old_state = sk->sk_state; 1927 int old_state = sk->sk_state;
1928 1928
1929 if (old_state != TCP_CLOSE) 1929 if (old_state != TCP_CLOSE)
1930 tcp_set_state(sk, TCP_CLOSE); 1930 tcp_set_state(sk, TCP_CLOSE);
1931 1931
1932 /* ABORT function of RFC793 */ 1932 /* ABORT function of RFC793 */
1933 if (old_state == TCP_LISTEN) { 1933 if (old_state == TCP_LISTEN) {
1934 inet_csk_listen_stop(sk); 1934 inet_csk_listen_stop(sk);
1935 } else if (tcp_need_reset(old_state) || 1935 } else if (tcp_need_reset(old_state) ||
1936 (tp->snd_nxt != tp->write_seq && 1936 (tp->snd_nxt != tp->write_seq &&
1937 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 1937 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1938 /* The last check adjusts for discrepancy of Linux wrt. RFC 1938 /* The last check adjusts for discrepancy of Linux wrt. RFC
1939 * states 1939 * states
1940 */ 1940 */
1941 tcp_send_active_reset(sk, gfp_any()); 1941 tcp_send_active_reset(sk, gfp_any());
1942 sk->sk_err = ECONNRESET; 1942 sk->sk_err = ECONNRESET;
1943 } else if (old_state == TCP_SYN_SENT) 1943 } else if (old_state == TCP_SYN_SENT)
1944 sk->sk_err = ECONNRESET; 1944 sk->sk_err = ECONNRESET;
1945 1945
1946 tcp_clear_xmit_timers(sk); 1946 tcp_clear_xmit_timers(sk);
1947 __skb_queue_purge(&sk->sk_receive_queue); 1947 __skb_queue_purge(&sk->sk_receive_queue);
1948 tcp_write_queue_purge(sk); 1948 tcp_write_queue_purge(sk);
1949 __skb_queue_purge(&tp->out_of_order_queue); 1949 __skb_queue_purge(&tp->out_of_order_queue);
1950 #ifdef CONFIG_NET_DMA 1950 #ifdef CONFIG_NET_DMA
1951 __skb_queue_purge(&sk->sk_async_wait_queue); 1951 __skb_queue_purge(&sk->sk_async_wait_queue);
1952 #endif 1952 #endif
1953 1953
1954 inet->dport = 0; 1954 inet->dport = 0;
1955 1955
1956 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1956 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1957 inet_reset_saddr(sk); 1957 inet_reset_saddr(sk);
1958 1958
1959 sk->sk_shutdown = 0; 1959 sk->sk_shutdown = 0;
1960 sock_reset_flag(sk, SOCK_DONE); 1960 sock_reset_flag(sk, SOCK_DONE);
1961 tp->srtt = 0; 1961 tp->srtt = 0;
1962 if ((tp->write_seq += tp->max_window + 2) == 0) 1962 if ((tp->write_seq += tp->max_window + 2) == 0)
1963 tp->write_seq = 1; 1963 tp->write_seq = 1;
1964 icsk->icsk_backoff = 0; 1964 icsk->icsk_backoff = 0;
1965 tp->snd_cwnd = 2; 1965 tp->snd_cwnd = 2;
1966 icsk->icsk_probes_out = 0; 1966 icsk->icsk_probes_out = 0;
1967 tp->packets_out = 0; 1967 tp->packets_out = 0;
1968 tp->snd_ssthresh = 0x7fffffff; 1968 tp->snd_ssthresh = 0x7fffffff;
1969 tp->snd_cwnd_cnt = 0; 1969 tp->snd_cwnd_cnt = 0;
1970 tp->bytes_acked = 0; 1970 tp->bytes_acked = 0;
1971 tcp_set_ca_state(sk, TCP_CA_Open); 1971 tcp_set_ca_state(sk, TCP_CA_Open);
1972 tcp_clear_retrans(tp); 1972 tcp_clear_retrans(tp);
1973 inet_csk_delack_init(sk); 1973 inet_csk_delack_init(sk);
1974 tcp_init_send_head(sk); 1974 tcp_init_send_head(sk);
1975 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 1975 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
1976 __sk_dst_reset(sk); 1976 __sk_dst_reset(sk);
1977 1977
1978 WARN_ON(inet->num && !icsk->icsk_bind_hash); 1978 WARN_ON(inet->num && !icsk->icsk_bind_hash);
1979 1979
1980 sk->sk_error_report(sk); 1980 sk->sk_error_report(sk);
1981 return err; 1981 return err;
1982 } 1982 }
1983 1983
1984 /* 1984 /*
1985 * Socket option code for TCP. 1985 * Socket option code for TCP.
1986 */ 1986 */
1987 static int do_tcp_setsockopt(struct sock *sk, int level, 1987 static int do_tcp_setsockopt(struct sock *sk, int level,
1988 int optname, char __user *optval, int optlen) 1988 int optname, char __user *optval, int optlen)
1989 { 1989 {
1990 struct tcp_sock *tp = tcp_sk(sk); 1990 struct tcp_sock *tp = tcp_sk(sk);
1991 struct inet_connection_sock *icsk = inet_csk(sk); 1991 struct inet_connection_sock *icsk = inet_csk(sk);
1992 int val; 1992 int val;
1993 int err = 0; 1993 int err = 0;
1994 1994
1995 /* This is a string value all the others are int's */ 1995 /* This is a string value all the others are int's */
1996 if (optname == TCP_CONGESTION) { 1996 if (optname == TCP_CONGESTION) {
1997 char name[TCP_CA_NAME_MAX]; 1997 char name[TCP_CA_NAME_MAX];
1998 1998
1999 if (optlen < 1) 1999 if (optlen < 1)
2000 return -EINVAL; 2000 return -EINVAL;
2001 2001
2002 val = strncpy_from_user(name, optval, 2002 val = strncpy_from_user(name, optval,
2003 min(TCP_CA_NAME_MAX-1, optlen)); 2003 min(TCP_CA_NAME_MAX-1, optlen));
2004 if (val < 0) 2004 if (val < 0)
2005 return -EFAULT; 2005 return -EFAULT;
2006 name[val] = 0; 2006 name[val] = 0;
2007 2007
2008 lock_sock(sk); 2008 lock_sock(sk);
2009 err = tcp_set_congestion_control(sk, name); 2009 err = tcp_set_congestion_control(sk, name);
2010 release_sock(sk); 2010 release_sock(sk);
2011 return err; 2011 return err;
2012 } 2012 }
2013 2013
2014 if (optlen < sizeof(int)) 2014 if (optlen < sizeof(int))
2015 return -EINVAL; 2015 return -EINVAL;
2016 2016
2017 if (get_user(val, (int __user *)optval)) 2017 if (get_user(val, (int __user *)optval))
2018 return -EFAULT; 2018 return -EFAULT;
2019 2019
2020 lock_sock(sk); 2020 lock_sock(sk);
2021 2021
2022 switch (optname) { 2022 switch (optname) {
2023 case TCP_MAXSEG: 2023 case TCP_MAXSEG:
2024 /* Values greater than interface MTU won't take effect. However 2024 /* Values greater than interface MTU won't take effect. However
2025 * at the point when this call is done we typically don't yet 2025 * at the point when this call is done we typically don't yet
2026 * know which interface is going to be used */ 2026 * know which interface is going to be used */
2027 if (val < 8 || val > MAX_TCP_WINDOW) { 2027 if (val < 8 || val > MAX_TCP_WINDOW) {
2028 err = -EINVAL; 2028 err = -EINVAL;
2029 break; 2029 break;
2030 } 2030 }
2031 tp->rx_opt.user_mss = val; 2031 tp->rx_opt.user_mss = val;
2032 break; 2032 break;
2033 2033
2034 case TCP_NODELAY: 2034 case TCP_NODELAY:
2035 if (val) { 2035 if (val) {
2036 /* TCP_NODELAY is weaker than TCP_CORK, so that 2036 /* TCP_NODELAY is weaker than TCP_CORK, so that
2037 * this option on corked socket is remembered, but 2037 * this option on corked socket is remembered, but
2038 * it is not activated until cork is cleared. 2038 * it is not activated until cork is cleared.
2039 * 2039 *
2040 * However, when TCP_NODELAY is set we make 2040 * However, when TCP_NODELAY is set we make
2041 * an explicit push, which overrides even TCP_CORK 2041 * an explicit push, which overrides even TCP_CORK
2042 * for currently queued segments. 2042 * for currently queued segments.
2043 */ 2043 */
2044 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; 2044 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2045 tcp_push_pending_frames(sk); 2045 tcp_push_pending_frames(sk);
2046 } else { 2046 } else {
2047 tp->nonagle &= ~TCP_NAGLE_OFF; 2047 tp->nonagle &= ~TCP_NAGLE_OFF;
2048 } 2048 }
2049 break; 2049 break;
2050 2050
2051 case TCP_CORK: 2051 case TCP_CORK:
2052 /* When set indicates to always queue non-full frames. 2052 /* When set indicates to always queue non-full frames.
2053 * Later the user clears this option and we transmit 2053 * Later the user clears this option and we transmit
2054 * any pending partial frames in the queue. This is 2054 * any pending partial frames in the queue. This is
2055 * meant to be used alongside sendfile() to get properly 2055 * meant to be used alongside sendfile() to get properly
2056 * filled frames when the user (for example) must write 2056 * filled frames when the user (for example) must write
2057 * out headers with a write() call first and then use 2057 * out headers with a write() call first and then use
2058 * sendfile to send out the data parts. 2058 * sendfile to send out the data parts.
2059 * 2059 *
2060 * TCP_CORK can be set together with TCP_NODELAY and it is 2060 * TCP_CORK can be set together with TCP_NODELAY and it is
2061 * stronger than TCP_NODELAY. 2061 * stronger than TCP_NODELAY.
2062 */ 2062 */
2063 if (val) { 2063 if (val) {
2064 tp->nonagle |= TCP_NAGLE_CORK; 2064 tp->nonagle |= TCP_NAGLE_CORK;
2065 } else { 2065 } else {
2066 tp->nonagle &= ~TCP_NAGLE_CORK; 2066 tp->nonagle &= ~TCP_NAGLE_CORK;
2067 if (tp->nonagle&TCP_NAGLE_OFF) 2067 if (tp->nonagle&TCP_NAGLE_OFF)
2068 tp->nonagle |= TCP_NAGLE_PUSH; 2068 tp->nonagle |= TCP_NAGLE_PUSH;
2069 tcp_push_pending_frames(sk); 2069 tcp_push_pending_frames(sk);
2070 } 2070 }
2071 break; 2071 break;
2072 2072
2073 case TCP_KEEPIDLE: 2073 case TCP_KEEPIDLE:
2074 if (val < 1 || val > MAX_TCP_KEEPIDLE) 2074 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2075 err = -EINVAL; 2075 err = -EINVAL;
2076 else { 2076 else {
2077 tp->keepalive_time = val * HZ; 2077 tp->keepalive_time = val * HZ;
2078 if (sock_flag(sk, SOCK_KEEPOPEN) && 2078 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2079 !((1 << sk->sk_state) & 2079 !((1 << sk->sk_state) &
2080 (TCPF_CLOSE | TCPF_LISTEN))) { 2080 (TCPF_CLOSE | TCPF_LISTEN))) {
2081 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; 2081 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2082 if (tp->keepalive_time > elapsed) 2082 if (tp->keepalive_time > elapsed)
2083 elapsed = tp->keepalive_time - elapsed; 2083 elapsed = tp->keepalive_time - elapsed;
2084 else 2084 else
2085 elapsed = 0; 2085 elapsed = 0;
2086 inet_csk_reset_keepalive_timer(sk, elapsed); 2086 inet_csk_reset_keepalive_timer(sk, elapsed);
2087 } 2087 }
2088 } 2088 }
2089 break; 2089 break;
2090 case TCP_KEEPINTVL: 2090 case TCP_KEEPINTVL:
2091 if (val < 1 || val > MAX_TCP_KEEPINTVL) 2091 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2092 err = -EINVAL; 2092 err = -EINVAL;
2093 else 2093 else
2094 tp->keepalive_intvl = val * HZ; 2094 tp->keepalive_intvl = val * HZ;
2095 break; 2095 break;
2096 case TCP_KEEPCNT: 2096 case TCP_KEEPCNT:
2097 if (val < 1 || val > MAX_TCP_KEEPCNT) 2097 if (val < 1 || val > MAX_TCP_KEEPCNT)
2098 err = -EINVAL; 2098 err = -EINVAL;
2099 else 2099 else
2100 tp->keepalive_probes = val; 2100 tp->keepalive_probes = val;
2101 break; 2101 break;
2102 case TCP_SYNCNT: 2102 case TCP_SYNCNT:
2103 if (val < 1 || val > MAX_TCP_SYNCNT) 2103 if (val < 1 || val > MAX_TCP_SYNCNT)
2104 err = -EINVAL; 2104 err = -EINVAL;
2105 else 2105 else
2106 icsk->icsk_syn_retries = val; 2106 icsk->icsk_syn_retries = val;
2107 break; 2107 break;
2108 2108
2109 case TCP_LINGER2: 2109 case TCP_LINGER2:
2110 if (val < 0) 2110 if (val < 0)
2111 tp->linger2 = -1; 2111 tp->linger2 = -1;
2112 else if (val > sysctl_tcp_fin_timeout / HZ) 2112 else if (val > sysctl_tcp_fin_timeout / HZ)
2113 tp->linger2 = 0; 2113 tp->linger2 = 0;
2114 else 2114 else
2115 tp->linger2 = val * HZ; 2115 tp->linger2 = val * HZ;
2116 break; 2116 break;
2117 2117
2118 case TCP_DEFER_ACCEPT: 2118 case TCP_DEFER_ACCEPT:
2119 icsk->icsk_accept_queue.rskq_defer_accept = 0; 2119 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2120 if (val > 0) { 2120 if (val > 0) {
2121 /* Translate value in seconds to number of 2121 /* Translate value in seconds to number of
2122 * retransmits */ 2122 * retransmits */
2123 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && 2123 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2124 val > ((TCP_TIMEOUT_INIT / HZ) << 2124 val > ((TCP_TIMEOUT_INIT / HZ) <<
2125 icsk->icsk_accept_queue.rskq_defer_accept)) 2125 icsk->icsk_accept_queue.rskq_defer_accept))
2126 icsk->icsk_accept_queue.rskq_defer_accept++; 2126 icsk->icsk_accept_queue.rskq_defer_accept++;
2127 icsk->icsk_accept_queue.rskq_defer_accept++; 2127 icsk->icsk_accept_queue.rskq_defer_accept++;
2128 } 2128 }
2129 break; 2129 break;
2130 2130
2131 case TCP_WINDOW_CLAMP: 2131 case TCP_WINDOW_CLAMP:
2132 if (!val) { 2132 if (!val) {
2133 if (sk->sk_state != TCP_CLOSE) { 2133 if (sk->sk_state != TCP_CLOSE) {
2134 err = -EINVAL; 2134 err = -EINVAL;
2135 break; 2135 break;
2136 } 2136 }
2137 tp->window_clamp = 0; 2137 tp->window_clamp = 0;
2138 } else 2138 } else
2139 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? 2139 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2140 SOCK_MIN_RCVBUF / 2 : val; 2140 SOCK_MIN_RCVBUF / 2 : val;
2141 break; 2141 break;
2142 2142
2143 case TCP_QUICKACK: 2143 case TCP_QUICKACK:
2144 if (!val) { 2144 if (!val) {
2145 icsk->icsk_ack.pingpong = 1; 2145 icsk->icsk_ack.pingpong = 1;
2146 } else { 2146 } else {
2147 icsk->icsk_ack.pingpong = 0; 2147 icsk->icsk_ack.pingpong = 0;
2148 if ((1 << sk->sk_state) & 2148 if ((1 << sk->sk_state) &
2149 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && 2149 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2150 inet_csk_ack_scheduled(sk)) { 2150 inet_csk_ack_scheduled(sk)) {
2151 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; 2151 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2152 tcp_cleanup_rbuf(sk, 1); 2152 tcp_cleanup_rbuf(sk, 1);
2153 if (!(val & 1)) 2153 if (!(val & 1))
2154 icsk->icsk_ack.pingpong = 1; 2154 icsk->icsk_ack.pingpong = 1;
2155 } 2155 }
2156 } 2156 }
2157 break; 2157 break;
2158 2158
2159 #ifdef CONFIG_TCP_MD5SIG 2159 #ifdef CONFIG_TCP_MD5SIG
2160 case TCP_MD5SIG: 2160 case TCP_MD5SIG:
2161 /* Read the IP->Key mappings from userspace */ 2161 /* Read the IP->Key mappings from userspace */
2162 err = tp->af_specific->md5_parse(sk, optval, optlen); 2162 err = tp->af_specific->md5_parse(sk, optval, optlen);
2163 break; 2163 break;
2164 #endif 2164 #endif
2165 2165
2166 default: 2166 default:
2167 err = -ENOPROTOOPT; 2167 err = -ENOPROTOOPT;
2168 break; 2168 break;
2169 } 2169 }
2170 2170
2171 release_sock(sk); 2171 release_sock(sk);
2172 return err; 2172 return err;
2173 } 2173 }
2174 2174
2175 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 2175 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2176 int optlen) 2176 int optlen)
2177 { 2177 {
2178 struct inet_connection_sock *icsk = inet_csk(sk); 2178 struct inet_connection_sock *icsk = inet_csk(sk);
2179 2179
2180 if (level != SOL_TCP) 2180 if (level != SOL_TCP)
2181 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 2181 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2182 optval, optlen); 2182 optval, optlen);
2183 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2183 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2184 } 2184 }
2185 2185
2186 #ifdef CONFIG_COMPAT 2186 #ifdef CONFIG_COMPAT
2187 int compat_tcp_setsockopt(struct sock *sk, int level, int optname, 2187 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2188 char __user *optval, int optlen) 2188 char __user *optval, int optlen)
2189 { 2189 {
2190 if (level != SOL_TCP) 2190 if (level != SOL_TCP)
2191 return inet_csk_compat_setsockopt(sk, level, optname, 2191 return inet_csk_compat_setsockopt(sk, level, optname,
2192 optval, optlen); 2192 optval, optlen);
2193 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2193 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2194 } 2194 }
2195 2195
2196 EXPORT_SYMBOL(compat_tcp_setsockopt); 2196 EXPORT_SYMBOL(compat_tcp_setsockopt);
2197 #endif 2197 #endif
2198 2198
2199 /* Return information about state of tcp endpoint in API format. */ 2199 /* Return information about state of tcp endpoint in API format. */
2200 void tcp_get_info(struct sock *sk, struct tcp_info *info) 2200 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2201 { 2201 {
2202 struct tcp_sock *tp = tcp_sk(sk); 2202 struct tcp_sock *tp = tcp_sk(sk);
2203 const struct inet_connection_sock *icsk = inet_csk(sk); 2203 const struct inet_connection_sock *icsk = inet_csk(sk);
2204 u32 now = tcp_time_stamp; 2204 u32 now = tcp_time_stamp;
2205 2205
2206 memset(info, 0, sizeof(*info)); 2206 memset(info, 0, sizeof(*info));
2207 2207
2208 info->tcpi_state = sk->sk_state; 2208 info->tcpi_state = sk->sk_state;
2209 info->tcpi_ca_state = icsk->icsk_ca_state; 2209 info->tcpi_ca_state = icsk->icsk_ca_state;
2210 info->tcpi_retransmits = icsk->icsk_retransmits; 2210 info->tcpi_retransmits = icsk->icsk_retransmits;
2211 info->tcpi_probes = icsk->icsk_probes_out; 2211 info->tcpi_probes = icsk->icsk_probes_out;
2212 info->tcpi_backoff = icsk->icsk_backoff; 2212 info->tcpi_backoff = icsk->icsk_backoff;
2213 2213
2214 if (tp->rx_opt.tstamp_ok) 2214 if (tp->rx_opt.tstamp_ok)
2215 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 2215 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2216 if (tcp_is_sack(tp)) 2216 if (tcp_is_sack(tp))
2217 info->tcpi_options |= TCPI_OPT_SACK; 2217 info->tcpi_options |= TCPI_OPT_SACK;
2218 if (tp->rx_opt.wscale_ok) { 2218 if (tp->rx_opt.wscale_ok) {
2219 info->tcpi_options |= TCPI_OPT_WSCALE; 2219 info->tcpi_options |= TCPI_OPT_WSCALE;
2220 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; 2220 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2221 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; 2221 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2222 } 2222 }
2223 2223
2224 if (tp->ecn_flags&TCP_ECN_OK) 2224 if (tp->ecn_flags&TCP_ECN_OK)
2225 info->tcpi_options |= TCPI_OPT_ECN; 2225 info->tcpi_options |= TCPI_OPT_ECN;
2226 2226
2227 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); 2227 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2228 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); 2228 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2229 info->tcpi_snd_mss = tp->mss_cache; 2229 info->tcpi_snd_mss = tp->mss_cache;
2230 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; 2230 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2231 2231
2232 if (sk->sk_state == TCP_LISTEN) { 2232 if (sk->sk_state == TCP_LISTEN) {
2233 info->tcpi_unacked = sk->sk_ack_backlog; 2233 info->tcpi_unacked = sk->sk_ack_backlog;
2234 info->tcpi_sacked = sk->sk_max_ack_backlog; 2234 info->tcpi_sacked = sk->sk_max_ack_backlog;
2235 } else { 2235 } else {
2236 info->tcpi_unacked = tp->packets_out; 2236 info->tcpi_unacked = tp->packets_out;
2237 info->tcpi_sacked = tp->sacked_out; 2237 info->tcpi_sacked = tp->sacked_out;
2238 } 2238 }
2239 info->tcpi_lost = tp->lost_out; 2239 info->tcpi_lost = tp->lost_out;
2240 info->tcpi_retrans = tp->retrans_out; 2240 info->tcpi_retrans = tp->retrans_out;
2241 info->tcpi_fackets = tp->fackets_out; 2241 info->tcpi_fackets = tp->fackets_out;
2242 2242
2243 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); 2243 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2244 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 2244 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2245 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 2245 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2246 2246
2247 info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 2247 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2248 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; 2248 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2249 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; 2249 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2250 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; 2250 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2251 info->tcpi_snd_ssthresh = tp->snd_ssthresh; 2251 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2252 info->tcpi_snd_cwnd = tp->snd_cwnd; 2252 info->tcpi_snd_cwnd = tp->snd_cwnd;
2253 info->tcpi_advmss = tp->advmss; 2253 info->tcpi_advmss = tp->advmss;
2254 info->tcpi_reordering = tp->reordering; 2254 info->tcpi_reordering = tp->reordering;
2255 2255
2256 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; 2256 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2257 info->tcpi_rcv_space = tp->rcvq_space.space; 2257 info->tcpi_rcv_space = tp->rcvq_space.space;
2258 2258
2259 info->tcpi_total_retrans = tp->total_retrans; 2259 info->tcpi_total_retrans = tp->total_retrans;
2260 } 2260 }
2261 2261
2262 EXPORT_SYMBOL_GPL(tcp_get_info); 2262 EXPORT_SYMBOL_GPL(tcp_get_info);
2263 2263
2264 static int do_tcp_getsockopt(struct sock *sk, int level, 2264 static int do_tcp_getsockopt(struct sock *sk, int level,
2265 int optname, char __user *optval, int __user *optlen) 2265 int optname, char __user *optval, int __user *optlen)
2266 { 2266 {
2267 struct inet_connection_sock *icsk = inet_csk(sk); 2267 struct inet_connection_sock *icsk = inet_csk(sk);
2268 struct tcp_sock *tp = tcp_sk(sk); 2268 struct tcp_sock *tp = tcp_sk(sk);
2269 int val, len; 2269 int val, len;
2270 2270
2271 if (get_user(len, optlen)) 2271 if (get_user(len, optlen))
2272 return -EFAULT; 2272 return -EFAULT;
2273 2273
2274 len = min_t(unsigned int, len, sizeof(int)); 2274 len = min_t(unsigned int, len, sizeof(int));
2275 2275
2276 if (len < 0) 2276 if (len < 0)
2277 return -EINVAL; 2277 return -EINVAL;
2278 2278
2279 switch (optname) { 2279 switch (optname) {
2280 case TCP_MAXSEG: 2280 case TCP_MAXSEG:
2281 val = tp->mss_cache; 2281 val = tp->mss_cache;
2282 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2282 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2283 val = tp->rx_opt.user_mss; 2283 val = tp->rx_opt.user_mss;
2284 break; 2284 break;
2285 case TCP_NODELAY: 2285 case TCP_NODELAY:
2286 val = !!(tp->nonagle&TCP_NAGLE_OFF); 2286 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2287 break; 2287 break;
2288 case TCP_CORK: 2288 case TCP_CORK:
2289 val = !!(tp->nonagle&TCP_NAGLE_CORK); 2289 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2290 break; 2290 break;
2291 case TCP_KEEPIDLE: 2291 case TCP_KEEPIDLE:
2292 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; 2292 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2293 break; 2293 break;
2294 case TCP_KEEPINTVL: 2294 case TCP_KEEPINTVL:
2295 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; 2295 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2296 break; 2296 break;
2297 case TCP_KEEPCNT: 2297 case TCP_KEEPCNT:
2298 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; 2298 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2299 break; 2299 break;
2300 case TCP_SYNCNT: 2300 case TCP_SYNCNT:
2301 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 2301 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2302 break; 2302 break;
2303 case TCP_LINGER2: 2303 case TCP_LINGER2:
2304 val = tp->linger2; 2304 val = tp->linger2;
2305 if (val >= 0) 2305 if (val >= 0)
2306 val = (val ? : sysctl_tcp_fin_timeout) / HZ; 2306 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2307 break; 2307 break;
2308 case TCP_DEFER_ACCEPT: 2308 case TCP_DEFER_ACCEPT:
2309 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : 2309 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2310 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); 2310 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2311 break; 2311 break;
2312 case TCP_WINDOW_CLAMP: 2312 case TCP_WINDOW_CLAMP:
2313 val = tp->window_clamp; 2313 val = tp->window_clamp;
2314 break; 2314 break;
2315 case TCP_INFO: { 2315 case TCP_INFO: {
2316 struct tcp_info info; 2316 struct tcp_info info;
2317 2317
2318 if (get_user(len, optlen)) 2318 if (get_user(len, optlen))
2319 return -EFAULT; 2319 return -EFAULT;
2320 2320
2321 tcp_get_info(sk, &info); 2321 tcp_get_info(sk, &info);
2322 2322
2323 len = min_t(unsigned int, len, sizeof(info)); 2323 len = min_t(unsigned int, len, sizeof(info));
2324 if (put_user(len, optlen)) 2324 if (put_user(len, optlen))
2325 return -EFAULT; 2325 return -EFAULT;
2326 if (copy_to_user(optval, &info, len)) 2326 if (copy_to_user(optval, &info, len))
2327 return -EFAULT; 2327 return -EFAULT;
2328 return 0; 2328 return 0;
2329 } 2329 }
2330 case TCP_QUICKACK: 2330 case TCP_QUICKACK:
2331 val = !icsk->icsk_ack.pingpong; 2331 val = !icsk->icsk_ack.pingpong;
2332 break; 2332 break;
2333 2333
2334 case TCP_CONGESTION: 2334 case TCP_CONGESTION:
2335 if (get_user(len, optlen)) 2335 if (get_user(len, optlen))
2336 return -EFAULT; 2336 return -EFAULT;
2337 len = min_t(unsigned int, len, TCP_CA_NAME_MAX); 2337 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2338 if (put_user(len, optlen)) 2338 if (put_user(len, optlen))
2339 return -EFAULT; 2339 return -EFAULT;
2340 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) 2340 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2341 return -EFAULT; 2341 return -EFAULT;
2342 return 0; 2342 return 0;
2343 default: 2343 default:
2344 return -ENOPROTOOPT; 2344 return -ENOPROTOOPT;
2345 } 2345 }
2346 2346
2347 if (put_user(len, optlen)) 2347 if (put_user(len, optlen))
2348 return -EFAULT; 2348 return -EFAULT;
2349 if (copy_to_user(optval, &val, len)) 2349 if (copy_to_user(optval, &val, len))
2350 return -EFAULT; 2350 return -EFAULT;
2351 return 0; 2351 return 0;
2352 } 2352 }
2353 2353
2354 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, 2354 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2355 int __user *optlen) 2355 int __user *optlen)
2356 { 2356 {
2357 struct inet_connection_sock *icsk = inet_csk(sk); 2357 struct inet_connection_sock *icsk = inet_csk(sk);
2358 2358
2359 if (level != SOL_TCP) 2359 if (level != SOL_TCP)
2360 return icsk->icsk_af_ops->getsockopt(sk, level, optname, 2360 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2361 optval, optlen); 2361 optval, optlen);
2362 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2362 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2363 } 2363 }
2364 2364
2365 #ifdef CONFIG_COMPAT 2365 #ifdef CONFIG_COMPAT
2366 int compat_tcp_getsockopt(struct sock *sk, int level, int optname, 2366 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2367 char __user *optval, int __user *optlen) 2367 char __user *optval, int __user *optlen)
2368 { 2368 {
2369 if (level != SOL_TCP) 2369 if (level != SOL_TCP)
2370 return inet_csk_compat_getsockopt(sk, level, optname, 2370 return inet_csk_compat_getsockopt(sk, level, optname,
2371 optval, optlen); 2371 optval, optlen);
2372 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2372 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2373 } 2373 }
2374 2374
2375 EXPORT_SYMBOL(compat_tcp_getsockopt); 2375 EXPORT_SYMBOL(compat_tcp_getsockopt);
2376 #endif 2376 #endif
2377 2377
2378 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2378 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2379 { 2379 {
2380 struct sk_buff *segs = ERR_PTR(-EINVAL); 2380 struct sk_buff *segs = ERR_PTR(-EINVAL);
2381 struct tcphdr *th; 2381 struct tcphdr *th;
2382 unsigned thlen; 2382 unsigned thlen;
2383 unsigned int seq; 2383 unsigned int seq;
2384 __be32 delta; 2384 __be32 delta;
2385 unsigned int oldlen; 2385 unsigned int oldlen;
2386 unsigned int len; 2386 unsigned int len;
2387 2387
2388 if (!pskb_may_pull(skb, sizeof(*th))) 2388 if (!pskb_may_pull(skb, sizeof(*th)))
2389 goto out; 2389 goto out;
2390 2390
2391 th = tcp_hdr(skb); 2391 th = tcp_hdr(skb);
2392 thlen = th->doff * 4; 2392 thlen = th->doff * 4;
2393 if (thlen < sizeof(*th)) 2393 if (thlen < sizeof(*th))
2394 goto out; 2394 goto out;
2395 2395
2396 if (!pskb_may_pull(skb, thlen)) 2396 if (!pskb_may_pull(skb, thlen))
2397 goto out; 2397 goto out;
2398 2398
2399 oldlen = (u16)~skb->len; 2399 oldlen = (u16)~skb->len;
2400 __skb_pull(skb, thlen); 2400 __skb_pull(skb, thlen);
2401 2401
2402 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { 2402 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2403 /* Packet is from an untrusted source, reset gso_segs. */ 2403 /* Packet is from an untrusted source, reset gso_segs. */
2404 int type = skb_shinfo(skb)->gso_type; 2404 int type = skb_shinfo(skb)->gso_type;
2405 int mss; 2405 int mss;
2406 2406
2407 if (unlikely(type & 2407 if (unlikely(type &
2408 ~(SKB_GSO_TCPV4 | 2408 ~(SKB_GSO_TCPV4 |
2409 SKB_GSO_DODGY | 2409 SKB_GSO_DODGY |
2410 SKB_GSO_TCP_ECN | 2410 SKB_GSO_TCP_ECN |
2411 SKB_GSO_TCPV6 | 2411 SKB_GSO_TCPV6 |
2412 0) || 2412 0) ||
2413 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) 2413 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2414 goto out; 2414 goto out;
2415 2415
2416 mss = skb_shinfo(skb)->gso_size; 2416 mss = skb_shinfo(skb)->gso_size;
2417 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); 2417 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2418 2418
2419 segs = NULL; 2419 segs = NULL;
2420 goto out; 2420 goto out;
2421 } 2421 }
2422 2422
2423 segs = skb_segment(skb, features); 2423 segs = skb_segment(skb, features);
2424 if (IS_ERR(segs)) 2424 if (IS_ERR(segs))
2425 goto out; 2425 goto out;
2426 2426
2427 len = skb_shinfo(skb)->gso_size; 2427 len = skb_shinfo(skb)->gso_size;
2428 delta = htonl(oldlen + (thlen + len)); 2428 delta = htonl(oldlen + (thlen + len));
2429 2429
2430 skb = segs; 2430 skb = segs;
2431 th = tcp_hdr(skb); 2431 th = tcp_hdr(skb);
2432 seq = ntohl(th->seq); 2432 seq = ntohl(th->seq);
2433 2433
2434 do { 2434 do {
2435 th->fin = th->psh = 0; 2435 th->fin = th->psh = 0;
2436 2436
2437 th->check = ~csum_fold((__force __wsum)((__force u32)th->check + 2437 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2438 (__force u32)delta)); 2438 (__force u32)delta));
2439 if (skb->ip_summed != CHECKSUM_PARTIAL) 2439 if (skb->ip_summed != CHECKSUM_PARTIAL)
2440 th->check = 2440 th->check =
2441 csum_fold(csum_partial(skb_transport_header(skb), 2441 csum_fold(csum_partial(skb_transport_header(skb),
2442 thlen, skb->csum)); 2442 thlen, skb->csum));
2443 2443
2444 seq += len; 2444 seq += len;
2445 skb = skb->next; 2445 skb = skb->next;
2446 th = tcp_hdr(skb); 2446 th = tcp_hdr(skb);
2447 2447
2448 th->seq = htonl(seq); 2448 th->seq = htonl(seq);
2449 th->cwr = 0; 2449 th->cwr = 0;
2450 } while (skb->next); 2450 } while (skb->next);
2451 2451
2452 delta = htonl(oldlen + (skb->tail - skb->transport_header) + 2452 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2453 skb->data_len); 2453 skb->data_len);
2454 th->check = ~csum_fold((__force __wsum)((__force u32)th->check + 2454 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2455 (__force u32)delta)); 2455 (__force u32)delta));
2456 if (skb->ip_summed != CHECKSUM_PARTIAL) 2456 if (skb->ip_summed != CHECKSUM_PARTIAL)
2457 th->check = csum_fold(csum_partial(skb_transport_header(skb), 2457 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2458 thlen, skb->csum)); 2458 thlen, skb->csum));
2459 2459
2460 out: 2460 out:
2461 return segs; 2461 return segs;
2462 } 2462 }
2463 EXPORT_SYMBOL(tcp_tso_segment); 2463 EXPORT_SYMBOL(tcp_tso_segment);
2464 2464
2465 #ifdef CONFIG_TCP_MD5SIG 2465 #ifdef CONFIG_TCP_MD5SIG
2466 static unsigned long tcp_md5sig_users; 2466 static unsigned long tcp_md5sig_users;
2467 static struct tcp_md5sig_pool **tcp_md5sig_pool; 2467 static struct tcp_md5sig_pool **tcp_md5sig_pool;
2468 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2468 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2469 2469
2470 static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) 2470 static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2471 { 2471 {
2472 int cpu; 2472 int cpu;
2473 for_each_possible_cpu(cpu) { 2473 for_each_possible_cpu(cpu) {
2474 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); 2474 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2475 if (p) { 2475 if (p) {
2476 if (p->md5_desc.tfm) 2476 if (p->md5_desc.tfm)
2477 crypto_free_hash(p->md5_desc.tfm); 2477 crypto_free_hash(p->md5_desc.tfm);
2478 kfree(p); 2478 kfree(p);
2479 p = NULL; 2479 p = NULL;
2480 } 2480 }
2481 } 2481 }
2482 free_percpu(pool); 2482 free_percpu(pool);
2483 } 2483 }
2484 2484
2485 void tcp_free_md5sig_pool(void) 2485 void tcp_free_md5sig_pool(void)
2486 { 2486 {
2487 struct tcp_md5sig_pool **pool = NULL; 2487 struct tcp_md5sig_pool **pool = NULL;
2488 2488
2489 spin_lock_bh(&tcp_md5sig_pool_lock); 2489 spin_lock_bh(&tcp_md5sig_pool_lock);
2490 if (--tcp_md5sig_users == 0) { 2490 if (--tcp_md5sig_users == 0) {
2491 pool = tcp_md5sig_pool; 2491 pool = tcp_md5sig_pool;
2492 tcp_md5sig_pool = NULL; 2492 tcp_md5sig_pool = NULL;
2493 } 2493 }
2494 spin_unlock_bh(&tcp_md5sig_pool_lock); 2494 spin_unlock_bh(&tcp_md5sig_pool_lock);
2495 if (pool) 2495 if (pool)
2496 __tcp_free_md5sig_pool(pool); 2496 __tcp_free_md5sig_pool(pool);
2497 } 2497 }
2498 2498
2499 EXPORT_SYMBOL(tcp_free_md5sig_pool); 2499 EXPORT_SYMBOL(tcp_free_md5sig_pool);
2500 2500
2501 static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) 2501 static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2502 { 2502 {
2503 int cpu; 2503 int cpu;
2504 struct tcp_md5sig_pool **pool; 2504 struct tcp_md5sig_pool **pool;
2505 2505
2506 pool = alloc_percpu(struct tcp_md5sig_pool *); 2506 pool = alloc_percpu(struct tcp_md5sig_pool *);
2507 if (!pool) 2507 if (!pool)
2508 return NULL; 2508 return NULL;
2509 2509
2510 for_each_possible_cpu(cpu) { 2510 for_each_possible_cpu(cpu) {
2511 struct tcp_md5sig_pool *p; 2511 struct tcp_md5sig_pool *p;
2512 struct crypto_hash *hash; 2512 struct crypto_hash *hash;
2513 2513
2514 p = kzalloc(sizeof(*p), GFP_KERNEL); 2514 p = kzalloc(sizeof(*p), GFP_KERNEL);
2515 if (!p) 2515 if (!p)
2516 goto out_free; 2516 goto out_free;
2517 *per_cpu_ptr(pool, cpu) = p; 2517 *per_cpu_ptr(pool, cpu) = p;
2518 2518
2519 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); 2519 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2520 if (!hash || IS_ERR(hash)) 2520 if (!hash || IS_ERR(hash))
2521 goto out_free; 2521 goto out_free;
2522 2522
2523 p->md5_desc.tfm = hash; 2523 p->md5_desc.tfm = hash;
2524 } 2524 }
2525 return pool; 2525 return pool;
2526 out_free: 2526 out_free:
2527 __tcp_free_md5sig_pool(pool); 2527 __tcp_free_md5sig_pool(pool);
2528 return NULL; 2528 return NULL;
2529 } 2529 }
2530 2530
2531 struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) 2531 struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2532 { 2532 {
2533 struct tcp_md5sig_pool **pool; 2533 struct tcp_md5sig_pool **pool;
2534 int alloc = 0; 2534 int alloc = 0;
2535 2535
2536 retry: 2536 retry:
2537 spin_lock_bh(&tcp_md5sig_pool_lock); 2537 spin_lock_bh(&tcp_md5sig_pool_lock);
2538 pool = tcp_md5sig_pool; 2538 pool = tcp_md5sig_pool;
2539 if (tcp_md5sig_users++ == 0) { 2539 if (tcp_md5sig_users++ == 0) {
2540 alloc = 1; 2540 alloc = 1;
2541 spin_unlock_bh(&tcp_md5sig_pool_lock); 2541 spin_unlock_bh(&tcp_md5sig_pool_lock);
2542 } else if (!pool) { 2542 } else if (!pool) {
2543 tcp_md5sig_users--; 2543 tcp_md5sig_users--;
2544 spin_unlock_bh(&tcp_md5sig_pool_lock); 2544 spin_unlock_bh(&tcp_md5sig_pool_lock);
2545 cpu_relax(); 2545 cpu_relax();
2546 goto retry; 2546 goto retry;
2547 } else 2547 } else
2548 spin_unlock_bh(&tcp_md5sig_pool_lock); 2548 spin_unlock_bh(&tcp_md5sig_pool_lock);
2549 2549
2550 if (alloc) { 2550 if (alloc) {
2551 /* we cannot hold spinlock here because this may sleep. */ 2551 /* we cannot hold spinlock here because this may sleep. */
2552 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); 2552 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2553 spin_lock_bh(&tcp_md5sig_pool_lock); 2553 spin_lock_bh(&tcp_md5sig_pool_lock);
2554 if (!p) { 2554 if (!p) {
2555 tcp_md5sig_users--; 2555 tcp_md5sig_users--;
2556 spin_unlock_bh(&tcp_md5sig_pool_lock); 2556 spin_unlock_bh(&tcp_md5sig_pool_lock);
2557 return NULL; 2557 return NULL;
2558 } 2558 }
2559 pool = tcp_md5sig_pool; 2559 pool = tcp_md5sig_pool;
2560 if (pool) { 2560 if (pool) {
2561 /* oops, it has already been assigned. */ 2561 /* oops, it has already been assigned. */
2562 spin_unlock_bh(&tcp_md5sig_pool_lock); 2562 spin_unlock_bh(&tcp_md5sig_pool_lock);
2563 __tcp_free_md5sig_pool(p); 2563 __tcp_free_md5sig_pool(p);
2564 } else { 2564 } else {
2565 tcp_md5sig_pool = pool = p; 2565 tcp_md5sig_pool = pool = p;
2566 spin_unlock_bh(&tcp_md5sig_pool_lock); 2566 spin_unlock_bh(&tcp_md5sig_pool_lock);
2567 } 2567 }
2568 } 2568 }
2569 return pool; 2569 return pool;
2570 } 2570 }
2571 2571
2572 EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2572 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2573 2573
2574 struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) 2574 struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2575 { 2575 {
2576 struct tcp_md5sig_pool **p; 2576 struct tcp_md5sig_pool **p;
2577 spin_lock_bh(&tcp_md5sig_pool_lock); 2577 spin_lock_bh(&tcp_md5sig_pool_lock);
2578 p = tcp_md5sig_pool; 2578 p = tcp_md5sig_pool;
2579 if (p) 2579 if (p)
2580 tcp_md5sig_users++; 2580 tcp_md5sig_users++;
2581 spin_unlock_bh(&tcp_md5sig_pool_lock); 2581 spin_unlock_bh(&tcp_md5sig_pool_lock);
2582 return (p ? *per_cpu_ptr(p, cpu) : NULL); 2582 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2583 } 2583 }
2584 2584
2585 EXPORT_SYMBOL(__tcp_get_md5sig_pool); 2585 EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2586 2586
2587 void __tcp_put_md5sig_pool(void) 2587 void __tcp_put_md5sig_pool(void)
2588 { 2588 {
2589 tcp_free_md5sig_pool(); 2589 tcp_free_md5sig_pool();
2590 } 2590 }
2591 2591
2592 EXPORT_SYMBOL(__tcp_put_md5sig_pool); 2592 EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2593 2593
2594 int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 2594 int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2595 struct tcphdr *th) 2595 struct tcphdr *th)
2596 { 2596 {
2597 struct scatterlist sg; 2597 struct scatterlist sg;
2598 int err; 2598 int err;
2599 2599
2600 __sum16 old_checksum = th->check; 2600 __sum16 old_checksum = th->check;
2601 th->check = 0; 2601 th->check = 0;
2602 /* options aren't included in the hash */ 2602 /* options aren't included in the hash */
2603 sg_init_one(&sg, th, sizeof(struct tcphdr)); 2603 sg_init_one(&sg, th, sizeof(struct tcphdr));
2604 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); 2604 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
2605 th->check = old_checksum; 2605 th->check = old_checksum;
2606 return err; 2606 return err;
2607 } 2607 }
2608 2608
2609 EXPORT_SYMBOL(tcp_md5_hash_header); 2609 EXPORT_SYMBOL(tcp_md5_hash_header);
2610 2610
2611 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 2611 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2612 struct sk_buff *skb, unsigned header_len) 2612 struct sk_buff *skb, unsigned header_len)
2613 { 2613 {
2614 struct scatterlist sg; 2614 struct scatterlist sg;
2615 const struct tcphdr *tp = tcp_hdr(skb); 2615 const struct tcphdr *tp = tcp_hdr(skb);
2616 struct hash_desc *desc = &hp->md5_desc; 2616 struct hash_desc *desc = &hp->md5_desc;
2617 unsigned i; 2617 unsigned i;
2618 const unsigned head_data_len = skb_headlen(skb) > header_len ? 2618 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2619 skb_headlen(skb) - header_len : 0; 2619 skb_headlen(skb) - header_len : 0;
2620 const struct skb_shared_info *shi = skb_shinfo(skb); 2620 const struct skb_shared_info *shi = skb_shinfo(skb);
2621 2621
2622 sg_init_table(&sg, 1); 2622 sg_init_table(&sg, 1);
2623 2623
2624 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); 2624 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
2625 if (crypto_hash_update(desc, &sg, head_data_len)) 2625 if (crypto_hash_update(desc, &sg, head_data_len))
2626 return 1; 2626 return 1;
2627 2627
2628 for (i = 0; i < shi->nr_frags; ++i) { 2628 for (i = 0; i < shi->nr_frags; ++i) {
2629 const struct skb_frag_struct *f = &shi->frags[i]; 2629 const struct skb_frag_struct *f = &shi->frags[i];
2630 sg_set_page(&sg, f->page, f->size, f->page_offset); 2630 sg_set_page(&sg, f->page, f->size, f->page_offset);
2631 if (crypto_hash_update(desc, &sg, f->size)) 2631 if (crypto_hash_update(desc, &sg, f->size))
2632 return 1; 2632 return 1;
2633 } 2633 }
2634 2634
2635 return 0; 2635 return 0;
2636 } 2636 }
2637 2637
2638 EXPORT_SYMBOL(tcp_md5_hash_skb_data); 2638 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2639 2639
2640 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) 2640 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2641 { 2641 {
2642 struct scatterlist sg; 2642 struct scatterlist sg;
2643 2643
2644 sg_init_one(&sg, key->key, key->keylen); 2644 sg_init_one(&sg, key->key, key->keylen);
2645 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); 2645 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2646 } 2646 }
2647 2647
2648 EXPORT_SYMBOL(tcp_md5_hash_key); 2648 EXPORT_SYMBOL(tcp_md5_hash_key);
2649 2649
2650 #endif 2650 #endif
2651 2651
2652 void tcp_done(struct sock *sk) 2652 void tcp_done(struct sock *sk)
2653 { 2653 {
2654 if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 2654 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2655 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 2655 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2656 2656
2657 tcp_set_state(sk, TCP_CLOSE); 2657 tcp_set_state(sk, TCP_CLOSE);
2658 tcp_clear_xmit_timers(sk); 2658 tcp_clear_xmit_timers(sk);
2659 2659
2660 sk->sk_shutdown = SHUTDOWN_MASK; 2660 sk->sk_shutdown = SHUTDOWN_MASK;
2661 2661
2662 if (!sock_flag(sk, SOCK_DEAD)) 2662 if (!sock_flag(sk, SOCK_DEAD))
2663 sk->sk_state_change(sk); 2663 sk->sk_state_change(sk);
2664 else 2664 else
2665 inet_csk_destroy_sock(sk); 2665 inet_csk_destroy_sock(sk);
2666 } 2666 }
2667 EXPORT_SYMBOL_GPL(tcp_done); 2667 EXPORT_SYMBOL_GPL(tcp_done);
2668 2668
2669 extern struct tcp_congestion_ops tcp_reno; 2669 extern struct tcp_congestion_ops tcp_reno;
2670 2670
2671 static __initdata unsigned long thash_entries; 2671 static __initdata unsigned long thash_entries;
2672 static int __init set_thash_entries(char *str) 2672 static int __init set_thash_entries(char *str)
2673 { 2673 {
2674 if (!str) 2674 if (!str)
2675 return 0; 2675 return 0;
2676 thash_entries = simple_strtoul(str, &str, 0); 2676 thash_entries = simple_strtoul(str, &str, 0);
2677 return 1; 2677 return 1;
2678 } 2678 }
2679 __setup("thash_entries=", set_thash_entries); 2679 __setup("thash_entries=", set_thash_entries);
2680 2680
2681 void __init tcp_init(void) 2681 void __init tcp_init(void)
2682 { 2682 {
2683 struct sk_buff *skb = NULL; 2683 struct sk_buff *skb = NULL;
2684 unsigned long nr_pages, limit; 2684 unsigned long nr_pages, limit;
2685 int order, i, max_share; 2685 int order, i, max_share;
2686 2686
2687 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 2687 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2688 2688
2689 tcp_hashinfo.bind_bucket_cachep = 2689 tcp_hashinfo.bind_bucket_cachep =
2690 kmem_cache_create("tcp_bind_bucket", 2690 kmem_cache_create("tcp_bind_bucket",
2691 sizeof(struct inet_bind_bucket), 0, 2691 sizeof(struct inet_bind_bucket), 0,
2692 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 2692 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2693 2693
2694 /* Size and allocate the main established and bind bucket 2694 /* Size and allocate the main established and bind bucket
2695 * hash tables. 2695 * hash tables.
2696 * 2696 *
2697 * The methodology is similar to that of the buffer cache. 2697 * The methodology is similar to that of the buffer cache.
2698 */ 2698 */
2699 tcp_hashinfo.ehash = 2699 tcp_hashinfo.ehash =
2700 alloc_large_system_hash("TCP established", 2700 alloc_large_system_hash("TCP established",
2701 sizeof(struct inet_ehash_bucket), 2701 sizeof(struct inet_ehash_bucket),
2702 thash_entries, 2702 thash_entries,
2703 (num_physpages >= 128 * 1024) ? 2703 (num_physpages >= 128 * 1024) ?
2704 13 : 15, 2704 13 : 15,
2705 0, 2705 0,
2706 &tcp_hashinfo.ehash_size, 2706 &tcp_hashinfo.ehash_size,
2707 NULL, 2707 NULL,
2708 thash_entries ? 0 : 512 * 1024); 2708 thash_entries ? 0 : 512 * 1024);
2709 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 2709 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2710 for (i = 0; i < tcp_hashinfo.ehash_size; i++) { 2710 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2711 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); 2711 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2712 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); 2712 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2713 } 2713 }
2714 if (inet_ehash_locks_alloc(&tcp_hashinfo)) 2714 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2715 panic("TCP: failed to alloc ehash_locks"); 2715 panic("TCP: failed to alloc ehash_locks");
2716 tcp_hashinfo.bhash = 2716 tcp_hashinfo.bhash =
2717 alloc_large_system_hash("TCP bind", 2717 alloc_large_system_hash("TCP bind",
2718 sizeof(struct inet_bind_hashbucket), 2718 sizeof(struct inet_bind_hashbucket),
2719 tcp_hashinfo.ehash_size, 2719 tcp_hashinfo.ehash_size,
2720 (num_physpages >= 128 * 1024) ? 2720 (num_physpages >= 128 * 1024) ?
2721 13 : 15, 2721 13 : 15,
2722 0, 2722 0,
2723 &tcp_hashinfo.bhash_size, 2723 &tcp_hashinfo.bhash_size,
2724 NULL, 2724 NULL,
2725 64 * 1024); 2725 64 * 1024);
2726 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; 2726 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2727 for (i = 0; i < tcp_hashinfo.bhash_size; i++) { 2727 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2728 spin_lock_init(&tcp_hashinfo.bhash[i].lock); 2728 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2729 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 2729 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2730 } 2730 }
2731 2731
2732 /* Try to be a bit smarter and adjust defaults depending 2732 /* Try to be a bit smarter and adjust defaults depending
2733 * on available memory. 2733 * on available memory.
2734 */ 2734 */
2735 for (order = 0; ((1 << order) << PAGE_SHIFT) < 2735 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2736 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); 2736 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2737 order++) 2737 order++)
2738 ; 2738 ;
2739 if (order >= 4) { 2739 if (order >= 4) {
2740 tcp_death_row.sysctl_max_tw_buckets = 180000; 2740 tcp_death_row.sysctl_max_tw_buckets = 180000;
2741 sysctl_tcp_max_orphans = 4096 << (order - 4); 2741 sysctl_tcp_max_orphans = 4096 << (order - 4);
2742 sysctl_max_syn_backlog = 1024; 2742 sysctl_max_syn_backlog = 1024;
2743 } else if (order < 3) { 2743 } else if (order < 3) {
2744 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); 2744 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2745 sysctl_tcp_max_orphans >>= (3 - order); 2745 sysctl_tcp_max_orphans >>= (3 - order);
2746 sysctl_max_syn_backlog = 128; 2746 sysctl_max_syn_backlog = 128;
2747 } 2747 }
2748 2748
2749 /* Set the pressure threshold to be a fraction of global memory that 2749 /* Set the pressure threshold to be a fraction of global memory that
2750 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of 2750 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
2751 * memory, with a floor of 128 pages. 2751 * memory, with a floor of 128 pages.
2752 */ 2752 */
2753 nr_pages = totalram_pages - totalhigh_pages; 2753 nr_pages = totalram_pages - totalhigh_pages;
2754 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); 2754 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2755 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); 2755 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2756 limit = max(limit, 128UL); 2756 limit = max(limit, 128UL);
2757 sysctl_tcp_mem[0] = limit / 4 * 3; 2757 sysctl_tcp_mem[0] = limit / 4 * 3;
2758 sysctl_tcp_mem[1] = limit; 2758 sysctl_tcp_mem[1] = limit;
2759 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; 2759 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2760 2760
2761 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 2761 /* Set per-socket limits to no more than 1/128 the pressure threshold */
2762 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); 2762 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2763 max_share = min(4UL*1024*1024, limit); 2763 max_share = min(4UL*1024*1024, limit);
2764 2764
2765 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 2765 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2766 sysctl_tcp_wmem[1] = 16*1024; 2766 sysctl_tcp_wmem[1] = 16*1024;
2767 sysctl_tcp_wmem[2] = max(64*1024, max_share); 2767 sysctl_tcp_wmem[2] = max(64*1024, max_share);
2768 2768
2769 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; 2769 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2770 sysctl_tcp_rmem[1] = 87380; 2770 sysctl_tcp_rmem[1] = 87380;
2771 sysctl_tcp_rmem[2] = max(87380, max_share); 2771 sysctl_tcp_rmem[2] = max(87380, max_share);
2772 2772
2773 printk(KERN_INFO "TCP: Hash tables configured " 2773 printk(KERN_INFO "TCP: Hash tables configured "
2774 "(established %d bind %d)\n", 2774 "(established %d bind %d)\n",
2775 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); 2775 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2776 2776
2777 tcp_register_congestion_control(&tcp_reno); 2777 tcp_register_congestion_control(&tcp_reno);
2778 } 2778 }
2779 2779
2780 EXPORT_SYMBOL(tcp_close); 2780 EXPORT_SYMBOL(tcp_close);
2781 EXPORT_SYMBOL(tcp_disconnect); 2781 EXPORT_SYMBOL(tcp_disconnect);
2782 EXPORT_SYMBOL(tcp_getsockopt); 2782 EXPORT_SYMBOL(tcp_getsockopt);
2783 EXPORT_SYMBOL(tcp_ioctl); 2783 EXPORT_SYMBOL(tcp_ioctl);
2784 EXPORT_SYMBOL(tcp_poll); 2784 EXPORT_SYMBOL(tcp_poll);
2785 EXPORT_SYMBOL(tcp_read_sock); 2785 EXPORT_SYMBOL(tcp_read_sock);
2786 EXPORT_SYMBOL(tcp_recvmsg); 2786 EXPORT_SYMBOL(tcp_recvmsg);
2787 EXPORT_SYMBOL(tcp_sendmsg); 2787 EXPORT_SYMBOL(tcp_sendmsg);
2788 EXPORT_SYMBOL(tcp_splice_read); 2788 EXPORT_SYMBOL(tcp_splice_read);
2789 EXPORT_SYMBOL(tcp_sendpage); 2789 EXPORT_SYMBOL(tcp_sendpage);
2790 EXPORT_SYMBOL(tcp_setsockopt); 2790 EXPORT_SYMBOL(tcp_setsockopt);
2791 EXPORT_SYMBOL(tcp_shutdown); 2791 EXPORT_SYMBOL(tcp_shutdown);
2792 2792
net/ipv4/tcp_minisocks.c
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Implementation of the Transmission Control Protocol(TCP). 6 * Implementation of the Transmission Control Protocol(TCP).
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Mark Evans, <evansmp@uhura.aston.ac.uk> 10 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Corey Minyard <wf-rch!minyard@relay.EU.net> 11 * Corey Minyard <wf-rch!minyard@relay.EU.net>
12 * Florian La Roche, <flla@stud.uni-sb.de> 12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 * Linus Torvalds, <torvalds@cs.helsinki.fi> 14 * Linus Torvalds, <torvalds@cs.helsinki.fi>
15 * Alan Cox, <gw4pts@gw4pts.ampr.org> 15 * Alan Cox, <gw4pts@gw4pts.ampr.org>
16 * Matthew Dillon, <dillon@apollo.west.oic.com> 16 * Matthew Dillon, <dillon@apollo.west.oic.com>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Jorge Cwik, <jorge@laser.satlink.net> 18 * Jorge Cwik, <jorge@laser.satlink.net>
19 */ 19 */
20 20
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/sysctl.h> 23 #include <linux/sysctl.h>
24 #include <linux/workqueue.h> 24 #include <linux/workqueue.h>
25 #include <net/tcp.h> 25 #include <net/tcp.h>
26 #include <net/inet_common.h> 26 #include <net/inet_common.h>
27 #include <net/xfrm.h> 27 #include <net/xfrm.h>
28 28
29 #ifdef CONFIG_SYSCTL 29 #ifdef CONFIG_SYSCTL
30 #define SYNC_INIT 0 /* let the user enable it */ 30 #define SYNC_INIT 0 /* let the user enable it */
31 #else 31 #else
32 #define SYNC_INIT 1 32 #define SYNC_INIT 1
33 #endif 33 #endif
34 34
35 int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; 35 int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
36 EXPORT_SYMBOL(sysctl_tcp_syncookies); 36 EXPORT_SYMBOL(sysctl_tcp_syncookies);
37 37
38 int sysctl_tcp_abort_on_overflow __read_mostly; 38 int sysctl_tcp_abort_on_overflow __read_mostly;
39 39
40 struct inet_timewait_death_row tcp_death_row = { 40 struct inet_timewait_death_row tcp_death_row = {
41 .sysctl_max_tw_buckets = NR_FILE * 2, 41 .sysctl_max_tw_buckets = NR_FILE * 2,
42 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, 42 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
43 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), 43 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
44 .hashinfo = &tcp_hashinfo, 44 .hashinfo = &tcp_hashinfo,
45 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, 45 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
46 (unsigned long)&tcp_death_row), 46 (unsigned long)&tcp_death_row),
47 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, 47 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
48 inet_twdr_twkill_work), 48 inet_twdr_twkill_work),
49 /* Short-time timewait calendar */ 49 /* Short-time timewait calendar */
50 50
51 .twcal_hand = -1, 51 .twcal_hand = -1,
52 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 52 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
53 (unsigned long)&tcp_death_row), 53 (unsigned long)&tcp_death_row),
54 }; 54 };
55 55
56 EXPORT_SYMBOL_GPL(tcp_death_row); 56 EXPORT_SYMBOL_GPL(tcp_death_row);
57 57
58 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 58 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
59 { 59 {
60 if (seq == s_win) 60 if (seq == s_win)
61 return 1; 61 return 1;
62 if (after(end_seq, s_win) && before(seq, e_win)) 62 if (after(end_seq, s_win) && before(seq, e_win))
63 return 1; 63 return 1;
64 return (seq == e_win && seq == end_seq); 64 return (seq == e_win && seq == end_seq);
65 } 65 }
66 66
67 /* 67 /*
68 * * Main purpose of TIME-WAIT state is to close connection gracefully, 68 * * Main purpose of TIME-WAIT state is to close connection gracefully,
69 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 69 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
70 * (and, probably, tail of data) and one or more our ACKs are lost. 70 * (and, probably, tail of data) and one or more our ACKs are lost.
71 * * What is TIME-WAIT timeout? It is associated with maximal packet 71 * * What is TIME-WAIT timeout? It is associated with maximal packet
72 * lifetime in the internet, which results in wrong conclusion, that 72 * lifetime in the internet, which results in wrong conclusion, that
73 * it is set to catch "old duplicate segments" wandering out of their path. 73 * it is set to catch "old duplicate segments" wandering out of their path.
74 * It is not quite correct. This timeout is calculated so that it exceeds 74 * It is not quite correct. This timeout is calculated so that it exceeds
75 * maximal retransmission timeout enough to allow to lose one (or more) 75 * maximal retransmission timeout enough to allow to lose one (or more)
76 * segments sent by peer and our ACKs. This time may be calculated from RTO. 76 * segments sent by peer and our ACKs. This time may be calculated from RTO.
77 * * When TIME-WAIT socket receives RST, it means that another end 77 * * When TIME-WAIT socket receives RST, it means that another end
78 * finally closed and we are allowed to kill TIME-WAIT too. 78 * finally closed and we are allowed to kill TIME-WAIT too.
79 * * Second purpose of TIME-WAIT is catching old duplicate segments. 79 * * Second purpose of TIME-WAIT is catching old duplicate segments.
80 * Well, certainly it is pure paranoia, but if we load TIME-WAIT 80 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
81 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. 81 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
82 * * If we invented some more clever way to catch duplicates 82 * * If we invented some more clever way to catch duplicates
83 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. 83 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
84 * 84 *
85 * The algorithm below is based on FORMAL INTERPRETATION of RFCs. 85 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
86 * When you compare it to RFCs, please, read section SEGMENT ARRIVES 86 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
87 * from the very beginning. 87 * from the very beginning.
88 * 88 *
89 * NOTE. With recycling (and later with fin-wait-2) TW bucket 89 * NOTE. With recycling (and later with fin-wait-2) TW bucket
90 * is _not_ stateless. It means, that strictly speaking we must 90 * is _not_ stateless. It means, that strictly speaking we must
91 * spinlock it. I do not want! Well, probability of misbehaviour 91 * spinlock it. I do not want! Well, probability of misbehaviour
92 * is ridiculously low and, seems, we could use some mb() tricks 92 * is ridiculously low and, seems, we could use some mb() tricks
93 * to avoid misread sequence numbers, states etc. --ANK 93 * to avoid misread sequence numbers, states etc. --ANK
94 */ 94 */
95 enum tcp_tw_status 95 enum tcp_tw_status
96 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 96 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
97 const struct tcphdr *th) 97 const struct tcphdr *th)
98 { 98 {
99 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 99 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
100 struct tcp_options_received tmp_opt; 100 struct tcp_options_received tmp_opt;
101 int paws_reject = 0; 101 int paws_reject = 0;
102 102
103 tmp_opt.saw_tstamp = 0; 103 tmp_opt.saw_tstamp = 0;
104 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 104 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
105 tcp_parse_options(skb, &tmp_opt, 0); 105 tcp_parse_options(skb, &tmp_opt, 0);
106 106
107 if (tmp_opt.saw_tstamp) { 107 if (tmp_opt.saw_tstamp) {
108 tmp_opt.ts_recent = tcptw->tw_ts_recent; 108 tmp_opt.ts_recent = tcptw->tw_ts_recent;
109 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 109 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
110 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 110 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
111 } 111 }
112 } 112 }
113 113
114 if (tw->tw_substate == TCP_FIN_WAIT2) { 114 if (tw->tw_substate == TCP_FIN_WAIT2) {
115 /* Just repeat all the checks of tcp_rcv_state_process() */ 115 /* Just repeat all the checks of tcp_rcv_state_process() */
116 116
117 /* Out of window, send ACK */ 117 /* Out of window, send ACK */
118 if (paws_reject || 118 if (paws_reject ||
119 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 119 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
120 tcptw->tw_rcv_nxt, 120 tcptw->tw_rcv_nxt,
121 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) 121 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
122 return TCP_TW_ACK; 122 return TCP_TW_ACK;
123 123
124 if (th->rst) 124 if (th->rst)
125 goto kill; 125 goto kill;
126 126
127 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) 127 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
128 goto kill_with_rst; 128 goto kill_with_rst;
129 129
130 /* Dup ACK? */ 130 /* Dup ACK? */
131 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || 131 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
132 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 132 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
133 inet_twsk_put(tw); 133 inet_twsk_put(tw);
134 return TCP_TW_SUCCESS; 134 return TCP_TW_SUCCESS;
135 } 135 }
136 136
137 /* New data or FIN. If new data arrive after half-duplex close, 137 /* New data or FIN. If new data arrive after half-duplex close,
138 * reset. 138 * reset.
139 */ 139 */
140 if (!th->fin || 140 if (!th->fin ||
141 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { 141 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
142 kill_with_rst: 142 kill_with_rst:
143 inet_twsk_deschedule(tw, &tcp_death_row); 143 inet_twsk_deschedule(tw, &tcp_death_row);
144 inet_twsk_put(tw); 144 inet_twsk_put(tw);
145 return TCP_TW_RST; 145 return TCP_TW_RST;
146 } 146 }
147 147
148 /* FIN arrived, enter true time-wait state. */ 148 /* FIN arrived, enter true time-wait state. */
149 tw->tw_substate = TCP_TIME_WAIT; 149 tw->tw_substate = TCP_TIME_WAIT;
150 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 150 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
151 if (tmp_opt.saw_tstamp) { 151 if (tmp_opt.saw_tstamp) {
152 tcptw->tw_ts_recent_stamp = get_seconds(); 152 tcptw->tw_ts_recent_stamp = get_seconds();
153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
154 } 154 }
155 155
156 /* I am shamed, but failed to make it more elegant. 156 /* I am shamed, but failed to make it more elegant.
157 * Yes, it is direct reference to IP, which is impossible 157 * Yes, it is direct reference to IP, which is impossible
158 * to generalize to IPv6. Taking into account that IPv6 158 * to generalize to IPv6. Taking into account that IPv6
159 * do not understand recycling in any case, it not 159 * do not understand recycling in any case, it not
160 * a big problem in practice. --ANK */ 160 * a big problem in practice. --ANK */
161 if (tw->tw_family == AF_INET && 161 if (tw->tw_family == AF_INET &&
162 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && 162 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
163 tcp_v4_tw_remember_stamp(tw)) 163 tcp_v4_tw_remember_stamp(tw))
164 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 164 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
165 TCP_TIMEWAIT_LEN); 165 TCP_TIMEWAIT_LEN);
166 else 166 else
167 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 167 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
168 TCP_TIMEWAIT_LEN); 168 TCP_TIMEWAIT_LEN);
169 return TCP_TW_ACK; 169 return TCP_TW_ACK;
170 } 170 }
171 171
172 /* 172 /*
173 * Now real TIME-WAIT state. 173 * Now real TIME-WAIT state.
174 * 174 *
175 * RFC 1122: 175 * RFC 1122:
176 * "When a connection is [...] on TIME-WAIT state [...] 176 * "When a connection is [...] on TIME-WAIT state [...]
177 * [a TCP] MAY accept a new SYN from the remote TCP to 177 * [a TCP] MAY accept a new SYN from the remote TCP to
178 * reopen the connection directly, if it: 178 * reopen the connection directly, if it:
179 * 179 *
180 * (1) assigns its initial sequence number for the new 180 * (1) assigns its initial sequence number for the new
181 * connection to be larger than the largest sequence 181 * connection to be larger than the largest sequence
182 * number it used on the previous connection incarnation, 182 * number it used on the previous connection incarnation,
183 * and 183 * and
184 * 184 *
185 * (2) returns to TIME-WAIT state if the SYN turns out 185 * (2) returns to TIME-WAIT state if the SYN turns out
186 * to be an old duplicate". 186 * to be an old duplicate".
187 */ 187 */
188 188
189 if (!paws_reject && 189 if (!paws_reject &&
190 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && 190 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
191 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { 191 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
192 /* In window segment, it may be only reset or bare ack. */ 192 /* In window segment, it may be only reset or bare ack. */
193 193
194 if (th->rst) { 194 if (th->rst) {
195 /* This is TIME_WAIT assassination, in two flavors. 195 /* This is TIME_WAIT assassination, in two flavors.
196 * Oh well... nobody has a sufficient solution to this 196 * Oh well... nobody has a sufficient solution to this
197 * protocol bug yet. 197 * protocol bug yet.
198 */ 198 */
199 if (sysctl_tcp_rfc1337 == 0) { 199 if (sysctl_tcp_rfc1337 == 0) {
200 kill: 200 kill:
201 inet_twsk_deschedule(tw, &tcp_death_row); 201 inet_twsk_deschedule(tw, &tcp_death_row);
202 inet_twsk_put(tw); 202 inet_twsk_put(tw);
203 return TCP_TW_SUCCESS; 203 return TCP_TW_SUCCESS;
204 } 204 }
205 } 205 }
206 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 206 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
207 TCP_TIMEWAIT_LEN); 207 TCP_TIMEWAIT_LEN);
208 208
209 if (tmp_opt.saw_tstamp) { 209 if (tmp_opt.saw_tstamp) {
210 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 210 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
211 tcptw->tw_ts_recent_stamp = get_seconds(); 211 tcptw->tw_ts_recent_stamp = get_seconds();
212 } 212 }
213 213
214 inet_twsk_put(tw); 214 inet_twsk_put(tw);
215 return TCP_TW_SUCCESS; 215 return TCP_TW_SUCCESS;
216 } 216 }
217 217
218 /* Out of window segment. 218 /* Out of window segment.
219 219
220 All the segments are ACKed immediately. 220 All the segments are ACKed immediately.
221 221
222 The only exception is new SYN. We accept it, if it is 222 The only exception is new SYN. We accept it, if it is
223 not old duplicate and we are not in danger to be killed 223 not old duplicate and we are not in danger to be killed
224 by delayed old duplicates. RFC check is that it has 224 by delayed old duplicates. RFC check is that it has
225 newer sequence number works at rates <40Mbit/sec. 225 newer sequence number works at rates <40Mbit/sec.
226 However, if paws works, it is reliable AND even more, 226 However, if paws works, it is reliable AND even more,
227 we even may relax silly seq space cutoff. 227 we even may relax silly seq space cutoff.
228 228
229 RED-PEN: we violate main RFC requirement, if this SYN will appear 229 RED-PEN: we violate main RFC requirement, if this SYN will appear
230 old duplicate (i.e. we receive RST in reply to SYN-ACK), 230 old duplicate (i.e. we receive RST in reply to SYN-ACK),
231 we must return socket to time-wait state. It is not good, 231 we must return socket to time-wait state. It is not good,
232 but not fatal yet. 232 but not fatal yet.
233 */ 233 */
234 234
235 if (th->syn && !th->rst && !th->ack && !paws_reject && 235 if (th->syn && !th->rst && !th->ack && !paws_reject &&
236 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || 236 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
237 (tmp_opt.saw_tstamp && 237 (tmp_opt.saw_tstamp &&
238 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { 238 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
239 u32 isn = tcptw->tw_snd_nxt + 65535 + 2; 239 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
240 if (isn == 0) 240 if (isn == 0)
241 isn++; 241 isn++;
242 TCP_SKB_CB(skb)->when = isn; 242 TCP_SKB_CB(skb)->when = isn;
243 return TCP_TW_SYN; 243 return TCP_TW_SYN;
244 } 244 }
245 245
246 if (paws_reject) 246 if (paws_reject)
247 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); 247 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
248 248
249 if (!th->rst) { 249 if (!th->rst) {
250 /* In this case we must reset the TIMEWAIT timer. 250 /* In this case we must reset the TIMEWAIT timer.
251 * 251 *
252 * If it is ACKless SYN it may be both old duplicate 252 * If it is ACKless SYN it may be both old duplicate
253 * and new good SYN with random sequence number <rcv_nxt. 253 * and new good SYN with random sequence number <rcv_nxt.
254 * Do not reschedule in the last case. 254 * Do not reschedule in the last case.
255 */ 255 */
256 if (paws_reject || th->ack) 256 if (paws_reject || th->ack)
257 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 257 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
258 TCP_TIMEWAIT_LEN); 258 TCP_TIMEWAIT_LEN);
259 259
260 /* Send ACK. Note, we do not put the bucket, 260 /* Send ACK. Note, we do not put the bucket,
261 * it will be released by caller. 261 * it will be released by caller.
262 */ 262 */
263 return TCP_TW_ACK; 263 return TCP_TW_ACK;
264 } 264 }
265 inet_twsk_put(tw); 265 inet_twsk_put(tw);
266 return TCP_TW_SUCCESS; 266 return TCP_TW_SUCCESS;
267 } 267 }
268 268
269 /* 269 /*
270 * Move a socket to time-wait or dead fin-wait-2 state. 270 * Move a socket to time-wait or dead fin-wait-2 state.
271 */ 271 */
272 void tcp_time_wait(struct sock *sk, int state, int timeo) 272 void tcp_time_wait(struct sock *sk, int state, int timeo)
273 { 273 {
274 struct inet_timewait_sock *tw = NULL; 274 struct inet_timewait_sock *tw = NULL;
275 const struct inet_connection_sock *icsk = inet_csk(sk); 275 const struct inet_connection_sock *icsk = inet_csk(sk);
276 const struct tcp_sock *tp = tcp_sk(sk); 276 const struct tcp_sock *tp = tcp_sk(sk);
277 int recycle_ok = 0; 277 int recycle_ok = 0;
278 278
279 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 279 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
280 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); 280 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
281 281
282 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 282 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
283 tw = inet_twsk_alloc(sk, state); 283 tw = inet_twsk_alloc(sk, state);
284 284
285 if (tw != NULL) { 285 if (tw != NULL) {
286 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 286 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
287 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 287 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
288 288
289 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 289 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
290 tcptw->tw_rcv_nxt = tp->rcv_nxt; 290 tcptw->tw_rcv_nxt = tp->rcv_nxt;
291 tcptw->tw_snd_nxt = tp->snd_nxt; 291 tcptw->tw_snd_nxt = tp->snd_nxt;
292 tcptw->tw_rcv_wnd = tcp_receive_window(tp); 292 tcptw->tw_rcv_wnd = tcp_receive_window(tp);
293 tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 293 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
294 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 294 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
295 295
296 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 296 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
297 if (tw->tw_family == PF_INET6) { 297 if (tw->tw_family == PF_INET6) {
298 struct ipv6_pinfo *np = inet6_sk(sk); 298 struct ipv6_pinfo *np = inet6_sk(sk);
299 struct inet6_timewait_sock *tw6; 299 struct inet6_timewait_sock *tw6;
300 300
301 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); 301 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
302 tw6 = inet6_twsk((struct sock *)tw); 302 tw6 = inet6_twsk((struct sock *)tw);
303 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); 303 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
304 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); 304 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
305 tw->tw_ipv6only = np->ipv6only; 305 tw->tw_ipv6only = np->ipv6only;
306 } 306 }
307 #endif 307 #endif
308 308
309 #ifdef CONFIG_TCP_MD5SIG 309 #ifdef CONFIG_TCP_MD5SIG
310 /* 310 /*
311 * The timewait bucket does not have the key DB from the 311 * The timewait bucket does not have the key DB from the
312 * sock structure. We just make a quick copy of the 312 * sock structure. We just make a quick copy of the
313 * md5 key being used (if indeed we are using one) 313 * md5 key being used (if indeed we are using one)
314 * so the timewait ack generating code has the key. 314 * so the timewait ack generating code has the key.
315 */ 315 */
316 do { 316 do {
317 struct tcp_md5sig_key *key; 317 struct tcp_md5sig_key *key;
318 memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); 318 memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
319 tcptw->tw_md5_keylen = 0; 319 tcptw->tw_md5_keylen = 0;
320 key = tp->af_specific->md5_lookup(sk, sk); 320 key = tp->af_specific->md5_lookup(sk, sk);
321 if (key != NULL) { 321 if (key != NULL) {
322 memcpy(&tcptw->tw_md5_key, key->key, key->keylen); 322 memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
323 tcptw->tw_md5_keylen = key->keylen; 323 tcptw->tw_md5_keylen = key->keylen;
324 if (tcp_alloc_md5sig_pool() == NULL) 324 if (tcp_alloc_md5sig_pool() == NULL)
325 BUG(); 325 BUG();
326 } 326 }
327 } while (0); 327 } while (0);
328 #endif 328 #endif
329 329
330 /* Linkage updates. */ 330 /* Linkage updates. */
331 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); 331 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
332 332
333 /* Get the TIME_WAIT timeout firing. */ 333 /* Get the TIME_WAIT timeout firing. */
334 if (timeo < rto) 334 if (timeo < rto)
335 timeo = rto; 335 timeo = rto;
336 336
337 if (recycle_ok) { 337 if (recycle_ok) {
338 tw->tw_timeout = rto; 338 tw->tw_timeout = rto;
339 } else { 339 } else {
340 tw->tw_timeout = TCP_TIMEWAIT_LEN; 340 tw->tw_timeout = TCP_TIMEWAIT_LEN;
341 if (state == TCP_TIME_WAIT) 341 if (state == TCP_TIME_WAIT)
342 timeo = TCP_TIMEWAIT_LEN; 342 timeo = TCP_TIMEWAIT_LEN;
343 } 343 }
344 344
345 inet_twsk_schedule(tw, &tcp_death_row, timeo, 345 inet_twsk_schedule(tw, &tcp_death_row, timeo,
346 TCP_TIMEWAIT_LEN); 346 TCP_TIMEWAIT_LEN);
347 inet_twsk_put(tw); 347 inet_twsk_put(tw);
348 } else { 348 } else {
349 /* Sorry, if we're out of memory, just CLOSE this 349 /* Sorry, if we're out of memory, just CLOSE this
350 * socket up. We've got bigger problems than 350 * socket up. We've got bigger problems than
351 * non-graceful socket closings. 351 * non-graceful socket closings.
352 */ 352 */
353 LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); 353 LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
354 } 354 }
355 355
356 tcp_update_metrics(sk); 356 tcp_update_metrics(sk);
357 tcp_done(sk); 357 tcp_done(sk);
358 } 358 }
359 359
360 void tcp_twsk_destructor(struct sock *sk) 360 void tcp_twsk_destructor(struct sock *sk)
361 { 361 {
362 #ifdef CONFIG_TCP_MD5SIG 362 #ifdef CONFIG_TCP_MD5SIG
363 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 363 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
364 if (twsk->tw_md5_keylen) 364 if (twsk->tw_md5_keylen)
365 tcp_put_md5sig_pool(); 365 tcp_put_md5sig_pool();
366 #endif 366 #endif
367 } 367 }
368 368
369 EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 369 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
370 370
371 static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 371 static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
372 struct request_sock *req) 372 struct request_sock *req)
373 { 373 {
374 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 374 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
375 } 375 }
376 376
377 /* This is not only more efficient than what we used to do, it eliminates 377 /* This is not only more efficient than what we used to do, it eliminates
378 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 378 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
379 * 379 *
380 * Actually, we could lots of memory writes here. tp of listening 380 * Actually, we could lots of memory writes here. tp of listening
381 * socket contains all necessary default parameters. 381 * socket contains all necessary default parameters.
382 */ 382 */
383 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 383 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
384 { 384 {
385 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); 385 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
386 386
387 if (newsk != NULL) { 387 if (newsk != NULL) {
388 const struct inet_request_sock *ireq = inet_rsk(req); 388 const struct inet_request_sock *ireq = inet_rsk(req);
389 struct tcp_request_sock *treq = tcp_rsk(req); 389 struct tcp_request_sock *treq = tcp_rsk(req);
390 struct inet_connection_sock *newicsk = inet_csk(newsk); 390 struct inet_connection_sock *newicsk = inet_csk(newsk);
391 struct tcp_sock *newtp; 391 struct tcp_sock *newtp;
392 392
393 /* Now setup tcp_sock */ 393 /* Now setup tcp_sock */
394 newtp = tcp_sk(newsk); 394 newtp = tcp_sk(newsk);
395 newtp->pred_flags = 0; 395 newtp->pred_flags = 0;
396 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; 396 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
397 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; 397 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
398 newtp->snd_up = treq->snt_isn + 1; 398 newtp->snd_up = treq->snt_isn + 1;
399 399
400 tcp_prequeue_init(newtp); 400 tcp_prequeue_init(newtp);
401 401
402 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 402 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
403 403
404 newtp->srtt = 0; 404 newtp->srtt = 0;
405 newtp->mdev = TCP_TIMEOUT_INIT; 405 newtp->mdev = TCP_TIMEOUT_INIT;
406 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 406 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
407 407
408 newtp->packets_out = 0; 408 newtp->packets_out = 0;
409 newtp->retrans_out = 0; 409 newtp->retrans_out = 0;
410 newtp->sacked_out = 0; 410 newtp->sacked_out = 0;
411 newtp->fackets_out = 0; 411 newtp->fackets_out = 0;
412 newtp->snd_ssthresh = 0x7fffffff; 412 newtp->snd_ssthresh = 0x7fffffff;
413 413
414 /* So many TCP implementations out there (incorrectly) count the 414 /* So many TCP implementations out there (incorrectly) count the
415 * initial SYN frame in their delayed-ACK and congestion control 415 * initial SYN frame in their delayed-ACK and congestion control
416 * algorithms that we must have the following bandaid to talk 416 * algorithms that we must have the following bandaid to talk
417 * efficiently to them. -DaveM 417 * efficiently to them. -DaveM
418 */ 418 */
419 newtp->snd_cwnd = 2; 419 newtp->snd_cwnd = 2;
420 newtp->snd_cwnd_cnt = 0; 420 newtp->snd_cwnd_cnt = 0;
421 newtp->bytes_acked = 0; 421 newtp->bytes_acked = 0;
422 422
423 newtp->frto_counter = 0; 423 newtp->frto_counter = 0;
424 newtp->frto_highmark = 0; 424 newtp->frto_highmark = 0;
425 425
426 newicsk->icsk_ca_ops = &tcp_init_congestion_ops; 426 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
427 427
428 tcp_set_ca_state(newsk, TCP_CA_Open); 428 tcp_set_ca_state(newsk, TCP_CA_Open);
429 tcp_init_xmit_timers(newsk); 429 tcp_init_xmit_timers(newsk);
430 skb_queue_head_init(&newtp->out_of_order_queue); 430 skb_queue_head_init(&newtp->out_of_order_queue);
431 newtp->write_seq = treq->snt_isn + 1; 431 newtp->write_seq = treq->snt_isn + 1;
432 newtp->pushed_seq = newtp->write_seq; 432 newtp->pushed_seq = newtp->write_seq;
433 433
434 newtp->rx_opt.saw_tstamp = 0; 434 newtp->rx_opt.saw_tstamp = 0;
435 435
436 newtp->rx_opt.dsack = 0; 436 newtp->rx_opt.dsack = 0;
437 newtp->rx_opt.eff_sacks = 0; 437 newtp->rx_opt.eff_sacks = 0;
438 438
439 newtp->rx_opt.num_sacks = 0; 439 newtp->rx_opt.num_sacks = 0;
440 newtp->urg_data = 0; 440 newtp->urg_data = 0;
441 441
442 if (sock_flag(newsk, SOCK_KEEPOPEN)) 442 if (sock_flag(newsk, SOCK_KEEPOPEN))
443 inet_csk_reset_keepalive_timer(newsk, 443 inet_csk_reset_keepalive_timer(newsk,
444 keepalive_time_when(newtp)); 444 keepalive_time_when(newtp));
445 445
446 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 446 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
447 if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 447 if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
448 if (sysctl_tcp_fack) 448 if (sysctl_tcp_fack)
449 tcp_enable_fack(newtp); 449 tcp_enable_fack(newtp);
450 } 450 }
451 newtp->window_clamp = req->window_clamp; 451 newtp->window_clamp = req->window_clamp;
452 newtp->rcv_ssthresh = req->rcv_wnd; 452 newtp->rcv_ssthresh = req->rcv_wnd;
453 newtp->rcv_wnd = req->rcv_wnd; 453 newtp->rcv_wnd = req->rcv_wnd;
454 newtp->rx_opt.wscale_ok = ireq->wscale_ok; 454 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
455 if (newtp->rx_opt.wscale_ok) { 455 if (newtp->rx_opt.wscale_ok) {
456 newtp->rx_opt.snd_wscale = ireq->snd_wscale; 456 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
457 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; 457 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
458 } else { 458 } else {
459 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 459 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
460 newtp->window_clamp = min(newtp->window_clamp, 65535U); 460 newtp->window_clamp = min(newtp->window_clamp, 65535U);
461 } 461 }
462 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << 462 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
463 newtp->rx_opt.snd_wscale); 463 newtp->rx_opt.snd_wscale);
464 newtp->max_window = newtp->snd_wnd; 464 newtp->max_window = newtp->snd_wnd;
465 465
466 if (newtp->rx_opt.tstamp_ok) { 466 if (newtp->rx_opt.tstamp_ok) {
467 newtp->rx_opt.ts_recent = req->ts_recent; 467 newtp->rx_opt.ts_recent = req->ts_recent;
468 newtp->rx_opt.ts_recent_stamp = get_seconds(); 468 newtp->rx_opt.ts_recent_stamp = get_seconds();
469 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 469 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
470 } else { 470 } else {
471 newtp->rx_opt.ts_recent_stamp = 0; 471 newtp->rx_opt.ts_recent_stamp = 0;
472 newtp->tcp_header_len = sizeof(struct tcphdr); 472 newtp->tcp_header_len = sizeof(struct tcphdr);
473 } 473 }
474 #ifdef CONFIG_TCP_MD5SIG 474 #ifdef CONFIG_TCP_MD5SIG
475 newtp->md5sig_info = NULL; /*XXX*/ 475 newtp->md5sig_info = NULL; /*XXX*/
476 if (newtp->af_specific->md5_lookup(sk, newsk)) 476 if (newtp->af_specific->md5_lookup(sk, newsk))
477 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 477 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
478 #endif 478 #endif
479 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 479 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
480 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 480 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
481 newtp->rx_opt.mss_clamp = req->mss; 481 newtp->rx_opt.mss_clamp = req->mss;
482 TCP_ECN_openreq_child(newtp, req); 482 TCP_ECN_openreq_child(newtp, req);
483 483
484 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 484 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
485 } 485 }
486 return newsk; 486 return newsk;
487 } 487 }
488 488
489 /* 489 /*
490 * Process an incoming packet for SYN_RECV sockets represented 490 * Process an incoming packet for SYN_RECV sockets represented
491 * as a request_sock. 491 * as a request_sock.
492 */ 492 */
493 493
494 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, 494 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
495 struct request_sock *req, 495 struct request_sock *req,
496 struct request_sock **prev) 496 struct request_sock **prev)
497 { 497 {
498 const struct tcphdr *th = tcp_hdr(skb); 498 const struct tcphdr *th = tcp_hdr(skb);
499 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 499 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
500 int paws_reject = 0; 500 int paws_reject = 0;
501 struct tcp_options_received tmp_opt; 501 struct tcp_options_received tmp_opt;
502 struct sock *child; 502 struct sock *child;
503 503
504 tmp_opt.saw_tstamp = 0; 504 tmp_opt.saw_tstamp = 0;
505 if (th->doff > (sizeof(struct tcphdr)>>2)) { 505 if (th->doff > (sizeof(struct tcphdr)>>2)) {
506 tcp_parse_options(skb, &tmp_opt, 0); 506 tcp_parse_options(skb, &tmp_opt, 0);
507 507
508 if (tmp_opt.saw_tstamp) { 508 if (tmp_opt.saw_tstamp) {
509 tmp_opt.ts_recent = req->ts_recent; 509 tmp_opt.ts_recent = req->ts_recent;
510 /* We do not store true stamp, but it is not required, 510 /* We do not store true stamp, but it is not required,
511 * it can be estimated (approximately) 511 * it can be estimated (approximately)
512 * from another data. 512 * from another data.
513 */ 513 */
514 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 514 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
515 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 515 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
516 } 516 }
517 } 517 }
518 518
519 /* Check for pure retransmitted SYN. */ 519 /* Check for pure retransmitted SYN. */
520 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && 520 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
521 flg == TCP_FLAG_SYN && 521 flg == TCP_FLAG_SYN &&
522 !paws_reject) { 522 !paws_reject) {
523 /* 523 /*
524 * RFC793 draws (Incorrectly! It was fixed in RFC1122) 524 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
525 * this case on figure 6 and figure 8, but formal 525 * this case on figure 6 and figure 8, but formal
526 * protocol description says NOTHING. 526 * protocol description says NOTHING.
527 * To be more exact, it says that we should send ACK, 527 * To be more exact, it says that we should send ACK,
528 * because this segment (at least, if it has no data) 528 * because this segment (at least, if it has no data)
529 * is out of window. 529 * is out of window.
530 * 530 *
531 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT 531 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
532 * describe SYN-RECV state. All the description 532 * describe SYN-RECV state. All the description
533 * is wrong, we cannot believe to it and should 533 * is wrong, we cannot believe to it and should
534 * rely only on common sense and implementation 534 * rely only on common sense and implementation
535 * experience. 535 * experience.
536 * 536 *
537 * Enforce "SYN-ACK" according to figure 8, figure 6 537 * Enforce "SYN-ACK" according to figure 8, figure 6
538 * of RFC793, fixed by RFC1122. 538 * of RFC793, fixed by RFC1122.
539 */ 539 */
540 req->rsk_ops->rtx_syn_ack(sk, req); 540 req->rsk_ops->rtx_syn_ack(sk, req);
541 return NULL; 541 return NULL;
542 } 542 }
543 543
544 /* Further reproduces section "SEGMENT ARRIVES" 544 /* Further reproduces section "SEGMENT ARRIVES"
545 for state SYN-RECEIVED of RFC793. 545 for state SYN-RECEIVED of RFC793.
546 It is broken, however, it does not work only 546 It is broken, however, it does not work only
547 when SYNs are crossed. 547 when SYNs are crossed.
548 548
549 You would think that SYN crossing is impossible here, since 549 You would think that SYN crossing is impossible here, since
550 we should have a SYN_SENT socket (from connect()) on our end, 550 we should have a SYN_SENT socket (from connect()) on our end,
551 but this is not true if the crossed SYNs were sent to both 551 but this is not true if the crossed SYNs were sent to both
552 ends by a malicious third party. We must defend against this, 552 ends by a malicious third party. We must defend against this,
553 and to do that we first verify the ACK (as per RFC793, page 553 and to do that we first verify the ACK (as per RFC793, page
554 36) and reset if it is invalid. Is this a true full defense? 554 36) and reset if it is invalid. Is this a true full defense?
555 To convince ourselves, let us consider a way in which the ACK 555 To convince ourselves, let us consider a way in which the ACK
556 test can still pass in this 'malicious crossed SYNs' case. 556 test can still pass in this 'malicious crossed SYNs' case.
557 Malicious sender sends identical SYNs (and thus identical sequence 557 Malicious sender sends identical SYNs (and thus identical sequence
558 numbers) to both A and B: 558 numbers) to both A and B:
559 559
560 A: gets SYN, seq=7 560 A: gets SYN, seq=7
561 B: gets SYN, seq=7 561 B: gets SYN, seq=7
562 562
563 By our good fortune, both A and B select the same initial 563 By our good fortune, both A and B select the same initial
564 send sequence number of seven :-) 564 send sequence number of seven :-)
565 565
566 A: sends SYN|ACK, seq=7, ack_seq=8 566 A: sends SYN|ACK, seq=7, ack_seq=8
567 B: sends SYN|ACK, seq=7, ack_seq=8 567 B: sends SYN|ACK, seq=7, ack_seq=8
568 568
569 So we are now A eating this SYN|ACK, ACK test passes. So 569 So we are now A eating this SYN|ACK, ACK test passes. So
570 does sequence test, SYN is truncated, and thus we consider 570 does sequence test, SYN is truncated, and thus we consider
571 it a bare ACK. 571 it a bare ACK.
572 572
573 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this 573 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
574 bare ACK. Otherwise, we create an established connection. Both 574 bare ACK. Otherwise, we create an established connection. Both
575 ends (listening sockets) accept the new incoming connection and try 575 ends (listening sockets) accept the new incoming connection and try
576 to talk to each other. 8-) 576 to talk to each other. 8-)
577 577
578 Note: This case is both harmless, and rare. Possibility is about the 578 Note: This case is both harmless, and rare. Possibility is about the
579 same as us discovering intelligent life on another plant tomorrow. 579 same as us discovering intelligent life on another plant tomorrow.
580 580
581 But generally, we should (RFC lies!) to accept ACK 581 But generally, we should (RFC lies!) to accept ACK
582 from SYNACK both here and in tcp_rcv_state_process(). 582 from SYNACK both here and in tcp_rcv_state_process().
583 tcp_rcv_state_process() does not, hence, we do not too. 583 tcp_rcv_state_process() does not, hence, we do not too.
584 584
585 Note that the case is absolutely generic: 585 Note that the case is absolutely generic:
586 we cannot optimize anything here without 586 we cannot optimize anything here without
587 violating protocol. All the checks must be made 587 violating protocol. All the checks must be made
588 before attempt to create socket. 588 before attempt to create socket.
589 */ 589 */
590 590
591 /* RFC793 page 36: "If the connection is in any non-synchronized state ... 591 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
592 * and the incoming segment acknowledges something not yet 592 * and the incoming segment acknowledges something not yet
593 * sent (the segment carries an unacceptable ACK) ... 593 * sent (the segment carries an unacceptable ACK) ...
594 * a reset is sent." 594 * a reset is sent."
595 * 595 *
596 * Invalid ACK: reset will be sent by listening socket 596 * Invalid ACK: reset will be sent by listening socket
597 */ 597 */
598 if ((flg & TCP_FLAG_ACK) && 598 if ((flg & TCP_FLAG_ACK) &&
599 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) 599 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
600 return sk; 600 return sk;
601 601
602 /* Also, it would be not so bad idea to check rcv_tsecr, which 602 /* Also, it would be not so bad idea to check rcv_tsecr, which
603 * is essentially ACK extension and too early or too late values 603 * is essentially ACK extension and too early or too late values
604 * should cause reset in unsynchronized states. 604 * should cause reset in unsynchronized states.
605 */ 605 */
606 606
607 /* RFC793: "first check sequence number". */ 607 /* RFC793: "first check sequence number". */
608 608
609 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 609 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
610 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 610 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
611 /* Out of window: send ACK and drop. */ 611 /* Out of window: send ACK and drop. */
612 if (!(flg & TCP_FLAG_RST)) 612 if (!(flg & TCP_FLAG_RST))
613 req->rsk_ops->send_ack(sk, skb, req); 613 req->rsk_ops->send_ack(sk, skb, req);
614 if (paws_reject) 614 if (paws_reject)
615 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 615 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
616 return NULL; 616 return NULL;
617 } 617 }
618 618
619 /* In sequence, PAWS is OK. */ 619 /* In sequence, PAWS is OK. */
620 620
621 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 621 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
622 req->ts_recent = tmp_opt.rcv_tsval; 622 req->ts_recent = tmp_opt.rcv_tsval;
623 623
624 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 624 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
625 /* Truncate SYN, it is out of window starting 625 /* Truncate SYN, it is out of window starting
626 at tcp_rsk(req)->rcv_isn + 1. */ 626 at tcp_rsk(req)->rcv_isn + 1. */
627 flg &= ~TCP_FLAG_SYN; 627 flg &= ~TCP_FLAG_SYN;
628 } 628 }
629 629
630 /* RFC793: "second check the RST bit" and 630 /* RFC793: "second check the RST bit" and
631 * "fourth, check the SYN bit" 631 * "fourth, check the SYN bit"
632 */ 632 */
633 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { 633 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
634 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 634 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
635 goto embryonic_reset; 635 goto embryonic_reset;
636 } 636 }
637 637
638 /* ACK sequence verified above, just make sure ACK is 638 /* ACK sequence verified above, just make sure ACK is
639 * set. If ACK not set, just silently drop the packet. 639 * set. If ACK not set, just silently drop the packet.
640 */ 640 */
641 if (!(flg & TCP_FLAG_ACK)) 641 if (!(flg & TCP_FLAG_ACK))
642 return NULL; 642 return NULL;
643 643
644 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 644 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
645 if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 645 if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
646 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 646 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
647 inet_rsk(req)->acked = 1; 647 inet_rsk(req)->acked = 1;
648 return NULL; 648 return NULL;
649 } 649 }
650 650
651 /* OK, ACK is valid, create big socket and 651 /* OK, ACK is valid, create big socket and
652 * feed this segment to it. It will repeat all 652 * feed this segment to it. It will repeat all
653 * the tests. THIS SEGMENT MUST MOVE SOCKET TO 653 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
654 * ESTABLISHED STATE. If it will be dropped after 654 * ESTABLISHED STATE. If it will be dropped after
655 * socket is created, wait for troubles. 655 * socket is created, wait for troubles.
656 */ 656 */
657 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 657 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
658 if (child == NULL) 658 if (child == NULL)
659 goto listen_overflow; 659 goto listen_overflow;
660 #ifdef CONFIG_TCP_MD5SIG 660 #ifdef CONFIG_TCP_MD5SIG
661 else { 661 else {
662 /* Copy over the MD5 key from the original socket */ 662 /* Copy over the MD5 key from the original socket */
663 struct tcp_md5sig_key *key; 663 struct tcp_md5sig_key *key;
664 struct tcp_sock *tp = tcp_sk(sk); 664 struct tcp_sock *tp = tcp_sk(sk);
665 key = tp->af_specific->md5_lookup(sk, child); 665 key = tp->af_specific->md5_lookup(sk, child);
666 if (key != NULL) { 666 if (key != NULL) {
667 /* 667 /*
668 * We're using one, so create a matching key on the 668 * We're using one, so create a matching key on the
669 * newsk structure. If we fail to get memory then we 669 * newsk structure. If we fail to get memory then we
670 * end up not copying the key across. Shucks. 670 * end up not copying the key across. Shucks.
671 */ 671 */
672 char *newkey = kmemdup(key->key, key->keylen, 672 char *newkey = kmemdup(key->key, key->keylen,
673 GFP_ATOMIC); 673 GFP_ATOMIC);
674 if (newkey) { 674 if (newkey) {
675 if (!tcp_alloc_md5sig_pool()) 675 if (!tcp_alloc_md5sig_pool())
676 BUG(); 676 BUG();
677 tp->af_specific->md5_add(child, child, newkey, 677 tp->af_specific->md5_add(child, child, newkey,
678 key->keylen); 678 key->keylen);
679 } 679 }
680 } 680 }
681 } 681 }
682 #endif 682 #endif
683 683
684 inet_csk_reqsk_queue_unlink(sk, req, prev); 684 inet_csk_reqsk_queue_unlink(sk, req, prev);
685 inet_csk_reqsk_queue_removed(sk, req); 685 inet_csk_reqsk_queue_removed(sk, req);
686 686
687 inet_csk_reqsk_queue_add(sk, req, child); 687 inet_csk_reqsk_queue_add(sk, req, child);
688 return child; 688 return child;
689 689
690 listen_overflow: 690 listen_overflow:
691 if (!sysctl_tcp_abort_on_overflow) { 691 if (!sysctl_tcp_abort_on_overflow) {
692 inet_rsk(req)->acked = 1; 692 inet_rsk(req)->acked = 1;
693 return NULL; 693 return NULL;
694 } 694 }
695 695
696 embryonic_reset: 696 embryonic_reset:
697 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 697 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
698 if (!(flg & TCP_FLAG_RST)) 698 if (!(flg & TCP_FLAG_RST))
699 req->rsk_ops->send_reset(sk, skb); 699 req->rsk_ops->send_reset(sk, skb);
700 700
701 inet_csk_reqsk_queue_drop(sk, req, prev); 701 inet_csk_reqsk_queue_drop(sk, req, prev);
702 return NULL; 702 return NULL;
703 } 703 }
704 704
705 /* 705 /*
706 * Queue segment on the new socket if the new socket is active, 706 * Queue segment on the new socket if the new socket is active,
707 * otherwise we just shortcircuit this and continue with 707 * otherwise we just shortcircuit this and continue with
708 * the new socket. 708 * the new socket.
709 */ 709 */
710 710
711 int tcp_child_process(struct sock *parent, struct sock *child, 711 int tcp_child_process(struct sock *parent, struct sock *child,
712 struct sk_buff *skb) 712 struct sk_buff *skb)
713 { 713 {
714 int ret = 0; 714 int ret = 0;
715 int state = child->sk_state; 715 int state = child->sk_state;
716 716
717 if (!sock_owned_by_user(child)) { 717 if (!sock_owned_by_user(child)) {
718 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), 718 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
719 skb->len); 719 skb->len);
720 /* Wakeup parent, send SIGIO */ 720 /* Wakeup parent, send SIGIO */
721 if (state == TCP_SYN_RECV && child->sk_state != state) 721 if (state == TCP_SYN_RECV && child->sk_state != state)
722 parent->sk_data_ready(parent, 0); 722 parent->sk_data_ready(parent, 0);
723 } else { 723 } else {
724 /* Alas, it is possible again, because we do lookup 724 /* Alas, it is possible again, because we do lookup
725 * in main socket hash table and lock on listening 725 * in main socket hash table and lock on listening
726 * socket does not protect us more. 726 * socket does not protect us more.
727 */ 727 */
728 sk_add_backlog(child, skb); 728 sk_add_backlog(child, skb);
729 } 729 }
730 730
731 bh_unlock_sock(child); 731 bh_unlock_sock(child);
732 sock_put(child); 732 sock_put(child);
733 return ret; 733 return ret;
734 } 734 }
735 735
736 EXPORT_SYMBOL(tcp_check_req); 736 EXPORT_SYMBOL(tcp_check_req);
737 EXPORT_SYMBOL(tcp_child_process); 737 EXPORT_SYMBOL(tcp_child_process);
738 EXPORT_SYMBOL(tcp_create_openreq_child); 738 EXPORT_SYMBOL(tcp_create_openreq_child);
739 EXPORT_SYMBOL(tcp_timewait_state_process); 739 EXPORT_SYMBOL(tcp_timewait_state_process);
740 740
1 /* 1 /*
2 * 2 *
3 * YeAH TCP 3 * YeAH TCP
4 * 4 *
5 * For further details look at: 5 * For further details look at:
6 * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf 6 * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
7 * 7 *
8 */ 8 */
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/skbuff.h> 11 #include <linux/skbuff.h>
12 #include <linux/inet_diag.h> 12 #include <linux/inet_diag.h>
13 13
14 #include <net/tcp.h> 14 #include <net/tcp.h>
15 15
16 #include "tcp_vegas.h" 16 #include "tcp_vegas.h"
17 17
18 #define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck 18 #define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
19 #define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt 19 #define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
20 #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss 20 #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
21 #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion 21 #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
22 #define TCP_YEAH_PHY 8 //lin maximum delta from base 22 #define TCP_YEAH_PHY 8 //lin maximum delta from base
23 #define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss 23 #define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss
24 #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count 24 #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
25 25
26 #define TCP_SCALABLE_AI_CNT 100U 26 #define TCP_SCALABLE_AI_CNT 100U
27 27
28 /* YeAH variables */ 28 /* YeAH variables */
29 struct yeah { 29 struct yeah {
30 struct vegas vegas; /* must be first */ 30 struct vegas vegas; /* must be first */
31 31
32 /* YeAH */ 32 /* YeAH */
33 u32 lastQ; 33 u32 lastQ;
34 u32 doing_reno_now; 34 u32 doing_reno_now;
35 35
36 u32 reno_count; 36 u32 reno_count;
37 u32 fast_count; 37 u32 fast_count;
38 38
39 u32 pkts_acked; 39 u32 pkts_acked;
40 }; 40 };
41 41
42 static void tcp_yeah_init(struct sock *sk) 42 static void tcp_yeah_init(struct sock *sk)
43 { 43 {
44 struct tcp_sock *tp = tcp_sk(sk); 44 struct tcp_sock *tp = tcp_sk(sk);
45 struct yeah *yeah = inet_csk_ca(sk); 45 struct yeah *yeah = inet_csk_ca(sk);
46 46
47 tcp_vegas_init(sk); 47 tcp_vegas_init(sk);
48 48
49 yeah->doing_reno_now = 0; 49 yeah->doing_reno_now = 0;
50 yeah->lastQ = 0; 50 yeah->lastQ = 0;
51 51
52 yeah->reno_count = 2; 52 yeah->reno_count = 2;
53 53
54 /* Ensure the MD arithmetic works. This is somewhat pedantic, 54 /* Ensure the MD arithmetic works. This is somewhat pedantic,
55 * since I don't think we will see a cwnd this large. :) */ 55 * since I don't think we will see a cwnd this large. :) */
56 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); 56 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
57 57
58 } 58 }
59 59
60 60
61 static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) 61 static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
62 { 62 {
63 const struct inet_connection_sock *icsk = inet_csk(sk); 63 const struct inet_connection_sock *icsk = inet_csk(sk);
64 struct yeah *yeah = inet_csk_ca(sk); 64 struct yeah *yeah = inet_csk_ca(sk);
65 65
66 if (icsk->icsk_ca_state == TCP_CA_Open) 66 if (icsk->icsk_ca_state == TCP_CA_Open)
67 yeah->pkts_acked = pkts_acked; 67 yeah->pkts_acked = pkts_acked;
68 68
69 tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); 69 tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
70 } 70 }
71 71
72 static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) 72 static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
73 { 73 {
74 struct tcp_sock *tp = tcp_sk(sk); 74 struct tcp_sock *tp = tcp_sk(sk);
75 struct yeah *yeah = inet_csk_ca(sk); 75 struct yeah *yeah = inet_csk_ca(sk);
76 76
77 if (!tcp_is_cwnd_limited(sk, in_flight)) 77 if (!tcp_is_cwnd_limited(sk, in_flight))
78 return; 78 return;
79 79
80 if (tp->snd_cwnd <= tp->snd_ssthresh) 80 if (tp->snd_cwnd <= tp->snd_ssthresh)
81 tcp_slow_start(tp); 81 tcp_slow_start(tp);
82 82
83 else if (!yeah->doing_reno_now) { 83 else if (!yeah->doing_reno_now) {
84 /* Scalable */ 84 /* Scalable */
85 85
86 tp->snd_cwnd_cnt+=yeah->pkts_acked; 86 tp->snd_cwnd_cnt += yeah->pkts_acked;
87 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ 87 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
88 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 88 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
89 tp->snd_cwnd++; 89 tp->snd_cwnd++;
90 tp->snd_cwnd_cnt = 0; 90 tp->snd_cwnd_cnt = 0;
91 } 91 }
92 92
93 yeah->pkts_acked = 1; 93 yeah->pkts_acked = 1;
94 94
95 } else { 95 } else {
96 /* Reno */ 96 /* Reno */
97 97
98 if (tp->snd_cwnd_cnt < tp->snd_cwnd) 98 if (tp->snd_cwnd_cnt < tp->snd_cwnd)
99 tp->snd_cwnd_cnt++; 99 tp->snd_cwnd_cnt++;
100 100
101 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 101 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
102 tp->snd_cwnd++; 102 tp->snd_cwnd++;
103 tp->snd_cwnd_cnt = 0; 103 tp->snd_cwnd_cnt = 0;
104 } 104 }
105 } 105 }
106 106
107 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. 107 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
108 * 108 *
109 * These are so named because they represent the approximate values 109 * These are so named because they represent the approximate values
110 * of snd_una and snd_nxt at the beginning of the current RTT. More 110 * of snd_una and snd_nxt at the beginning of the current RTT. More
111 * precisely, they represent the amount of data sent during the RTT. 111 * precisely, they represent the amount of data sent during the RTT.
112 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, 112 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
113 * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding 113 * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
114 * bytes of data have been ACKed during the course of the RTT, giving 114 * bytes of data have been ACKed during the course of the RTT, giving
115 * an "actual" rate of: 115 * an "actual" rate of:
116 * 116 *
117 * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) 117 * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
118 * 118 *
119 * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, 119 * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
120 * because delayed ACKs can cover more than one segment, so they 120 * because delayed ACKs can cover more than one segment, so they
121 * don't line up yeahly with the boundaries of RTTs. 121 * don't line up yeahly with the boundaries of RTTs.
122 * 122 *
123 * Another unfortunate fact of life is that delayed ACKs delay the 123 * Another unfortunate fact of life is that delayed ACKs delay the
124 * advance of the left edge of our send window, so that the number 124 * advance of the left edge of our send window, so that the number
125 * of bytes we send in an RTT is often less than our cwnd will allow. 125 * of bytes we send in an RTT is often less than our cwnd will allow.
126 * So we keep track of our cwnd separately, in v_beg_snd_cwnd. 126 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
127 */ 127 */
128 128
129 if (after(ack, yeah->vegas.beg_snd_nxt)) { 129 if (after(ack, yeah->vegas.beg_snd_nxt)) {
130 130
131 /* We do the Vegas calculations only if we got enough RTT 131 /* We do the Vegas calculations only if we got enough RTT
132 * samples that we can be reasonably sure that we got 132 * samples that we can be reasonably sure that we got
133 * at least one RTT sample that wasn't from a delayed ACK. 133 * at least one RTT sample that wasn't from a delayed ACK.
134 * If we only had 2 samples total, 134 * If we only had 2 samples total,
135 * then that means we're getting only 1 ACK per RTT, which 135 * then that means we're getting only 1 ACK per RTT, which
136 * means they're almost certainly delayed ACKs. 136 * means they're almost certainly delayed ACKs.
137 * If we have 3 samples, we should be OK. 137 * If we have 3 samples, we should be OK.
138 */ 138 */
139 139
140 if (yeah->vegas.cntRTT > 2) { 140 if (yeah->vegas.cntRTT > 2) {
141 u32 rtt, queue; 141 u32 rtt, queue;
142 u64 bw; 142 u64 bw;
143 143
144 /* We have enough RTT samples, so, using the Vegas 144 /* We have enough RTT samples, so, using the Vegas
145 * algorithm, we determine if we should increase or 145 * algorithm, we determine if we should increase or
146 * decrease cwnd, and by how much. 146 * decrease cwnd, and by how much.
147 */ 147 */
148 148
149 /* Pluck out the RTT we are using for the Vegas 149 /* Pluck out the RTT we are using for the Vegas
150 * calculations. This is the min RTT seen during the 150 * calculations. This is the min RTT seen during the
151 * last RTT. Taking the min filters out the effects 151 * last RTT. Taking the min filters out the effects
152 * of delayed ACKs, at the cost of noticing congestion 152 * of delayed ACKs, at the cost of noticing congestion
153 * a bit later. 153 * a bit later.
154 */ 154 */
155 rtt = yeah->vegas.minRTT; 155 rtt = yeah->vegas.minRTT;
156 156
157 /* Compute excess number of packets above bandwidth 157 /* Compute excess number of packets above bandwidth
158 * Avoid doing full 64 bit divide. 158 * Avoid doing full 64 bit divide.
159 */ 159 */
160 bw = tp->snd_cwnd; 160 bw = tp->snd_cwnd;
161 bw *= rtt - yeah->vegas.baseRTT; 161 bw *= rtt - yeah->vegas.baseRTT;
162 do_div(bw, rtt); 162 do_div(bw, rtt);
163 queue = bw; 163 queue = bw;
164 164
165 if (queue > TCP_YEAH_ALPHA || 165 if (queue > TCP_YEAH_ALPHA ||
166 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { 166 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
167 if (queue > TCP_YEAH_ALPHA 167 if (queue > TCP_YEAH_ALPHA
168 && tp->snd_cwnd > yeah->reno_count) { 168 && tp->snd_cwnd > yeah->reno_count) {
169 u32 reduction = min(queue / TCP_YEAH_GAMMA , 169 u32 reduction = min(queue / TCP_YEAH_GAMMA ,
170 tp->snd_cwnd >> TCP_YEAH_EPSILON); 170 tp->snd_cwnd >> TCP_YEAH_EPSILON);
171 171
172 tp->snd_cwnd -= reduction; 172 tp->snd_cwnd -= reduction;
173 173
174 tp->snd_cwnd = max(tp->snd_cwnd, 174 tp->snd_cwnd = max(tp->snd_cwnd,
175 yeah->reno_count); 175 yeah->reno_count);
176 176
177 tp->snd_ssthresh = tp->snd_cwnd; 177 tp->snd_ssthresh = tp->snd_cwnd;
178 } 178 }
179 179
180 if (yeah->reno_count <= 2) 180 if (yeah->reno_count <= 2)
181 yeah->reno_count = max(tp->snd_cwnd>>1, 2U); 181 yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
182 else 182 else
183 yeah->reno_count++; 183 yeah->reno_count++;
184 184
185 yeah->doing_reno_now = min(yeah->doing_reno_now + 1, 185 yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
186 0xffffffU); 186 0xffffffU);
187 } else { 187 } else {
188 yeah->fast_count++; 188 yeah->fast_count++;
189 189
190 if (yeah->fast_count > TCP_YEAH_ZETA) { 190 if (yeah->fast_count > TCP_YEAH_ZETA) {
191 yeah->reno_count = 2; 191 yeah->reno_count = 2;
192 yeah->fast_count = 0; 192 yeah->fast_count = 0;
193 } 193 }
194 194
195 yeah->doing_reno_now = 0; 195 yeah->doing_reno_now = 0;
196 } 196 }
197 197
198 yeah->lastQ = queue; 198 yeah->lastQ = queue;
199 199
200 } 200 }
201 201
202 /* Save the extent of the current window so we can use this 202 /* Save the extent of the current window so we can use this
203 * at the end of the next RTT. 203 * at the end of the next RTT.
204 */ 204 */
205 yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; 205 yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
206 yeah->vegas.beg_snd_nxt = tp->snd_nxt; 206 yeah->vegas.beg_snd_nxt = tp->snd_nxt;
207 yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; 207 yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
208 208
209 /* Wipe the slate clean for the next RTT. */ 209 /* Wipe the slate clean for the next RTT. */
210 yeah->vegas.cntRTT = 0; 210 yeah->vegas.cntRTT = 0;
211 yeah->vegas.minRTT = 0x7fffffff; 211 yeah->vegas.minRTT = 0x7fffffff;
212 } 212 }
213 } 213 }
214 214
215 static u32 tcp_yeah_ssthresh(struct sock *sk) { 215 static u32 tcp_yeah_ssthresh(struct sock *sk) {
216 const struct tcp_sock *tp = tcp_sk(sk); 216 const struct tcp_sock *tp = tcp_sk(sk);
217 struct yeah *yeah = inet_csk_ca(sk); 217 struct yeah *yeah = inet_csk_ca(sk);
218 u32 reduction; 218 u32 reduction;
219 219
220 if (yeah->doing_reno_now < TCP_YEAH_RHO) { 220 if (yeah->doing_reno_now < TCP_YEAH_RHO) {
221 reduction = yeah->lastQ; 221 reduction = yeah->lastQ;
222 222
223 reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); 223 reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
224 224
225 reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); 225 reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
226 } else 226 } else
227 reduction = max(tp->snd_cwnd>>1,2U); 227 reduction = max(tp->snd_cwnd>>1, 2U);
228 228
229 yeah->fast_count = 0; 229 yeah->fast_count = 0;
230 yeah->reno_count = max(yeah->reno_count>>1, 2U); 230 yeah->reno_count = max(yeah->reno_count>>1, 2U);
231 231
232 return tp->snd_cwnd - reduction; 232 return tp->snd_cwnd - reduction;
233 } 233 }
234 234
235 static struct tcp_congestion_ops tcp_yeah = { 235 static struct tcp_congestion_ops tcp_yeah = {
236 .flags = TCP_CONG_RTT_STAMP, 236 .flags = TCP_CONG_RTT_STAMP,
237 .init = tcp_yeah_init, 237 .init = tcp_yeah_init,
238 .ssthresh = tcp_yeah_ssthresh, 238 .ssthresh = tcp_yeah_ssthresh,
239 .cong_avoid = tcp_yeah_cong_avoid, 239 .cong_avoid = tcp_yeah_cong_avoid,
240 .min_cwnd = tcp_reno_min_cwnd, 240 .min_cwnd = tcp_reno_min_cwnd,
241 .set_state = tcp_vegas_state, 241 .set_state = tcp_vegas_state,
242 .cwnd_event = tcp_vegas_cwnd_event, 242 .cwnd_event = tcp_vegas_cwnd_event,
243 .get_info = tcp_vegas_get_info, 243 .get_info = tcp_vegas_get_info,
244 .pkts_acked = tcp_yeah_pkts_acked, 244 .pkts_acked = tcp_yeah_pkts_acked,
245 245
246 .owner = THIS_MODULE, 246 .owner = THIS_MODULE,
247 .name = "yeah", 247 .name = "yeah",
248 }; 248 };
249 249
250 static int __init tcp_yeah_register(void) 250 static int __init tcp_yeah_register(void)
251 { 251 {
252 BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); 252 BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
253 tcp_register_congestion_control(&tcp_yeah); 253 tcp_register_congestion_control(&tcp_yeah);
254 return 0; 254 return 0;
255 } 255 }
256 256
257 static void __exit tcp_yeah_unregister(void) 257 static void __exit tcp_yeah_unregister(void)
258 { 258 {
259 tcp_unregister_congestion_control(&tcp_yeah); 259 tcp_unregister_congestion_control(&tcp_yeah);
260 } 260 }
261 261
262 module_init(tcp_yeah_register); 262 module_init(tcp_yeah_register);
263 module_exit(tcp_yeah_unregister); 263 module_exit(tcp_yeah_unregister);
264 264
265 MODULE_AUTHOR("Angelo P. Castellani"); 265 MODULE_AUTHOR("Angelo P. Castellani");
266 MODULE_LICENSE("GPL"); 266 MODULE_LICENSE("GPL");
267 MODULE_DESCRIPTION("YeAH TCP"); 267 MODULE_DESCRIPTION("YeAH TCP");
268 268
net/ipv4/xfrm4_policy.c
1 /* 1 /*
2 * xfrm4_policy.c 2 * xfrm4_policy.c
3 * 3 *
4 * Changes: 4 * Changes:
5 * Kazunori MIYAZAWA @USAGI 5 * Kazunori MIYAZAWA @USAGI
6 * YOSHIFUJI Hideaki @USAGI 6 * YOSHIFUJI Hideaki @USAGI
7 * Split up af-specific portion 7 * Split up af-specific portion
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/err.h> 11 #include <linux/err.h>
12 #include <linux/kernel.h> 12 #include <linux/kernel.h>
13 #include <linux/inetdevice.h> 13 #include <linux/inetdevice.h>
14 #include <net/dst.h> 14 #include <net/dst.h>
15 #include <net/xfrm.h> 15 #include <net/xfrm.h>
16 #include <net/ip.h> 16 #include <net/ip.h>
17 17
18 static struct dst_ops xfrm4_dst_ops; 18 static struct dst_ops xfrm4_dst_ops;
19 static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21 static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, 21 static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr,
22 xfrm_address_t *daddr) 22 xfrm_address_t *daddr)
23 { 23 {
24 struct flowi fl = { 24 struct flowi fl = {
25 .nl_u = { 25 .nl_u = {
26 .ip4_u = { 26 .ip4_u = {
27 .tos = tos, 27 .tos = tos,
28 .daddr = daddr->a4, 28 .daddr = daddr->a4,
29 }, 29 },
30 }, 30 },
31 }; 31 };
32 struct dst_entry *dst; 32 struct dst_entry *dst;
33 struct rtable *rt; 33 struct rtable *rt;
34 int err; 34 int err;
35 35
36 if (saddr) 36 if (saddr)
37 fl.fl4_src = saddr->a4; 37 fl.fl4_src = saddr->a4;
38 38
39 err = __ip_route_output_key(&init_net, &rt, &fl); 39 err = __ip_route_output_key(&init_net, &rt, &fl);
40 dst = &rt->u.dst; 40 dst = &rt->u.dst;
41 if (err) 41 if (err)
42 dst = ERR_PTR(err); 42 dst = ERR_PTR(err);
43 return dst; 43 return dst;
44 } 44 }
45 45
46 static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) 46 static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr)
47 { 47 {
48 struct dst_entry *dst; 48 struct dst_entry *dst;
49 struct rtable *rt; 49 struct rtable *rt;
50 50
51 dst = xfrm4_dst_lookup(0, NULL, daddr); 51 dst = xfrm4_dst_lookup(0, NULL, daddr);
52 if (IS_ERR(dst)) 52 if (IS_ERR(dst))
53 return -EHOSTUNREACH; 53 return -EHOSTUNREACH;
54 54
55 rt = (struct rtable *)dst; 55 rt = (struct rtable *)dst;
56 saddr->a4 = rt->rt_src; 56 saddr->a4 = rt->rt_src;
57 dst_release(dst); 57 dst_release(dst);
58 return 0; 58 return 0;
59 } 59 }
60 60
61 static struct dst_entry * 61 static struct dst_entry *
62 __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) 62 __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
63 { 63 {
64 struct dst_entry *dst; 64 struct dst_entry *dst;
65 65
66 read_lock_bh(&policy->lock); 66 read_lock_bh(&policy->lock);
67 for (dst = policy->bundles; dst; dst = dst->next) { 67 for (dst = policy->bundles; dst; dst = dst->next) {
68 struct xfrm_dst *xdst = (struct xfrm_dst*)dst; 68 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
69 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ 69 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
70 xdst->u.rt.fl.fl4_dst == fl->fl4_dst && 70 xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
71 xdst->u.rt.fl.fl4_src == fl->fl4_src && 71 xdst->u.rt.fl.fl4_src == fl->fl4_src &&
72 xdst->u.rt.fl.fl4_tos == fl->fl4_tos && 72 xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
73 xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { 73 xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
74 dst_clone(dst); 74 dst_clone(dst);
75 break; 75 break;
76 } 76 }
77 } 77 }
78 read_unlock_bh(&policy->lock); 78 read_unlock_bh(&policy->lock);
79 return dst; 79 return dst;
80 } 80 }
81 81
82 static int xfrm4_get_tos(struct flowi *fl) 82 static int xfrm4_get_tos(struct flowi *fl)
83 { 83 {
84 return fl->fl4_tos; 84 return fl->fl4_tos;
85 } 85 }
86 86
87 static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 87 static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
88 int nfheader_len) 88 int nfheader_len)
89 { 89 {
90 return 0; 90 return 0;
91 } 91 }
92 92
93 static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) 93 static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev)
94 { 94 {
95 struct rtable *rt = (struct rtable *)xdst->route; 95 struct rtable *rt = (struct rtable *)xdst->route;
96 96
97 xdst->u.rt.fl = rt->fl; 97 xdst->u.rt.fl = rt->fl;
98 98
99 xdst->u.dst.dev = dev; 99 xdst->u.dst.dev = dev;
100 dev_hold(dev); 100 dev_hold(dev);
101 101
102 xdst->u.rt.idev = in_dev_get(dev); 102 xdst->u.rt.idev = in_dev_get(dev);
103 if (!xdst->u.rt.idev) 103 if (!xdst->u.rt.idev)
104 return -ENODEV; 104 return -ENODEV;
105 105
106 xdst->u.rt.peer = rt->peer; 106 xdst->u.rt.peer = rt->peer;
107 if (rt->peer) 107 if (rt->peer)
108 atomic_inc(&rt->peer->refcnt); 108 atomic_inc(&rt->peer->refcnt);
109 109
110 /* Sheit... I remember I did this right. Apparently, 110 /* Sheit... I remember I did this right. Apparently,
111 * it was magically lost, so this code needs audit */ 111 * it was magically lost, so this code needs audit */
112 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | 112 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
113 RTCF_LOCAL); 113 RTCF_LOCAL);
114 xdst->u.rt.rt_type = rt->rt_type; 114 xdst->u.rt.rt_type = rt->rt_type;
115 xdst->u.rt.rt_src = rt->rt_src; 115 xdst->u.rt.rt_src = rt->rt_src;
116 xdst->u.rt.rt_dst = rt->rt_dst; 116 xdst->u.rt.rt_dst = rt->rt_dst;
117 xdst->u.rt.rt_gateway = rt->rt_gateway; 117 xdst->u.rt.rt_gateway = rt->rt_gateway;
118 xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; 118 xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
119 119
120 return 0; 120 return 0;
121 } 121 }
122 122
123 static void 123 static void
124 _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) 124 _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
125 { 125 {
126 struct iphdr *iph = ip_hdr(skb); 126 struct iphdr *iph = ip_hdr(skb);
127 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 127 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
128 128
129 memset(fl, 0, sizeof(struct flowi)); 129 memset(fl, 0, sizeof(struct flowi));
130 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 130 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
131 switch (iph->protocol) { 131 switch (iph->protocol) {
132 case IPPROTO_UDP: 132 case IPPROTO_UDP:
133 case IPPROTO_UDPLITE: 133 case IPPROTO_UDPLITE:
134 case IPPROTO_TCP: 134 case IPPROTO_TCP:
135 case IPPROTO_SCTP: 135 case IPPROTO_SCTP:
136 case IPPROTO_DCCP: 136 case IPPROTO_DCCP:
137 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 137 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
138 __be16 *ports = (__be16 *)xprth; 138 __be16 *ports = (__be16 *)xprth;
139 139
140 fl->fl_ip_sport = ports[!!reverse]; 140 fl->fl_ip_sport = ports[!!reverse];
141 fl->fl_ip_dport = ports[!reverse]; 141 fl->fl_ip_dport = ports[!reverse];
142 } 142 }
143 break; 143 break;
144 144
145 case IPPROTO_ICMP: 145 case IPPROTO_ICMP:
146 if (pskb_may_pull(skb, xprth + 2 - skb->data)) { 146 if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
147 u8 *icmp = xprth; 147 u8 *icmp = xprth;
148 148
149 fl->fl_icmp_type = icmp[0]; 149 fl->fl_icmp_type = icmp[0];
150 fl->fl_icmp_code = icmp[1]; 150 fl->fl_icmp_code = icmp[1];
151 } 151 }
152 break; 152 break;
153 153
154 case IPPROTO_ESP: 154 case IPPROTO_ESP:
155 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 155 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
156 __be32 *ehdr = (__be32 *)xprth; 156 __be32 *ehdr = (__be32 *)xprth;
157 157
158 fl->fl_ipsec_spi = ehdr[0]; 158 fl->fl_ipsec_spi = ehdr[0];
159 } 159 }
160 break; 160 break;
161 161
162 case IPPROTO_AH: 162 case IPPROTO_AH:
163 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 163 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
164 __be32 *ah_hdr = (__be32*)xprth; 164 __be32 *ah_hdr = (__be32*)xprth;
165 165
166 fl->fl_ipsec_spi = ah_hdr[1]; 166 fl->fl_ipsec_spi = ah_hdr[1];
167 } 167 }
168 break; 168 break;
169 169
170 case IPPROTO_COMP: 170 case IPPROTO_COMP:
171 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 171 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
172 __be16 *ipcomp_hdr = (__be16 *)xprth; 172 __be16 *ipcomp_hdr = (__be16 *)xprth;
173 173
174 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 174 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
175 } 175 }
176 break; 176 break;
177 default: 177 default:
178 fl->fl_ipsec_spi = 0; 178 fl->fl_ipsec_spi = 0;
179 break; 179 break;
180 } 180 }
181 } 181 }
182 fl->proto = iph->protocol; 182 fl->proto = iph->protocol;
183 fl->fl4_dst = reverse ? iph->saddr : iph->daddr; 183 fl->fl4_dst = reverse ? iph->saddr : iph->daddr;
184 fl->fl4_src = reverse ? iph->daddr : iph->saddr; 184 fl->fl4_src = reverse ? iph->daddr : iph->saddr;
185 fl->fl4_tos = iph->tos; 185 fl->fl4_tos = iph->tos;
186 } 186 }
187 187
188 static inline int xfrm4_garbage_collect(struct dst_ops *ops) 188 static inline int xfrm4_garbage_collect(struct dst_ops *ops)
189 { 189 {
190 xfrm4_policy_afinfo.garbage_collect(); 190 xfrm4_policy_afinfo.garbage_collect();
191 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 191 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
192 } 192 }
193 193
194 static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 194 static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
195 { 195 {
196 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 196 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
197 struct dst_entry *path = xdst->route; 197 struct dst_entry *path = xdst->route;
198 198
199 path->ops->update_pmtu(path, mtu); 199 path->ops->update_pmtu(path, mtu);
200 } 200 }
201 201
202 static void xfrm4_dst_destroy(struct dst_entry *dst) 202 static void xfrm4_dst_destroy(struct dst_entry *dst)
203 { 203 {
204 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 204 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
205 205
206 if (likely(xdst->u.rt.idev)) 206 if (likely(xdst->u.rt.idev))
207 in_dev_put(xdst->u.rt.idev); 207 in_dev_put(xdst->u.rt.idev);
208 if (likely(xdst->u.rt.peer)) 208 if (likely(xdst->u.rt.peer))
209 inet_putpeer(xdst->u.rt.peer); 209 inet_putpeer(xdst->u.rt.peer);
210 xfrm_dst_destroy(xdst); 210 xfrm_dst_destroy(xdst);
211 } 211 }
212 212
213 static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 213 static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
214 int unregister) 214 int unregister)
215 { 215 {
216 struct xfrm_dst *xdst; 216 struct xfrm_dst *xdst;
217 217
218 if (!unregister) 218 if (!unregister)
219 return; 219 return;
220 220
221 xdst = (struct xfrm_dst *)dst; 221 xdst = (struct xfrm_dst *)dst;
222 if (xdst->u.rt.idev->dev == dev) { 222 if (xdst->u.rt.idev->dev == dev) {
223 struct in_device *loopback_idev = 223 struct in_device *loopback_idev =
224 in_dev_get(dev_net(dev)->loopback_dev); 224 in_dev_get(dev_net(dev)->loopback_dev);
225 BUG_ON(!loopback_idev); 225 BUG_ON(!loopback_idev);
226 226
227 do { 227 do {
228 in_dev_put(xdst->u.rt.idev); 228 in_dev_put(xdst->u.rt.idev);
229 xdst->u.rt.idev = loopback_idev; 229 xdst->u.rt.idev = loopback_idev;
230 in_dev_hold(loopback_idev); 230 in_dev_hold(loopback_idev);
231 xdst = (struct xfrm_dst *)xdst->u.dst.child; 231 xdst = (struct xfrm_dst *)xdst->u.dst.child;
232 } while (xdst->u.dst.xfrm); 232 } while (xdst->u.dst.xfrm);
233 233
234 __in_dev_put(loopback_idev); 234 __in_dev_put(loopback_idev);
235 } 235 }
236 236
237 xfrm_dst_ifdown(dst, dev); 237 xfrm_dst_ifdown(dst, dev);
238 } 238 }
239 239
240 static struct dst_ops xfrm4_dst_ops = { 240 static struct dst_ops xfrm4_dst_ops = {
241 .family = AF_INET, 241 .family = AF_INET,
242 .protocol = __constant_htons(ETH_P_IP), 242 .protocol = __constant_htons(ETH_P_IP),
243 .gc = xfrm4_garbage_collect, 243 .gc = xfrm4_garbage_collect,
244 .update_pmtu = xfrm4_update_pmtu, 244 .update_pmtu = xfrm4_update_pmtu,
245 .destroy = xfrm4_dst_destroy, 245 .destroy = xfrm4_dst_destroy,
246 .ifdown = xfrm4_dst_ifdown, 246 .ifdown = xfrm4_dst_ifdown,
247 .local_out = __ip_local_out, 247 .local_out = __ip_local_out,
248 .gc_thresh = 1024, 248 .gc_thresh = 1024,
249 .entry_size = sizeof(struct xfrm_dst), 249 .entry_size = sizeof(struct xfrm_dst),
250 .entries = ATOMIC_INIT(0), 250 .entries = ATOMIC_INIT(0),
251 }; 251 };
252 252
253 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 253 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
254 .family = AF_INET, 254 .family = AF_INET,
255 .dst_ops = &xfrm4_dst_ops, 255 .dst_ops = &xfrm4_dst_ops,
256 .dst_lookup = xfrm4_dst_lookup, 256 .dst_lookup = xfrm4_dst_lookup,
257 .get_saddr = xfrm4_get_saddr, 257 .get_saddr = xfrm4_get_saddr,
258 .find_bundle = __xfrm4_find_bundle, 258 .find_bundle = __xfrm4_find_bundle,
259 .decode_session = _decode_session4, 259 .decode_session = _decode_session4,
260 .get_tos = xfrm4_get_tos, 260 .get_tos = xfrm4_get_tos,
261 .init_path = xfrm4_init_path, 261 .init_path = xfrm4_init_path,
262 .fill_dst = xfrm4_fill_dst, 262 .fill_dst = xfrm4_fill_dst,
263 }; 263 };
264 264
265 static void __init xfrm4_policy_init(void) 265 static void __init xfrm4_policy_init(void)
266 { 266 {
267 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); 267 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
268 } 268 }
269 269
270 static void __exit xfrm4_policy_fini(void) 270 static void __exit xfrm4_policy_fini(void)
271 { 271 {
272 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); 272 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
273 } 273 }
274 274
275 void __init xfrm4_init(void) 275 void __init xfrm4_init(void)
276 { 276 {
277 xfrm4_state_init(); 277 xfrm4_state_init();
278 xfrm4_policy_init(); 278 xfrm4_policy_init();
279 } 279 }
280 280
281 281