Commit 5a5f3a8db9d70c90e9d55b46e02b2d8deb1c2c2e
Committed by
David S. Miller
1 parent
d9319100c1
Exists in
master
and in
7 other branches
net: clean up net/ipv4/ipip.c raw.c tcp.c tcp_minisocks.c tcp_yeah.c xfrm4_policy.c
Signed-off-by: Jianjun Kong <jianjun@zeuux.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 6 changed files with 12 additions and 12 deletions Inline Diff
net/ipv4/ipip.c
1 | /* | 1 | /* |
2 | * Linux NET3: IP/IP protocol decoder. | 2 | * Linux NET3: IP/IP protocol decoder. |
3 | * | 3 | * |
4 | * Authors: | 4 | * Authors: |
5 | * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 | 5 | * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 |
6 | * | 6 | * |
7 | * Fixes: | 7 | * Fixes: |
8 | * Alan Cox : Merged and made usable non modular (its so tiny its silly as | 8 | * Alan Cox : Merged and made usable non modular (its so tiny its silly as |
9 | * a module taking up 2 pages). | 9 | * a module taking up 2 pages). |
10 | * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) | 10 | * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) |
11 | * to keep ip_forward happy. | 11 | * to keep ip_forward happy. |
12 | * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). | 12 | * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). |
13 | * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL | 13 | * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL |
14 | * David Woodhouse : Perform some basic ICMP handling. | 14 | * David Woodhouse : Perform some basic ICMP handling. |
15 | * IPIP Routing without decapsulation. | 15 | * IPIP Routing without decapsulation. |
16 | * Carlos Picoto : GRE over IP support | 16 | * Carlos Picoto : GRE over IP support |
17 | * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. | 17 | * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. |
18 | * I do not want to merge them together. | 18 | * I do not want to merge them together. |
19 | * | 19 | * |
20 | * This program is free software; you can redistribute it and/or | 20 | * This program is free software; you can redistribute it and/or |
21 | * modify it under the terms of the GNU General Public License | 21 | * modify it under the terms of the GNU General Public License |
22 | * as published by the Free Software Foundation; either version | 22 | * as published by the Free Software Foundation; either version |
23 | * 2 of the License, or (at your option) any later version. | 23 | * 2 of the License, or (at your option) any later version. |
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* tunnel.c: an IP tunnel driver | 27 | /* tunnel.c: an IP tunnel driver |
28 | 28 | ||
29 | The purpose of this driver is to provide an IP tunnel through | 29 | The purpose of this driver is to provide an IP tunnel through |
30 | which you can tunnel network traffic transparently across subnets. | 30 | which you can tunnel network traffic transparently across subnets. |
31 | 31 | ||
32 | This was written by looking at Nick Holloway's dummy driver | 32 | This was written by looking at Nick Holloway's dummy driver |
33 | Thanks for the great code! | 33 | Thanks for the great code! |
34 | 34 | ||
35 | -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 | 35 | -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 |
36 | 36 | ||
37 | Minor tweaks: | 37 | Minor tweaks: |
38 | Cleaned up the code a little and added some pre-1.3.0 tweaks. | 38 | Cleaned up the code a little and added some pre-1.3.0 tweaks. |
39 | dev->hard_header/hard_header_len changed to use no headers. | 39 | dev->hard_header/hard_header_len changed to use no headers. |
40 | Comments/bracketing tweaked. | 40 | Comments/bracketing tweaked. |
41 | Made the tunnels use dev->name not tunnel: when error reporting. | 41 | Made the tunnels use dev->name not tunnel: when error reporting. |
42 | Added tx_dropped stat | 42 | Added tx_dropped stat |
43 | 43 | ||
44 | -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 | 44 | -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 |
45 | 45 | ||
46 | Reworked: | 46 | Reworked: |
47 | Changed to tunnel to destination gateway in addition to the | 47 | Changed to tunnel to destination gateway in addition to the |
48 | tunnel's pointopoint address | 48 | tunnel's pointopoint address |
49 | Almost completely rewritten | 49 | Almost completely rewritten |
50 | Note: There is currently no firewall or ICMP handling done. | 50 | Note: There is currently no firewall or ICMP handling done. |
51 | 51 | ||
52 | -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 | 52 | -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 |
53 | 53 | ||
54 | */ | 54 | */ |
55 | 55 | ||
56 | /* Things I wish I had known when writing the tunnel driver: | 56 | /* Things I wish I had known when writing the tunnel driver: |
57 | 57 | ||
58 | When the tunnel_xmit() function is called, the skb contains the | 58 | When the tunnel_xmit() function is called, the skb contains the |
59 | packet to be sent (plus a great deal of extra info), and dev | 59 | packet to be sent (plus a great deal of extra info), and dev |
60 | contains the tunnel device that _we_ are. | 60 | contains the tunnel device that _we_ are. |
61 | 61 | ||
62 | When we are passed a packet, we are expected to fill in the | 62 | When we are passed a packet, we are expected to fill in the |
63 | source address with our source IP address. | 63 | source address with our source IP address. |
64 | 64 | ||
65 | What is the proper way to allocate, copy and free a buffer? | 65 | What is the proper way to allocate, copy and free a buffer? |
66 | After you allocate it, it is a "0 length" chunk of memory | 66 | After you allocate it, it is a "0 length" chunk of memory |
67 | starting at zero. If you want to add headers to the buffer | 67 | starting at zero. If you want to add headers to the buffer |
68 | later, you'll have to call "skb_reserve(skb, amount)" with | 68 | later, you'll have to call "skb_reserve(skb, amount)" with |
69 | the amount of memory you want reserved. Then, you call | 69 | the amount of memory you want reserved. Then, you call |
70 | "skb_put(skb, amount)" with the amount of space you want in | 70 | "skb_put(skb, amount)" with the amount of space you want in |
71 | the buffer. skb_put() returns a pointer to the top (#0) of | 71 | the buffer. skb_put() returns a pointer to the top (#0) of |
72 | that buffer. skb->len is set to the amount of space you have | 72 | that buffer. skb->len is set to the amount of space you have |
73 | "allocated" with skb_put(). You can then write up to skb->len | 73 | "allocated" with skb_put(). You can then write up to skb->len |
74 | bytes to that buffer. If you need more, you can call skb_put() | 74 | bytes to that buffer. If you need more, you can call skb_put() |
75 | again with the additional amount of space you need. You can | 75 | again with the additional amount of space you need. You can |
76 | find out how much more space you can allocate by calling | 76 | find out how much more space you can allocate by calling |
77 | "skb_tailroom(skb)". | 77 | "skb_tailroom(skb)". |
78 | Now, to add header space, call "skb_push(skb, header_len)". | 78 | Now, to add header space, call "skb_push(skb, header_len)". |
79 | This creates space at the beginning of the buffer and returns | 79 | This creates space at the beginning of the buffer and returns |
80 | a pointer to this new space. If later you need to strip a | 80 | a pointer to this new space. If later you need to strip a |
81 | header from a buffer, call "skb_pull(skb, header_len)". | 81 | header from a buffer, call "skb_pull(skb, header_len)". |
82 | skb_headroom() will return how much space is left at the top | 82 | skb_headroom() will return how much space is left at the top |
83 | of the buffer (before the main data). Remember, this headroom | 83 | of the buffer (before the main data). Remember, this headroom |
84 | space must be reserved before the skb_put() function is called. | 84 | space must be reserved before the skb_put() function is called. |
85 | */ | 85 | */ |
86 | 86 | ||
87 | /* | 87 | /* |
88 | This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c | 88 | This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c |
89 | 89 | ||
90 | For comments look at net/ipv4/ip_gre.c --ANK | 90 | For comments look at net/ipv4/ip_gre.c --ANK |
91 | */ | 91 | */ |
92 | 92 | ||
93 | 93 | ||
94 | #include <linux/capability.h> | 94 | #include <linux/capability.h> |
95 | #include <linux/module.h> | 95 | #include <linux/module.h> |
96 | #include <linux/types.h> | 96 | #include <linux/types.h> |
97 | #include <linux/kernel.h> | 97 | #include <linux/kernel.h> |
98 | #include <asm/uaccess.h> | 98 | #include <asm/uaccess.h> |
99 | #include <linux/skbuff.h> | 99 | #include <linux/skbuff.h> |
100 | #include <linux/netdevice.h> | 100 | #include <linux/netdevice.h> |
101 | #include <linux/in.h> | 101 | #include <linux/in.h> |
102 | #include <linux/tcp.h> | 102 | #include <linux/tcp.h> |
103 | #include <linux/udp.h> | 103 | #include <linux/udp.h> |
104 | #include <linux/if_arp.h> | 104 | #include <linux/if_arp.h> |
105 | #include <linux/mroute.h> | 105 | #include <linux/mroute.h> |
106 | #include <linux/init.h> | 106 | #include <linux/init.h> |
107 | #include <linux/netfilter_ipv4.h> | 107 | #include <linux/netfilter_ipv4.h> |
108 | #include <linux/if_ether.h> | 108 | #include <linux/if_ether.h> |
109 | 109 | ||
110 | #include <net/sock.h> | 110 | #include <net/sock.h> |
111 | #include <net/ip.h> | 111 | #include <net/ip.h> |
112 | #include <net/icmp.h> | 112 | #include <net/icmp.h> |
113 | #include <net/ipip.h> | 113 | #include <net/ipip.h> |
114 | #include <net/inet_ecn.h> | 114 | #include <net/inet_ecn.h> |
115 | #include <net/xfrm.h> | 115 | #include <net/xfrm.h> |
116 | #include <net/net_namespace.h> | 116 | #include <net/net_namespace.h> |
117 | #include <net/netns/generic.h> | 117 | #include <net/netns/generic.h> |
118 | 118 | ||
119 | #define HASH_SIZE 16 | 119 | #define HASH_SIZE 16 |
120 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) | 120 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) |
121 | 121 | ||
122 | static int ipip_net_id; | 122 | static int ipip_net_id; |
123 | struct ipip_net { | 123 | struct ipip_net { |
124 | struct ip_tunnel *tunnels_r_l[HASH_SIZE]; | 124 | struct ip_tunnel *tunnels_r_l[HASH_SIZE]; |
125 | struct ip_tunnel *tunnels_r[HASH_SIZE]; | 125 | struct ip_tunnel *tunnels_r[HASH_SIZE]; |
126 | struct ip_tunnel *tunnels_l[HASH_SIZE]; | 126 | struct ip_tunnel *tunnels_l[HASH_SIZE]; |
127 | struct ip_tunnel *tunnels_wc[1]; | 127 | struct ip_tunnel *tunnels_wc[1]; |
128 | struct ip_tunnel **tunnels[4]; | 128 | struct ip_tunnel **tunnels[4]; |
129 | 129 | ||
130 | struct net_device *fb_tunnel_dev; | 130 | struct net_device *fb_tunnel_dev; |
131 | }; | 131 | }; |
132 | 132 | ||
133 | static int ipip_fb_tunnel_init(struct net_device *dev); | 133 | static int ipip_fb_tunnel_init(struct net_device *dev); |
134 | static int ipip_tunnel_init(struct net_device *dev); | 134 | static int ipip_tunnel_init(struct net_device *dev); |
135 | static void ipip_tunnel_setup(struct net_device *dev); | 135 | static void ipip_tunnel_setup(struct net_device *dev); |
136 | 136 | ||
137 | static DEFINE_RWLOCK(ipip_lock); | 137 | static DEFINE_RWLOCK(ipip_lock); |
138 | 138 | ||
139 | static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, | 139 | static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, |
140 | __be32 remote, __be32 local) | 140 | __be32 remote, __be32 local) |
141 | { | 141 | { |
142 | unsigned h0 = HASH(remote); | 142 | unsigned h0 = HASH(remote); |
143 | unsigned h1 = HASH(local); | 143 | unsigned h1 = HASH(local); |
144 | struct ip_tunnel *t; | 144 | struct ip_tunnel *t; |
145 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 145 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
146 | 146 | ||
147 | for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { | 147 | for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { |
148 | if (local == t->parms.iph.saddr && | 148 | if (local == t->parms.iph.saddr && |
149 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | 149 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) |
150 | return t; | 150 | return t; |
151 | } | 151 | } |
152 | for (t = ipn->tunnels_r[h0]; t; t = t->next) { | 152 | for (t = ipn->tunnels_r[h0]; t; t = t->next) { |
153 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | 153 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) |
154 | return t; | 154 | return t; |
155 | } | 155 | } |
156 | for (t = ipn->tunnels_l[h1]; t; t = t->next) { | 156 | for (t = ipn->tunnels_l[h1]; t; t = t->next) { |
157 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | 157 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) |
158 | return t; | 158 | return t; |
159 | } | 159 | } |
160 | if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) | 160 | if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) |
161 | return t; | 161 | return t; |
162 | return NULL; | 162 | return NULL; |
163 | } | 163 | } |
164 | 164 | ||
165 | static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, | 165 | static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, |
166 | struct ip_tunnel_parm *parms) | 166 | struct ip_tunnel_parm *parms) |
167 | { | 167 | { |
168 | __be32 remote = parms->iph.daddr; | 168 | __be32 remote = parms->iph.daddr; |
169 | __be32 local = parms->iph.saddr; | 169 | __be32 local = parms->iph.saddr; |
170 | unsigned h = 0; | 170 | unsigned h = 0; |
171 | int prio = 0; | 171 | int prio = 0; |
172 | 172 | ||
173 | if (remote) { | 173 | if (remote) { |
174 | prio |= 2; | 174 | prio |= 2; |
175 | h ^= HASH(remote); | 175 | h ^= HASH(remote); |
176 | } | 176 | } |
177 | if (local) { | 177 | if (local) { |
178 | prio |= 1; | 178 | prio |= 1; |
179 | h ^= HASH(local); | 179 | h ^= HASH(local); |
180 | } | 180 | } |
181 | return &ipn->tunnels[prio][h]; | 181 | return &ipn->tunnels[prio][h]; |
182 | } | 182 | } |
183 | 183 | ||
184 | static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, | 184 | static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, |
185 | struct ip_tunnel *t) | 185 | struct ip_tunnel *t) |
186 | { | 186 | { |
187 | return __ipip_bucket(ipn, &t->parms); | 187 | return __ipip_bucket(ipn, &t->parms); |
188 | } | 188 | } |
189 | 189 | ||
190 | static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) | 190 | static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) |
191 | { | 191 | { |
192 | struct ip_tunnel **tp; | 192 | struct ip_tunnel **tp; |
193 | 193 | ||
194 | for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { | 194 | for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { |
195 | if (t == *tp) { | 195 | if (t == *tp) { |
196 | write_lock_bh(&ipip_lock); | 196 | write_lock_bh(&ipip_lock); |
197 | *tp = t->next; | 197 | *tp = t->next; |
198 | write_unlock_bh(&ipip_lock); | 198 | write_unlock_bh(&ipip_lock); |
199 | break; | 199 | break; |
200 | } | 200 | } |
201 | } | 201 | } |
202 | } | 202 | } |
203 | 203 | ||
204 | static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) | 204 | static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) |
205 | { | 205 | { |
206 | struct ip_tunnel **tp = ipip_bucket(ipn, t); | 206 | struct ip_tunnel **tp = ipip_bucket(ipn, t); |
207 | 207 | ||
208 | t->next = *tp; | 208 | t->next = *tp; |
209 | write_lock_bh(&ipip_lock); | 209 | write_lock_bh(&ipip_lock); |
210 | *tp = t; | 210 | *tp = t; |
211 | write_unlock_bh(&ipip_lock); | 211 | write_unlock_bh(&ipip_lock); |
212 | } | 212 | } |
213 | 213 | ||
214 | static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | 214 | static struct ip_tunnel * ipip_tunnel_locate(struct net *net, |
215 | struct ip_tunnel_parm *parms, int create) | 215 | struct ip_tunnel_parm *parms, int create) |
216 | { | 216 | { |
217 | __be32 remote = parms->iph.daddr; | 217 | __be32 remote = parms->iph.daddr; |
218 | __be32 local = parms->iph.saddr; | 218 | __be32 local = parms->iph.saddr; |
219 | struct ip_tunnel *t, **tp, *nt; | 219 | struct ip_tunnel *t, **tp, *nt; |
220 | struct net_device *dev; | 220 | struct net_device *dev; |
221 | char name[IFNAMSIZ]; | 221 | char name[IFNAMSIZ]; |
222 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 222 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
223 | 223 | ||
224 | for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { | 224 | for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { |
225 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | 225 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) |
226 | return t; | 226 | return t; |
227 | } | 227 | } |
228 | if (!create) | 228 | if (!create) |
229 | return NULL; | 229 | return NULL; |
230 | 230 | ||
231 | if (parms->name[0]) | 231 | if (parms->name[0]) |
232 | strlcpy(name, parms->name, IFNAMSIZ); | 232 | strlcpy(name, parms->name, IFNAMSIZ); |
233 | else | 233 | else |
234 | sprintf(name, "tunl%%d"); | 234 | sprintf(name, "tunl%%d"); |
235 | 235 | ||
236 | dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); | 236 | dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); |
237 | if (dev == NULL) | 237 | if (dev == NULL) |
238 | return NULL; | 238 | return NULL; |
239 | 239 | ||
240 | dev_net_set(dev, net); | 240 | dev_net_set(dev, net); |
241 | 241 | ||
242 | if (strchr(name, '%')) { | 242 | if (strchr(name, '%')) { |
243 | if (dev_alloc_name(dev, name) < 0) | 243 | if (dev_alloc_name(dev, name) < 0) |
244 | goto failed_free; | 244 | goto failed_free; |
245 | } | 245 | } |
246 | 246 | ||
247 | nt = netdev_priv(dev); | 247 | nt = netdev_priv(dev); |
248 | dev->init = ipip_tunnel_init; | 248 | dev->init = ipip_tunnel_init; |
249 | nt->parms = *parms; | 249 | nt->parms = *parms; |
250 | 250 | ||
251 | if (register_netdevice(dev) < 0) | 251 | if (register_netdevice(dev) < 0) |
252 | goto failed_free; | 252 | goto failed_free; |
253 | 253 | ||
254 | dev_hold(dev); | 254 | dev_hold(dev); |
255 | ipip_tunnel_link(ipn, nt); | 255 | ipip_tunnel_link(ipn, nt); |
256 | return nt; | 256 | return nt; |
257 | 257 | ||
258 | failed_free: | 258 | failed_free: |
259 | free_netdev(dev); | 259 | free_netdev(dev); |
260 | return NULL; | 260 | return NULL; |
261 | } | 261 | } |
262 | 262 | ||
263 | static void ipip_tunnel_uninit(struct net_device *dev) | 263 | static void ipip_tunnel_uninit(struct net_device *dev) |
264 | { | 264 | { |
265 | struct net *net = dev_net(dev); | 265 | struct net *net = dev_net(dev); |
266 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 266 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
267 | 267 | ||
268 | if (dev == ipn->fb_tunnel_dev) { | 268 | if (dev == ipn->fb_tunnel_dev) { |
269 | write_lock_bh(&ipip_lock); | 269 | write_lock_bh(&ipip_lock); |
270 | ipn->tunnels_wc[0] = NULL; | 270 | ipn->tunnels_wc[0] = NULL; |
271 | write_unlock_bh(&ipip_lock); | 271 | write_unlock_bh(&ipip_lock); |
272 | } else | 272 | } else |
273 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); | 273 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); |
274 | dev_put(dev); | 274 | dev_put(dev); |
275 | } | 275 | } |
276 | 276 | ||
277 | static int ipip_err(struct sk_buff *skb, u32 info) | 277 | static int ipip_err(struct sk_buff *skb, u32 info) |
278 | { | 278 | { |
279 | 279 | ||
280 | /* All the routers (except for Linux) return only | 280 | /* All the routers (except for Linux) return only |
281 | 8 bytes of packet payload. It means, that precise relaying of | 281 | 8 bytes of packet payload. It means, that precise relaying of |
282 | ICMP in the real Internet is absolutely infeasible. | 282 | ICMP in the real Internet is absolutely infeasible. |
283 | */ | 283 | */ |
284 | struct iphdr *iph = (struct iphdr*)skb->data; | 284 | struct iphdr *iph = (struct iphdr *)skb->data; |
285 | const int type = icmp_hdr(skb)->type; | 285 | const int type = icmp_hdr(skb)->type; |
286 | const int code = icmp_hdr(skb)->code; | 286 | const int code = icmp_hdr(skb)->code; |
287 | struct ip_tunnel *t; | 287 | struct ip_tunnel *t; |
288 | int err; | 288 | int err; |
289 | 289 | ||
290 | switch (type) { | 290 | switch (type) { |
291 | default: | 291 | default: |
292 | case ICMP_PARAMETERPROB: | 292 | case ICMP_PARAMETERPROB: |
293 | return 0; | 293 | return 0; |
294 | 294 | ||
295 | case ICMP_DEST_UNREACH: | 295 | case ICMP_DEST_UNREACH: |
296 | switch (code) { | 296 | switch (code) { |
297 | case ICMP_SR_FAILED: | 297 | case ICMP_SR_FAILED: |
298 | case ICMP_PORT_UNREACH: | 298 | case ICMP_PORT_UNREACH: |
299 | /* Impossible event. */ | 299 | /* Impossible event. */ |
300 | return 0; | 300 | return 0; |
301 | case ICMP_FRAG_NEEDED: | 301 | case ICMP_FRAG_NEEDED: |
302 | /* Soft state for pmtu is maintained by IP core. */ | 302 | /* Soft state for pmtu is maintained by IP core. */ |
303 | return 0; | 303 | return 0; |
304 | default: | 304 | default: |
305 | /* All others are translated to HOST_UNREACH. | 305 | /* All others are translated to HOST_UNREACH. |
306 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 306 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
307 | I believe they are just ether pollution. --ANK | 307 | I believe they are just ether pollution. --ANK |
308 | */ | 308 | */ |
309 | break; | 309 | break; |
310 | } | 310 | } |
311 | break; | 311 | break; |
312 | case ICMP_TIME_EXCEEDED: | 312 | case ICMP_TIME_EXCEEDED: |
313 | if (code != ICMP_EXC_TTL) | 313 | if (code != ICMP_EXC_TTL) |
314 | return 0; | 314 | return 0; |
315 | break; | 315 | break; |
316 | } | 316 | } |
317 | 317 | ||
318 | err = -ENOENT; | 318 | err = -ENOENT; |
319 | 319 | ||
320 | read_lock(&ipip_lock); | 320 | read_lock(&ipip_lock); |
321 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | 321 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); |
322 | if (t == NULL || t->parms.iph.daddr == 0) | 322 | if (t == NULL || t->parms.iph.daddr == 0) |
323 | goto out; | 323 | goto out; |
324 | 324 | ||
325 | err = 0; | 325 | err = 0; |
326 | if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) | 326 | if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) |
327 | goto out; | 327 | goto out; |
328 | 328 | ||
329 | if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) | 329 | if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) |
330 | t->err_count++; | 330 | t->err_count++; |
331 | else | 331 | else |
332 | t->err_count = 1; | 332 | t->err_count = 1; |
333 | t->err_time = jiffies; | 333 | t->err_time = jiffies; |
334 | out: | 334 | out: |
335 | read_unlock(&ipip_lock); | 335 | read_unlock(&ipip_lock); |
336 | return err; | 336 | return err; |
337 | } | 337 | } |
338 | 338 | ||
339 | static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, | 339 | static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, |
340 | struct sk_buff *skb) | 340 | struct sk_buff *skb) |
341 | { | 341 | { |
342 | struct iphdr *inner_iph = ip_hdr(skb); | 342 | struct iphdr *inner_iph = ip_hdr(skb); |
343 | 343 | ||
344 | if (INET_ECN_is_ce(outer_iph->tos)) | 344 | if (INET_ECN_is_ce(outer_iph->tos)) |
345 | IP_ECN_set_ce(inner_iph); | 345 | IP_ECN_set_ce(inner_iph); |
346 | } | 346 | } |
347 | 347 | ||
348 | static int ipip_rcv(struct sk_buff *skb) | 348 | static int ipip_rcv(struct sk_buff *skb) |
349 | { | 349 | { |
350 | struct ip_tunnel *tunnel; | 350 | struct ip_tunnel *tunnel; |
351 | const struct iphdr *iph = ip_hdr(skb); | 351 | const struct iphdr *iph = ip_hdr(skb); |
352 | 352 | ||
353 | read_lock(&ipip_lock); | 353 | read_lock(&ipip_lock); |
354 | if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), | 354 | if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), |
355 | iph->saddr, iph->daddr)) != NULL) { | 355 | iph->saddr, iph->daddr)) != NULL) { |
356 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { | 356 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
357 | read_unlock(&ipip_lock); | 357 | read_unlock(&ipip_lock); |
358 | kfree_skb(skb); | 358 | kfree_skb(skb); |
359 | return 0; | 359 | return 0; |
360 | } | 360 | } |
361 | 361 | ||
362 | secpath_reset(skb); | 362 | secpath_reset(skb); |
363 | 363 | ||
364 | skb->mac_header = skb->network_header; | 364 | skb->mac_header = skb->network_header; |
365 | skb_reset_network_header(skb); | 365 | skb_reset_network_header(skb); |
366 | skb->protocol = htons(ETH_P_IP); | 366 | skb->protocol = htons(ETH_P_IP); |
367 | skb->pkt_type = PACKET_HOST; | 367 | skb->pkt_type = PACKET_HOST; |
368 | 368 | ||
369 | tunnel->dev->stats.rx_packets++; | 369 | tunnel->dev->stats.rx_packets++; |
370 | tunnel->dev->stats.rx_bytes += skb->len; | 370 | tunnel->dev->stats.rx_bytes += skb->len; |
371 | skb->dev = tunnel->dev; | 371 | skb->dev = tunnel->dev; |
372 | dst_release(skb->dst); | 372 | dst_release(skb->dst); |
373 | skb->dst = NULL; | 373 | skb->dst = NULL; |
374 | nf_reset(skb); | 374 | nf_reset(skb); |
375 | ipip_ecn_decapsulate(iph, skb); | 375 | ipip_ecn_decapsulate(iph, skb); |
376 | netif_rx(skb); | 376 | netif_rx(skb); |
377 | read_unlock(&ipip_lock); | 377 | read_unlock(&ipip_lock); |
378 | return 0; | 378 | return 0; |
379 | } | 379 | } |
380 | read_unlock(&ipip_lock); | 380 | read_unlock(&ipip_lock); |
381 | 381 | ||
382 | return -1; | 382 | return -1; |
383 | } | 383 | } |
384 | 384 | ||
385 | /* | 385 | /* |
386 | * This function assumes it is being called from dev_queue_xmit() | 386 | * This function assumes it is being called from dev_queue_xmit() |
387 | * and that skb is filled properly by that function. | 387 | * and that skb is filled properly by that function. |
388 | */ | 388 | */ |
389 | 389 | ||
390 | static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 390 | static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
391 | { | 391 | { |
392 | struct ip_tunnel *tunnel = netdev_priv(dev); | 392 | struct ip_tunnel *tunnel = netdev_priv(dev); |
393 | struct net_device_stats *stats = &tunnel->dev->stats; | 393 | struct net_device_stats *stats = &tunnel->dev->stats; |
394 | struct iphdr *tiph = &tunnel->parms.iph; | 394 | struct iphdr *tiph = &tunnel->parms.iph; |
395 | u8 tos = tunnel->parms.iph.tos; | 395 | u8 tos = tunnel->parms.iph.tos; |
396 | __be16 df = tiph->frag_off; | 396 | __be16 df = tiph->frag_off; |
397 | struct rtable *rt; /* Route to the other host */ | 397 | struct rtable *rt; /* Route to the other host */ |
398 | struct net_device *tdev; /* Device to other host */ | 398 | struct net_device *tdev; /* Device to other host */ |
399 | struct iphdr *old_iph = ip_hdr(skb); | 399 | struct iphdr *old_iph = ip_hdr(skb); |
400 | struct iphdr *iph; /* Our new IP header */ | 400 | struct iphdr *iph; /* Our new IP header */ |
401 | unsigned int max_headroom; /* The extra header space needed */ | 401 | unsigned int max_headroom; /* The extra header space needed */ |
402 | __be32 dst = tiph->daddr; | 402 | __be32 dst = tiph->daddr; |
403 | int mtu; | 403 | int mtu; |
404 | 404 | ||
405 | if (tunnel->recursion++) { | 405 | if (tunnel->recursion++) { |
406 | stats->collisions++; | 406 | stats->collisions++; |
407 | goto tx_error; | 407 | goto tx_error; |
408 | } | 408 | } |
409 | 409 | ||
410 | if (skb->protocol != htons(ETH_P_IP)) | 410 | if (skb->protocol != htons(ETH_P_IP)) |
411 | goto tx_error; | 411 | goto tx_error; |
412 | 412 | ||
413 | if (tos&1) | 413 | if (tos&1) |
414 | tos = old_iph->tos; | 414 | tos = old_iph->tos; |
415 | 415 | ||
416 | if (!dst) { | 416 | if (!dst) { |
417 | /* NBMA tunnel */ | 417 | /* NBMA tunnel */ |
418 | if ((rt = skb->rtable) == NULL) { | 418 | if ((rt = skb->rtable) == NULL) { |
419 | stats->tx_fifo_errors++; | 419 | stats->tx_fifo_errors++; |
420 | goto tx_error; | 420 | goto tx_error; |
421 | } | 421 | } |
422 | if ((dst = rt->rt_gateway) == 0) | 422 | if ((dst = rt->rt_gateway) == 0) |
423 | goto tx_error_icmp; | 423 | goto tx_error_icmp; |
424 | } | 424 | } |
425 | 425 | ||
426 | { | 426 | { |
427 | struct flowi fl = { .oif = tunnel->parms.link, | 427 | struct flowi fl = { .oif = tunnel->parms.link, |
428 | .nl_u = { .ip4_u = | 428 | .nl_u = { .ip4_u = |
429 | { .daddr = dst, | 429 | { .daddr = dst, |
430 | .saddr = tiph->saddr, | 430 | .saddr = tiph->saddr, |
431 | .tos = RT_TOS(tos) } }, | 431 | .tos = RT_TOS(tos) } }, |
432 | .proto = IPPROTO_IPIP }; | 432 | .proto = IPPROTO_IPIP }; |
433 | if (ip_route_output_key(dev_net(dev), &rt, &fl)) { | 433 | if (ip_route_output_key(dev_net(dev), &rt, &fl)) { |
434 | stats->tx_carrier_errors++; | 434 | stats->tx_carrier_errors++; |
435 | goto tx_error_icmp; | 435 | goto tx_error_icmp; |
436 | } | 436 | } |
437 | } | 437 | } |
438 | tdev = rt->u.dst.dev; | 438 | tdev = rt->u.dst.dev; |
439 | 439 | ||
440 | if (tdev == dev) { | 440 | if (tdev == dev) { |
441 | ip_rt_put(rt); | 441 | ip_rt_put(rt); |
442 | stats->collisions++; | 442 | stats->collisions++; |
443 | goto tx_error; | 443 | goto tx_error; |
444 | } | 444 | } |
445 | 445 | ||
446 | if (tiph->frag_off) | 446 | if (tiph->frag_off) |
447 | mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); | 447 | mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); |
448 | else | 448 | else |
449 | mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; | 449 | mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; |
450 | 450 | ||
451 | if (mtu < 68) { | 451 | if (mtu < 68) { |
452 | stats->collisions++; | 452 | stats->collisions++; |
453 | ip_rt_put(rt); | 453 | ip_rt_put(rt); |
454 | goto tx_error; | 454 | goto tx_error; |
455 | } | 455 | } |
456 | if (skb->dst) | 456 | if (skb->dst) |
457 | skb->dst->ops->update_pmtu(skb->dst, mtu); | 457 | skb->dst->ops->update_pmtu(skb->dst, mtu); |
458 | 458 | ||
459 | df |= (old_iph->frag_off&htons(IP_DF)); | 459 | df |= (old_iph->frag_off&htons(IP_DF)); |
460 | 460 | ||
461 | if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { | 461 | if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { |
462 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | 462 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); |
463 | ip_rt_put(rt); | 463 | ip_rt_put(rt); |
464 | goto tx_error; | 464 | goto tx_error; |
465 | } | 465 | } |
466 | 466 | ||
467 | if (tunnel->err_count > 0) { | 467 | if (tunnel->err_count > 0) { |
468 | if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { | 468 | if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { |
469 | tunnel->err_count--; | 469 | tunnel->err_count--; |
470 | dst_link_failure(skb); | 470 | dst_link_failure(skb); |
471 | } else | 471 | } else |
472 | tunnel->err_count = 0; | 472 | tunnel->err_count = 0; |
473 | } | 473 | } |
474 | 474 | ||
475 | /* | 475 | /* |
476 | * Okay, now see if we can stuff it in the buffer as-is. | 476 | * Okay, now see if we can stuff it in the buffer as-is. |
477 | */ | 477 | */ |
478 | max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); | 478 | max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); |
479 | 479 | ||
480 | if (skb_headroom(skb) < max_headroom || skb_shared(skb) || | 480 | if (skb_headroom(skb) < max_headroom || skb_shared(skb) || |
481 | (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { | 481 | (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { |
482 | struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); | 482 | struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); |
483 | if (!new_skb) { | 483 | if (!new_skb) { |
484 | ip_rt_put(rt); | 484 | ip_rt_put(rt); |
485 | stats->tx_dropped++; | 485 | stats->tx_dropped++; |
486 | dev_kfree_skb(skb); | 486 | dev_kfree_skb(skb); |
487 | tunnel->recursion--; | 487 | tunnel->recursion--; |
488 | return 0; | 488 | return 0; |
489 | } | 489 | } |
490 | if (skb->sk) | 490 | if (skb->sk) |
491 | skb_set_owner_w(new_skb, skb->sk); | 491 | skb_set_owner_w(new_skb, skb->sk); |
492 | dev_kfree_skb(skb); | 492 | dev_kfree_skb(skb); |
493 | skb = new_skb; | 493 | skb = new_skb; |
494 | old_iph = ip_hdr(skb); | 494 | old_iph = ip_hdr(skb); |
495 | } | 495 | } |
496 | 496 | ||
497 | skb->transport_header = skb->network_header; | 497 | skb->transport_header = skb->network_header; |
498 | skb_push(skb, sizeof(struct iphdr)); | 498 | skb_push(skb, sizeof(struct iphdr)); |
499 | skb_reset_network_header(skb); | 499 | skb_reset_network_header(skb); |
500 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | 500 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
501 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | 501 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | |
502 | IPSKB_REROUTED); | 502 | IPSKB_REROUTED); |
503 | dst_release(skb->dst); | 503 | dst_release(skb->dst); |
504 | skb->dst = &rt->u.dst; | 504 | skb->dst = &rt->u.dst; |
505 | 505 | ||
506 | /* | 506 | /* |
507 | * Push down and install the IPIP header. | 507 | * Push down and install the IPIP header. |
508 | */ | 508 | */ |
509 | 509 | ||
510 | iph = ip_hdr(skb); | 510 | iph = ip_hdr(skb); |
511 | iph->version = 4; | 511 | iph->version = 4; |
512 | iph->ihl = sizeof(struct iphdr)>>2; | 512 | iph->ihl = sizeof(struct iphdr)>>2; |
513 | iph->frag_off = df; | 513 | iph->frag_off = df; |
514 | iph->protocol = IPPROTO_IPIP; | 514 | iph->protocol = IPPROTO_IPIP; |
515 | iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); | 515 | iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); |
516 | iph->daddr = rt->rt_dst; | 516 | iph->daddr = rt->rt_dst; |
517 | iph->saddr = rt->rt_src; | 517 | iph->saddr = rt->rt_src; |
518 | 518 | ||
519 | if ((iph->ttl = tiph->ttl) == 0) | 519 | if ((iph->ttl = tiph->ttl) == 0) |
520 | iph->ttl = old_iph->ttl; | 520 | iph->ttl = old_iph->ttl; |
521 | 521 | ||
522 | nf_reset(skb); | 522 | nf_reset(skb); |
523 | 523 | ||
524 | IPTUNNEL_XMIT(); | 524 | IPTUNNEL_XMIT(); |
525 | tunnel->recursion--; | 525 | tunnel->recursion--; |
526 | return 0; | 526 | return 0; |
527 | 527 | ||
528 | tx_error_icmp: | 528 | tx_error_icmp: |
529 | dst_link_failure(skb); | 529 | dst_link_failure(skb); |
530 | tx_error: | 530 | tx_error: |
531 | stats->tx_errors++; | 531 | stats->tx_errors++; |
532 | dev_kfree_skb(skb); | 532 | dev_kfree_skb(skb); |
533 | tunnel->recursion--; | 533 | tunnel->recursion--; |
534 | return 0; | 534 | return 0; |
535 | } | 535 | } |
536 | 536 | ||
537 | static void ipip_tunnel_bind_dev(struct net_device *dev) | 537 | static void ipip_tunnel_bind_dev(struct net_device *dev) |
538 | { | 538 | { |
539 | struct net_device *tdev = NULL; | 539 | struct net_device *tdev = NULL; |
540 | struct ip_tunnel *tunnel; | 540 | struct ip_tunnel *tunnel; |
541 | struct iphdr *iph; | 541 | struct iphdr *iph; |
542 | 542 | ||
543 | tunnel = netdev_priv(dev); | 543 | tunnel = netdev_priv(dev); |
544 | iph = &tunnel->parms.iph; | 544 | iph = &tunnel->parms.iph; |
545 | 545 | ||
546 | if (iph->daddr) { | 546 | if (iph->daddr) { |
547 | struct flowi fl = { .oif = tunnel->parms.link, | 547 | struct flowi fl = { .oif = tunnel->parms.link, |
548 | .nl_u = { .ip4_u = | 548 | .nl_u = { .ip4_u = |
549 | { .daddr = iph->daddr, | 549 | { .daddr = iph->daddr, |
550 | .saddr = iph->saddr, | 550 | .saddr = iph->saddr, |
551 | .tos = RT_TOS(iph->tos) } }, | 551 | .tos = RT_TOS(iph->tos) } }, |
552 | .proto = IPPROTO_IPIP }; | 552 | .proto = IPPROTO_IPIP }; |
553 | struct rtable *rt; | 553 | struct rtable *rt; |
554 | if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { | 554 | if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { |
555 | tdev = rt->u.dst.dev; | 555 | tdev = rt->u.dst.dev; |
556 | ip_rt_put(rt); | 556 | ip_rt_put(rt); |
557 | } | 557 | } |
558 | dev->flags |= IFF_POINTOPOINT; | 558 | dev->flags |= IFF_POINTOPOINT; |
559 | } | 559 | } |
560 | 560 | ||
561 | if (!tdev && tunnel->parms.link) | 561 | if (!tdev && tunnel->parms.link) |
562 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | 562 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); |
563 | 563 | ||
564 | if (tdev) { | 564 | if (tdev) { |
565 | dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); | 565 | dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); |
566 | dev->mtu = tdev->mtu - sizeof(struct iphdr); | 566 | dev->mtu = tdev->mtu - sizeof(struct iphdr); |
567 | } | 567 | } |
568 | dev->iflink = tunnel->parms.link; | 568 | dev->iflink = tunnel->parms.link; |
569 | } | 569 | } |
570 | 570 | ||
571 | static int | 571 | static int |
572 | ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | 572 | ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) |
573 | { | 573 | { |
574 | int err = 0; | 574 | int err = 0; |
575 | struct ip_tunnel_parm p; | 575 | struct ip_tunnel_parm p; |
576 | struct ip_tunnel *t; | 576 | struct ip_tunnel *t; |
577 | struct net *net = dev_net(dev); | 577 | struct net *net = dev_net(dev); |
578 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 578 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
579 | 579 | ||
580 | switch (cmd) { | 580 | switch (cmd) { |
581 | case SIOCGETTUNNEL: | 581 | case SIOCGETTUNNEL: |
582 | t = NULL; | 582 | t = NULL; |
583 | if (dev == ipn->fb_tunnel_dev) { | 583 | if (dev == ipn->fb_tunnel_dev) { |
584 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { | 584 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { |
585 | err = -EFAULT; | 585 | err = -EFAULT; |
586 | break; | 586 | break; |
587 | } | 587 | } |
588 | t = ipip_tunnel_locate(net, &p, 0); | 588 | t = ipip_tunnel_locate(net, &p, 0); |
589 | } | 589 | } |
590 | if (t == NULL) | 590 | if (t == NULL) |
591 | t = netdev_priv(dev); | 591 | t = netdev_priv(dev); |
592 | memcpy(&p, &t->parms, sizeof(p)); | 592 | memcpy(&p, &t->parms, sizeof(p)); |
593 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | 593 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) |
594 | err = -EFAULT; | 594 | err = -EFAULT; |
595 | break; | 595 | break; |
596 | 596 | ||
597 | case SIOCADDTUNNEL: | 597 | case SIOCADDTUNNEL: |
598 | case SIOCCHGTUNNEL: | 598 | case SIOCCHGTUNNEL: |
599 | err = -EPERM; | 599 | err = -EPERM; |
600 | if (!capable(CAP_NET_ADMIN)) | 600 | if (!capable(CAP_NET_ADMIN)) |
601 | goto done; | 601 | goto done; |
602 | 602 | ||
603 | err = -EFAULT; | 603 | err = -EFAULT; |
604 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | 604 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
605 | goto done; | 605 | goto done; |
606 | 606 | ||
607 | err = -EINVAL; | 607 | err = -EINVAL; |
608 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || | 608 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || |
609 | p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) | 609 | p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) |
610 | goto done; | 610 | goto done; |
611 | if (p.iph.ttl) | 611 | if (p.iph.ttl) |
612 | p.iph.frag_off |= htons(IP_DF); | 612 | p.iph.frag_off |= htons(IP_DF); |
613 | 613 | ||
614 | t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | 614 | t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); |
615 | 615 | ||
616 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | 616 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { |
617 | if (t != NULL) { | 617 | if (t != NULL) { |
618 | if (t->dev != dev) { | 618 | if (t->dev != dev) { |
619 | err = -EEXIST; | 619 | err = -EEXIST; |
620 | break; | 620 | break; |
621 | } | 621 | } |
622 | } else { | 622 | } else { |
623 | if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || | 623 | if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || |
624 | (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { | 624 | (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { |
625 | err = -EINVAL; | 625 | err = -EINVAL; |
626 | break; | 626 | break; |
627 | } | 627 | } |
628 | t = netdev_priv(dev); | 628 | t = netdev_priv(dev); |
629 | ipip_tunnel_unlink(ipn, t); | 629 | ipip_tunnel_unlink(ipn, t); |
630 | t->parms.iph.saddr = p.iph.saddr; | 630 | t->parms.iph.saddr = p.iph.saddr; |
631 | t->parms.iph.daddr = p.iph.daddr; | 631 | t->parms.iph.daddr = p.iph.daddr; |
632 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | 632 | memcpy(dev->dev_addr, &p.iph.saddr, 4); |
633 | memcpy(dev->broadcast, &p.iph.daddr, 4); | 633 | memcpy(dev->broadcast, &p.iph.daddr, 4); |
634 | ipip_tunnel_link(ipn, t); | 634 | ipip_tunnel_link(ipn, t); |
635 | netdev_state_change(dev); | 635 | netdev_state_change(dev); |
636 | } | 636 | } |
637 | } | 637 | } |
638 | 638 | ||
639 | if (t) { | 639 | if (t) { |
640 | err = 0; | 640 | err = 0; |
641 | if (cmd == SIOCCHGTUNNEL) { | 641 | if (cmd == SIOCCHGTUNNEL) { |
642 | t->parms.iph.ttl = p.iph.ttl; | 642 | t->parms.iph.ttl = p.iph.ttl; |
643 | t->parms.iph.tos = p.iph.tos; | 643 | t->parms.iph.tos = p.iph.tos; |
644 | t->parms.iph.frag_off = p.iph.frag_off; | 644 | t->parms.iph.frag_off = p.iph.frag_off; |
645 | if (t->parms.link != p.link) { | 645 | if (t->parms.link != p.link) { |
646 | t->parms.link = p.link; | 646 | t->parms.link = p.link; |
647 | ipip_tunnel_bind_dev(dev); | 647 | ipip_tunnel_bind_dev(dev); |
648 | netdev_state_change(dev); | 648 | netdev_state_change(dev); |
649 | } | 649 | } |
650 | } | 650 | } |
651 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) | 651 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) |
652 | err = -EFAULT; | 652 | err = -EFAULT; |
653 | } else | 653 | } else |
654 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | 654 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); |
655 | break; | 655 | break; |
656 | 656 | ||
657 | case SIOCDELTUNNEL: | 657 | case SIOCDELTUNNEL: |
658 | err = -EPERM; | 658 | err = -EPERM; |
659 | if (!capable(CAP_NET_ADMIN)) | 659 | if (!capable(CAP_NET_ADMIN)) |
660 | goto done; | 660 | goto done; |
661 | 661 | ||
662 | if (dev == ipn->fb_tunnel_dev) { | 662 | if (dev == ipn->fb_tunnel_dev) { |
663 | err = -EFAULT; | 663 | err = -EFAULT; |
664 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | 664 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
665 | goto done; | 665 | goto done; |
666 | err = -ENOENT; | 666 | err = -ENOENT; |
667 | if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) | 667 | if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) |
668 | goto done; | 668 | goto done; |
669 | err = -EPERM; | 669 | err = -EPERM; |
670 | if (t->dev == ipn->fb_tunnel_dev) | 670 | if (t->dev == ipn->fb_tunnel_dev) |
671 | goto done; | 671 | goto done; |
672 | dev = t->dev; | 672 | dev = t->dev; |
673 | } | 673 | } |
674 | unregister_netdevice(dev); | 674 | unregister_netdevice(dev); |
675 | err = 0; | 675 | err = 0; |
676 | break; | 676 | break; |
677 | 677 | ||
678 | default: | 678 | default: |
679 | err = -EINVAL; | 679 | err = -EINVAL; |
680 | } | 680 | } |
681 | 681 | ||
682 | done: | 682 | done: |
683 | return err; | 683 | return err; |
684 | } | 684 | } |
685 | 685 | ||
686 | static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) | 686 | static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) |
687 | { | 687 | { |
688 | if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) | 688 | if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) |
689 | return -EINVAL; | 689 | return -EINVAL; |
690 | dev->mtu = new_mtu; | 690 | dev->mtu = new_mtu; |
691 | return 0; | 691 | return 0; |
692 | } | 692 | } |
693 | 693 | ||
694 | static void ipip_tunnel_setup(struct net_device *dev) | 694 | static void ipip_tunnel_setup(struct net_device *dev) |
695 | { | 695 | { |
696 | dev->uninit = ipip_tunnel_uninit; | 696 | dev->uninit = ipip_tunnel_uninit; |
697 | dev->hard_start_xmit = ipip_tunnel_xmit; | 697 | dev->hard_start_xmit = ipip_tunnel_xmit; |
698 | dev->do_ioctl = ipip_tunnel_ioctl; | 698 | dev->do_ioctl = ipip_tunnel_ioctl; |
699 | dev->change_mtu = ipip_tunnel_change_mtu; | 699 | dev->change_mtu = ipip_tunnel_change_mtu; |
700 | dev->destructor = free_netdev; | 700 | dev->destructor = free_netdev; |
701 | 701 | ||
702 | dev->type = ARPHRD_TUNNEL; | 702 | dev->type = ARPHRD_TUNNEL; |
703 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | 703 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); |
704 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); | 704 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); |
705 | dev->flags = IFF_NOARP; | 705 | dev->flags = IFF_NOARP; |
706 | dev->iflink = 0; | 706 | dev->iflink = 0; |
707 | dev->addr_len = 4; | 707 | dev->addr_len = 4; |
708 | dev->features |= NETIF_F_NETNS_LOCAL; | 708 | dev->features |= NETIF_F_NETNS_LOCAL; |
709 | } | 709 | } |
710 | 710 | ||
711 | static int ipip_tunnel_init(struct net_device *dev) | 711 | static int ipip_tunnel_init(struct net_device *dev) |
712 | { | 712 | { |
713 | struct ip_tunnel *tunnel; | 713 | struct ip_tunnel *tunnel; |
714 | 714 | ||
715 | tunnel = netdev_priv(dev); | 715 | tunnel = netdev_priv(dev); |
716 | 716 | ||
717 | tunnel->dev = dev; | 717 | tunnel->dev = dev; |
718 | strcpy(tunnel->parms.name, dev->name); | 718 | strcpy(tunnel->parms.name, dev->name); |
719 | 719 | ||
720 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | 720 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); |
721 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 721 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); |
722 | 722 | ||
723 | ipip_tunnel_bind_dev(dev); | 723 | ipip_tunnel_bind_dev(dev); |
724 | 724 | ||
725 | return 0; | 725 | return 0; |
726 | } | 726 | } |
727 | 727 | ||
728 | static int ipip_fb_tunnel_init(struct net_device *dev) | 728 | static int ipip_fb_tunnel_init(struct net_device *dev) |
729 | { | 729 | { |
730 | struct ip_tunnel *tunnel = netdev_priv(dev); | 730 | struct ip_tunnel *tunnel = netdev_priv(dev); |
731 | struct iphdr *iph = &tunnel->parms.iph; | 731 | struct iphdr *iph = &tunnel->parms.iph; |
732 | struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); | 732 | struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); |
733 | 733 | ||
734 | tunnel->dev = dev; | 734 | tunnel->dev = dev; |
735 | strcpy(tunnel->parms.name, dev->name); | 735 | strcpy(tunnel->parms.name, dev->name); |
736 | 736 | ||
737 | iph->version = 4; | 737 | iph->version = 4; |
738 | iph->protocol = IPPROTO_IPIP; | 738 | iph->protocol = IPPROTO_IPIP; |
739 | iph->ihl = 5; | 739 | iph->ihl = 5; |
740 | 740 | ||
741 | dev_hold(dev); | 741 | dev_hold(dev); |
742 | ipn->tunnels_wc[0] = tunnel; | 742 | ipn->tunnels_wc[0] = tunnel; |
743 | return 0; | 743 | return 0; |
744 | } | 744 | } |
745 | 745 | ||
746 | static struct xfrm_tunnel ipip_handler = { | 746 | static struct xfrm_tunnel ipip_handler = { |
747 | .handler = ipip_rcv, | 747 | .handler = ipip_rcv, |
748 | .err_handler = ipip_err, | 748 | .err_handler = ipip_err, |
749 | .priority = 1, | 749 | .priority = 1, |
750 | }; | 750 | }; |
751 | 751 | ||
752 | static char banner[] __initdata = | 752 | static char banner[] __initdata = |
753 | KERN_INFO "IPv4 over IPv4 tunneling driver\n"; | 753 | KERN_INFO "IPv4 over IPv4 tunneling driver\n"; |
754 | 754 | ||
755 | static void ipip_destroy_tunnels(struct ipip_net *ipn) | 755 | static void ipip_destroy_tunnels(struct ipip_net *ipn) |
756 | { | 756 | { |
757 | int prio; | 757 | int prio; |
758 | 758 | ||
759 | for (prio = 1; prio < 4; prio++) { | 759 | for (prio = 1; prio < 4; prio++) { |
760 | int h; | 760 | int h; |
761 | for (h = 0; h < HASH_SIZE; h++) { | 761 | for (h = 0; h < HASH_SIZE; h++) { |
762 | struct ip_tunnel *t; | 762 | struct ip_tunnel *t; |
763 | while ((t = ipn->tunnels[prio][h]) != NULL) | 763 | while ((t = ipn->tunnels[prio][h]) != NULL) |
764 | unregister_netdevice(t->dev); | 764 | unregister_netdevice(t->dev); |
765 | } | 765 | } |
766 | } | 766 | } |
767 | } | 767 | } |
768 | 768 | ||
769 | static int ipip_init_net(struct net *net) | 769 | static int ipip_init_net(struct net *net) |
770 | { | 770 | { |
771 | int err; | 771 | int err; |
772 | struct ipip_net *ipn; | 772 | struct ipip_net *ipn; |
773 | 773 | ||
774 | err = -ENOMEM; | 774 | err = -ENOMEM; |
775 | ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); | 775 | ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); |
776 | if (ipn == NULL) | 776 | if (ipn == NULL) |
777 | goto err_alloc; | 777 | goto err_alloc; |
778 | 778 | ||
779 | err = net_assign_generic(net, ipip_net_id, ipn); | 779 | err = net_assign_generic(net, ipip_net_id, ipn); |
780 | if (err < 0) | 780 | if (err < 0) |
781 | goto err_assign; | 781 | goto err_assign; |
782 | 782 | ||
783 | ipn->tunnels[0] = ipn->tunnels_wc; | 783 | ipn->tunnels[0] = ipn->tunnels_wc; |
784 | ipn->tunnels[1] = ipn->tunnels_l; | 784 | ipn->tunnels[1] = ipn->tunnels_l; |
785 | ipn->tunnels[2] = ipn->tunnels_r; | 785 | ipn->tunnels[2] = ipn->tunnels_r; |
786 | ipn->tunnels[3] = ipn->tunnels_r_l; | 786 | ipn->tunnels[3] = ipn->tunnels_r_l; |
787 | 787 | ||
788 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), | 788 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), |
789 | "tunl0", | 789 | "tunl0", |
790 | ipip_tunnel_setup); | 790 | ipip_tunnel_setup); |
791 | if (!ipn->fb_tunnel_dev) { | 791 | if (!ipn->fb_tunnel_dev) { |
792 | err = -ENOMEM; | 792 | err = -ENOMEM; |
793 | goto err_alloc_dev; | 793 | goto err_alloc_dev; |
794 | } | 794 | } |
795 | 795 | ||
796 | ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init; | 796 | ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init; |
797 | dev_net_set(ipn->fb_tunnel_dev, net); | 797 | dev_net_set(ipn->fb_tunnel_dev, net); |
798 | 798 | ||
799 | if ((err = register_netdev(ipn->fb_tunnel_dev))) | 799 | if ((err = register_netdev(ipn->fb_tunnel_dev))) |
800 | goto err_reg_dev; | 800 | goto err_reg_dev; |
801 | 801 | ||
802 | return 0; | 802 | return 0; |
803 | 803 | ||
804 | err_reg_dev: | 804 | err_reg_dev: |
805 | free_netdev(ipn->fb_tunnel_dev); | 805 | free_netdev(ipn->fb_tunnel_dev); |
806 | err_alloc_dev: | 806 | err_alloc_dev: |
807 | /* nothing */ | 807 | /* nothing */ |
808 | err_assign: | 808 | err_assign: |
809 | kfree(ipn); | 809 | kfree(ipn); |
810 | err_alloc: | 810 | err_alloc: |
811 | return err; | 811 | return err; |
812 | } | 812 | } |
813 | 813 | ||
814 | static void ipip_exit_net(struct net *net) | 814 | static void ipip_exit_net(struct net *net) |
815 | { | 815 | { |
816 | struct ipip_net *ipn; | 816 | struct ipip_net *ipn; |
817 | 817 | ||
818 | ipn = net_generic(net, ipip_net_id); | 818 | ipn = net_generic(net, ipip_net_id); |
819 | rtnl_lock(); | 819 | rtnl_lock(); |
820 | ipip_destroy_tunnels(ipn); | 820 | ipip_destroy_tunnels(ipn); |
821 | unregister_netdevice(ipn->fb_tunnel_dev); | 821 | unregister_netdevice(ipn->fb_tunnel_dev); |
822 | rtnl_unlock(); | 822 | rtnl_unlock(); |
823 | kfree(ipn); | 823 | kfree(ipn); |
824 | } | 824 | } |
825 | 825 | ||
826 | static struct pernet_operations ipip_net_ops = { | 826 | static struct pernet_operations ipip_net_ops = { |
827 | .init = ipip_init_net, | 827 | .init = ipip_init_net, |
828 | .exit = ipip_exit_net, | 828 | .exit = ipip_exit_net, |
829 | }; | 829 | }; |
830 | 830 | ||
831 | static int __init ipip_init(void) | 831 | static int __init ipip_init(void) |
832 | { | 832 | { |
833 | int err; | 833 | int err; |
834 | 834 | ||
835 | printk(banner); | 835 | printk(banner); |
836 | 836 | ||
837 | if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { | 837 | if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { |
838 | printk(KERN_INFO "ipip init: can't register tunnel\n"); | 838 | printk(KERN_INFO "ipip init: can't register tunnel\n"); |
839 | return -EAGAIN; | 839 | return -EAGAIN; |
840 | } | 840 | } |
841 | 841 | ||
842 | err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); | 842 | err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); |
843 | if (err) | 843 | if (err) |
844 | xfrm4_tunnel_deregister(&ipip_handler, AF_INET); | 844 | xfrm4_tunnel_deregister(&ipip_handler, AF_INET); |
845 | 845 | ||
846 | return err; | 846 | return err; |
847 | } | 847 | } |
848 | 848 | ||
849 | static void __exit ipip_fini(void) | 849 | static void __exit ipip_fini(void) |
850 | { | 850 | { |
851 | if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) | 851 | if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) |
852 | printk(KERN_INFO "ipip close: can't deregister tunnel\n"); | 852 | printk(KERN_INFO "ipip close: can't deregister tunnel\n"); |
853 | 853 | ||
854 | unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); | 854 | unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); |
855 | } | 855 | } |
856 | 856 | ||
857 | module_init(ipip_init); | 857 | module_init(ipip_init); |
858 | module_exit(ipip_fini); | 858 | module_exit(ipip_fini); |
859 | MODULE_LICENSE("GPL"); | 859 | MODULE_LICENSE("GPL"); |
860 | 860 |
net/ipv4/raw.c
1 | /* | 1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
3 | * operating system. INET is implemented using the BSD Socket | 3 | * operating system. INET is implemented using the BSD Socket |
4 | * interface as the means of communication with the user level. | 4 | * interface as the means of communication with the user level. |
5 | * | 5 | * |
6 | * RAW - implementation of IP "raw" sockets. | 6 | * RAW - implementation of IP "raw" sockets. |
7 | * | 7 | * |
8 | * Authors: Ross Biro | 8 | * Authors: Ross Biro |
9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | 9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> |
10 | * | 10 | * |
11 | * Fixes: | 11 | * Fixes: |
12 | * Alan Cox : verify_area() fixed up | 12 | * Alan Cox : verify_area() fixed up |
13 | * Alan Cox : ICMP error handling | 13 | * Alan Cox : ICMP error handling |
14 | * Alan Cox : EMSGSIZE if you send too big a packet | 14 | * Alan Cox : EMSGSIZE if you send too big a packet |
15 | * Alan Cox : Now uses generic datagrams and shared | 15 | * Alan Cox : Now uses generic datagrams and shared |
16 | * skbuff library. No more peek crashes, | 16 | * skbuff library. No more peek crashes, |
17 | * no more backlogs | 17 | * no more backlogs |
18 | * Alan Cox : Checks sk->broadcast. | 18 | * Alan Cox : Checks sk->broadcast. |
19 | * Alan Cox : Uses skb_free_datagram/skb_copy_datagram | 19 | * Alan Cox : Uses skb_free_datagram/skb_copy_datagram |
20 | * Alan Cox : Raw passes ip options too | 20 | * Alan Cox : Raw passes ip options too |
21 | * Alan Cox : Setsocketopt added | 21 | * Alan Cox : Setsocketopt added |
22 | * Alan Cox : Fixed error return for broadcasts | 22 | * Alan Cox : Fixed error return for broadcasts |
23 | * Alan Cox : Removed wake_up calls | 23 | * Alan Cox : Removed wake_up calls |
24 | * Alan Cox : Use ttl/tos | 24 | * Alan Cox : Use ttl/tos |
25 | * Alan Cox : Cleaned up old debugging | 25 | * Alan Cox : Cleaned up old debugging |
26 | * Alan Cox : Use new kernel side addresses | 26 | * Alan Cox : Use new kernel side addresses |
27 | * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. | 27 | * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. |
28 | * Alan Cox : BSD style RAW socket demultiplexing. | 28 | * Alan Cox : BSD style RAW socket demultiplexing. |
29 | * Alan Cox : Beginnings of mrouted support. | 29 | * Alan Cox : Beginnings of mrouted support. |
30 | * Alan Cox : Added IP_HDRINCL option. | 30 | * Alan Cox : Added IP_HDRINCL option. |
31 | * Alan Cox : Skip broadcast check if BSDism set. | 31 | * Alan Cox : Skip broadcast check if BSDism set. |
32 | * David S. Miller : New socket lookup architecture. | 32 | * David S. Miller : New socket lookup architecture. |
33 | * | 33 | * |
34 | * This program is free software; you can redistribute it and/or | 34 | * This program is free software; you can redistribute it and/or |
35 | * modify it under the terms of the GNU General Public License | 35 | * modify it under the terms of the GNU General Public License |
36 | * as published by the Free Software Foundation; either version | 36 | * as published by the Free Software Foundation; either version |
37 | * 2 of the License, or (at your option) any later version. | 37 | * 2 of the License, or (at your option) any later version. |
38 | */ | 38 | */ |
39 | 39 | ||
40 | #include <linux/types.h> | 40 | #include <linux/types.h> |
41 | #include <asm/atomic.h> | 41 | #include <asm/atomic.h> |
42 | #include <asm/byteorder.h> | 42 | #include <asm/byteorder.h> |
43 | #include <asm/current.h> | 43 | #include <asm/current.h> |
44 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
45 | #include <asm/ioctls.h> | 45 | #include <asm/ioctls.h> |
46 | #include <linux/stddef.h> | 46 | #include <linux/stddef.h> |
47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
48 | #include <linux/errno.h> | 48 | #include <linux/errno.h> |
49 | #include <linux/aio.h> | 49 | #include <linux/aio.h> |
50 | #include <linux/kernel.h> | 50 | #include <linux/kernel.h> |
51 | #include <linux/spinlock.h> | 51 | #include <linux/spinlock.h> |
52 | #include <linux/sockios.h> | 52 | #include <linux/sockios.h> |
53 | #include <linux/socket.h> | 53 | #include <linux/socket.h> |
54 | #include <linux/in.h> | 54 | #include <linux/in.h> |
55 | #include <linux/mroute.h> | 55 | #include <linux/mroute.h> |
56 | #include <linux/netdevice.h> | 56 | #include <linux/netdevice.h> |
57 | #include <linux/in_route.h> | 57 | #include <linux/in_route.h> |
58 | #include <linux/route.h> | 58 | #include <linux/route.h> |
59 | #include <linux/skbuff.h> | 59 | #include <linux/skbuff.h> |
60 | #include <net/net_namespace.h> | 60 | #include <net/net_namespace.h> |
61 | #include <net/dst.h> | 61 | #include <net/dst.h> |
62 | #include <net/sock.h> | 62 | #include <net/sock.h> |
63 | #include <linux/gfp.h> | 63 | #include <linux/gfp.h> |
64 | #include <linux/ip.h> | 64 | #include <linux/ip.h> |
65 | #include <linux/net.h> | 65 | #include <linux/net.h> |
66 | #include <net/ip.h> | 66 | #include <net/ip.h> |
67 | #include <net/icmp.h> | 67 | #include <net/icmp.h> |
68 | #include <net/udp.h> | 68 | #include <net/udp.h> |
69 | #include <net/raw.h> | 69 | #include <net/raw.h> |
70 | #include <net/snmp.h> | 70 | #include <net/snmp.h> |
71 | #include <net/tcp_states.h> | 71 | #include <net/tcp_states.h> |
72 | #include <net/inet_common.h> | 72 | #include <net/inet_common.h> |
73 | #include <net/checksum.h> | 73 | #include <net/checksum.h> |
74 | #include <net/xfrm.h> | 74 | #include <net/xfrm.h> |
75 | #include <linux/rtnetlink.h> | 75 | #include <linux/rtnetlink.h> |
76 | #include <linux/proc_fs.h> | 76 | #include <linux/proc_fs.h> |
77 | #include <linux/seq_file.h> | 77 | #include <linux/seq_file.h> |
78 | #include <linux/netfilter.h> | 78 | #include <linux/netfilter.h> |
79 | #include <linux/netfilter_ipv4.h> | 79 | #include <linux/netfilter_ipv4.h> |
80 | 80 | ||
81 | static struct raw_hashinfo raw_v4_hashinfo = { | 81 | static struct raw_hashinfo raw_v4_hashinfo = { |
82 | .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), | 82 | .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), |
83 | }; | 83 | }; |
84 | 84 | ||
85 | void raw_hash_sk(struct sock *sk) | 85 | void raw_hash_sk(struct sock *sk) |
86 | { | 86 | { |
87 | struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; | 87 | struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; |
88 | struct hlist_head *head; | 88 | struct hlist_head *head; |
89 | 89 | ||
90 | head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; | 90 | head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; |
91 | 91 | ||
92 | write_lock_bh(&h->lock); | 92 | write_lock_bh(&h->lock); |
93 | sk_add_node(sk, head); | 93 | sk_add_node(sk, head); |
94 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 94 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
95 | write_unlock_bh(&h->lock); | 95 | write_unlock_bh(&h->lock); |
96 | } | 96 | } |
97 | EXPORT_SYMBOL_GPL(raw_hash_sk); | 97 | EXPORT_SYMBOL_GPL(raw_hash_sk); |
98 | 98 | ||
99 | void raw_unhash_sk(struct sock *sk) | 99 | void raw_unhash_sk(struct sock *sk) |
100 | { | 100 | { |
101 | struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; | 101 | struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; |
102 | 102 | ||
103 | write_lock_bh(&h->lock); | 103 | write_lock_bh(&h->lock); |
104 | if (sk_del_node_init(sk)) | 104 | if (sk_del_node_init(sk)) |
105 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 105 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
106 | write_unlock_bh(&h->lock); | 106 | write_unlock_bh(&h->lock); |
107 | } | 107 | } |
108 | EXPORT_SYMBOL_GPL(raw_unhash_sk); | 108 | EXPORT_SYMBOL_GPL(raw_unhash_sk); |
109 | 109 | ||
110 | static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, | 110 | static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, |
111 | unsigned short num, __be32 raddr, __be32 laddr, int dif) | 111 | unsigned short num, __be32 raddr, __be32 laddr, int dif) |
112 | { | 112 | { |
113 | struct hlist_node *node; | 113 | struct hlist_node *node; |
114 | 114 | ||
115 | sk_for_each_from(sk, node) { | 115 | sk_for_each_from(sk, node) { |
116 | struct inet_sock *inet = inet_sk(sk); | 116 | struct inet_sock *inet = inet_sk(sk); |
117 | 117 | ||
118 | if (net_eq(sock_net(sk), net) && inet->num == num && | 118 | if (net_eq(sock_net(sk), net) && inet->num == num && |
119 | !(inet->daddr && inet->daddr != raddr) && | 119 | !(inet->daddr && inet->daddr != raddr) && |
120 | !(inet->rcv_saddr && inet->rcv_saddr != laddr) && | 120 | !(inet->rcv_saddr && inet->rcv_saddr != laddr) && |
121 | !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) | 121 | !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) |
122 | goto found; /* gotcha */ | 122 | goto found; /* gotcha */ |
123 | } | 123 | } |
124 | sk = NULL; | 124 | sk = NULL; |
125 | found: | 125 | found: |
126 | return sk; | 126 | return sk; |
127 | } | 127 | } |
128 | 128 | ||
129 | /* | 129 | /* |
130 | * 0 - deliver | 130 | * 0 - deliver |
131 | * 1 - block | 131 | * 1 - block |
132 | */ | 132 | */ |
133 | static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) | 133 | static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) |
134 | { | 134 | { |
135 | int type; | 135 | int type; |
136 | 136 | ||
137 | if (!pskb_may_pull(skb, sizeof(struct icmphdr))) | 137 | if (!pskb_may_pull(skb, sizeof(struct icmphdr))) |
138 | return 1; | 138 | return 1; |
139 | 139 | ||
140 | type = icmp_hdr(skb)->type; | 140 | type = icmp_hdr(skb)->type; |
141 | if (type < 32) { | 141 | if (type < 32) { |
142 | __u32 data = raw_sk(sk)->filter.data; | 142 | __u32 data = raw_sk(sk)->filter.data; |
143 | 143 | ||
144 | return ((1 << type) & data) != 0; | 144 | return ((1 << type) & data) != 0; |
145 | } | 145 | } |
146 | 146 | ||
147 | /* Do not block unknown ICMP types */ | 147 | /* Do not block unknown ICMP types */ |
148 | return 0; | 148 | return 0; |
149 | } | 149 | } |
150 | 150 | ||
151 | /* IP input processing comes here for RAW socket delivery. | 151 | /* IP input processing comes here for RAW socket delivery. |
152 | * Caller owns SKB, so we must make clones. | 152 | * Caller owns SKB, so we must make clones. |
153 | * | 153 | * |
154 | * RFC 1122: SHOULD pass TOS value up to the transport layer. | 154 | * RFC 1122: SHOULD pass TOS value up to the transport layer. |
155 | * -> It does. And not only TOS, but all IP header. | 155 | * -> It does. And not only TOS, but all IP header. |
156 | */ | 156 | */ |
157 | static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) | 157 | static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) |
158 | { | 158 | { |
159 | struct sock *sk; | 159 | struct sock *sk; |
160 | struct hlist_head *head; | 160 | struct hlist_head *head; |
161 | int delivered = 0; | 161 | int delivered = 0; |
162 | struct net *net; | 162 | struct net *net; |
163 | 163 | ||
164 | read_lock(&raw_v4_hashinfo.lock); | 164 | read_lock(&raw_v4_hashinfo.lock); |
165 | head = &raw_v4_hashinfo.ht[hash]; | 165 | head = &raw_v4_hashinfo.ht[hash]; |
166 | if (hlist_empty(head)) | 166 | if (hlist_empty(head)) |
167 | goto out; | 167 | goto out; |
168 | 168 | ||
169 | net = dev_net(skb->dev); | 169 | net = dev_net(skb->dev); |
170 | sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, | 170 | sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, |
171 | iph->saddr, iph->daddr, | 171 | iph->saddr, iph->daddr, |
172 | skb->dev->ifindex); | 172 | skb->dev->ifindex); |
173 | 173 | ||
174 | while (sk) { | 174 | while (sk) { |
175 | delivered = 1; | 175 | delivered = 1; |
176 | if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { | 176 | if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { |
177 | struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); | 177 | struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); |
178 | 178 | ||
179 | /* Not releasing hash table! */ | 179 | /* Not releasing hash table! */ |
180 | if (clone) | 180 | if (clone) |
181 | raw_rcv(sk, clone); | 181 | raw_rcv(sk, clone); |
182 | } | 182 | } |
183 | sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, | 183 | sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, |
184 | iph->saddr, iph->daddr, | 184 | iph->saddr, iph->daddr, |
185 | skb->dev->ifindex); | 185 | skb->dev->ifindex); |
186 | } | 186 | } |
187 | out: | 187 | out: |
188 | read_unlock(&raw_v4_hashinfo.lock); | 188 | read_unlock(&raw_v4_hashinfo.lock); |
189 | return delivered; | 189 | return delivered; |
190 | } | 190 | } |
191 | 191 | ||
192 | int raw_local_deliver(struct sk_buff *skb, int protocol) | 192 | int raw_local_deliver(struct sk_buff *skb, int protocol) |
193 | { | 193 | { |
194 | int hash; | 194 | int hash; |
195 | struct sock *raw_sk; | 195 | struct sock *raw_sk; |
196 | 196 | ||
197 | hash = protocol & (RAW_HTABLE_SIZE - 1); | 197 | hash = protocol & (RAW_HTABLE_SIZE - 1); |
198 | raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); | 198 | raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); |
199 | 199 | ||
200 | /* If there maybe a raw socket we must check - if not we | 200 | /* If there maybe a raw socket we must check - if not we |
201 | * don't care less | 201 | * don't care less |
202 | */ | 202 | */ |
203 | if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) | 203 | if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) |
204 | raw_sk = NULL; | 204 | raw_sk = NULL; |
205 | 205 | ||
206 | return raw_sk != NULL; | 206 | return raw_sk != NULL; |
207 | 207 | ||
208 | } | 208 | } |
209 | 209 | ||
210 | static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) | 210 | static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) |
211 | { | 211 | { |
212 | struct inet_sock *inet = inet_sk(sk); | 212 | struct inet_sock *inet = inet_sk(sk); |
213 | const int type = icmp_hdr(skb)->type; | 213 | const int type = icmp_hdr(skb)->type; |
214 | const int code = icmp_hdr(skb)->code; | 214 | const int code = icmp_hdr(skb)->code; |
215 | int err = 0; | 215 | int err = 0; |
216 | int harderr = 0; | 216 | int harderr = 0; |
217 | 217 | ||
218 | /* Report error on raw socket, if: | 218 | /* Report error on raw socket, if: |
219 | 1. User requested ip_recverr. | 219 | 1. User requested ip_recverr. |
220 | 2. Socket is connected (otherwise the error indication | 220 | 2. Socket is connected (otherwise the error indication |
221 | is useless without ip_recverr and error is hard. | 221 | is useless without ip_recverr and error is hard. |
222 | */ | 222 | */ |
223 | if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) | 223 | if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) |
224 | return; | 224 | return; |
225 | 225 | ||
226 | switch (type) { | 226 | switch (type) { |
227 | default: | 227 | default: |
228 | case ICMP_TIME_EXCEEDED: | 228 | case ICMP_TIME_EXCEEDED: |
229 | err = EHOSTUNREACH; | 229 | err = EHOSTUNREACH; |
230 | break; | 230 | break; |
231 | case ICMP_SOURCE_QUENCH: | 231 | case ICMP_SOURCE_QUENCH: |
232 | return; | 232 | return; |
233 | case ICMP_PARAMETERPROB: | 233 | case ICMP_PARAMETERPROB: |
234 | err = EPROTO; | 234 | err = EPROTO; |
235 | harderr = 1; | 235 | harderr = 1; |
236 | break; | 236 | break; |
237 | case ICMP_DEST_UNREACH: | 237 | case ICMP_DEST_UNREACH: |
238 | err = EHOSTUNREACH; | 238 | err = EHOSTUNREACH; |
239 | if (code > NR_ICMP_UNREACH) | 239 | if (code > NR_ICMP_UNREACH) |
240 | break; | 240 | break; |
241 | err = icmp_err_convert[code].errno; | 241 | err = icmp_err_convert[code].errno; |
242 | harderr = icmp_err_convert[code].fatal; | 242 | harderr = icmp_err_convert[code].fatal; |
243 | if (code == ICMP_FRAG_NEEDED) { | 243 | if (code == ICMP_FRAG_NEEDED) { |
244 | harderr = inet->pmtudisc != IP_PMTUDISC_DONT; | 244 | harderr = inet->pmtudisc != IP_PMTUDISC_DONT; |
245 | err = EMSGSIZE; | 245 | err = EMSGSIZE; |
246 | } | 246 | } |
247 | } | 247 | } |
248 | 248 | ||
249 | if (inet->recverr) { | 249 | if (inet->recverr) { |
250 | struct iphdr *iph = (struct iphdr*)skb->data; | 250 | struct iphdr *iph = (struct iphdr *)skb->data; |
251 | u8 *payload = skb->data + (iph->ihl << 2); | 251 | u8 *payload = skb->data + (iph->ihl << 2); |
252 | 252 | ||
253 | if (inet->hdrincl) | 253 | if (inet->hdrincl) |
254 | payload = skb->data; | 254 | payload = skb->data; |
255 | ip_icmp_error(sk, skb, err, 0, info, payload); | 255 | ip_icmp_error(sk, skb, err, 0, info, payload); |
256 | } | 256 | } |
257 | 257 | ||
258 | if (inet->recverr || harderr) { | 258 | if (inet->recverr || harderr) { |
259 | sk->sk_err = err; | 259 | sk->sk_err = err; |
260 | sk->sk_error_report(sk); | 260 | sk->sk_error_report(sk); |
261 | } | 261 | } |
262 | } | 262 | } |
263 | 263 | ||
264 | void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) | 264 | void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) |
265 | { | 265 | { |
266 | int hash; | 266 | int hash; |
267 | struct sock *raw_sk; | 267 | struct sock *raw_sk; |
268 | struct iphdr *iph; | 268 | struct iphdr *iph; |
269 | struct net *net; | 269 | struct net *net; |
270 | 270 | ||
271 | hash = protocol & (RAW_HTABLE_SIZE - 1); | 271 | hash = protocol & (RAW_HTABLE_SIZE - 1); |
272 | 272 | ||
273 | read_lock(&raw_v4_hashinfo.lock); | 273 | read_lock(&raw_v4_hashinfo.lock); |
274 | raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); | 274 | raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); |
275 | if (raw_sk != NULL) { | 275 | if (raw_sk != NULL) { |
276 | iph = (struct iphdr *)skb->data; | 276 | iph = (struct iphdr *)skb->data; |
277 | net = dev_net(skb->dev); | 277 | net = dev_net(skb->dev); |
278 | 278 | ||
279 | while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, | 279 | while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, |
280 | iph->daddr, iph->saddr, | 280 | iph->daddr, iph->saddr, |
281 | skb->dev->ifindex)) != NULL) { | 281 | skb->dev->ifindex)) != NULL) { |
282 | raw_err(raw_sk, skb, info); | 282 | raw_err(raw_sk, skb, info); |
283 | raw_sk = sk_next(raw_sk); | 283 | raw_sk = sk_next(raw_sk); |
284 | iph = (struct iphdr *)skb->data; | 284 | iph = (struct iphdr *)skb->data; |
285 | } | 285 | } |
286 | } | 286 | } |
287 | read_unlock(&raw_v4_hashinfo.lock); | 287 | read_unlock(&raw_v4_hashinfo.lock); |
288 | } | 288 | } |
289 | 289 | ||
290 | static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) | 290 | static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) |
291 | { | 291 | { |
292 | /* Charge it to the socket. */ | 292 | /* Charge it to the socket. */ |
293 | 293 | ||
294 | if (sock_queue_rcv_skb(sk, skb) < 0) { | 294 | if (sock_queue_rcv_skb(sk, skb) < 0) { |
295 | atomic_inc(&sk->sk_drops); | 295 | atomic_inc(&sk->sk_drops); |
296 | kfree_skb(skb); | 296 | kfree_skb(skb); |
297 | return NET_RX_DROP; | 297 | return NET_RX_DROP; |
298 | } | 298 | } |
299 | 299 | ||
300 | return NET_RX_SUCCESS; | 300 | return NET_RX_SUCCESS; |
301 | } | 301 | } |
302 | 302 | ||
303 | int raw_rcv(struct sock *sk, struct sk_buff *skb) | 303 | int raw_rcv(struct sock *sk, struct sk_buff *skb) |
304 | { | 304 | { |
305 | if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { | 305 | if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { |
306 | atomic_inc(&sk->sk_drops); | 306 | atomic_inc(&sk->sk_drops); |
307 | kfree_skb(skb); | 307 | kfree_skb(skb); |
308 | return NET_RX_DROP; | 308 | return NET_RX_DROP; |
309 | } | 309 | } |
310 | nf_reset(skb); | 310 | nf_reset(skb); |
311 | 311 | ||
312 | skb_push(skb, skb->data - skb_network_header(skb)); | 312 | skb_push(skb, skb->data - skb_network_header(skb)); |
313 | 313 | ||
314 | raw_rcv_skb(sk, skb); | 314 | raw_rcv_skb(sk, skb); |
315 | return 0; | 315 | return 0; |
316 | } | 316 | } |
317 | 317 | ||
318 | static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, | 318 | static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, |
319 | struct rtable *rt, | 319 | struct rtable *rt, |
320 | unsigned int flags) | 320 | unsigned int flags) |
321 | { | 321 | { |
322 | struct inet_sock *inet = inet_sk(sk); | 322 | struct inet_sock *inet = inet_sk(sk); |
323 | struct net *net = sock_net(sk); | 323 | struct net *net = sock_net(sk); |
324 | struct iphdr *iph; | 324 | struct iphdr *iph; |
325 | struct sk_buff *skb; | 325 | struct sk_buff *skb; |
326 | unsigned int iphlen; | 326 | unsigned int iphlen; |
327 | int err; | 327 | int err; |
328 | 328 | ||
329 | if (length > rt->u.dst.dev->mtu) { | 329 | if (length > rt->u.dst.dev->mtu) { |
330 | ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, | 330 | ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, |
331 | rt->u.dst.dev->mtu); | 331 | rt->u.dst.dev->mtu); |
332 | return -EMSGSIZE; | 332 | return -EMSGSIZE; |
333 | } | 333 | } |
334 | if (flags&MSG_PROBE) | 334 | if (flags&MSG_PROBE) |
335 | goto out; | 335 | goto out; |
336 | 336 | ||
337 | skb = sock_alloc_send_skb(sk, | 337 | skb = sock_alloc_send_skb(sk, |
338 | length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, | 338 | length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, |
339 | flags & MSG_DONTWAIT, &err); | 339 | flags & MSG_DONTWAIT, &err); |
340 | if (skb == NULL) | 340 | if (skb == NULL) |
341 | goto error; | 341 | goto error; |
342 | skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); | 342 | skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); |
343 | 343 | ||
344 | skb->priority = sk->sk_priority; | 344 | skb->priority = sk->sk_priority; |
345 | skb->mark = sk->sk_mark; | 345 | skb->mark = sk->sk_mark; |
346 | skb->dst = dst_clone(&rt->u.dst); | 346 | skb->dst = dst_clone(&rt->u.dst); |
347 | 347 | ||
348 | skb_reset_network_header(skb); | 348 | skb_reset_network_header(skb); |
349 | iph = ip_hdr(skb); | 349 | iph = ip_hdr(skb); |
350 | skb_put(skb, length); | 350 | skb_put(skb, length); |
351 | 351 | ||
352 | skb->ip_summed = CHECKSUM_NONE; | 352 | skb->ip_summed = CHECKSUM_NONE; |
353 | 353 | ||
354 | skb->transport_header = skb->network_header; | 354 | skb->transport_header = skb->network_header; |
355 | err = memcpy_fromiovecend((void *)iph, from, 0, length); | 355 | err = memcpy_fromiovecend((void *)iph, from, 0, length); |
356 | if (err) | 356 | if (err) |
357 | goto error_fault; | 357 | goto error_fault; |
358 | 358 | ||
359 | /* We don't modify invalid header */ | 359 | /* We don't modify invalid header */ |
360 | iphlen = iph->ihl * 4; | 360 | iphlen = iph->ihl * 4; |
361 | if (iphlen >= sizeof(*iph) && iphlen <= length) { | 361 | if (iphlen >= sizeof(*iph) && iphlen <= length) { |
362 | if (!iph->saddr) | 362 | if (!iph->saddr) |
363 | iph->saddr = rt->rt_src; | 363 | iph->saddr = rt->rt_src; |
364 | iph->check = 0; | 364 | iph->check = 0; |
365 | iph->tot_len = htons(length); | 365 | iph->tot_len = htons(length); |
366 | if (!iph->id) | 366 | if (!iph->id) |
367 | ip_select_ident(iph, &rt->u.dst, NULL); | 367 | ip_select_ident(iph, &rt->u.dst, NULL); |
368 | 368 | ||
369 | iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); | 369 | iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); |
370 | } | 370 | } |
371 | if (iph->protocol == IPPROTO_ICMP) | 371 | if (iph->protocol == IPPROTO_ICMP) |
372 | icmp_out_count(net, ((struct icmphdr *) | 372 | icmp_out_count(net, ((struct icmphdr *) |
373 | skb_transport_header(skb))->type); | 373 | skb_transport_header(skb))->type); |
374 | 374 | ||
375 | err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, | 375 | err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, |
376 | dst_output); | 376 | dst_output); |
377 | if (err > 0) | 377 | if (err > 0) |
378 | err = inet->recverr ? net_xmit_errno(err) : 0; | 378 | err = inet->recverr ? net_xmit_errno(err) : 0; |
379 | if (err) | 379 | if (err) |
380 | goto error; | 380 | goto error; |
381 | out: | 381 | out: |
382 | return 0; | 382 | return 0; |
383 | 383 | ||
384 | error_fault: | 384 | error_fault: |
385 | err = -EFAULT; | 385 | err = -EFAULT; |
386 | kfree_skb(skb); | 386 | kfree_skb(skb); |
387 | error: | 387 | error: |
388 | IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); | 388 | IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); |
389 | return err; | 389 | return err; |
390 | } | 390 | } |
391 | 391 | ||
392 | static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) | 392 | static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) |
393 | { | 393 | { |
394 | struct iovec *iov; | 394 | struct iovec *iov; |
395 | u8 __user *type = NULL; | 395 | u8 __user *type = NULL; |
396 | u8 __user *code = NULL; | 396 | u8 __user *code = NULL; |
397 | int probed = 0; | 397 | int probed = 0; |
398 | unsigned int i; | 398 | unsigned int i; |
399 | 399 | ||
400 | if (!msg->msg_iov) | 400 | if (!msg->msg_iov) |
401 | return 0; | 401 | return 0; |
402 | 402 | ||
403 | for (i = 0; i < msg->msg_iovlen; i++) { | 403 | for (i = 0; i < msg->msg_iovlen; i++) { |
404 | iov = &msg->msg_iov[i]; | 404 | iov = &msg->msg_iov[i]; |
405 | if (!iov) | 405 | if (!iov) |
406 | continue; | 406 | continue; |
407 | 407 | ||
408 | switch (fl->proto) { | 408 | switch (fl->proto) { |
409 | case IPPROTO_ICMP: | 409 | case IPPROTO_ICMP: |
410 | /* check if one-byte field is readable or not. */ | 410 | /* check if one-byte field is readable or not. */ |
411 | if (iov->iov_base && iov->iov_len < 1) | 411 | if (iov->iov_base && iov->iov_len < 1) |
412 | break; | 412 | break; |
413 | 413 | ||
414 | if (!type) { | 414 | if (!type) { |
415 | type = iov->iov_base; | 415 | type = iov->iov_base; |
416 | /* check if code field is readable or not. */ | 416 | /* check if code field is readable or not. */ |
417 | if (iov->iov_len > 1) | 417 | if (iov->iov_len > 1) |
418 | code = type + 1; | 418 | code = type + 1; |
419 | } else if (!code) | 419 | } else if (!code) |
420 | code = iov->iov_base; | 420 | code = iov->iov_base; |
421 | 421 | ||
422 | if (type && code) { | 422 | if (type && code) { |
423 | if (get_user(fl->fl_icmp_type, type) || | 423 | if (get_user(fl->fl_icmp_type, type) || |
424 | get_user(fl->fl_icmp_code, code)) | 424 | get_user(fl->fl_icmp_code, code)) |
425 | return -EFAULT; | 425 | return -EFAULT; |
426 | probed = 1; | 426 | probed = 1; |
427 | } | 427 | } |
428 | break; | 428 | break; |
429 | default: | 429 | default: |
430 | probed = 1; | 430 | probed = 1; |
431 | break; | 431 | break; |
432 | } | 432 | } |
433 | if (probed) | 433 | if (probed) |
434 | break; | 434 | break; |
435 | } | 435 | } |
436 | return 0; | 436 | return 0; |
437 | } | 437 | } |
438 | 438 | ||
439 | static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 439 | static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
440 | size_t len) | 440 | size_t len) |
441 | { | 441 | { |
442 | struct inet_sock *inet = inet_sk(sk); | 442 | struct inet_sock *inet = inet_sk(sk); |
443 | struct ipcm_cookie ipc; | 443 | struct ipcm_cookie ipc; |
444 | struct rtable *rt = NULL; | 444 | struct rtable *rt = NULL; |
445 | int free = 0; | 445 | int free = 0; |
446 | __be32 daddr; | 446 | __be32 daddr; |
447 | __be32 saddr; | 447 | __be32 saddr; |
448 | u8 tos; | 448 | u8 tos; |
449 | int err; | 449 | int err; |
450 | 450 | ||
451 | err = -EMSGSIZE; | 451 | err = -EMSGSIZE; |
452 | if (len > 0xFFFF) | 452 | if (len > 0xFFFF) |
453 | goto out; | 453 | goto out; |
454 | 454 | ||
455 | /* | 455 | /* |
456 | * Check the flags. | 456 | * Check the flags. |
457 | */ | 457 | */ |
458 | 458 | ||
459 | err = -EOPNOTSUPP; | 459 | err = -EOPNOTSUPP; |
460 | if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ | 460 | if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ |
461 | goto out; /* compatibility */ | 461 | goto out; /* compatibility */ |
462 | 462 | ||
463 | /* | 463 | /* |
464 | * Get and verify the address. | 464 | * Get and verify the address. |
465 | */ | 465 | */ |
466 | 466 | ||
467 | if (msg->msg_namelen) { | 467 | if (msg->msg_namelen) { |
468 | struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; | 468 | struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; |
469 | err = -EINVAL; | 469 | err = -EINVAL; |
470 | if (msg->msg_namelen < sizeof(*usin)) | 470 | if (msg->msg_namelen < sizeof(*usin)) |
471 | goto out; | 471 | goto out; |
472 | if (usin->sin_family != AF_INET) { | 472 | if (usin->sin_family != AF_INET) { |
473 | static int complained; | 473 | static int complained; |
474 | if (!complained++) | 474 | if (!complained++) |
475 | printk(KERN_INFO "%s forgot to set AF_INET in " | 475 | printk(KERN_INFO "%s forgot to set AF_INET in " |
476 | "raw sendmsg. Fix it!\n", | 476 | "raw sendmsg. Fix it!\n", |
477 | current->comm); | 477 | current->comm); |
478 | err = -EAFNOSUPPORT; | 478 | err = -EAFNOSUPPORT; |
479 | if (usin->sin_family) | 479 | if (usin->sin_family) |
480 | goto out; | 480 | goto out; |
481 | } | 481 | } |
482 | daddr = usin->sin_addr.s_addr; | 482 | daddr = usin->sin_addr.s_addr; |
483 | /* ANK: I did not forget to get protocol from port field. | 483 | /* ANK: I did not forget to get protocol from port field. |
484 | * I just do not know, who uses this weirdness. | 484 | * I just do not know, who uses this weirdness. |
485 | * IP_HDRINCL is much more convenient. | 485 | * IP_HDRINCL is much more convenient. |
486 | */ | 486 | */ |
487 | } else { | 487 | } else { |
488 | err = -EDESTADDRREQ; | 488 | err = -EDESTADDRREQ; |
489 | if (sk->sk_state != TCP_ESTABLISHED) | 489 | if (sk->sk_state != TCP_ESTABLISHED) |
490 | goto out; | 490 | goto out; |
491 | daddr = inet->daddr; | 491 | daddr = inet->daddr; |
492 | } | 492 | } |
493 | 493 | ||
494 | ipc.addr = inet->saddr; | 494 | ipc.addr = inet->saddr; |
495 | ipc.opt = NULL; | 495 | ipc.opt = NULL; |
496 | ipc.oif = sk->sk_bound_dev_if; | 496 | ipc.oif = sk->sk_bound_dev_if; |
497 | 497 | ||
498 | if (msg->msg_controllen) { | 498 | if (msg->msg_controllen) { |
499 | err = ip_cmsg_send(sock_net(sk), msg, &ipc); | 499 | err = ip_cmsg_send(sock_net(sk), msg, &ipc); |
500 | if (err) | 500 | if (err) |
501 | goto out; | 501 | goto out; |
502 | if (ipc.opt) | 502 | if (ipc.opt) |
503 | free = 1; | 503 | free = 1; |
504 | } | 504 | } |
505 | 505 | ||
506 | saddr = ipc.addr; | 506 | saddr = ipc.addr; |
507 | ipc.addr = daddr; | 507 | ipc.addr = daddr; |
508 | 508 | ||
509 | if (!ipc.opt) | 509 | if (!ipc.opt) |
510 | ipc.opt = inet->opt; | 510 | ipc.opt = inet->opt; |
511 | 511 | ||
512 | if (ipc.opt) { | 512 | if (ipc.opt) { |
513 | err = -EINVAL; | 513 | err = -EINVAL; |
514 | /* Linux does not mangle headers on raw sockets, | 514 | /* Linux does not mangle headers on raw sockets, |
515 | * so that IP options + IP_HDRINCL is non-sense. | 515 | * so that IP options + IP_HDRINCL is non-sense. |
516 | */ | 516 | */ |
517 | if (inet->hdrincl) | 517 | if (inet->hdrincl) |
518 | goto done; | 518 | goto done; |
519 | if (ipc.opt->srr) { | 519 | if (ipc.opt->srr) { |
520 | if (!daddr) | 520 | if (!daddr) |
521 | goto done; | 521 | goto done; |
522 | daddr = ipc.opt->faddr; | 522 | daddr = ipc.opt->faddr; |
523 | } | 523 | } |
524 | } | 524 | } |
525 | tos = RT_CONN_FLAGS(sk); | 525 | tos = RT_CONN_FLAGS(sk); |
526 | if (msg->msg_flags & MSG_DONTROUTE) | 526 | if (msg->msg_flags & MSG_DONTROUTE) |
527 | tos |= RTO_ONLINK; | 527 | tos |= RTO_ONLINK; |
528 | 528 | ||
529 | if (ipv4_is_multicast(daddr)) { | 529 | if (ipv4_is_multicast(daddr)) { |
530 | if (!ipc.oif) | 530 | if (!ipc.oif) |
531 | ipc.oif = inet->mc_index; | 531 | ipc.oif = inet->mc_index; |
532 | if (!saddr) | 532 | if (!saddr) |
533 | saddr = inet->mc_addr; | 533 | saddr = inet->mc_addr; |
534 | } | 534 | } |
535 | 535 | ||
536 | { | 536 | { |
537 | struct flowi fl = { .oif = ipc.oif, | 537 | struct flowi fl = { .oif = ipc.oif, |
538 | .mark = sk->sk_mark, | 538 | .mark = sk->sk_mark, |
539 | .nl_u = { .ip4_u = | 539 | .nl_u = { .ip4_u = |
540 | { .daddr = daddr, | 540 | { .daddr = daddr, |
541 | .saddr = saddr, | 541 | .saddr = saddr, |
542 | .tos = tos } }, | 542 | .tos = tos } }, |
543 | .proto = inet->hdrincl ? IPPROTO_RAW : | 543 | .proto = inet->hdrincl ? IPPROTO_RAW : |
544 | sk->sk_protocol, | 544 | sk->sk_protocol, |
545 | }; | 545 | }; |
546 | if (!inet->hdrincl) { | 546 | if (!inet->hdrincl) { |
547 | err = raw_probe_proto_opt(&fl, msg); | 547 | err = raw_probe_proto_opt(&fl, msg); |
548 | if (err) | 548 | if (err) |
549 | goto done; | 549 | goto done; |
550 | } | 550 | } |
551 | 551 | ||
552 | security_sk_classify_flow(sk, &fl); | 552 | security_sk_classify_flow(sk, &fl); |
553 | err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); | 553 | err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); |
554 | } | 554 | } |
555 | if (err) | 555 | if (err) |
556 | goto done; | 556 | goto done; |
557 | 557 | ||
558 | err = -EACCES; | 558 | err = -EACCES; |
559 | if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) | 559 | if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) |
560 | goto done; | 560 | goto done; |
561 | 561 | ||
562 | if (msg->msg_flags & MSG_CONFIRM) | 562 | if (msg->msg_flags & MSG_CONFIRM) |
563 | goto do_confirm; | 563 | goto do_confirm; |
564 | back_from_confirm: | 564 | back_from_confirm: |
565 | 565 | ||
566 | if (inet->hdrincl) | 566 | if (inet->hdrincl) |
567 | err = raw_send_hdrinc(sk, msg->msg_iov, len, | 567 | err = raw_send_hdrinc(sk, msg->msg_iov, len, |
568 | rt, msg->msg_flags); | 568 | rt, msg->msg_flags); |
569 | 569 | ||
570 | else { | 570 | else { |
571 | if (!ipc.addr) | 571 | if (!ipc.addr) |
572 | ipc.addr = rt->rt_dst; | 572 | ipc.addr = rt->rt_dst; |
573 | lock_sock(sk); | 573 | lock_sock(sk); |
574 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, | 574 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, |
575 | &ipc, rt, msg->msg_flags); | 575 | &ipc, rt, msg->msg_flags); |
576 | if (err) | 576 | if (err) |
577 | ip_flush_pending_frames(sk); | 577 | ip_flush_pending_frames(sk); |
578 | else if (!(msg->msg_flags & MSG_MORE)) | 578 | else if (!(msg->msg_flags & MSG_MORE)) |
579 | err = ip_push_pending_frames(sk); | 579 | err = ip_push_pending_frames(sk); |
580 | release_sock(sk); | 580 | release_sock(sk); |
581 | } | 581 | } |
582 | done: | 582 | done: |
583 | if (free) | 583 | if (free) |
584 | kfree(ipc.opt); | 584 | kfree(ipc.opt); |
585 | ip_rt_put(rt); | 585 | ip_rt_put(rt); |
586 | 586 | ||
587 | out: | 587 | out: |
588 | if (err < 0) | 588 | if (err < 0) |
589 | return err; | 589 | return err; |
590 | return len; | 590 | return len; |
591 | 591 | ||
592 | do_confirm: | 592 | do_confirm: |
593 | dst_confirm(&rt->u.dst); | 593 | dst_confirm(&rt->u.dst); |
594 | if (!(msg->msg_flags & MSG_PROBE) || len) | 594 | if (!(msg->msg_flags & MSG_PROBE) || len) |
595 | goto back_from_confirm; | 595 | goto back_from_confirm; |
596 | err = 0; | 596 | err = 0; |
597 | goto done; | 597 | goto done; |
598 | } | 598 | } |
599 | 599 | ||
600 | static void raw_close(struct sock *sk, long timeout) | 600 | static void raw_close(struct sock *sk, long timeout) |
601 | { | 601 | { |
602 | /* | 602 | /* |
603 | * Raw sockets may have direct kernel refereneces. Kill them. | 603 | * Raw sockets may have direct kernel refereneces. Kill them. |
604 | */ | 604 | */ |
605 | ip_ra_control(sk, 0, NULL); | 605 | ip_ra_control(sk, 0, NULL); |
606 | 606 | ||
607 | sk_common_release(sk); | 607 | sk_common_release(sk); |
608 | } | 608 | } |
609 | 609 | ||
610 | static void raw_destroy(struct sock *sk) | 610 | static void raw_destroy(struct sock *sk) |
611 | { | 611 | { |
612 | lock_sock(sk); | 612 | lock_sock(sk); |
613 | ip_flush_pending_frames(sk); | 613 | ip_flush_pending_frames(sk); |
614 | release_sock(sk); | 614 | release_sock(sk); |
615 | } | 615 | } |
616 | 616 | ||
617 | /* This gets rid of all the nasties in af_inet. -DaveM */ | 617 | /* This gets rid of all the nasties in af_inet. -DaveM */ |
618 | static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) | 618 | static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) |
619 | { | 619 | { |
620 | struct inet_sock *inet = inet_sk(sk); | 620 | struct inet_sock *inet = inet_sk(sk); |
621 | struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; | 621 | struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; |
622 | int ret = -EINVAL; | 622 | int ret = -EINVAL; |
623 | int chk_addr_ret; | 623 | int chk_addr_ret; |
624 | 624 | ||
625 | if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) | 625 | if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) |
626 | goto out; | 626 | goto out; |
627 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); | 627 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); |
628 | ret = -EADDRNOTAVAIL; | 628 | ret = -EADDRNOTAVAIL; |
629 | if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && | 629 | if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && |
630 | chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) | 630 | chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) |
631 | goto out; | 631 | goto out; |
632 | inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; | 632 | inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; |
633 | if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) | 633 | if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) |
634 | inet->saddr = 0; /* Use device */ | 634 | inet->saddr = 0; /* Use device */ |
635 | sk_dst_reset(sk); | 635 | sk_dst_reset(sk); |
636 | ret = 0; | 636 | ret = 0; |
637 | out: return ret; | 637 | out: return ret; |
638 | } | 638 | } |
639 | 639 | ||
640 | /* | 640 | /* |
641 | * This should be easy, if there is something there | 641 | * This should be easy, if there is something there |
642 | * we return it, otherwise we block. | 642 | * we return it, otherwise we block. |
643 | */ | 643 | */ |
644 | 644 | ||
645 | static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 645 | static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
646 | size_t len, int noblock, int flags, int *addr_len) | 646 | size_t len, int noblock, int flags, int *addr_len) |
647 | { | 647 | { |
648 | struct inet_sock *inet = inet_sk(sk); | 648 | struct inet_sock *inet = inet_sk(sk); |
649 | size_t copied = 0; | 649 | size_t copied = 0; |
650 | int err = -EOPNOTSUPP; | 650 | int err = -EOPNOTSUPP; |
651 | struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; | 651 | struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; |
652 | struct sk_buff *skb; | 652 | struct sk_buff *skb; |
653 | 653 | ||
654 | if (flags & MSG_OOB) | 654 | if (flags & MSG_OOB) |
655 | goto out; | 655 | goto out; |
656 | 656 | ||
657 | if (addr_len) | 657 | if (addr_len) |
658 | *addr_len = sizeof(*sin); | 658 | *addr_len = sizeof(*sin); |
659 | 659 | ||
660 | if (flags & MSG_ERRQUEUE) { | 660 | if (flags & MSG_ERRQUEUE) { |
661 | err = ip_recv_error(sk, msg, len); | 661 | err = ip_recv_error(sk, msg, len); |
662 | goto out; | 662 | goto out; |
663 | } | 663 | } |
664 | 664 | ||
665 | skb = skb_recv_datagram(sk, flags, noblock, &err); | 665 | skb = skb_recv_datagram(sk, flags, noblock, &err); |
666 | if (!skb) | 666 | if (!skb) |
667 | goto out; | 667 | goto out; |
668 | 668 | ||
669 | copied = skb->len; | 669 | copied = skb->len; |
670 | if (len < copied) { | 670 | if (len < copied) { |
671 | msg->msg_flags |= MSG_TRUNC; | 671 | msg->msg_flags |= MSG_TRUNC; |
672 | copied = len; | 672 | copied = len; |
673 | } | 673 | } |
674 | 674 | ||
675 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); | 675 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); |
676 | if (err) | 676 | if (err) |
677 | goto done; | 677 | goto done; |
678 | 678 | ||
679 | sock_recv_timestamp(msg, sk, skb); | 679 | sock_recv_timestamp(msg, sk, skb); |
680 | 680 | ||
681 | /* Copy the address. */ | 681 | /* Copy the address. */ |
682 | if (sin) { | 682 | if (sin) { |
683 | sin->sin_family = AF_INET; | 683 | sin->sin_family = AF_INET; |
684 | sin->sin_addr.s_addr = ip_hdr(skb)->saddr; | 684 | sin->sin_addr.s_addr = ip_hdr(skb)->saddr; |
685 | sin->sin_port = 0; | 685 | sin->sin_port = 0; |
686 | memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); | 686 | memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); |
687 | } | 687 | } |
688 | if (inet->cmsg_flags) | 688 | if (inet->cmsg_flags) |
689 | ip_cmsg_recv(msg, skb); | 689 | ip_cmsg_recv(msg, skb); |
690 | if (flags & MSG_TRUNC) | 690 | if (flags & MSG_TRUNC) |
691 | copied = skb->len; | 691 | copied = skb->len; |
692 | done: | 692 | done: |
693 | skb_free_datagram(sk, skb); | 693 | skb_free_datagram(sk, skb); |
694 | out: | 694 | out: |
695 | if (err) | 695 | if (err) |
696 | return err; | 696 | return err; |
697 | return copied; | 697 | return copied; |
698 | } | 698 | } |
699 | 699 | ||
700 | static int raw_init(struct sock *sk) | 700 | static int raw_init(struct sock *sk) |
701 | { | 701 | { |
702 | struct raw_sock *rp = raw_sk(sk); | 702 | struct raw_sock *rp = raw_sk(sk); |
703 | 703 | ||
704 | if (inet_sk(sk)->num == IPPROTO_ICMP) | 704 | if (inet_sk(sk)->num == IPPROTO_ICMP) |
705 | memset(&rp->filter, 0, sizeof(rp->filter)); | 705 | memset(&rp->filter, 0, sizeof(rp->filter)); |
706 | return 0; | 706 | return 0; |
707 | } | 707 | } |
708 | 708 | ||
709 | static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) | 709 | static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) |
710 | { | 710 | { |
711 | if (optlen > sizeof(struct icmp_filter)) | 711 | if (optlen > sizeof(struct icmp_filter)) |
712 | optlen = sizeof(struct icmp_filter); | 712 | optlen = sizeof(struct icmp_filter); |
713 | if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) | 713 | if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) |
714 | return -EFAULT; | 714 | return -EFAULT; |
715 | return 0; | 715 | return 0; |
716 | } | 716 | } |
717 | 717 | ||
718 | static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) | 718 | static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) |
719 | { | 719 | { |
720 | int len, ret = -EFAULT; | 720 | int len, ret = -EFAULT; |
721 | 721 | ||
722 | if (get_user(len, optlen)) | 722 | if (get_user(len, optlen)) |
723 | goto out; | 723 | goto out; |
724 | ret = -EINVAL; | 724 | ret = -EINVAL; |
725 | if (len < 0) | 725 | if (len < 0) |
726 | goto out; | 726 | goto out; |
727 | if (len > sizeof(struct icmp_filter)) | 727 | if (len > sizeof(struct icmp_filter)) |
728 | len = sizeof(struct icmp_filter); | 728 | len = sizeof(struct icmp_filter); |
729 | ret = -EFAULT; | 729 | ret = -EFAULT; |
730 | if (put_user(len, optlen) || | 730 | if (put_user(len, optlen) || |
731 | copy_to_user(optval, &raw_sk(sk)->filter, len)) | 731 | copy_to_user(optval, &raw_sk(sk)->filter, len)) |
732 | goto out; | 732 | goto out; |
733 | ret = 0; | 733 | ret = 0; |
734 | out: return ret; | 734 | out: return ret; |
735 | } | 735 | } |
736 | 736 | ||
737 | static int do_raw_setsockopt(struct sock *sk, int level, int optname, | 737 | static int do_raw_setsockopt(struct sock *sk, int level, int optname, |
738 | char __user *optval, int optlen) | 738 | char __user *optval, int optlen) |
739 | { | 739 | { |
740 | if (optname == ICMP_FILTER) { | 740 | if (optname == ICMP_FILTER) { |
741 | if (inet_sk(sk)->num != IPPROTO_ICMP) | 741 | if (inet_sk(sk)->num != IPPROTO_ICMP) |
742 | return -EOPNOTSUPP; | 742 | return -EOPNOTSUPP; |
743 | else | 743 | else |
744 | return raw_seticmpfilter(sk, optval, optlen); | 744 | return raw_seticmpfilter(sk, optval, optlen); |
745 | } | 745 | } |
746 | return -ENOPROTOOPT; | 746 | return -ENOPROTOOPT; |
747 | } | 747 | } |
748 | 748 | ||
749 | static int raw_setsockopt(struct sock *sk, int level, int optname, | 749 | static int raw_setsockopt(struct sock *sk, int level, int optname, |
750 | char __user *optval, int optlen) | 750 | char __user *optval, int optlen) |
751 | { | 751 | { |
752 | if (level != SOL_RAW) | 752 | if (level != SOL_RAW) |
753 | return ip_setsockopt(sk, level, optname, optval, optlen); | 753 | return ip_setsockopt(sk, level, optname, optval, optlen); |
754 | return do_raw_setsockopt(sk, level, optname, optval, optlen); | 754 | return do_raw_setsockopt(sk, level, optname, optval, optlen); |
755 | } | 755 | } |
756 | 756 | ||
757 | #ifdef CONFIG_COMPAT | 757 | #ifdef CONFIG_COMPAT |
758 | static int compat_raw_setsockopt(struct sock *sk, int level, int optname, | 758 | static int compat_raw_setsockopt(struct sock *sk, int level, int optname, |
759 | char __user *optval, int optlen) | 759 | char __user *optval, int optlen) |
760 | { | 760 | { |
761 | if (level != SOL_RAW) | 761 | if (level != SOL_RAW) |
762 | return compat_ip_setsockopt(sk, level, optname, optval, optlen); | 762 | return compat_ip_setsockopt(sk, level, optname, optval, optlen); |
763 | return do_raw_setsockopt(sk, level, optname, optval, optlen); | 763 | return do_raw_setsockopt(sk, level, optname, optval, optlen); |
764 | } | 764 | } |
765 | #endif | 765 | #endif |
766 | 766 | ||
767 | static int do_raw_getsockopt(struct sock *sk, int level, int optname, | 767 | static int do_raw_getsockopt(struct sock *sk, int level, int optname, |
768 | char __user *optval, int __user *optlen) | 768 | char __user *optval, int __user *optlen) |
769 | { | 769 | { |
770 | if (optname == ICMP_FILTER) { | 770 | if (optname == ICMP_FILTER) { |
771 | if (inet_sk(sk)->num != IPPROTO_ICMP) | 771 | if (inet_sk(sk)->num != IPPROTO_ICMP) |
772 | return -EOPNOTSUPP; | 772 | return -EOPNOTSUPP; |
773 | else | 773 | else |
774 | return raw_geticmpfilter(sk, optval, optlen); | 774 | return raw_geticmpfilter(sk, optval, optlen); |
775 | } | 775 | } |
776 | return -ENOPROTOOPT; | 776 | return -ENOPROTOOPT; |
777 | } | 777 | } |
778 | 778 | ||
779 | static int raw_getsockopt(struct sock *sk, int level, int optname, | 779 | static int raw_getsockopt(struct sock *sk, int level, int optname, |
780 | char __user *optval, int __user *optlen) | 780 | char __user *optval, int __user *optlen) |
781 | { | 781 | { |
782 | if (level != SOL_RAW) | 782 | if (level != SOL_RAW) |
783 | return ip_getsockopt(sk, level, optname, optval, optlen); | 783 | return ip_getsockopt(sk, level, optname, optval, optlen); |
784 | return do_raw_getsockopt(sk, level, optname, optval, optlen); | 784 | return do_raw_getsockopt(sk, level, optname, optval, optlen); |
785 | } | 785 | } |
786 | 786 | ||
787 | #ifdef CONFIG_COMPAT | 787 | #ifdef CONFIG_COMPAT |
788 | static int compat_raw_getsockopt(struct sock *sk, int level, int optname, | 788 | static int compat_raw_getsockopt(struct sock *sk, int level, int optname, |
789 | char __user *optval, int __user *optlen) | 789 | char __user *optval, int __user *optlen) |
790 | { | 790 | { |
791 | if (level != SOL_RAW) | 791 | if (level != SOL_RAW) |
792 | return compat_ip_getsockopt(sk, level, optname, optval, optlen); | 792 | return compat_ip_getsockopt(sk, level, optname, optval, optlen); |
793 | return do_raw_getsockopt(sk, level, optname, optval, optlen); | 793 | return do_raw_getsockopt(sk, level, optname, optval, optlen); |
794 | } | 794 | } |
795 | #endif | 795 | #endif |
796 | 796 | ||
797 | static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) | 797 | static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) |
798 | { | 798 | { |
799 | switch (cmd) { | 799 | switch (cmd) { |
800 | case SIOCOUTQ: { | 800 | case SIOCOUTQ: { |
801 | int amount = atomic_read(&sk->sk_wmem_alloc); | 801 | int amount = atomic_read(&sk->sk_wmem_alloc); |
802 | return put_user(amount, (int __user *)arg); | 802 | return put_user(amount, (int __user *)arg); |
803 | } | 803 | } |
804 | case SIOCINQ: { | 804 | case SIOCINQ: { |
805 | struct sk_buff *skb; | 805 | struct sk_buff *skb; |
806 | int amount = 0; | 806 | int amount = 0; |
807 | 807 | ||
808 | spin_lock_bh(&sk->sk_receive_queue.lock); | 808 | spin_lock_bh(&sk->sk_receive_queue.lock); |
809 | skb = skb_peek(&sk->sk_receive_queue); | 809 | skb = skb_peek(&sk->sk_receive_queue); |
810 | if (skb != NULL) | 810 | if (skb != NULL) |
811 | amount = skb->len; | 811 | amount = skb->len; |
812 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 812 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
813 | return put_user(amount, (int __user *)arg); | 813 | return put_user(amount, (int __user *)arg); |
814 | } | 814 | } |
815 | 815 | ||
816 | default: | 816 | default: |
817 | #ifdef CONFIG_IP_MROUTE | 817 | #ifdef CONFIG_IP_MROUTE |
818 | return ipmr_ioctl(sk, cmd, (void __user *)arg); | 818 | return ipmr_ioctl(sk, cmd, (void __user *)arg); |
819 | #else | 819 | #else |
820 | return -ENOIOCTLCMD; | 820 | return -ENOIOCTLCMD; |
821 | #endif | 821 | #endif |
822 | } | 822 | } |
823 | } | 823 | } |
824 | 824 | ||
825 | struct proto raw_prot = { | 825 | struct proto raw_prot = { |
826 | .name = "RAW", | 826 | .name = "RAW", |
827 | .owner = THIS_MODULE, | 827 | .owner = THIS_MODULE, |
828 | .close = raw_close, | 828 | .close = raw_close, |
829 | .destroy = raw_destroy, | 829 | .destroy = raw_destroy, |
830 | .connect = ip4_datagram_connect, | 830 | .connect = ip4_datagram_connect, |
831 | .disconnect = udp_disconnect, | 831 | .disconnect = udp_disconnect, |
832 | .ioctl = raw_ioctl, | 832 | .ioctl = raw_ioctl, |
833 | .init = raw_init, | 833 | .init = raw_init, |
834 | .setsockopt = raw_setsockopt, | 834 | .setsockopt = raw_setsockopt, |
835 | .getsockopt = raw_getsockopt, | 835 | .getsockopt = raw_getsockopt, |
836 | .sendmsg = raw_sendmsg, | 836 | .sendmsg = raw_sendmsg, |
837 | .recvmsg = raw_recvmsg, | 837 | .recvmsg = raw_recvmsg, |
838 | .bind = raw_bind, | 838 | .bind = raw_bind, |
839 | .backlog_rcv = raw_rcv_skb, | 839 | .backlog_rcv = raw_rcv_skb, |
840 | .hash = raw_hash_sk, | 840 | .hash = raw_hash_sk, |
841 | .unhash = raw_unhash_sk, | 841 | .unhash = raw_unhash_sk, |
842 | .obj_size = sizeof(struct raw_sock), | 842 | .obj_size = sizeof(struct raw_sock), |
843 | .h.raw_hash = &raw_v4_hashinfo, | 843 | .h.raw_hash = &raw_v4_hashinfo, |
844 | #ifdef CONFIG_COMPAT | 844 | #ifdef CONFIG_COMPAT |
845 | .compat_setsockopt = compat_raw_setsockopt, | 845 | .compat_setsockopt = compat_raw_setsockopt, |
846 | .compat_getsockopt = compat_raw_getsockopt, | 846 | .compat_getsockopt = compat_raw_getsockopt, |
847 | #endif | 847 | #endif |
848 | }; | 848 | }; |
849 | 849 | ||
850 | #ifdef CONFIG_PROC_FS | 850 | #ifdef CONFIG_PROC_FS |
851 | static struct sock *raw_get_first(struct seq_file *seq) | 851 | static struct sock *raw_get_first(struct seq_file *seq) |
852 | { | 852 | { |
853 | struct sock *sk; | 853 | struct sock *sk; |
854 | struct raw_iter_state* state = raw_seq_private(seq); | 854 | struct raw_iter_state *state = raw_seq_private(seq); |
855 | 855 | ||
856 | for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; | 856 | for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; |
857 | ++state->bucket) { | 857 | ++state->bucket) { |
858 | struct hlist_node *node; | 858 | struct hlist_node *node; |
859 | 859 | ||
860 | sk_for_each(sk, node, &state->h->ht[state->bucket]) | 860 | sk_for_each(sk, node, &state->h->ht[state->bucket]) |
861 | if (sock_net(sk) == seq_file_net(seq)) | 861 | if (sock_net(sk) == seq_file_net(seq)) |
862 | goto found; | 862 | goto found; |
863 | } | 863 | } |
864 | sk = NULL; | 864 | sk = NULL; |
865 | found: | 865 | found: |
866 | return sk; | 866 | return sk; |
867 | } | 867 | } |
868 | 868 | ||
869 | static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) | 869 | static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) |
870 | { | 870 | { |
871 | struct raw_iter_state* state = raw_seq_private(seq); | 871 | struct raw_iter_state *state = raw_seq_private(seq); |
872 | 872 | ||
873 | do { | 873 | do { |
874 | sk = sk_next(sk); | 874 | sk = sk_next(sk); |
875 | try_again: | 875 | try_again: |
876 | ; | 876 | ; |
877 | } while (sk && sock_net(sk) != seq_file_net(seq)); | 877 | } while (sk && sock_net(sk) != seq_file_net(seq)); |
878 | 878 | ||
879 | if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { | 879 | if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { |
880 | sk = sk_head(&state->h->ht[state->bucket]); | 880 | sk = sk_head(&state->h->ht[state->bucket]); |
881 | goto try_again; | 881 | goto try_again; |
882 | } | 882 | } |
883 | return sk; | 883 | return sk; |
884 | } | 884 | } |
885 | 885 | ||
886 | static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) | 886 | static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) |
887 | { | 887 | { |
888 | struct sock *sk = raw_get_first(seq); | 888 | struct sock *sk = raw_get_first(seq); |
889 | 889 | ||
890 | if (sk) | 890 | if (sk) |
891 | while (pos && (sk = raw_get_next(seq, sk)) != NULL) | 891 | while (pos && (sk = raw_get_next(seq, sk)) != NULL) |
892 | --pos; | 892 | --pos; |
893 | return pos ? NULL : sk; | 893 | return pos ? NULL : sk; |
894 | } | 894 | } |
895 | 895 | ||
896 | void *raw_seq_start(struct seq_file *seq, loff_t *pos) | 896 | void *raw_seq_start(struct seq_file *seq, loff_t *pos) |
897 | { | 897 | { |
898 | struct raw_iter_state *state = raw_seq_private(seq); | 898 | struct raw_iter_state *state = raw_seq_private(seq); |
899 | 899 | ||
900 | read_lock(&state->h->lock); | 900 | read_lock(&state->h->lock); |
901 | return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; | 901 | return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; |
902 | } | 902 | } |
903 | EXPORT_SYMBOL_GPL(raw_seq_start); | 903 | EXPORT_SYMBOL_GPL(raw_seq_start); |
904 | 904 | ||
905 | void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 905 | void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
906 | { | 906 | { |
907 | struct sock *sk; | 907 | struct sock *sk; |
908 | 908 | ||
909 | if (v == SEQ_START_TOKEN) | 909 | if (v == SEQ_START_TOKEN) |
910 | sk = raw_get_first(seq); | 910 | sk = raw_get_first(seq); |
911 | else | 911 | else |
912 | sk = raw_get_next(seq, v); | 912 | sk = raw_get_next(seq, v); |
913 | ++*pos; | 913 | ++*pos; |
914 | return sk; | 914 | return sk; |
915 | } | 915 | } |
916 | EXPORT_SYMBOL_GPL(raw_seq_next); | 916 | EXPORT_SYMBOL_GPL(raw_seq_next); |
917 | 917 | ||
918 | void raw_seq_stop(struct seq_file *seq, void *v) | 918 | void raw_seq_stop(struct seq_file *seq, void *v) |
919 | { | 919 | { |
920 | struct raw_iter_state *state = raw_seq_private(seq); | 920 | struct raw_iter_state *state = raw_seq_private(seq); |
921 | 921 | ||
922 | read_unlock(&state->h->lock); | 922 | read_unlock(&state->h->lock); |
923 | } | 923 | } |
924 | EXPORT_SYMBOL_GPL(raw_seq_stop); | 924 | EXPORT_SYMBOL_GPL(raw_seq_stop); |
925 | 925 | ||
926 | static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) | 926 | static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) |
927 | { | 927 | { |
928 | struct inet_sock *inet = inet_sk(sp); | 928 | struct inet_sock *inet = inet_sk(sp); |
929 | __be32 dest = inet->daddr, | 929 | __be32 dest = inet->daddr, |
930 | src = inet->rcv_saddr; | 930 | src = inet->rcv_saddr; |
931 | __u16 destp = 0, | 931 | __u16 destp = 0, |
932 | srcp = inet->num; | 932 | srcp = inet->num; |
933 | 933 | ||
934 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" | 934 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" |
935 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", | 935 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", |
936 | i, src, srcp, dest, destp, sp->sk_state, | 936 | i, src, srcp, dest, destp, sp->sk_state, |
937 | atomic_read(&sp->sk_wmem_alloc), | 937 | atomic_read(&sp->sk_wmem_alloc), |
938 | atomic_read(&sp->sk_rmem_alloc), | 938 | atomic_read(&sp->sk_rmem_alloc), |
939 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), | 939 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), |
940 | atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); | 940 | atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); |
941 | } | 941 | } |
942 | 942 | ||
943 | static int raw_seq_show(struct seq_file *seq, void *v) | 943 | static int raw_seq_show(struct seq_file *seq, void *v) |
944 | { | 944 | { |
945 | if (v == SEQ_START_TOKEN) | 945 | if (v == SEQ_START_TOKEN) |
946 | seq_printf(seq, " sl local_address rem_address st tx_queue " | 946 | seq_printf(seq, " sl local_address rem_address st tx_queue " |
947 | "rx_queue tr tm->when retrnsmt uid timeout " | 947 | "rx_queue tr tm->when retrnsmt uid timeout " |
948 | "inode ref pointer drops\n"); | 948 | "inode ref pointer drops\n"); |
949 | else | 949 | else |
950 | raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); | 950 | raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); |
951 | return 0; | 951 | return 0; |
952 | } | 952 | } |
953 | 953 | ||
954 | static const struct seq_operations raw_seq_ops = { | 954 | static const struct seq_operations raw_seq_ops = { |
955 | .start = raw_seq_start, | 955 | .start = raw_seq_start, |
956 | .next = raw_seq_next, | 956 | .next = raw_seq_next, |
957 | .stop = raw_seq_stop, | 957 | .stop = raw_seq_stop, |
958 | .show = raw_seq_show, | 958 | .show = raw_seq_show, |
959 | }; | 959 | }; |
960 | 960 | ||
961 | int raw_seq_open(struct inode *ino, struct file *file, | 961 | int raw_seq_open(struct inode *ino, struct file *file, |
962 | struct raw_hashinfo *h, const struct seq_operations *ops) | 962 | struct raw_hashinfo *h, const struct seq_operations *ops) |
963 | { | 963 | { |
964 | int err; | 964 | int err; |
965 | struct raw_iter_state *i; | 965 | struct raw_iter_state *i; |
966 | 966 | ||
967 | err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state)); | 967 | err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state)); |
968 | if (err < 0) | 968 | if (err < 0) |
969 | return err; | 969 | return err; |
970 | 970 | ||
971 | i = raw_seq_private((struct seq_file *)file->private_data); | 971 | i = raw_seq_private((struct seq_file *)file->private_data); |
972 | i->h = h; | 972 | i->h = h; |
973 | return 0; | 973 | return 0; |
974 | } | 974 | } |
975 | EXPORT_SYMBOL_GPL(raw_seq_open); | 975 | EXPORT_SYMBOL_GPL(raw_seq_open); |
976 | 976 | ||
977 | static int raw_v4_seq_open(struct inode *inode, struct file *file) | 977 | static int raw_v4_seq_open(struct inode *inode, struct file *file) |
978 | { | 978 | { |
979 | return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops); | 979 | return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops); |
980 | } | 980 | } |
981 | 981 | ||
982 | static const struct file_operations raw_seq_fops = { | 982 | static const struct file_operations raw_seq_fops = { |
983 | .owner = THIS_MODULE, | 983 | .owner = THIS_MODULE, |
984 | .open = raw_v4_seq_open, | 984 | .open = raw_v4_seq_open, |
985 | .read = seq_read, | 985 | .read = seq_read, |
986 | .llseek = seq_lseek, | 986 | .llseek = seq_lseek, |
987 | .release = seq_release_net, | 987 | .release = seq_release_net, |
988 | }; | 988 | }; |
989 | 989 | ||
990 | static __net_init int raw_init_net(struct net *net) | 990 | static __net_init int raw_init_net(struct net *net) |
991 | { | 991 | { |
992 | if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) | 992 | if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) |
993 | return -ENOMEM; | 993 | return -ENOMEM; |
994 | 994 | ||
995 | return 0; | 995 | return 0; |
996 | } | 996 | } |
997 | 997 | ||
998 | static __net_exit void raw_exit_net(struct net *net) | 998 | static __net_exit void raw_exit_net(struct net *net) |
999 | { | 999 | { |
1000 | proc_net_remove(net, "raw"); | 1000 | proc_net_remove(net, "raw"); |
1001 | } | 1001 | } |
1002 | 1002 | ||
1003 | static __net_initdata struct pernet_operations raw_net_ops = { | 1003 | static __net_initdata struct pernet_operations raw_net_ops = { |
1004 | .init = raw_init_net, | 1004 | .init = raw_init_net, |
1005 | .exit = raw_exit_net, | 1005 | .exit = raw_exit_net, |
1006 | }; | 1006 | }; |
1007 | 1007 | ||
1008 | int __init raw_proc_init(void) | 1008 | int __init raw_proc_init(void) |
1009 | { | 1009 | { |
1010 | return register_pernet_subsys(&raw_net_ops); | 1010 | return register_pernet_subsys(&raw_net_ops); |
1011 | } | 1011 | } |
1012 | 1012 | ||
1013 | void __init raw_proc_exit(void) | 1013 | void __init raw_proc_exit(void) |
1014 | { | 1014 | { |
1015 | unregister_pernet_subsys(&raw_net_ops); | 1015 | unregister_pernet_subsys(&raw_net_ops); |
1016 | } | 1016 | } |
1017 | #endif /* CONFIG_PROC_FS */ | 1017 | #endif /* CONFIG_PROC_FS */ |
1018 | 1018 |
net/ipv4/tcp.c
1 | /* | 1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
3 | * operating system. INET is implemented using the BSD Socket | 3 | * operating system. INET is implemented using the BSD Socket |
4 | * interface as the means of communication with the user level. | 4 | * interface as the means of communication with the user level. |
5 | * | 5 | * |
6 | * Implementation of the Transmission Control Protocol(TCP). | 6 | * Implementation of the Transmission Control Protocol(TCP). |
7 | * | 7 | * |
8 | * Authors: Ross Biro | 8 | * Authors: Ross Biro |
9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | 9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> |
10 | * Mark Evans, <evansmp@uhura.aston.ac.uk> | 10 | * Mark Evans, <evansmp@uhura.aston.ac.uk> |
11 | * Corey Minyard <wf-rch!minyard@relay.EU.net> | 11 | * Corey Minyard <wf-rch!minyard@relay.EU.net> |
12 | * Florian La Roche, <flla@stud.uni-sb.de> | 12 | * Florian La Roche, <flla@stud.uni-sb.de> |
13 | * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> | 13 | * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> |
14 | * Linus Torvalds, <torvalds@cs.helsinki.fi> | 14 | * Linus Torvalds, <torvalds@cs.helsinki.fi> |
15 | * Alan Cox, <gw4pts@gw4pts.ampr.org> | 15 | * Alan Cox, <gw4pts@gw4pts.ampr.org> |
16 | * Matthew Dillon, <dillon@apollo.west.oic.com> | 16 | * Matthew Dillon, <dillon@apollo.west.oic.com> |
17 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | 17 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> |
18 | * Jorge Cwik, <jorge@laser.satlink.net> | 18 | * Jorge Cwik, <jorge@laser.satlink.net> |
19 | * | 19 | * |
20 | * Fixes: | 20 | * Fixes: |
21 | * Alan Cox : Numerous verify_area() calls | 21 | * Alan Cox : Numerous verify_area() calls |
22 | * Alan Cox : Set the ACK bit on a reset | 22 | * Alan Cox : Set the ACK bit on a reset |
23 | * Alan Cox : Stopped it crashing if it closed while | 23 | * Alan Cox : Stopped it crashing if it closed while |
24 | * sk->inuse=1 and was trying to connect | 24 | * sk->inuse=1 and was trying to connect |
25 | * (tcp_err()). | 25 | * (tcp_err()). |
26 | * Alan Cox : All icmp error handling was broken | 26 | * Alan Cox : All icmp error handling was broken |
27 | * pointers passed where wrong and the | 27 | * pointers passed where wrong and the |
28 | * socket was looked up backwards. Nobody | 28 | * socket was looked up backwards. Nobody |
29 | * tested any icmp error code obviously. | 29 | * tested any icmp error code obviously. |
30 | * Alan Cox : tcp_err() now handled properly. It | 30 | * Alan Cox : tcp_err() now handled properly. It |
31 | * wakes people on errors. poll | 31 | * wakes people on errors. poll |
32 | * behaves and the icmp error race | 32 | * behaves and the icmp error race |
33 | * has gone by moving it into sock.c | 33 | * has gone by moving it into sock.c |
34 | * Alan Cox : tcp_send_reset() fixed to work for | 34 | * Alan Cox : tcp_send_reset() fixed to work for |
35 | * everything not just packets for | 35 | * everything not just packets for |
36 | * unknown sockets. | 36 | * unknown sockets. |
37 | * Alan Cox : tcp option processing. | 37 | * Alan Cox : tcp option processing. |
38 | * Alan Cox : Reset tweaked (still not 100%) [Had | 38 | * Alan Cox : Reset tweaked (still not 100%) [Had |
39 | * syn rule wrong] | 39 | * syn rule wrong] |
40 | * Herp Rosmanith : More reset fixes | 40 | * Herp Rosmanith : More reset fixes |
41 | * Alan Cox : No longer acks invalid rst frames. | 41 | * Alan Cox : No longer acks invalid rst frames. |
42 | * Acking any kind of RST is right out. | 42 | * Acking any kind of RST is right out. |
43 | * Alan Cox : Sets an ignore me flag on an rst | 43 | * Alan Cox : Sets an ignore me flag on an rst |
44 | * receive otherwise odd bits of prattle | 44 | * receive otherwise odd bits of prattle |
45 | * escape still | 45 | * escape still |
46 | * Alan Cox : Fixed another acking RST frame bug. | 46 | * Alan Cox : Fixed another acking RST frame bug. |
47 | * Should stop LAN workplace lockups. | 47 | * Should stop LAN workplace lockups. |
48 | * Alan Cox : Some tidyups using the new skb list | 48 | * Alan Cox : Some tidyups using the new skb list |
49 | * facilities | 49 | * facilities |
50 | * Alan Cox : sk->keepopen now seems to work | 50 | * Alan Cox : sk->keepopen now seems to work |
51 | * Alan Cox : Pulls options out correctly on accepts | 51 | * Alan Cox : Pulls options out correctly on accepts |
52 | * Alan Cox : Fixed assorted sk->rqueue->next errors | 52 | * Alan Cox : Fixed assorted sk->rqueue->next errors |
53 | * Alan Cox : PSH doesn't end a TCP read. Switched a | 53 | * Alan Cox : PSH doesn't end a TCP read. Switched a |
54 | * bit to skb ops. | 54 | * bit to skb ops. |
55 | * Alan Cox : Tidied tcp_data to avoid a potential | 55 | * Alan Cox : Tidied tcp_data to avoid a potential |
56 | * nasty. | 56 | * nasty. |
57 | * Alan Cox : Added some better commenting, as the | 57 | * Alan Cox : Added some better commenting, as the |
58 | * tcp is hard to follow | 58 | * tcp is hard to follow |
59 | * Alan Cox : Removed incorrect check for 20 * psh | 59 | * Alan Cox : Removed incorrect check for 20 * psh |
60 | * Michael O'Reilly : ack < copied bug fix. | 60 | * Michael O'Reilly : ack < copied bug fix. |
61 | * Johannes Stille : Misc tcp fixes (not all in yet). | 61 | * Johannes Stille : Misc tcp fixes (not all in yet). |
62 | * Alan Cox : FIN with no memory -> CRASH | 62 | * Alan Cox : FIN with no memory -> CRASH |
63 | * Alan Cox : Added socket option proto entries. | 63 | * Alan Cox : Added socket option proto entries. |
64 | * Also added awareness of them to accept. | 64 | * Also added awareness of them to accept. |
65 | * Alan Cox : Added TCP options (SOL_TCP) | 65 | * Alan Cox : Added TCP options (SOL_TCP) |
66 | * Alan Cox : Switched wakeup calls to callbacks, | 66 | * Alan Cox : Switched wakeup calls to callbacks, |
67 | * so the kernel can layer network | 67 | * so the kernel can layer network |
68 | * sockets. | 68 | * sockets. |
69 | * Alan Cox : Use ip_tos/ip_ttl settings. | 69 | * Alan Cox : Use ip_tos/ip_ttl settings. |
70 | * Alan Cox : Handle FIN (more) properly (we hope). | 70 | * Alan Cox : Handle FIN (more) properly (we hope). |
71 | * Alan Cox : RST frames sent on unsynchronised | 71 | * Alan Cox : RST frames sent on unsynchronised |
72 | * state ack error. | 72 | * state ack error. |
73 | * Alan Cox : Put in missing check for SYN bit. | 73 | * Alan Cox : Put in missing check for SYN bit. |
74 | * Alan Cox : Added tcp_select_window() aka NET2E | 74 | * Alan Cox : Added tcp_select_window() aka NET2E |
75 | * window non shrink trick. | 75 | * window non shrink trick. |
76 | * Alan Cox : Added a couple of small NET2E timer | 76 | * Alan Cox : Added a couple of small NET2E timer |
77 | * fixes | 77 | * fixes |
78 | * Charles Hedrick : TCP fixes | 78 | * Charles Hedrick : TCP fixes |
79 | * Toomas Tamm : TCP window fixes | 79 | * Toomas Tamm : TCP window fixes |
80 | * Alan Cox : Small URG fix to rlogin ^C ack fight | 80 | * Alan Cox : Small URG fix to rlogin ^C ack fight |
81 | * Charles Hedrick : Rewrote most of it to actually work | 81 | * Charles Hedrick : Rewrote most of it to actually work |
82 | * Linus : Rewrote tcp_read() and URG handling | 82 | * Linus : Rewrote tcp_read() and URG handling |
83 | * completely | 83 | * completely |
84 | * Gerhard Koerting: Fixed some missing timer handling | 84 | * Gerhard Koerting: Fixed some missing timer handling |
85 | * Matthew Dillon : Reworked TCP machine states as per RFC | 85 | * Matthew Dillon : Reworked TCP machine states as per RFC |
86 | * Gerhard Koerting: PC/TCP workarounds | 86 | * Gerhard Koerting: PC/TCP workarounds |
87 | * Adam Caldwell : Assorted timer/timing errors | 87 | * Adam Caldwell : Assorted timer/timing errors |
88 | * Matthew Dillon : Fixed another RST bug | 88 | * Matthew Dillon : Fixed another RST bug |
89 | * Alan Cox : Move to kernel side addressing changes. | 89 | * Alan Cox : Move to kernel side addressing changes. |
90 | * Alan Cox : Beginning work on TCP fastpathing | 90 | * Alan Cox : Beginning work on TCP fastpathing |
91 | * (not yet usable) | 91 | * (not yet usable) |
92 | * Arnt Gulbrandsen: Turbocharged tcp_check() routine. | 92 | * Arnt Gulbrandsen: Turbocharged tcp_check() routine. |
93 | * Alan Cox : TCP fast path debugging | 93 | * Alan Cox : TCP fast path debugging |
94 | * Alan Cox : Window clamping | 94 | * Alan Cox : Window clamping |
95 | * Michael Riepe : Bug in tcp_check() | 95 | * Michael Riepe : Bug in tcp_check() |
96 | * Matt Dillon : More TCP improvements and RST bug fixes | 96 | * Matt Dillon : More TCP improvements and RST bug fixes |
97 | * Matt Dillon : Yet more small nasties remove from the | 97 | * Matt Dillon : Yet more small nasties remove from the |
98 | * TCP code (Be very nice to this man if | 98 | * TCP code (Be very nice to this man if |
99 | * tcp finally works 100%) 8) | 99 | * tcp finally works 100%) 8) |
100 | * Alan Cox : BSD accept semantics. | 100 | * Alan Cox : BSD accept semantics. |
101 | * Alan Cox : Reset on closedown bug. | 101 | * Alan Cox : Reset on closedown bug. |
102 | * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). | 102 | * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). |
103 | * Michael Pall : Handle poll() after URG properly in | 103 | * Michael Pall : Handle poll() after URG properly in |
104 | * all cases. | 104 | * all cases. |
105 | * Michael Pall : Undo the last fix in tcp_read_urg() | 105 | * Michael Pall : Undo the last fix in tcp_read_urg() |
106 | * (multi URG PUSH broke rlogin). | 106 | * (multi URG PUSH broke rlogin). |
107 | * Michael Pall : Fix the multi URG PUSH problem in | 107 | * Michael Pall : Fix the multi URG PUSH problem in |
108 | * tcp_readable(), poll() after URG | 108 | * tcp_readable(), poll() after URG |
109 | * works now. | 109 | * works now. |
110 | * Michael Pall : recv(...,MSG_OOB) never blocks in the | 110 | * Michael Pall : recv(...,MSG_OOB) never blocks in the |
111 | * BSD api. | 111 | * BSD api. |
112 | * Alan Cox : Changed the semantics of sk->socket to | 112 | * Alan Cox : Changed the semantics of sk->socket to |
113 | * fix a race and a signal problem with | 113 | * fix a race and a signal problem with |
114 | * accept() and async I/O. | 114 | * accept() and async I/O. |
115 | * Alan Cox : Relaxed the rules on tcp_sendto(). | 115 | * Alan Cox : Relaxed the rules on tcp_sendto(). |
116 | * Yury Shevchuk : Really fixed accept() blocking problem. | 116 | * Yury Shevchuk : Really fixed accept() blocking problem. |
117 | * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for | 117 | * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for |
118 | * clients/servers which listen in on | 118 | * clients/servers which listen in on |
119 | * fixed ports. | 119 | * fixed ports. |
120 | * Alan Cox : Cleaned the above up and shrank it to | 120 | * Alan Cox : Cleaned the above up and shrank it to |
121 | * a sensible code size. | 121 | * a sensible code size. |
122 | * Alan Cox : Self connect lockup fix. | 122 | * Alan Cox : Self connect lockup fix. |
123 | * Alan Cox : No connect to multicast. | 123 | * Alan Cox : No connect to multicast. |
124 | * Ross Biro : Close unaccepted children on master | 124 | * Ross Biro : Close unaccepted children on master |
125 | * socket close. | 125 | * socket close. |
126 | * Alan Cox : Reset tracing code. | 126 | * Alan Cox : Reset tracing code. |
127 | * Alan Cox : Spurious resets on shutdown. | 127 | * Alan Cox : Spurious resets on shutdown. |
128 | * Alan Cox : Giant 15 minute/60 second timer error | 128 | * Alan Cox : Giant 15 minute/60 second timer error |
129 | * Alan Cox : Small whoops in polling before an | 129 | * Alan Cox : Small whoops in polling before an |
130 | * accept. | 130 | * accept. |
131 | * Alan Cox : Kept the state trace facility since | 131 | * Alan Cox : Kept the state trace facility since |
132 | * it's handy for debugging. | 132 | * it's handy for debugging. |
133 | * Alan Cox : More reset handler fixes. | 133 | * Alan Cox : More reset handler fixes. |
134 | * Alan Cox : Started rewriting the code based on | 134 | * Alan Cox : Started rewriting the code based on |
135 | * the RFC's for other useful protocol | 135 | * the RFC's for other useful protocol |
136 | * references see: Comer, KA9Q NOS, and | 136 | * references see: Comer, KA9Q NOS, and |
137 | * for a reference on the difference | 137 | * for a reference on the difference |
138 | * between specifications and how BSD | 138 | * between specifications and how BSD |
139 | * works see the 4.4lite source. | 139 | * works see the 4.4lite source. |
140 | * A.N.Kuznetsov : Don't time wait on completion of tidy | 140 | * A.N.Kuznetsov : Don't time wait on completion of tidy |
141 | * close. | 141 | * close. |
142 | * Linus Torvalds : Fin/Shutdown & copied_seq changes. | 142 | * Linus Torvalds : Fin/Shutdown & copied_seq changes. |
143 | * Linus Torvalds : Fixed BSD port reuse to work first syn | 143 | * Linus Torvalds : Fixed BSD port reuse to work first syn |
144 | * Alan Cox : Reimplemented timers as per the RFC | 144 | * Alan Cox : Reimplemented timers as per the RFC |
145 | * and using multiple timers for sanity. | 145 | * and using multiple timers for sanity. |
146 | * Alan Cox : Small bug fixes, and a lot of new | 146 | * Alan Cox : Small bug fixes, and a lot of new |
147 | * comments. | 147 | * comments. |
148 | * Alan Cox : Fixed dual reader crash by locking | 148 | * Alan Cox : Fixed dual reader crash by locking |
149 | * the buffers (much like datagram.c) | 149 | * the buffers (much like datagram.c) |
150 | * Alan Cox : Fixed stuck sockets in probe. A probe | 150 | * Alan Cox : Fixed stuck sockets in probe. A probe |
151 | * now gets fed up of retrying without | 151 | * now gets fed up of retrying without |
152 | * (even a no space) answer. | 152 | * (even a no space) answer. |
153 | * Alan Cox : Extracted closing code better | 153 | * Alan Cox : Extracted closing code better |
154 | * Alan Cox : Fixed the closing state machine to | 154 | * Alan Cox : Fixed the closing state machine to |
155 | * resemble the RFC. | 155 | * resemble the RFC. |
156 | * Alan Cox : More 'per spec' fixes. | 156 | * Alan Cox : More 'per spec' fixes. |
157 | * Jorge Cwik : Even faster checksumming. | 157 | * Jorge Cwik : Even faster checksumming. |
158 | * Alan Cox : tcp_data() doesn't ack illegal PSH | 158 | * Alan Cox : tcp_data() doesn't ack illegal PSH |
159 | * only frames. At least one pc tcp stack | 159 | * only frames. At least one pc tcp stack |
160 | * generates them. | 160 | * generates them. |
161 | * Alan Cox : Cache last socket. | 161 | * Alan Cox : Cache last socket. |
162 | * Alan Cox : Per route irtt. | 162 | * Alan Cox : Per route irtt. |
163 | * Matt Day : poll()->select() match BSD precisely on error | 163 | * Matt Day : poll()->select() match BSD precisely on error |
164 | * Alan Cox : New buffers | 164 | * Alan Cox : New buffers |
165 | * Marc Tamsky : Various sk->prot->retransmits and | 165 | * Marc Tamsky : Various sk->prot->retransmits and |
166 | * sk->retransmits misupdating fixed. | 166 | * sk->retransmits misupdating fixed. |
167 | * Fixed tcp_write_timeout: stuck close, | 167 | * Fixed tcp_write_timeout: stuck close, |
168 | * and TCP syn retries gets used now. | 168 | * and TCP syn retries gets used now. |
169 | * Mark Yarvis : In tcp_read_wakeup(), don't send an | 169 | * Mark Yarvis : In tcp_read_wakeup(), don't send an |
170 | * ack if state is TCP_CLOSED. | 170 | * ack if state is TCP_CLOSED. |
171 | * Alan Cox : Look up device on a retransmit - routes may | 171 | * Alan Cox : Look up device on a retransmit - routes may |
172 | * change. Doesn't yet cope with MSS shrink right | 172 | * change. Doesn't yet cope with MSS shrink right |
173 | * but it's a start! | 173 | * but it's a start! |
174 | * Marc Tamsky : Closing in closing fixes. | 174 | * Marc Tamsky : Closing in closing fixes. |
175 | * Mike Shaver : RFC1122 verifications. | 175 | * Mike Shaver : RFC1122 verifications. |
176 | * Alan Cox : rcv_saddr errors. | 176 | * Alan Cox : rcv_saddr errors. |
177 | * Alan Cox : Block double connect(). | 177 | * Alan Cox : Block double connect(). |
178 | * Alan Cox : Small hooks for enSKIP. | 178 | * Alan Cox : Small hooks for enSKIP. |
179 | * Alexey Kuznetsov: Path MTU discovery. | 179 | * Alexey Kuznetsov: Path MTU discovery. |
180 | * Alan Cox : Support soft errors. | 180 | * Alan Cox : Support soft errors. |
181 | * Alan Cox : Fix MTU discovery pathological case | 181 | * Alan Cox : Fix MTU discovery pathological case |
182 | * when the remote claims no mtu! | 182 | * when the remote claims no mtu! |
183 | * Marc Tamsky : TCP_CLOSE fix. | 183 | * Marc Tamsky : TCP_CLOSE fix. |
184 | * Colin (G3TNE) : Send a reset on syn ack replies in | 184 | * Colin (G3TNE) : Send a reset on syn ack replies in |
185 | * window but wrong (fixes NT lpd problems) | 185 | * window but wrong (fixes NT lpd problems) |
186 | * Pedro Roque : Better TCP window handling, delayed ack. | 186 | * Pedro Roque : Better TCP window handling, delayed ack. |
187 | * Joerg Reuter : No modification of locked buffers in | 187 | * Joerg Reuter : No modification of locked buffers in |
188 | * tcp_do_retransmit() | 188 | * tcp_do_retransmit() |
189 | * Eric Schenk : Changed receiver side silly window | 189 | * Eric Schenk : Changed receiver side silly window |
190 | * avoidance algorithm to BSD style | 190 | * avoidance algorithm to BSD style |
191 | * algorithm. This doubles throughput | 191 | * algorithm. This doubles throughput |
192 | * against machines running Solaris, | 192 | * against machines running Solaris, |
193 | * and seems to result in general | 193 | * and seems to result in general |
194 | * improvement. | 194 | * improvement. |
195 | * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD | 195 | * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD |
196 | * Willy Konynenberg : Transparent proxying support. | 196 | * Willy Konynenberg : Transparent proxying support. |
197 | * Mike McLagan : Routing by source | 197 | * Mike McLagan : Routing by source |
198 | * Keith Owens : Do proper merging with partial SKB's in | 198 | * Keith Owens : Do proper merging with partial SKB's in |
199 | * tcp_do_sendmsg to avoid burstiness. | 199 | * tcp_do_sendmsg to avoid burstiness. |
200 | * Eric Schenk : Fix fast close down bug with | 200 | * Eric Schenk : Fix fast close down bug with |
201 | * shutdown() followed by close(). | 201 | * shutdown() followed by close(). |
202 | * Andi Kleen : Make poll agree with SIGIO | 202 | * Andi Kleen : Make poll agree with SIGIO |
203 | * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and | 203 | * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and |
204 | * lingertime == 0 (RFC 793 ABORT Call) | 204 | * lingertime == 0 (RFC 793 ABORT Call) |
205 | * Hirokazu Takahashi : Use copy_from_user() instead of | 205 | * Hirokazu Takahashi : Use copy_from_user() instead of |
206 | * csum_and_copy_from_user() if possible. | 206 | * csum_and_copy_from_user() if possible. |
207 | * | 207 | * |
208 | * This program is free software; you can redistribute it and/or | 208 | * This program is free software; you can redistribute it and/or |
209 | * modify it under the terms of the GNU General Public License | 209 | * modify it under the terms of the GNU General Public License |
210 | * as published by the Free Software Foundation; either version | 210 | * as published by the Free Software Foundation; either version |
211 | * 2 of the License, or(at your option) any later version. | 211 | * 2 of the License, or(at your option) any later version. |
212 | * | 212 | * |
213 | * Description of States: | 213 | * Description of States: |
214 | * | 214 | * |
215 | * TCP_SYN_SENT sent a connection request, waiting for ack | 215 | * TCP_SYN_SENT sent a connection request, waiting for ack |
216 | * | 216 | * |
217 | * TCP_SYN_RECV received a connection request, sent ack, | 217 | * TCP_SYN_RECV received a connection request, sent ack, |
218 | * waiting for final ack in three-way handshake. | 218 | * waiting for final ack in three-way handshake. |
219 | * | 219 | * |
220 | * TCP_ESTABLISHED connection established | 220 | * TCP_ESTABLISHED connection established |
221 | * | 221 | * |
222 | * TCP_FIN_WAIT1 our side has shutdown, waiting to complete | 222 | * TCP_FIN_WAIT1 our side has shutdown, waiting to complete |
223 | * transmission of remaining buffered data | 223 | * transmission of remaining buffered data |
224 | * | 224 | * |
225 | * TCP_FIN_WAIT2 all buffered data sent, waiting for remote | 225 | * TCP_FIN_WAIT2 all buffered data sent, waiting for remote |
226 | * to shutdown | 226 | * to shutdown |
227 | * | 227 | * |
228 | * TCP_CLOSING both sides have shutdown but we still have | 228 | * TCP_CLOSING both sides have shutdown but we still have |
229 | * data we have to finish sending | 229 | * data we have to finish sending |
230 | * | 230 | * |
231 | * TCP_TIME_WAIT timeout to catch resent junk before entering | 231 | * TCP_TIME_WAIT timeout to catch resent junk before entering |
232 | * closed, can only be entered from FIN_WAIT2 | 232 | * closed, can only be entered from FIN_WAIT2 |
233 | * or CLOSING. Required because the other end | 233 | * or CLOSING. Required because the other end |
234 | * may not have gotten our last ACK causing it | 234 | * may not have gotten our last ACK causing it |
235 | * to retransmit the data packet (which we ignore) | 235 | * to retransmit the data packet (which we ignore) |
236 | * | 236 | * |
237 | * TCP_CLOSE_WAIT remote side has shutdown and is waiting for | 237 | * TCP_CLOSE_WAIT remote side has shutdown and is waiting for |
238 | * us to finish writing our data and to shutdown | 238 | * us to finish writing our data and to shutdown |
239 | * (we have to close() to move on to LAST_ACK) | 239 | * (we have to close() to move on to LAST_ACK) |
240 | * | 240 | * |
241 | * TCP_LAST_ACK out side has shutdown after remote has | 241 | * TCP_LAST_ACK out side has shutdown after remote has |
242 | * shutdown. There may still be data in our | 242 | * shutdown. There may still be data in our |
243 | * buffer that we have to finish sending | 243 | * buffer that we have to finish sending |
244 | * | 244 | * |
245 | * TCP_CLOSE socket is finished | 245 | * TCP_CLOSE socket is finished |
246 | */ | 246 | */ |
247 | 247 | ||
248 | #include <linux/kernel.h> | 248 | #include <linux/kernel.h> |
249 | #include <linux/module.h> | 249 | #include <linux/module.h> |
250 | #include <linux/types.h> | 250 | #include <linux/types.h> |
251 | #include <linux/fcntl.h> | 251 | #include <linux/fcntl.h> |
252 | #include <linux/poll.h> | 252 | #include <linux/poll.h> |
253 | #include <linux/init.h> | 253 | #include <linux/init.h> |
254 | #include <linux/fs.h> | 254 | #include <linux/fs.h> |
255 | #include <linux/skbuff.h> | 255 | #include <linux/skbuff.h> |
256 | #include <linux/scatterlist.h> | 256 | #include <linux/scatterlist.h> |
257 | #include <linux/splice.h> | 257 | #include <linux/splice.h> |
258 | #include <linux/net.h> | 258 | #include <linux/net.h> |
259 | #include <linux/socket.h> | 259 | #include <linux/socket.h> |
260 | #include <linux/random.h> | 260 | #include <linux/random.h> |
261 | #include <linux/bootmem.h> | 261 | #include <linux/bootmem.h> |
262 | #include <linux/highmem.h> | 262 | #include <linux/highmem.h> |
263 | #include <linux/swap.h> | 263 | #include <linux/swap.h> |
264 | #include <linux/cache.h> | 264 | #include <linux/cache.h> |
265 | #include <linux/err.h> | 265 | #include <linux/err.h> |
266 | #include <linux/crypto.h> | 266 | #include <linux/crypto.h> |
267 | 267 | ||
268 | #include <net/icmp.h> | 268 | #include <net/icmp.h> |
269 | #include <net/tcp.h> | 269 | #include <net/tcp.h> |
270 | #include <net/xfrm.h> | 270 | #include <net/xfrm.h> |
271 | #include <net/ip.h> | 271 | #include <net/ip.h> |
272 | #include <net/netdma.h> | 272 | #include <net/netdma.h> |
273 | #include <net/sock.h> | 273 | #include <net/sock.h> |
274 | 274 | ||
275 | #include <asm/uaccess.h> | 275 | #include <asm/uaccess.h> |
276 | #include <asm/ioctls.h> | 276 | #include <asm/ioctls.h> |
277 | 277 | ||
278 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | 278 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; |
279 | 279 | ||
280 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); | 280 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); |
281 | 281 | ||
282 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 282 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
283 | 283 | ||
284 | int sysctl_tcp_mem[3] __read_mostly; | 284 | int sysctl_tcp_mem[3] __read_mostly; |
285 | int sysctl_tcp_wmem[3] __read_mostly; | 285 | int sysctl_tcp_wmem[3] __read_mostly; |
286 | int sysctl_tcp_rmem[3] __read_mostly; | 286 | int sysctl_tcp_rmem[3] __read_mostly; |
287 | 287 | ||
288 | EXPORT_SYMBOL(sysctl_tcp_mem); | 288 | EXPORT_SYMBOL(sysctl_tcp_mem); |
289 | EXPORT_SYMBOL(sysctl_tcp_rmem); | 289 | EXPORT_SYMBOL(sysctl_tcp_rmem); |
290 | EXPORT_SYMBOL(sysctl_tcp_wmem); | 290 | EXPORT_SYMBOL(sysctl_tcp_wmem); |
291 | 291 | ||
292 | atomic_t tcp_memory_allocated; /* Current allocated memory. */ | 292 | atomic_t tcp_memory_allocated; /* Current allocated memory. */ |
293 | atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ | 293 | atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ |
294 | 294 | ||
295 | EXPORT_SYMBOL(tcp_memory_allocated); | 295 | EXPORT_SYMBOL(tcp_memory_allocated); |
296 | EXPORT_SYMBOL(tcp_sockets_allocated); | 296 | EXPORT_SYMBOL(tcp_sockets_allocated); |
297 | 297 | ||
298 | /* | 298 | /* |
299 | * TCP splice context | 299 | * TCP splice context |
300 | */ | 300 | */ |
301 | struct tcp_splice_state { | 301 | struct tcp_splice_state { |
302 | struct pipe_inode_info *pipe; | 302 | struct pipe_inode_info *pipe; |
303 | size_t len; | 303 | size_t len; |
304 | unsigned int flags; | 304 | unsigned int flags; |
305 | }; | 305 | }; |
306 | 306 | ||
307 | /* | 307 | /* |
308 | * Pressure flag: try to collapse. | 308 | * Pressure flag: try to collapse. |
309 | * Technical note: it is used by multiple contexts non atomically. | 309 | * Technical note: it is used by multiple contexts non atomically. |
310 | * All the __sk_mem_schedule() is of this nature: accounting | 310 | * All the __sk_mem_schedule() is of this nature: accounting |
311 | * is strict, actions are advisory and have some latency. | 311 | * is strict, actions are advisory and have some latency. |
312 | */ | 312 | */ |
313 | int tcp_memory_pressure __read_mostly; | 313 | int tcp_memory_pressure __read_mostly; |
314 | 314 | ||
315 | EXPORT_SYMBOL(tcp_memory_pressure); | 315 | EXPORT_SYMBOL(tcp_memory_pressure); |
316 | 316 | ||
317 | void tcp_enter_memory_pressure(struct sock *sk) | 317 | void tcp_enter_memory_pressure(struct sock *sk) |
318 | { | 318 | { |
319 | if (!tcp_memory_pressure) { | 319 | if (!tcp_memory_pressure) { |
320 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); | 320 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); |
321 | tcp_memory_pressure = 1; | 321 | tcp_memory_pressure = 1; |
322 | } | 322 | } |
323 | } | 323 | } |
324 | 324 | ||
325 | EXPORT_SYMBOL(tcp_enter_memory_pressure); | 325 | EXPORT_SYMBOL(tcp_enter_memory_pressure); |
326 | 326 | ||
327 | /* | 327 | /* |
328 | * Wait for a TCP event. | 328 | * Wait for a TCP event. |
329 | * | 329 | * |
330 | * Note that we don't need to lock the socket, as the upper poll layers | 330 | * Note that we don't need to lock the socket, as the upper poll layers |
331 | * take care of normal races (between the test and the event) and we don't | 331 | * take care of normal races (between the test and the event) and we don't |
332 | * go look at any of the socket buffers directly. | 332 | * go look at any of the socket buffers directly. |
333 | */ | 333 | */ |
334 | unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | 334 | unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) |
335 | { | 335 | { |
336 | unsigned int mask; | 336 | unsigned int mask; |
337 | struct sock *sk = sock->sk; | 337 | struct sock *sk = sock->sk; |
338 | struct tcp_sock *tp = tcp_sk(sk); | 338 | struct tcp_sock *tp = tcp_sk(sk); |
339 | 339 | ||
340 | poll_wait(file, sk->sk_sleep, wait); | 340 | poll_wait(file, sk->sk_sleep, wait); |
341 | if (sk->sk_state == TCP_LISTEN) | 341 | if (sk->sk_state == TCP_LISTEN) |
342 | return inet_csk_listen_poll(sk); | 342 | return inet_csk_listen_poll(sk); |
343 | 343 | ||
344 | /* Socket is not locked. We are protected from async events | 344 | /* Socket is not locked. We are protected from async events |
345 | * by poll logic and correct handling of state changes | 345 | * by poll logic and correct handling of state changes |
346 | * made by other threads is impossible in any case. | 346 | * made by other threads is impossible in any case. |
347 | */ | 347 | */ |
348 | 348 | ||
349 | mask = 0; | 349 | mask = 0; |
350 | if (sk->sk_err) | 350 | if (sk->sk_err) |
351 | mask = POLLERR; | 351 | mask = POLLERR; |
352 | 352 | ||
353 | /* | 353 | /* |
354 | * POLLHUP is certainly not done right. But poll() doesn't | 354 | * POLLHUP is certainly not done right. But poll() doesn't |
355 | * have a notion of HUP in just one direction, and for a | 355 | * have a notion of HUP in just one direction, and for a |
356 | * socket the read side is more interesting. | 356 | * socket the read side is more interesting. |
357 | * | 357 | * |
358 | * Some poll() documentation says that POLLHUP is incompatible | 358 | * Some poll() documentation says that POLLHUP is incompatible |
359 | * with the POLLOUT/POLLWR flags, so somebody should check this | 359 | * with the POLLOUT/POLLWR flags, so somebody should check this |
360 | * all. But careful, it tends to be safer to return too many | 360 | * all. But careful, it tends to be safer to return too many |
361 | * bits than too few, and you can easily break real applications | 361 | * bits than too few, and you can easily break real applications |
362 | * if you don't tell them that something has hung up! | 362 | * if you don't tell them that something has hung up! |
363 | * | 363 | * |
364 | * Check-me. | 364 | * Check-me. |
365 | * | 365 | * |
366 | * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and | 366 | * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and |
367 | * our fs/select.c). It means that after we received EOF, | 367 | * our fs/select.c). It means that after we received EOF, |
368 | * poll always returns immediately, making impossible poll() on write() | 368 | * poll always returns immediately, making impossible poll() on write() |
369 | * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP | 369 | * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP |
370 | * if and only if shutdown has been made in both directions. | 370 | * if and only if shutdown has been made in both directions. |
371 | * Actually, it is interesting to look how Solaris and DUX | 371 | * Actually, it is interesting to look how Solaris and DUX |
372 | * solve this dilemma. I would prefer, if POLLHUP were maskable, | 372 | * solve this dilemma. I would prefer, if POLLHUP were maskable, |
373 | * then we could set it on SND_SHUTDOWN. BTW examples given | 373 | * then we could set it on SND_SHUTDOWN. BTW examples given |
374 | * in Stevens' books assume exactly this behaviour, it explains | 374 | * in Stevens' books assume exactly this behaviour, it explains |
375 | * why POLLHUP is incompatible with POLLOUT. --ANK | 375 | * why POLLHUP is incompatible with POLLOUT. --ANK |
376 | * | 376 | * |
377 | * NOTE. Check for TCP_CLOSE is added. The goal is to prevent | 377 | * NOTE. Check for TCP_CLOSE is added. The goal is to prevent |
378 | * blocking on fresh not-connected or disconnected socket. --ANK | 378 | * blocking on fresh not-connected or disconnected socket. --ANK |
379 | */ | 379 | */ |
380 | if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) | 380 | if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) |
381 | mask |= POLLHUP; | 381 | mask |= POLLHUP; |
382 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 382 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
383 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; | 383 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; |
384 | 384 | ||
385 | /* Connected? */ | 385 | /* Connected? */ |
386 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 386 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
387 | int target = sock_rcvlowat(sk, 0, INT_MAX); | 387 | int target = sock_rcvlowat(sk, 0, INT_MAX); |
388 | 388 | ||
389 | if (tp->urg_seq == tp->copied_seq && | 389 | if (tp->urg_seq == tp->copied_seq && |
390 | !sock_flag(sk, SOCK_URGINLINE) && | 390 | !sock_flag(sk, SOCK_URGINLINE) && |
391 | tp->urg_data) | 391 | tp->urg_data) |
392 | target--; | 392 | target--; |
393 | 393 | ||
394 | /* Potential race condition. If read of tp below will | 394 | /* Potential race condition. If read of tp below will |
395 | * escape above sk->sk_state, we can be illegally awaken | 395 | * escape above sk->sk_state, we can be illegally awaken |
396 | * in SYN_* states. */ | 396 | * in SYN_* states. */ |
397 | if (tp->rcv_nxt - tp->copied_seq >= target) | 397 | if (tp->rcv_nxt - tp->copied_seq >= target) |
398 | mask |= POLLIN | POLLRDNORM; | 398 | mask |= POLLIN | POLLRDNORM; |
399 | 399 | ||
400 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { | 400 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { |
401 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { | 401 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { |
402 | mask |= POLLOUT | POLLWRNORM; | 402 | mask |= POLLOUT | POLLWRNORM; |
403 | } else { /* send SIGIO later */ | 403 | } else { /* send SIGIO later */ |
404 | set_bit(SOCK_ASYNC_NOSPACE, | 404 | set_bit(SOCK_ASYNC_NOSPACE, |
405 | &sk->sk_socket->flags); | 405 | &sk->sk_socket->flags); |
406 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 406 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
407 | 407 | ||
408 | /* Race breaker. If space is freed after | 408 | /* Race breaker. If space is freed after |
409 | * wspace test but before the flags are set, | 409 | * wspace test but before the flags are set, |
410 | * IO signal will be lost. | 410 | * IO signal will be lost. |
411 | */ | 411 | */ |
412 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) | 412 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) |
413 | mask |= POLLOUT | POLLWRNORM; | 413 | mask |= POLLOUT | POLLWRNORM; |
414 | } | 414 | } |
415 | } | 415 | } |
416 | 416 | ||
417 | if (tp->urg_data & TCP_URG_VALID) | 417 | if (tp->urg_data & TCP_URG_VALID) |
418 | mask |= POLLPRI; | 418 | mask |= POLLPRI; |
419 | } | 419 | } |
420 | return mask; | 420 | return mask; |
421 | } | 421 | } |
422 | 422 | ||
423 | int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) | 423 | int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) |
424 | { | 424 | { |
425 | struct tcp_sock *tp = tcp_sk(sk); | 425 | struct tcp_sock *tp = tcp_sk(sk); |
426 | int answ; | 426 | int answ; |
427 | 427 | ||
428 | switch (cmd) { | 428 | switch (cmd) { |
429 | case SIOCINQ: | 429 | case SIOCINQ: |
430 | if (sk->sk_state == TCP_LISTEN) | 430 | if (sk->sk_state == TCP_LISTEN) |
431 | return -EINVAL; | 431 | return -EINVAL; |
432 | 432 | ||
433 | lock_sock(sk); | 433 | lock_sock(sk); |
434 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) | 434 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) |
435 | answ = 0; | 435 | answ = 0; |
436 | else if (sock_flag(sk, SOCK_URGINLINE) || | 436 | else if (sock_flag(sk, SOCK_URGINLINE) || |
437 | !tp->urg_data || | 437 | !tp->urg_data || |
438 | before(tp->urg_seq, tp->copied_seq) || | 438 | before(tp->urg_seq, tp->copied_seq) || |
439 | !before(tp->urg_seq, tp->rcv_nxt)) { | 439 | !before(tp->urg_seq, tp->rcv_nxt)) { |
440 | answ = tp->rcv_nxt - tp->copied_seq; | 440 | answ = tp->rcv_nxt - tp->copied_seq; |
441 | 441 | ||
442 | /* Subtract 1, if FIN is in queue. */ | 442 | /* Subtract 1, if FIN is in queue. */ |
443 | if (answ && !skb_queue_empty(&sk->sk_receive_queue)) | 443 | if (answ && !skb_queue_empty(&sk->sk_receive_queue)) |
444 | answ -= | 444 | answ -= |
445 | tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; | 445 | tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; |
446 | } else | 446 | } else |
447 | answ = tp->urg_seq - tp->copied_seq; | 447 | answ = tp->urg_seq - tp->copied_seq; |
448 | release_sock(sk); | 448 | release_sock(sk); |
449 | break; | 449 | break; |
450 | case SIOCATMARK: | 450 | case SIOCATMARK: |
451 | answ = tp->urg_data && tp->urg_seq == tp->copied_seq; | 451 | answ = tp->urg_data && tp->urg_seq == tp->copied_seq; |
452 | break; | 452 | break; |
453 | case SIOCOUTQ: | 453 | case SIOCOUTQ: |
454 | if (sk->sk_state == TCP_LISTEN) | 454 | if (sk->sk_state == TCP_LISTEN) |
455 | return -EINVAL; | 455 | return -EINVAL; |
456 | 456 | ||
457 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) | 457 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) |
458 | answ = 0; | 458 | answ = 0; |
459 | else | 459 | else |
460 | answ = tp->write_seq - tp->snd_una; | 460 | answ = tp->write_seq - tp->snd_una; |
461 | break; | 461 | break; |
462 | default: | 462 | default: |
463 | return -ENOIOCTLCMD; | 463 | return -ENOIOCTLCMD; |
464 | } | 464 | } |
465 | 465 | ||
466 | return put_user(answ, (int __user *)arg); | 466 | return put_user(answ, (int __user *)arg); |
467 | } | 467 | } |
468 | 468 | ||
469 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) | 469 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) |
470 | { | 470 | { |
471 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; | 471 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; |
472 | tp->pushed_seq = tp->write_seq; | 472 | tp->pushed_seq = tp->write_seq; |
473 | } | 473 | } |
474 | 474 | ||
475 | static inline int forced_push(struct tcp_sock *tp) | 475 | static inline int forced_push(struct tcp_sock *tp) |
476 | { | 476 | { |
477 | return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); | 477 | return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); |
478 | } | 478 | } |
479 | 479 | ||
480 | static inline void skb_entail(struct sock *sk, struct sk_buff *skb) | 480 | static inline void skb_entail(struct sock *sk, struct sk_buff *skb) |
481 | { | 481 | { |
482 | struct tcp_sock *tp = tcp_sk(sk); | 482 | struct tcp_sock *tp = tcp_sk(sk); |
483 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | 483 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
484 | 484 | ||
485 | skb->csum = 0; | 485 | skb->csum = 0; |
486 | tcb->seq = tcb->end_seq = tp->write_seq; | 486 | tcb->seq = tcb->end_seq = tp->write_seq; |
487 | tcb->flags = TCPCB_FLAG_ACK; | 487 | tcb->flags = TCPCB_FLAG_ACK; |
488 | tcb->sacked = 0; | 488 | tcb->sacked = 0; |
489 | skb_header_release(skb); | 489 | skb_header_release(skb); |
490 | tcp_add_write_queue_tail(sk, skb); | 490 | tcp_add_write_queue_tail(sk, skb); |
491 | sk->sk_wmem_queued += skb->truesize; | 491 | sk->sk_wmem_queued += skb->truesize; |
492 | sk_mem_charge(sk, skb->truesize); | 492 | sk_mem_charge(sk, skb->truesize); |
493 | if (tp->nonagle & TCP_NAGLE_PUSH) | 493 | if (tp->nonagle & TCP_NAGLE_PUSH) |
494 | tp->nonagle &= ~TCP_NAGLE_PUSH; | 494 | tp->nonagle &= ~TCP_NAGLE_PUSH; |
495 | } | 495 | } |
496 | 496 | ||
497 | static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, | 497 | static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, |
498 | struct sk_buff *skb) | 498 | struct sk_buff *skb) |
499 | { | 499 | { |
500 | if (flags & MSG_OOB) | 500 | if (flags & MSG_OOB) |
501 | tp->snd_up = tp->write_seq; | 501 | tp->snd_up = tp->write_seq; |
502 | } | 502 | } |
503 | 503 | ||
504 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, | 504 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, |
505 | int nonagle) | 505 | int nonagle) |
506 | { | 506 | { |
507 | struct tcp_sock *tp = tcp_sk(sk); | 507 | struct tcp_sock *tp = tcp_sk(sk); |
508 | 508 | ||
509 | if (tcp_send_head(sk)) { | 509 | if (tcp_send_head(sk)) { |
510 | struct sk_buff *skb = tcp_write_queue_tail(sk); | 510 | struct sk_buff *skb = tcp_write_queue_tail(sk); |
511 | if (!(flags & MSG_MORE) || forced_push(tp)) | 511 | if (!(flags & MSG_MORE) || forced_push(tp)) |
512 | tcp_mark_push(tp, skb); | 512 | tcp_mark_push(tp, skb); |
513 | tcp_mark_urg(tp, flags, skb); | 513 | tcp_mark_urg(tp, flags, skb); |
514 | __tcp_push_pending_frames(sk, mss_now, | 514 | __tcp_push_pending_frames(sk, mss_now, |
515 | (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); | 515 | (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); |
516 | } | 516 | } |
517 | } | 517 | } |
518 | 518 | ||
519 | static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, | 519 | static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, |
520 | unsigned int offset, size_t len) | 520 | unsigned int offset, size_t len) |
521 | { | 521 | { |
522 | struct tcp_splice_state *tss = rd_desc->arg.data; | 522 | struct tcp_splice_state *tss = rd_desc->arg.data; |
523 | 523 | ||
524 | return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); | 524 | return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); |
525 | } | 525 | } |
526 | 526 | ||
527 | static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) | 527 | static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) |
528 | { | 528 | { |
529 | /* Store TCP splice context information in read_descriptor_t. */ | 529 | /* Store TCP splice context information in read_descriptor_t. */ |
530 | read_descriptor_t rd_desc = { | 530 | read_descriptor_t rd_desc = { |
531 | .arg.data = tss, | 531 | .arg.data = tss, |
532 | }; | 532 | }; |
533 | 533 | ||
534 | return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); | 534 | return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); |
535 | } | 535 | } |
536 | 536 | ||
537 | /** | 537 | /** |
538 | * tcp_splice_read - splice data from TCP socket to a pipe | 538 | * tcp_splice_read - splice data from TCP socket to a pipe |
539 | * @sock: socket to splice from | 539 | * @sock: socket to splice from |
540 | * @ppos: position (not valid) | 540 | * @ppos: position (not valid) |
541 | * @pipe: pipe to splice to | 541 | * @pipe: pipe to splice to |
542 | * @len: number of bytes to splice | 542 | * @len: number of bytes to splice |
543 | * @flags: splice modifier flags | 543 | * @flags: splice modifier flags |
544 | * | 544 | * |
545 | * Description: | 545 | * Description: |
546 | * Will read pages from given socket and fill them into a pipe. | 546 | * Will read pages from given socket and fill them into a pipe. |
547 | * | 547 | * |
548 | **/ | 548 | **/ |
549 | ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, | 549 | ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, |
550 | struct pipe_inode_info *pipe, size_t len, | 550 | struct pipe_inode_info *pipe, size_t len, |
551 | unsigned int flags) | 551 | unsigned int flags) |
552 | { | 552 | { |
553 | struct sock *sk = sock->sk; | 553 | struct sock *sk = sock->sk; |
554 | struct tcp_splice_state tss = { | 554 | struct tcp_splice_state tss = { |
555 | .pipe = pipe, | 555 | .pipe = pipe, |
556 | .len = len, | 556 | .len = len, |
557 | .flags = flags, | 557 | .flags = flags, |
558 | }; | 558 | }; |
559 | long timeo; | 559 | long timeo; |
560 | ssize_t spliced; | 560 | ssize_t spliced; |
561 | int ret; | 561 | int ret; |
562 | 562 | ||
563 | /* | 563 | /* |
564 | * We can't seek on a socket input | 564 | * We can't seek on a socket input |
565 | */ | 565 | */ |
566 | if (unlikely(*ppos)) | 566 | if (unlikely(*ppos)) |
567 | return -ESPIPE; | 567 | return -ESPIPE; |
568 | 568 | ||
569 | ret = spliced = 0; | 569 | ret = spliced = 0; |
570 | 570 | ||
571 | lock_sock(sk); | 571 | lock_sock(sk); |
572 | 572 | ||
573 | timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); | 573 | timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); |
574 | while (tss.len) { | 574 | while (tss.len) { |
575 | ret = __tcp_splice_read(sk, &tss); | 575 | ret = __tcp_splice_read(sk, &tss); |
576 | if (ret < 0) | 576 | if (ret < 0) |
577 | break; | 577 | break; |
578 | else if (!ret) { | 578 | else if (!ret) { |
579 | if (spliced) | 579 | if (spliced) |
580 | break; | 580 | break; |
581 | if (flags & SPLICE_F_NONBLOCK) { | 581 | if (flags & SPLICE_F_NONBLOCK) { |
582 | ret = -EAGAIN; | 582 | ret = -EAGAIN; |
583 | break; | 583 | break; |
584 | } | 584 | } |
585 | if (sock_flag(sk, SOCK_DONE)) | 585 | if (sock_flag(sk, SOCK_DONE)) |
586 | break; | 586 | break; |
587 | if (sk->sk_err) { | 587 | if (sk->sk_err) { |
588 | ret = sock_error(sk); | 588 | ret = sock_error(sk); |
589 | break; | 589 | break; |
590 | } | 590 | } |
591 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 591 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
592 | break; | 592 | break; |
593 | if (sk->sk_state == TCP_CLOSE) { | 593 | if (sk->sk_state == TCP_CLOSE) { |
594 | /* | 594 | /* |
595 | * This occurs when user tries to read | 595 | * This occurs when user tries to read |
596 | * from never connected socket. | 596 | * from never connected socket. |
597 | */ | 597 | */ |
598 | if (!sock_flag(sk, SOCK_DONE)) | 598 | if (!sock_flag(sk, SOCK_DONE)) |
599 | ret = -ENOTCONN; | 599 | ret = -ENOTCONN; |
600 | break; | 600 | break; |
601 | } | 601 | } |
602 | if (!timeo) { | 602 | if (!timeo) { |
603 | ret = -EAGAIN; | 603 | ret = -EAGAIN; |
604 | break; | 604 | break; |
605 | } | 605 | } |
606 | sk_wait_data(sk, &timeo); | 606 | sk_wait_data(sk, &timeo); |
607 | if (signal_pending(current)) { | 607 | if (signal_pending(current)) { |
608 | ret = sock_intr_errno(timeo); | 608 | ret = sock_intr_errno(timeo); |
609 | break; | 609 | break; |
610 | } | 610 | } |
611 | continue; | 611 | continue; |
612 | } | 612 | } |
613 | tss.len -= ret; | 613 | tss.len -= ret; |
614 | spliced += ret; | 614 | spliced += ret; |
615 | 615 | ||
616 | release_sock(sk); | 616 | release_sock(sk); |
617 | lock_sock(sk); | 617 | lock_sock(sk); |
618 | 618 | ||
619 | if (sk->sk_err || sk->sk_state == TCP_CLOSE || | 619 | if (sk->sk_err || sk->sk_state == TCP_CLOSE || |
620 | (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || | 620 | (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || |
621 | signal_pending(current)) | 621 | signal_pending(current)) |
622 | break; | 622 | break; |
623 | } | 623 | } |
624 | 624 | ||
625 | release_sock(sk); | 625 | release_sock(sk); |
626 | 626 | ||
627 | if (spliced) | 627 | if (spliced) |
628 | return spliced; | 628 | return spliced; |
629 | 629 | ||
630 | return ret; | 630 | return ret; |
631 | } | 631 | } |
632 | 632 | ||
633 | struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) | 633 | struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) |
634 | { | 634 | { |
635 | struct sk_buff *skb; | 635 | struct sk_buff *skb; |
636 | 636 | ||
637 | /* The TCP header must be at least 32-bit aligned. */ | 637 | /* The TCP header must be at least 32-bit aligned. */ |
638 | size = ALIGN(size, 4); | 638 | size = ALIGN(size, 4); |
639 | 639 | ||
640 | skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); | 640 | skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); |
641 | if (skb) { | 641 | if (skb) { |
642 | if (sk_wmem_schedule(sk, skb->truesize)) { | 642 | if (sk_wmem_schedule(sk, skb->truesize)) { |
643 | /* | 643 | /* |
644 | * Make sure that we have exactly size bytes | 644 | * Make sure that we have exactly size bytes |
645 | * available to the caller, no more, no less. | 645 | * available to the caller, no more, no less. |
646 | */ | 646 | */ |
647 | skb_reserve(skb, skb_tailroom(skb) - size); | 647 | skb_reserve(skb, skb_tailroom(skb) - size); |
648 | return skb; | 648 | return skb; |
649 | } | 649 | } |
650 | __kfree_skb(skb); | 650 | __kfree_skb(skb); |
651 | } else { | 651 | } else { |
652 | sk->sk_prot->enter_memory_pressure(sk); | 652 | sk->sk_prot->enter_memory_pressure(sk); |
653 | sk_stream_moderate_sndbuf(sk); | 653 | sk_stream_moderate_sndbuf(sk); |
654 | } | 654 | } |
655 | return NULL; | 655 | return NULL; |
656 | } | 656 | } |
657 | 657 | ||
658 | static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, | 658 | static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, |
659 | size_t psize, int flags) | 659 | size_t psize, int flags) |
660 | { | 660 | { |
661 | struct tcp_sock *tp = tcp_sk(sk); | 661 | struct tcp_sock *tp = tcp_sk(sk); |
662 | int mss_now, size_goal; | 662 | int mss_now, size_goal; |
663 | int err; | 663 | int err; |
664 | ssize_t copied; | 664 | ssize_t copied; |
665 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 665 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
666 | 666 | ||
667 | /* Wait for a connection to finish. */ | 667 | /* Wait for a connection to finish. */ |
668 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 668 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) |
669 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 669 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
670 | goto out_err; | 670 | goto out_err; |
671 | 671 | ||
672 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); | 672 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
673 | 673 | ||
674 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); | 674 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); |
675 | size_goal = tp->xmit_size_goal; | 675 | size_goal = tp->xmit_size_goal; |
676 | copied = 0; | 676 | copied = 0; |
677 | 677 | ||
678 | err = -EPIPE; | 678 | err = -EPIPE; |
679 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) | 679 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) |
680 | goto do_error; | 680 | goto do_error; |
681 | 681 | ||
682 | while (psize > 0) { | 682 | while (psize > 0) { |
683 | struct sk_buff *skb = tcp_write_queue_tail(sk); | 683 | struct sk_buff *skb = tcp_write_queue_tail(sk); |
684 | struct page *page = pages[poffset / PAGE_SIZE]; | 684 | struct page *page = pages[poffset / PAGE_SIZE]; |
685 | int copy, i, can_coalesce; | 685 | int copy, i, can_coalesce; |
686 | int offset = poffset % PAGE_SIZE; | 686 | int offset = poffset % PAGE_SIZE; |
687 | int size = min_t(size_t, psize, PAGE_SIZE - offset); | 687 | int size = min_t(size_t, psize, PAGE_SIZE - offset); |
688 | 688 | ||
689 | if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { | 689 | if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { |
690 | new_segment: | 690 | new_segment: |
691 | if (!sk_stream_memory_free(sk)) | 691 | if (!sk_stream_memory_free(sk)) |
692 | goto wait_for_sndbuf; | 692 | goto wait_for_sndbuf; |
693 | 693 | ||
694 | skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); | 694 | skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); |
695 | if (!skb) | 695 | if (!skb) |
696 | goto wait_for_memory; | 696 | goto wait_for_memory; |
697 | 697 | ||
698 | skb_entail(sk, skb); | 698 | skb_entail(sk, skb); |
699 | copy = size_goal; | 699 | copy = size_goal; |
700 | } | 700 | } |
701 | 701 | ||
702 | if (copy > size) | 702 | if (copy > size) |
703 | copy = size; | 703 | copy = size; |
704 | 704 | ||
705 | i = skb_shinfo(skb)->nr_frags; | 705 | i = skb_shinfo(skb)->nr_frags; |
706 | can_coalesce = skb_can_coalesce(skb, i, page, offset); | 706 | can_coalesce = skb_can_coalesce(skb, i, page, offset); |
707 | if (!can_coalesce && i >= MAX_SKB_FRAGS) { | 707 | if (!can_coalesce && i >= MAX_SKB_FRAGS) { |
708 | tcp_mark_push(tp, skb); | 708 | tcp_mark_push(tp, skb); |
709 | goto new_segment; | 709 | goto new_segment; |
710 | } | 710 | } |
711 | if (!sk_wmem_schedule(sk, copy)) | 711 | if (!sk_wmem_schedule(sk, copy)) |
712 | goto wait_for_memory; | 712 | goto wait_for_memory; |
713 | 713 | ||
714 | if (can_coalesce) { | 714 | if (can_coalesce) { |
715 | skb_shinfo(skb)->frags[i - 1].size += copy; | 715 | skb_shinfo(skb)->frags[i - 1].size += copy; |
716 | } else { | 716 | } else { |
717 | get_page(page); | 717 | get_page(page); |
718 | skb_fill_page_desc(skb, i, page, offset, copy); | 718 | skb_fill_page_desc(skb, i, page, offset, copy); |
719 | } | 719 | } |
720 | 720 | ||
721 | skb->len += copy; | 721 | skb->len += copy; |
722 | skb->data_len += copy; | 722 | skb->data_len += copy; |
723 | skb->truesize += copy; | 723 | skb->truesize += copy; |
724 | sk->sk_wmem_queued += copy; | 724 | sk->sk_wmem_queued += copy; |
725 | sk_mem_charge(sk, copy); | 725 | sk_mem_charge(sk, copy); |
726 | skb->ip_summed = CHECKSUM_PARTIAL; | 726 | skb->ip_summed = CHECKSUM_PARTIAL; |
727 | tp->write_seq += copy; | 727 | tp->write_seq += copy; |
728 | TCP_SKB_CB(skb)->end_seq += copy; | 728 | TCP_SKB_CB(skb)->end_seq += copy; |
729 | skb_shinfo(skb)->gso_segs = 0; | 729 | skb_shinfo(skb)->gso_segs = 0; |
730 | 730 | ||
731 | if (!copied) | 731 | if (!copied) |
732 | TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; | 732 | TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; |
733 | 733 | ||
734 | copied += copy; | 734 | copied += copy; |
735 | poffset += copy; | 735 | poffset += copy; |
736 | if (!(psize -= copy)) | 736 | if (!(psize -= copy)) |
737 | goto out; | 737 | goto out; |
738 | 738 | ||
739 | if (skb->len < size_goal || (flags & MSG_OOB)) | 739 | if (skb->len < size_goal || (flags & MSG_OOB)) |
740 | continue; | 740 | continue; |
741 | 741 | ||
742 | if (forced_push(tp)) { | 742 | if (forced_push(tp)) { |
743 | tcp_mark_push(tp, skb); | 743 | tcp_mark_push(tp, skb); |
744 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); | 744 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); |
745 | } else if (skb == tcp_send_head(sk)) | 745 | } else if (skb == tcp_send_head(sk)) |
746 | tcp_push_one(sk, mss_now); | 746 | tcp_push_one(sk, mss_now); |
747 | continue; | 747 | continue; |
748 | 748 | ||
749 | wait_for_sndbuf: | 749 | wait_for_sndbuf: |
750 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 750 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
751 | wait_for_memory: | 751 | wait_for_memory: |
752 | if (copied) | 752 | if (copied) |
753 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); | 753 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); |
754 | 754 | ||
755 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) | 755 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) |
756 | goto do_error; | 756 | goto do_error; |
757 | 757 | ||
758 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); | 758 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); |
759 | size_goal = tp->xmit_size_goal; | 759 | size_goal = tp->xmit_size_goal; |
760 | } | 760 | } |
761 | 761 | ||
762 | out: | 762 | out: |
763 | if (copied) | 763 | if (copied) |
764 | tcp_push(sk, flags, mss_now, tp->nonagle); | 764 | tcp_push(sk, flags, mss_now, tp->nonagle); |
765 | return copied; | 765 | return copied; |
766 | 766 | ||
767 | do_error: | 767 | do_error: |
768 | if (copied) | 768 | if (copied) |
769 | goto out; | 769 | goto out; |
770 | out_err: | 770 | out_err: |
771 | return sk_stream_error(sk, flags, err); | 771 | return sk_stream_error(sk, flags, err); |
772 | } | 772 | } |
773 | 773 | ||
774 | ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, | 774 | ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, |
775 | size_t size, int flags) | 775 | size_t size, int flags) |
776 | { | 776 | { |
777 | ssize_t res; | 777 | ssize_t res; |
778 | struct sock *sk = sock->sk; | 778 | struct sock *sk = sock->sk; |
779 | 779 | ||
780 | if (!(sk->sk_route_caps & NETIF_F_SG) || | 780 | if (!(sk->sk_route_caps & NETIF_F_SG) || |
781 | !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) | 781 | !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) |
782 | return sock_no_sendpage(sock, page, offset, size, flags); | 782 | return sock_no_sendpage(sock, page, offset, size, flags); |
783 | 783 | ||
784 | lock_sock(sk); | 784 | lock_sock(sk); |
785 | TCP_CHECK_TIMER(sk); | 785 | TCP_CHECK_TIMER(sk); |
786 | res = do_tcp_sendpages(sk, &page, offset, size, flags); | 786 | res = do_tcp_sendpages(sk, &page, offset, size, flags); |
787 | TCP_CHECK_TIMER(sk); | 787 | TCP_CHECK_TIMER(sk); |
788 | release_sock(sk); | 788 | release_sock(sk); |
789 | return res; | 789 | return res; |
790 | } | 790 | } |
791 | 791 | ||
792 | #define TCP_PAGE(sk) (sk->sk_sndmsg_page) | 792 | #define TCP_PAGE(sk) (sk->sk_sndmsg_page) |
793 | #define TCP_OFF(sk) (sk->sk_sndmsg_off) | 793 | #define TCP_OFF(sk) (sk->sk_sndmsg_off) |
794 | 794 | ||
795 | static inline int select_size(struct sock *sk) | 795 | static inline int select_size(struct sock *sk) |
796 | { | 796 | { |
797 | struct tcp_sock *tp = tcp_sk(sk); | 797 | struct tcp_sock *tp = tcp_sk(sk); |
798 | int tmp = tp->mss_cache; | 798 | int tmp = tp->mss_cache; |
799 | 799 | ||
800 | if (sk->sk_route_caps & NETIF_F_SG) { | 800 | if (sk->sk_route_caps & NETIF_F_SG) { |
801 | if (sk_can_gso(sk)) | 801 | if (sk_can_gso(sk)) |
802 | tmp = 0; | 802 | tmp = 0; |
803 | else { | 803 | else { |
804 | int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); | 804 | int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); |
805 | 805 | ||
806 | if (tmp >= pgbreak && | 806 | if (tmp >= pgbreak && |
807 | tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) | 807 | tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) |
808 | tmp = pgbreak; | 808 | tmp = pgbreak; |
809 | } | 809 | } |
810 | } | 810 | } |
811 | 811 | ||
812 | return tmp; | 812 | return tmp; |
813 | } | 813 | } |
814 | 814 | ||
815 | int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | 815 | int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, |
816 | size_t size) | 816 | size_t size) |
817 | { | 817 | { |
818 | struct sock *sk = sock->sk; | 818 | struct sock *sk = sock->sk; |
819 | struct iovec *iov; | 819 | struct iovec *iov; |
820 | struct tcp_sock *tp = tcp_sk(sk); | 820 | struct tcp_sock *tp = tcp_sk(sk); |
821 | struct sk_buff *skb; | 821 | struct sk_buff *skb; |
822 | int iovlen, flags; | 822 | int iovlen, flags; |
823 | int mss_now, size_goal; | 823 | int mss_now, size_goal; |
824 | int err, copied; | 824 | int err, copied; |
825 | long timeo; | 825 | long timeo; |
826 | 826 | ||
827 | lock_sock(sk); | 827 | lock_sock(sk); |
828 | TCP_CHECK_TIMER(sk); | 828 | TCP_CHECK_TIMER(sk); |
829 | 829 | ||
830 | flags = msg->msg_flags; | 830 | flags = msg->msg_flags; |
831 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 831 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
832 | 832 | ||
833 | /* Wait for a connection to finish. */ | 833 | /* Wait for a connection to finish. */ |
834 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 834 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) |
835 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 835 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
836 | goto out_err; | 836 | goto out_err; |
837 | 837 | ||
838 | /* This should be in poll */ | 838 | /* This should be in poll */ |
839 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); | 839 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
840 | 840 | ||
841 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); | 841 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); |
842 | size_goal = tp->xmit_size_goal; | 842 | size_goal = tp->xmit_size_goal; |
843 | 843 | ||
844 | /* Ok commence sending. */ | 844 | /* Ok commence sending. */ |
845 | iovlen = msg->msg_iovlen; | 845 | iovlen = msg->msg_iovlen; |
846 | iov = msg->msg_iov; | 846 | iov = msg->msg_iov; |
847 | copied = 0; | 847 | copied = 0; |
848 | 848 | ||
849 | err = -EPIPE; | 849 | err = -EPIPE; |
850 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) | 850 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) |
851 | goto do_error; | 851 | goto do_error; |
852 | 852 | ||
853 | while (--iovlen >= 0) { | 853 | while (--iovlen >= 0) { |
854 | int seglen = iov->iov_len; | 854 | int seglen = iov->iov_len; |
855 | unsigned char __user *from = iov->iov_base; | 855 | unsigned char __user *from = iov->iov_base; |
856 | 856 | ||
857 | iov++; | 857 | iov++; |
858 | 858 | ||
859 | while (seglen > 0) { | 859 | while (seglen > 0) { |
860 | int copy; | 860 | int copy; |
861 | 861 | ||
862 | skb = tcp_write_queue_tail(sk); | 862 | skb = tcp_write_queue_tail(sk); |
863 | 863 | ||
864 | if (!tcp_send_head(sk) || | 864 | if (!tcp_send_head(sk) || |
865 | (copy = size_goal - skb->len) <= 0) { | 865 | (copy = size_goal - skb->len) <= 0) { |
866 | 866 | ||
867 | new_segment: | 867 | new_segment: |
868 | /* Allocate new segment. If the interface is SG, | 868 | /* Allocate new segment. If the interface is SG, |
869 | * allocate skb fitting to single page. | 869 | * allocate skb fitting to single page. |
870 | */ | 870 | */ |
871 | if (!sk_stream_memory_free(sk)) | 871 | if (!sk_stream_memory_free(sk)) |
872 | goto wait_for_sndbuf; | 872 | goto wait_for_sndbuf; |
873 | 873 | ||
874 | skb = sk_stream_alloc_skb(sk, select_size(sk), | 874 | skb = sk_stream_alloc_skb(sk, select_size(sk), |
875 | sk->sk_allocation); | 875 | sk->sk_allocation); |
876 | if (!skb) | 876 | if (!skb) |
877 | goto wait_for_memory; | 877 | goto wait_for_memory; |
878 | 878 | ||
879 | /* | 879 | /* |
880 | * Check whether we can use HW checksum. | 880 | * Check whether we can use HW checksum. |
881 | */ | 881 | */ |
882 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) | 882 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) |
883 | skb->ip_summed = CHECKSUM_PARTIAL; | 883 | skb->ip_summed = CHECKSUM_PARTIAL; |
884 | 884 | ||
885 | skb_entail(sk, skb); | 885 | skb_entail(sk, skb); |
886 | copy = size_goal; | 886 | copy = size_goal; |
887 | } | 887 | } |
888 | 888 | ||
889 | /* Try to append data to the end of skb. */ | 889 | /* Try to append data to the end of skb. */ |
890 | if (copy > seglen) | 890 | if (copy > seglen) |
891 | copy = seglen; | 891 | copy = seglen; |
892 | 892 | ||
893 | /* Where to copy to? */ | 893 | /* Where to copy to? */ |
894 | if (skb_tailroom(skb) > 0) { | 894 | if (skb_tailroom(skb) > 0) { |
895 | /* We have some space in skb head. Superb! */ | 895 | /* We have some space in skb head. Superb! */ |
896 | if (copy > skb_tailroom(skb)) | 896 | if (copy > skb_tailroom(skb)) |
897 | copy = skb_tailroom(skb); | 897 | copy = skb_tailroom(skb); |
898 | if ((err = skb_add_data(skb, from, copy)) != 0) | 898 | if ((err = skb_add_data(skb, from, copy)) != 0) |
899 | goto do_fault; | 899 | goto do_fault; |
900 | } else { | 900 | } else { |
901 | int merge = 0; | 901 | int merge = 0; |
902 | int i = skb_shinfo(skb)->nr_frags; | 902 | int i = skb_shinfo(skb)->nr_frags; |
903 | struct page *page = TCP_PAGE(sk); | 903 | struct page *page = TCP_PAGE(sk); |
904 | int off = TCP_OFF(sk); | 904 | int off = TCP_OFF(sk); |
905 | 905 | ||
906 | if (skb_can_coalesce(skb, i, page, off) && | 906 | if (skb_can_coalesce(skb, i, page, off) && |
907 | off != PAGE_SIZE) { | 907 | off != PAGE_SIZE) { |
908 | /* We can extend the last page | 908 | /* We can extend the last page |
909 | * fragment. */ | 909 | * fragment. */ |
910 | merge = 1; | 910 | merge = 1; |
911 | } else if (i == MAX_SKB_FRAGS || | 911 | } else if (i == MAX_SKB_FRAGS || |
912 | (!i && | 912 | (!i && |
913 | !(sk->sk_route_caps & NETIF_F_SG))) { | 913 | !(sk->sk_route_caps & NETIF_F_SG))) { |
914 | /* Need to add new fragment and cannot | 914 | /* Need to add new fragment and cannot |
915 | * do this because interface is non-SG, | 915 | * do this because interface is non-SG, |
916 | * or because all the page slots are | 916 | * or because all the page slots are |
917 | * busy. */ | 917 | * busy. */ |
918 | tcp_mark_push(tp, skb); | 918 | tcp_mark_push(tp, skb); |
919 | goto new_segment; | 919 | goto new_segment; |
920 | } else if (page) { | 920 | } else if (page) { |
921 | if (off == PAGE_SIZE) { | 921 | if (off == PAGE_SIZE) { |
922 | put_page(page); | 922 | put_page(page); |
923 | TCP_PAGE(sk) = page = NULL; | 923 | TCP_PAGE(sk) = page = NULL; |
924 | off = 0; | 924 | off = 0; |
925 | } | 925 | } |
926 | } else | 926 | } else |
927 | off = 0; | 927 | off = 0; |
928 | 928 | ||
929 | if (copy > PAGE_SIZE - off) | 929 | if (copy > PAGE_SIZE - off) |
930 | copy = PAGE_SIZE - off; | 930 | copy = PAGE_SIZE - off; |
931 | 931 | ||
932 | if (!sk_wmem_schedule(sk, copy)) | 932 | if (!sk_wmem_schedule(sk, copy)) |
933 | goto wait_for_memory; | 933 | goto wait_for_memory; |
934 | 934 | ||
935 | if (!page) { | 935 | if (!page) { |
936 | /* Allocate new cache page. */ | 936 | /* Allocate new cache page. */ |
937 | if (!(page = sk_stream_alloc_page(sk))) | 937 | if (!(page = sk_stream_alloc_page(sk))) |
938 | goto wait_for_memory; | 938 | goto wait_for_memory; |
939 | } | 939 | } |
940 | 940 | ||
941 | /* Time to copy data. We are close to | 941 | /* Time to copy data. We are close to |
942 | * the end! */ | 942 | * the end! */ |
943 | err = skb_copy_to_page(sk, from, skb, page, | 943 | err = skb_copy_to_page(sk, from, skb, page, |
944 | off, copy); | 944 | off, copy); |
945 | if (err) { | 945 | if (err) { |
946 | /* If this page was new, give it to the | 946 | /* If this page was new, give it to the |
947 | * socket so it does not get leaked. | 947 | * socket so it does not get leaked. |
948 | */ | 948 | */ |
949 | if (!TCP_PAGE(sk)) { | 949 | if (!TCP_PAGE(sk)) { |
950 | TCP_PAGE(sk) = page; | 950 | TCP_PAGE(sk) = page; |
951 | TCP_OFF(sk) = 0; | 951 | TCP_OFF(sk) = 0; |
952 | } | 952 | } |
953 | goto do_error; | 953 | goto do_error; |
954 | } | 954 | } |
955 | 955 | ||
956 | /* Update the skb. */ | 956 | /* Update the skb. */ |
957 | if (merge) { | 957 | if (merge) { |
958 | skb_shinfo(skb)->frags[i - 1].size += | 958 | skb_shinfo(skb)->frags[i - 1].size += |
959 | copy; | 959 | copy; |
960 | } else { | 960 | } else { |
961 | skb_fill_page_desc(skb, i, page, off, copy); | 961 | skb_fill_page_desc(skb, i, page, off, copy); |
962 | if (TCP_PAGE(sk)) { | 962 | if (TCP_PAGE(sk)) { |
963 | get_page(page); | 963 | get_page(page); |
964 | } else if (off + copy < PAGE_SIZE) { | 964 | } else if (off + copy < PAGE_SIZE) { |
965 | get_page(page); | 965 | get_page(page); |
966 | TCP_PAGE(sk) = page; | 966 | TCP_PAGE(sk) = page; |
967 | } | 967 | } |
968 | } | 968 | } |
969 | 969 | ||
970 | TCP_OFF(sk) = off + copy; | 970 | TCP_OFF(sk) = off + copy; |
971 | } | 971 | } |
972 | 972 | ||
973 | if (!copied) | 973 | if (!copied) |
974 | TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; | 974 | TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; |
975 | 975 | ||
976 | tp->write_seq += copy; | 976 | tp->write_seq += copy; |
977 | TCP_SKB_CB(skb)->end_seq += copy; | 977 | TCP_SKB_CB(skb)->end_seq += copy; |
978 | skb_shinfo(skb)->gso_segs = 0; | 978 | skb_shinfo(skb)->gso_segs = 0; |
979 | 979 | ||
980 | from += copy; | 980 | from += copy; |
981 | copied += copy; | 981 | copied += copy; |
982 | if ((seglen -= copy) == 0 && iovlen == 0) | 982 | if ((seglen -= copy) == 0 && iovlen == 0) |
983 | goto out; | 983 | goto out; |
984 | 984 | ||
985 | if (skb->len < size_goal || (flags & MSG_OOB)) | 985 | if (skb->len < size_goal || (flags & MSG_OOB)) |
986 | continue; | 986 | continue; |
987 | 987 | ||
988 | if (forced_push(tp)) { | 988 | if (forced_push(tp)) { |
989 | tcp_mark_push(tp, skb); | 989 | tcp_mark_push(tp, skb); |
990 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); | 990 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); |
991 | } else if (skb == tcp_send_head(sk)) | 991 | } else if (skb == tcp_send_head(sk)) |
992 | tcp_push_one(sk, mss_now); | 992 | tcp_push_one(sk, mss_now); |
993 | continue; | 993 | continue; |
994 | 994 | ||
995 | wait_for_sndbuf: | 995 | wait_for_sndbuf: |
996 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 996 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
997 | wait_for_memory: | 997 | wait_for_memory: |
998 | if (copied) | 998 | if (copied) |
999 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); | 999 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); |
1000 | 1000 | ||
1001 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) | 1001 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) |
1002 | goto do_error; | 1002 | goto do_error; |
1003 | 1003 | ||
1004 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); | 1004 | mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); |
1005 | size_goal = tp->xmit_size_goal; | 1005 | size_goal = tp->xmit_size_goal; |
1006 | } | 1006 | } |
1007 | } | 1007 | } |
1008 | 1008 | ||
1009 | out: | 1009 | out: |
1010 | if (copied) | 1010 | if (copied) |
1011 | tcp_push(sk, flags, mss_now, tp->nonagle); | 1011 | tcp_push(sk, flags, mss_now, tp->nonagle); |
1012 | TCP_CHECK_TIMER(sk); | 1012 | TCP_CHECK_TIMER(sk); |
1013 | release_sock(sk); | 1013 | release_sock(sk); |
1014 | return copied; | 1014 | return copied; |
1015 | 1015 | ||
1016 | do_fault: | 1016 | do_fault: |
1017 | if (!skb->len) { | 1017 | if (!skb->len) { |
1018 | tcp_unlink_write_queue(skb, sk); | 1018 | tcp_unlink_write_queue(skb, sk); |
1019 | /* It is the one place in all of TCP, except connection | 1019 | /* It is the one place in all of TCP, except connection |
1020 | * reset, where we can be unlinking the send_head. | 1020 | * reset, where we can be unlinking the send_head. |
1021 | */ | 1021 | */ |
1022 | tcp_check_send_head(sk, skb); | 1022 | tcp_check_send_head(sk, skb); |
1023 | sk_wmem_free_skb(sk, skb); | 1023 | sk_wmem_free_skb(sk, skb); |
1024 | } | 1024 | } |
1025 | 1025 | ||
1026 | do_error: | 1026 | do_error: |
1027 | if (copied) | 1027 | if (copied) |
1028 | goto out; | 1028 | goto out; |
1029 | out_err: | 1029 | out_err: |
1030 | err = sk_stream_error(sk, flags, err); | 1030 | err = sk_stream_error(sk, flags, err); |
1031 | TCP_CHECK_TIMER(sk); | 1031 | TCP_CHECK_TIMER(sk); |
1032 | release_sock(sk); | 1032 | release_sock(sk); |
1033 | return err; | 1033 | return err; |
1034 | } | 1034 | } |
1035 | 1035 | ||
1036 | /* | 1036 | /* |
1037 | * Handle reading urgent data. BSD has very simple semantics for | 1037 | * Handle reading urgent data. BSD has very simple semantics for |
1038 | * this, no blocking and very strange errors 8) | 1038 | * this, no blocking and very strange errors 8) |
1039 | */ | 1039 | */ |
1040 | 1040 | ||
1041 | static int tcp_recv_urg(struct sock *sk, long timeo, | 1041 | static int tcp_recv_urg(struct sock *sk, long timeo, |
1042 | struct msghdr *msg, int len, int flags, | 1042 | struct msghdr *msg, int len, int flags, |
1043 | int *addr_len) | 1043 | int *addr_len) |
1044 | { | 1044 | { |
1045 | struct tcp_sock *tp = tcp_sk(sk); | 1045 | struct tcp_sock *tp = tcp_sk(sk); |
1046 | 1046 | ||
1047 | /* No URG data to read. */ | 1047 | /* No URG data to read. */ |
1048 | if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || | 1048 | if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || |
1049 | tp->urg_data == TCP_URG_READ) | 1049 | tp->urg_data == TCP_URG_READ) |
1050 | return -EINVAL; /* Yes this is right ! */ | 1050 | return -EINVAL; /* Yes this is right ! */ |
1051 | 1051 | ||
1052 | if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) | 1052 | if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) |
1053 | return -ENOTCONN; | 1053 | return -ENOTCONN; |
1054 | 1054 | ||
1055 | if (tp->urg_data & TCP_URG_VALID) { | 1055 | if (tp->urg_data & TCP_URG_VALID) { |
1056 | int err = 0; | 1056 | int err = 0; |
1057 | char c = tp->urg_data; | 1057 | char c = tp->urg_data; |
1058 | 1058 | ||
1059 | if (!(flags & MSG_PEEK)) | 1059 | if (!(flags & MSG_PEEK)) |
1060 | tp->urg_data = TCP_URG_READ; | 1060 | tp->urg_data = TCP_URG_READ; |
1061 | 1061 | ||
1062 | /* Read urgent data. */ | 1062 | /* Read urgent data. */ |
1063 | msg->msg_flags |= MSG_OOB; | 1063 | msg->msg_flags |= MSG_OOB; |
1064 | 1064 | ||
1065 | if (len > 0) { | 1065 | if (len > 0) { |
1066 | if (!(flags & MSG_TRUNC)) | 1066 | if (!(flags & MSG_TRUNC)) |
1067 | err = memcpy_toiovec(msg->msg_iov, &c, 1); | 1067 | err = memcpy_toiovec(msg->msg_iov, &c, 1); |
1068 | len = 1; | 1068 | len = 1; |
1069 | } else | 1069 | } else |
1070 | msg->msg_flags |= MSG_TRUNC; | 1070 | msg->msg_flags |= MSG_TRUNC; |
1071 | 1071 | ||
1072 | return err ? -EFAULT : len; | 1072 | return err ? -EFAULT : len; |
1073 | } | 1073 | } |
1074 | 1074 | ||
1075 | if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) | 1075 | if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) |
1076 | return 0; | 1076 | return 0; |
1077 | 1077 | ||
1078 | /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and | 1078 | /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and |
1079 | * the available implementations agree in this case: | 1079 | * the available implementations agree in this case: |
1080 | * this call should never block, independent of the | 1080 | * this call should never block, independent of the |
1081 | * blocking state of the socket. | 1081 | * blocking state of the socket. |
1082 | * Mike <pall@rz.uni-karlsruhe.de> | 1082 | * Mike <pall@rz.uni-karlsruhe.de> |
1083 | */ | 1083 | */ |
1084 | return -EAGAIN; | 1084 | return -EAGAIN; |
1085 | } | 1085 | } |
1086 | 1086 | ||
1087 | /* Clean up the receive buffer for full frames taken by the user, | 1087 | /* Clean up the receive buffer for full frames taken by the user, |
1088 | * then send an ACK if necessary. COPIED is the number of bytes | 1088 | * then send an ACK if necessary. COPIED is the number of bytes |
1089 | * tcp_recvmsg has given to the user so far, it speeds up the | 1089 | * tcp_recvmsg has given to the user so far, it speeds up the |
1090 | * calculation of whether or not we must ACK for the sake of | 1090 | * calculation of whether or not we must ACK for the sake of |
1091 | * a window update. | 1091 | * a window update. |
1092 | */ | 1092 | */ |
1093 | void tcp_cleanup_rbuf(struct sock *sk, int copied) | 1093 | void tcp_cleanup_rbuf(struct sock *sk, int copied) |
1094 | { | 1094 | { |
1095 | struct tcp_sock *tp = tcp_sk(sk); | 1095 | struct tcp_sock *tp = tcp_sk(sk); |
1096 | int time_to_ack = 0; | 1096 | int time_to_ack = 0; |
1097 | 1097 | ||
1098 | #if TCP_DEBUG | 1098 | #if TCP_DEBUG |
1099 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); | 1099 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); |
1100 | 1100 | ||
1101 | WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); | 1101 | WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); |
1102 | #endif | 1102 | #endif |
1103 | 1103 | ||
1104 | if (inet_csk_ack_scheduled(sk)) { | 1104 | if (inet_csk_ack_scheduled(sk)) { |
1105 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1105 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1106 | /* Delayed ACKs frequently hit locked sockets during bulk | 1106 | /* Delayed ACKs frequently hit locked sockets during bulk |
1107 | * receive. */ | 1107 | * receive. */ |
1108 | if (icsk->icsk_ack.blocked || | 1108 | if (icsk->icsk_ack.blocked || |
1109 | /* Once-per-two-segments ACK was not sent by tcp_input.c */ | 1109 | /* Once-per-two-segments ACK was not sent by tcp_input.c */ |
1110 | tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || | 1110 | tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || |
1111 | /* | 1111 | /* |
1112 | * If this read emptied read buffer, we send ACK, if | 1112 | * If this read emptied read buffer, we send ACK, if |
1113 | * connection is not bidirectional, user drained | 1113 | * connection is not bidirectional, user drained |
1114 | * receive buffer and there was a small segment | 1114 | * receive buffer and there was a small segment |
1115 | * in queue. | 1115 | * in queue. |
1116 | */ | 1116 | */ |
1117 | (copied > 0 && | 1117 | (copied > 0 && |
1118 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || | 1118 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || |
1119 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && | 1119 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && |
1120 | !icsk->icsk_ack.pingpong)) && | 1120 | !icsk->icsk_ack.pingpong)) && |
1121 | !atomic_read(&sk->sk_rmem_alloc))) | 1121 | !atomic_read(&sk->sk_rmem_alloc))) |
1122 | time_to_ack = 1; | 1122 | time_to_ack = 1; |
1123 | } | 1123 | } |
1124 | 1124 | ||
1125 | /* We send an ACK if we can now advertise a non-zero window | 1125 | /* We send an ACK if we can now advertise a non-zero window |
1126 | * which has been raised "significantly". | 1126 | * which has been raised "significantly". |
1127 | * | 1127 | * |
1128 | * Even if window raised up to infinity, do not send window open ACK | 1128 | * Even if window raised up to infinity, do not send window open ACK |
1129 | * in states, where we will not receive more. It is useless. | 1129 | * in states, where we will not receive more. It is useless. |
1130 | */ | 1130 | */ |
1131 | if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { | 1131 | if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { |
1132 | __u32 rcv_window_now = tcp_receive_window(tp); | 1132 | __u32 rcv_window_now = tcp_receive_window(tp); |
1133 | 1133 | ||
1134 | /* Optimize, __tcp_select_window() is not cheap. */ | 1134 | /* Optimize, __tcp_select_window() is not cheap. */ |
1135 | if (2*rcv_window_now <= tp->window_clamp) { | 1135 | if (2*rcv_window_now <= tp->window_clamp) { |
1136 | __u32 new_window = __tcp_select_window(sk); | 1136 | __u32 new_window = __tcp_select_window(sk); |
1137 | 1137 | ||
1138 | /* Send ACK now, if this read freed lots of space | 1138 | /* Send ACK now, if this read freed lots of space |
1139 | * in our buffer. Certainly, new_window is new window. | 1139 | * in our buffer. Certainly, new_window is new window. |
1140 | * We can advertise it now, if it is not less than current one. | 1140 | * We can advertise it now, if it is not less than current one. |
1141 | * "Lots" means "at least twice" here. | 1141 | * "Lots" means "at least twice" here. |
1142 | */ | 1142 | */ |
1143 | if (new_window && new_window >= 2 * rcv_window_now) | 1143 | if (new_window && new_window >= 2 * rcv_window_now) |
1144 | time_to_ack = 1; | 1144 | time_to_ack = 1; |
1145 | } | 1145 | } |
1146 | } | 1146 | } |
1147 | if (time_to_ack) | 1147 | if (time_to_ack) |
1148 | tcp_send_ack(sk); | 1148 | tcp_send_ack(sk); |
1149 | } | 1149 | } |
1150 | 1150 | ||
1151 | static void tcp_prequeue_process(struct sock *sk) | 1151 | static void tcp_prequeue_process(struct sock *sk) |
1152 | { | 1152 | { |
1153 | struct sk_buff *skb; | 1153 | struct sk_buff *skb; |
1154 | struct tcp_sock *tp = tcp_sk(sk); | 1154 | struct tcp_sock *tp = tcp_sk(sk); |
1155 | 1155 | ||
1156 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); | 1156 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); |
1157 | 1157 | ||
1158 | /* RX process wants to run with disabled BHs, though it is not | 1158 | /* RX process wants to run with disabled BHs, though it is not |
1159 | * necessary */ | 1159 | * necessary */ |
1160 | local_bh_disable(); | 1160 | local_bh_disable(); |
1161 | while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) | 1161 | while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) |
1162 | sk_backlog_rcv(sk, skb); | 1162 | sk_backlog_rcv(sk, skb); |
1163 | local_bh_enable(); | 1163 | local_bh_enable(); |
1164 | 1164 | ||
1165 | /* Clear memory counter. */ | 1165 | /* Clear memory counter. */ |
1166 | tp->ucopy.memory = 0; | 1166 | tp->ucopy.memory = 0; |
1167 | } | 1167 | } |
1168 | 1168 | ||
1169 | static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | 1169 | static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) |
1170 | { | 1170 | { |
1171 | struct sk_buff *skb; | 1171 | struct sk_buff *skb; |
1172 | u32 offset; | 1172 | u32 offset; |
1173 | 1173 | ||
1174 | skb_queue_walk(&sk->sk_receive_queue, skb) { | 1174 | skb_queue_walk(&sk->sk_receive_queue, skb) { |
1175 | offset = seq - TCP_SKB_CB(skb)->seq; | 1175 | offset = seq - TCP_SKB_CB(skb)->seq; |
1176 | if (tcp_hdr(skb)->syn) | 1176 | if (tcp_hdr(skb)->syn) |
1177 | offset--; | 1177 | offset--; |
1178 | if (offset < skb->len || tcp_hdr(skb)->fin) { | 1178 | if (offset < skb->len || tcp_hdr(skb)->fin) { |
1179 | *off = offset; | 1179 | *off = offset; |
1180 | return skb; | 1180 | return skb; |
1181 | } | 1181 | } |
1182 | } | 1182 | } |
1183 | return NULL; | 1183 | return NULL; |
1184 | } | 1184 | } |
1185 | 1185 | ||
1186 | /* | 1186 | /* |
1187 | * This routine provides an alternative to tcp_recvmsg() for routines | 1187 | * This routine provides an alternative to tcp_recvmsg() for routines |
1188 | * that would like to handle copying from skbuffs directly in 'sendfile' | 1188 | * that would like to handle copying from skbuffs directly in 'sendfile' |
1189 | * fashion. | 1189 | * fashion. |
1190 | * Note: | 1190 | * Note: |
1191 | * - It is assumed that the socket was locked by the caller. | 1191 | * - It is assumed that the socket was locked by the caller. |
1192 | * - The routine does not block. | 1192 | * - The routine does not block. |
1193 | * - At present, there is no support for reading OOB data | 1193 | * - At present, there is no support for reading OOB data |
1194 | * or for 'peeking' the socket using this routine | 1194 | * or for 'peeking' the socket using this routine |
1195 | * (although both would be easy to implement). | 1195 | * (although both would be easy to implement). |
1196 | */ | 1196 | */ |
1197 | int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | 1197 | int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, |
1198 | sk_read_actor_t recv_actor) | 1198 | sk_read_actor_t recv_actor) |
1199 | { | 1199 | { |
1200 | struct sk_buff *skb; | 1200 | struct sk_buff *skb; |
1201 | struct tcp_sock *tp = tcp_sk(sk); | 1201 | struct tcp_sock *tp = tcp_sk(sk); |
1202 | u32 seq = tp->copied_seq; | 1202 | u32 seq = tp->copied_seq; |
1203 | u32 offset; | 1203 | u32 offset; |
1204 | int copied = 0; | 1204 | int copied = 0; |
1205 | 1205 | ||
1206 | if (sk->sk_state == TCP_LISTEN) | 1206 | if (sk->sk_state == TCP_LISTEN) |
1207 | return -ENOTCONN; | 1207 | return -ENOTCONN; |
1208 | while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { | 1208 | while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { |
1209 | if (offset < skb->len) { | 1209 | if (offset < skb->len) { |
1210 | int used; | 1210 | int used; |
1211 | size_t len; | 1211 | size_t len; |
1212 | 1212 | ||
1213 | len = skb->len - offset; | 1213 | len = skb->len - offset; |
1214 | /* Stop reading if we hit a patch of urgent data */ | 1214 | /* Stop reading if we hit a patch of urgent data */ |
1215 | if (tp->urg_data) { | 1215 | if (tp->urg_data) { |
1216 | u32 urg_offset = tp->urg_seq - seq; | 1216 | u32 urg_offset = tp->urg_seq - seq; |
1217 | if (urg_offset < len) | 1217 | if (urg_offset < len) |
1218 | len = urg_offset; | 1218 | len = urg_offset; |
1219 | if (!len) | 1219 | if (!len) |
1220 | break; | 1220 | break; |
1221 | } | 1221 | } |
1222 | used = recv_actor(desc, skb, offset, len); | 1222 | used = recv_actor(desc, skb, offset, len); |
1223 | if (used < 0) { | 1223 | if (used < 0) { |
1224 | if (!copied) | 1224 | if (!copied) |
1225 | copied = used; | 1225 | copied = used; |
1226 | break; | 1226 | break; |
1227 | } else if (used <= len) { | 1227 | } else if (used <= len) { |
1228 | seq += used; | 1228 | seq += used; |
1229 | copied += used; | 1229 | copied += used; |
1230 | offset += used; | 1230 | offset += used; |
1231 | } | 1231 | } |
1232 | /* | 1232 | /* |
1233 | * If recv_actor drops the lock (e.g. TCP splice | 1233 | * If recv_actor drops the lock (e.g. TCP splice |
1234 | * receive) the skb pointer might be invalid when | 1234 | * receive) the skb pointer might be invalid when |
1235 | * getting here: tcp_collapse might have deleted it | 1235 | * getting here: tcp_collapse might have deleted it |
1236 | * while aggregating skbs from the socket queue. | 1236 | * while aggregating skbs from the socket queue. |
1237 | */ | 1237 | */ |
1238 | skb = tcp_recv_skb(sk, seq-1, &offset); | 1238 | skb = tcp_recv_skb(sk, seq-1, &offset); |
1239 | if (!skb || (offset+1 != skb->len)) | 1239 | if (!skb || (offset+1 != skb->len)) |
1240 | break; | 1240 | break; |
1241 | } | 1241 | } |
1242 | if (tcp_hdr(skb)->fin) { | 1242 | if (tcp_hdr(skb)->fin) { |
1243 | sk_eat_skb(sk, skb, 0); | 1243 | sk_eat_skb(sk, skb, 0); |
1244 | ++seq; | 1244 | ++seq; |
1245 | break; | 1245 | break; |
1246 | } | 1246 | } |
1247 | sk_eat_skb(sk, skb, 0); | 1247 | sk_eat_skb(sk, skb, 0); |
1248 | if (!desc->count) | 1248 | if (!desc->count) |
1249 | break; | 1249 | break; |
1250 | } | 1250 | } |
1251 | tp->copied_seq = seq; | 1251 | tp->copied_seq = seq; |
1252 | 1252 | ||
1253 | tcp_rcv_space_adjust(sk); | 1253 | tcp_rcv_space_adjust(sk); |
1254 | 1254 | ||
1255 | /* Clean up data we have read: This will do ACK frames. */ | 1255 | /* Clean up data we have read: This will do ACK frames. */ |
1256 | if (copied > 0) | 1256 | if (copied > 0) |
1257 | tcp_cleanup_rbuf(sk, copied); | 1257 | tcp_cleanup_rbuf(sk, copied); |
1258 | return copied; | 1258 | return copied; |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | /* | 1261 | /* |
1262 | * This routine copies from a sock struct into the user buffer. | 1262 | * This routine copies from a sock struct into the user buffer. |
1263 | * | 1263 | * |
1264 | * Technical note: in 2.3 we work on _locked_ socket, so that | 1264 | * Technical note: in 2.3 we work on _locked_ socket, so that |
1265 | * tricks with *seq access order and skb->users are not required. | 1265 | * tricks with *seq access order and skb->users are not required. |
1266 | * Probably, code can be easily improved even more. | 1266 | * Probably, code can be easily improved even more. |
1267 | */ | 1267 | */ |
1268 | 1268 | ||
1269 | int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 1269 | int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
1270 | size_t len, int nonblock, int flags, int *addr_len) | 1270 | size_t len, int nonblock, int flags, int *addr_len) |
1271 | { | 1271 | { |
1272 | struct tcp_sock *tp = tcp_sk(sk); | 1272 | struct tcp_sock *tp = tcp_sk(sk); |
1273 | int copied = 0; | 1273 | int copied = 0; |
1274 | u32 peek_seq; | 1274 | u32 peek_seq; |
1275 | u32 *seq; | 1275 | u32 *seq; |
1276 | unsigned long used; | 1276 | unsigned long used; |
1277 | int err; | 1277 | int err; |
1278 | int target; /* Read at least this many bytes */ | 1278 | int target; /* Read at least this many bytes */ |
1279 | long timeo; | 1279 | long timeo; |
1280 | struct task_struct *user_recv = NULL; | 1280 | struct task_struct *user_recv = NULL; |
1281 | int copied_early = 0; | 1281 | int copied_early = 0; |
1282 | struct sk_buff *skb; | 1282 | struct sk_buff *skb; |
1283 | 1283 | ||
1284 | lock_sock(sk); | 1284 | lock_sock(sk); |
1285 | 1285 | ||
1286 | TCP_CHECK_TIMER(sk); | 1286 | TCP_CHECK_TIMER(sk); |
1287 | 1287 | ||
1288 | err = -ENOTCONN; | 1288 | err = -ENOTCONN; |
1289 | if (sk->sk_state == TCP_LISTEN) | 1289 | if (sk->sk_state == TCP_LISTEN) |
1290 | goto out; | 1290 | goto out; |
1291 | 1291 | ||
1292 | timeo = sock_rcvtimeo(sk, nonblock); | 1292 | timeo = sock_rcvtimeo(sk, nonblock); |
1293 | 1293 | ||
1294 | /* Urgent data needs to be handled specially. */ | 1294 | /* Urgent data needs to be handled specially. */ |
1295 | if (flags & MSG_OOB) | 1295 | if (flags & MSG_OOB) |
1296 | goto recv_urg; | 1296 | goto recv_urg; |
1297 | 1297 | ||
1298 | seq = &tp->copied_seq; | 1298 | seq = &tp->copied_seq; |
1299 | if (flags & MSG_PEEK) { | 1299 | if (flags & MSG_PEEK) { |
1300 | peek_seq = tp->copied_seq; | 1300 | peek_seq = tp->copied_seq; |
1301 | seq = &peek_seq; | 1301 | seq = &peek_seq; |
1302 | } | 1302 | } |
1303 | 1303 | ||
1304 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); | 1304 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); |
1305 | 1305 | ||
1306 | #ifdef CONFIG_NET_DMA | 1306 | #ifdef CONFIG_NET_DMA |
1307 | tp->ucopy.dma_chan = NULL; | 1307 | tp->ucopy.dma_chan = NULL; |
1308 | preempt_disable(); | 1308 | preempt_disable(); |
1309 | skb = skb_peek_tail(&sk->sk_receive_queue); | 1309 | skb = skb_peek_tail(&sk->sk_receive_queue); |
1310 | { | 1310 | { |
1311 | int available = 0; | 1311 | int available = 0; |
1312 | 1312 | ||
1313 | if (skb) | 1313 | if (skb) |
1314 | available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); | 1314 | available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); |
1315 | if ((available < target) && | 1315 | if ((available < target) && |
1316 | (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && | 1316 | (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && |
1317 | !sysctl_tcp_low_latency && | 1317 | !sysctl_tcp_low_latency && |
1318 | __get_cpu_var(softnet_data).net_dma) { | 1318 | __get_cpu_var(softnet_data).net_dma) { |
1319 | preempt_enable_no_resched(); | 1319 | preempt_enable_no_resched(); |
1320 | tp->ucopy.pinned_list = | 1320 | tp->ucopy.pinned_list = |
1321 | dma_pin_iovec_pages(msg->msg_iov, len); | 1321 | dma_pin_iovec_pages(msg->msg_iov, len); |
1322 | } else { | 1322 | } else { |
1323 | preempt_enable_no_resched(); | 1323 | preempt_enable_no_resched(); |
1324 | } | 1324 | } |
1325 | } | 1325 | } |
1326 | #endif | 1326 | #endif |
1327 | 1327 | ||
1328 | do { | 1328 | do { |
1329 | u32 offset; | 1329 | u32 offset; |
1330 | 1330 | ||
1331 | /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ | 1331 | /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ |
1332 | if (tp->urg_data && tp->urg_seq == *seq) { | 1332 | if (tp->urg_data && tp->urg_seq == *seq) { |
1333 | if (copied) | 1333 | if (copied) |
1334 | break; | 1334 | break; |
1335 | if (signal_pending(current)) { | 1335 | if (signal_pending(current)) { |
1336 | copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; | 1336 | copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; |
1337 | break; | 1337 | break; |
1338 | } | 1338 | } |
1339 | } | 1339 | } |
1340 | 1340 | ||
1341 | /* Next get a buffer. */ | 1341 | /* Next get a buffer. */ |
1342 | 1342 | ||
1343 | skb = skb_peek(&sk->sk_receive_queue); | 1343 | skb = skb_peek(&sk->sk_receive_queue); |
1344 | do { | 1344 | do { |
1345 | if (!skb) | 1345 | if (!skb) |
1346 | break; | 1346 | break; |
1347 | 1347 | ||
1348 | /* Now that we have two receive queues this | 1348 | /* Now that we have two receive queues this |
1349 | * shouldn't happen. | 1349 | * shouldn't happen. |
1350 | */ | 1350 | */ |
1351 | if (before(*seq, TCP_SKB_CB(skb)->seq)) { | 1351 | if (before(*seq, TCP_SKB_CB(skb)->seq)) { |
1352 | printk(KERN_INFO "recvmsg bug: copied %X " | 1352 | printk(KERN_INFO "recvmsg bug: copied %X " |
1353 | "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); | 1353 | "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); |
1354 | break; | 1354 | break; |
1355 | } | 1355 | } |
1356 | offset = *seq - TCP_SKB_CB(skb)->seq; | 1356 | offset = *seq - TCP_SKB_CB(skb)->seq; |
1357 | if (tcp_hdr(skb)->syn) | 1357 | if (tcp_hdr(skb)->syn) |
1358 | offset--; | 1358 | offset--; |
1359 | if (offset < skb->len) | 1359 | if (offset < skb->len) |
1360 | goto found_ok_skb; | 1360 | goto found_ok_skb; |
1361 | if (tcp_hdr(skb)->fin) | 1361 | if (tcp_hdr(skb)->fin) |
1362 | goto found_fin_ok; | 1362 | goto found_fin_ok; |
1363 | WARN_ON(!(flags & MSG_PEEK)); | 1363 | WARN_ON(!(flags & MSG_PEEK)); |
1364 | skb = skb->next; | 1364 | skb = skb->next; |
1365 | } while (skb != (struct sk_buff *)&sk->sk_receive_queue); | 1365 | } while (skb != (struct sk_buff *)&sk->sk_receive_queue); |
1366 | 1366 | ||
1367 | /* Well, if we have backlog, try to process it now yet. */ | 1367 | /* Well, if we have backlog, try to process it now yet. */ |
1368 | 1368 | ||
1369 | if (copied >= target && !sk->sk_backlog.tail) | 1369 | if (copied >= target && !sk->sk_backlog.tail) |
1370 | break; | 1370 | break; |
1371 | 1371 | ||
1372 | if (copied) { | 1372 | if (copied) { |
1373 | if (sk->sk_err || | 1373 | if (sk->sk_err || |
1374 | sk->sk_state == TCP_CLOSE || | 1374 | sk->sk_state == TCP_CLOSE || |
1375 | (sk->sk_shutdown & RCV_SHUTDOWN) || | 1375 | (sk->sk_shutdown & RCV_SHUTDOWN) || |
1376 | !timeo || | 1376 | !timeo || |
1377 | signal_pending(current) || | 1377 | signal_pending(current) || |
1378 | (flags & MSG_PEEK)) | 1378 | (flags & MSG_PEEK)) |
1379 | break; | 1379 | break; |
1380 | } else { | 1380 | } else { |
1381 | if (sock_flag(sk, SOCK_DONE)) | 1381 | if (sock_flag(sk, SOCK_DONE)) |
1382 | break; | 1382 | break; |
1383 | 1383 | ||
1384 | if (sk->sk_err) { | 1384 | if (sk->sk_err) { |
1385 | copied = sock_error(sk); | 1385 | copied = sock_error(sk); |
1386 | break; | 1386 | break; |
1387 | } | 1387 | } |
1388 | 1388 | ||
1389 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 1389 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
1390 | break; | 1390 | break; |
1391 | 1391 | ||
1392 | if (sk->sk_state == TCP_CLOSE) { | 1392 | if (sk->sk_state == TCP_CLOSE) { |
1393 | if (!sock_flag(sk, SOCK_DONE)) { | 1393 | if (!sock_flag(sk, SOCK_DONE)) { |
1394 | /* This occurs when user tries to read | 1394 | /* This occurs when user tries to read |
1395 | * from never connected socket. | 1395 | * from never connected socket. |
1396 | */ | 1396 | */ |
1397 | copied = -ENOTCONN; | 1397 | copied = -ENOTCONN; |
1398 | break; | 1398 | break; |
1399 | } | 1399 | } |
1400 | break; | 1400 | break; |
1401 | } | 1401 | } |
1402 | 1402 | ||
1403 | if (!timeo) { | 1403 | if (!timeo) { |
1404 | copied = -EAGAIN; | 1404 | copied = -EAGAIN; |
1405 | break; | 1405 | break; |
1406 | } | 1406 | } |
1407 | 1407 | ||
1408 | if (signal_pending(current)) { | 1408 | if (signal_pending(current)) { |
1409 | copied = sock_intr_errno(timeo); | 1409 | copied = sock_intr_errno(timeo); |
1410 | break; | 1410 | break; |
1411 | } | 1411 | } |
1412 | } | 1412 | } |
1413 | 1413 | ||
1414 | tcp_cleanup_rbuf(sk, copied); | 1414 | tcp_cleanup_rbuf(sk, copied); |
1415 | 1415 | ||
1416 | if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { | 1416 | if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { |
1417 | /* Install new reader */ | 1417 | /* Install new reader */ |
1418 | if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { | 1418 | if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { |
1419 | user_recv = current; | 1419 | user_recv = current; |
1420 | tp->ucopy.task = user_recv; | 1420 | tp->ucopy.task = user_recv; |
1421 | tp->ucopy.iov = msg->msg_iov; | 1421 | tp->ucopy.iov = msg->msg_iov; |
1422 | } | 1422 | } |
1423 | 1423 | ||
1424 | tp->ucopy.len = len; | 1424 | tp->ucopy.len = len; |
1425 | 1425 | ||
1426 | WARN_ON(tp->copied_seq != tp->rcv_nxt && | 1426 | WARN_ON(tp->copied_seq != tp->rcv_nxt && |
1427 | !(flags & (MSG_PEEK | MSG_TRUNC))); | 1427 | !(flags & (MSG_PEEK | MSG_TRUNC))); |
1428 | 1428 | ||
1429 | /* Ugly... If prequeue is not empty, we have to | 1429 | /* Ugly... If prequeue is not empty, we have to |
1430 | * process it before releasing socket, otherwise | 1430 | * process it before releasing socket, otherwise |
1431 | * order will be broken at second iteration. | 1431 | * order will be broken at second iteration. |
1432 | * More elegant solution is required!!! | 1432 | * More elegant solution is required!!! |
1433 | * | 1433 | * |
1434 | * Look: we have the following (pseudo)queues: | 1434 | * Look: we have the following (pseudo)queues: |
1435 | * | 1435 | * |
1436 | * 1. packets in flight | 1436 | * 1. packets in flight |
1437 | * 2. backlog | 1437 | * 2. backlog |
1438 | * 3. prequeue | 1438 | * 3. prequeue |
1439 | * 4. receive_queue | 1439 | * 4. receive_queue |
1440 | * | 1440 | * |
1441 | * Each queue can be processed only if the next ones | 1441 | * Each queue can be processed only if the next ones |
1442 | * are empty. At this point we have empty receive_queue. | 1442 | * are empty. At this point we have empty receive_queue. |
1443 | * But prequeue _can_ be not empty after 2nd iteration, | 1443 | * But prequeue _can_ be not empty after 2nd iteration, |
1444 | * when we jumped to start of loop because backlog | 1444 | * when we jumped to start of loop because backlog |
1445 | * processing added something to receive_queue. | 1445 | * processing added something to receive_queue. |
1446 | * We cannot release_sock(), because backlog contains | 1446 | * We cannot release_sock(), because backlog contains |
1447 | * packets arrived _after_ prequeued ones. | 1447 | * packets arrived _after_ prequeued ones. |
1448 | * | 1448 | * |
1449 | * Shortly, algorithm is clear --- to process all | 1449 | * Shortly, algorithm is clear --- to process all |
1450 | * the queues in order. We could make it more directly, | 1450 | * the queues in order. We could make it more directly, |
1451 | * requeueing packets from backlog to prequeue, if | 1451 | * requeueing packets from backlog to prequeue, if |
1452 | * is not empty. It is more elegant, but eats cycles, | 1452 | * is not empty. It is more elegant, but eats cycles, |
1453 | * unfortunately. | 1453 | * unfortunately. |
1454 | */ | 1454 | */ |
1455 | if (!skb_queue_empty(&tp->ucopy.prequeue)) | 1455 | if (!skb_queue_empty(&tp->ucopy.prequeue)) |
1456 | goto do_prequeue; | 1456 | goto do_prequeue; |
1457 | 1457 | ||
1458 | /* __ Set realtime policy in scheduler __ */ | 1458 | /* __ Set realtime policy in scheduler __ */ |
1459 | } | 1459 | } |
1460 | 1460 | ||
1461 | if (copied >= target) { | 1461 | if (copied >= target) { |
1462 | /* Do not sleep, just process backlog. */ | 1462 | /* Do not sleep, just process backlog. */ |
1463 | release_sock(sk); | 1463 | release_sock(sk); |
1464 | lock_sock(sk); | 1464 | lock_sock(sk); |
1465 | } else | 1465 | } else |
1466 | sk_wait_data(sk, &timeo); | 1466 | sk_wait_data(sk, &timeo); |
1467 | 1467 | ||
1468 | #ifdef CONFIG_NET_DMA | 1468 | #ifdef CONFIG_NET_DMA |
1469 | tp->ucopy.wakeup = 0; | 1469 | tp->ucopy.wakeup = 0; |
1470 | #endif | 1470 | #endif |
1471 | 1471 | ||
1472 | if (user_recv) { | 1472 | if (user_recv) { |
1473 | int chunk; | 1473 | int chunk; |
1474 | 1474 | ||
1475 | /* __ Restore normal policy in scheduler __ */ | 1475 | /* __ Restore normal policy in scheduler __ */ |
1476 | 1476 | ||
1477 | if ((chunk = len - tp->ucopy.len) != 0) { | 1477 | if ((chunk = len - tp->ucopy.len) != 0) { |
1478 | NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); | 1478 | NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); |
1479 | len -= chunk; | 1479 | len -= chunk; |
1480 | copied += chunk; | 1480 | copied += chunk; |
1481 | } | 1481 | } |
1482 | 1482 | ||
1483 | if (tp->rcv_nxt == tp->copied_seq && | 1483 | if (tp->rcv_nxt == tp->copied_seq && |
1484 | !skb_queue_empty(&tp->ucopy.prequeue)) { | 1484 | !skb_queue_empty(&tp->ucopy.prequeue)) { |
1485 | do_prequeue: | 1485 | do_prequeue: |
1486 | tcp_prequeue_process(sk); | 1486 | tcp_prequeue_process(sk); |
1487 | 1487 | ||
1488 | if ((chunk = len - tp->ucopy.len) != 0) { | 1488 | if ((chunk = len - tp->ucopy.len) != 0) { |
1489 | NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); | 1489 | NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); |
1490 | len -= chunk; | 1490 | len -= chunk; |
1491 | copied += chunk; | 1491 | copied += chunk; |
1492 | } | 1492 | } |
1493 | } | 1493 | } |
1494 | } | 1494 | } |
1495 | if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { | 1495 | if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { |
1496 | if (net_ratelimit()) | 1496 | if (net_ratelimit()) |
1497 | printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", | 1497 | printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", |
1498 | current->comm, task_pid_nr(current)); | 1498 | current->comm, task_pid_nr(current)); |
1499 | peek_seq = tp->copied_seq; | 1499 | peek_seq = tp->copied_seq; |
1500 | } | 1500 | } |
1501 | continue; | 1501 | continue; |
1502 | 1502 | ||
1503 | found_ok_skb: | 1503 | found_ok_skb: |
1504 | /* Ok so how much can we use? */ | 1504 | /* Ok so how much can we use? */ |
1505 | used = skb->len - offset; | 1505 | used = skb->len - offset; |
1506 | if (len < used) | 1506 | if (len < used) |
1507 | used = len; | 1507 | used = len; |
1508 | 1508 | ||
1509 | /* Do we have urgent data here? */ | 1509 | /* Do we have urgent data here? */ |
1510 | if (tp->urg_data) { | 1510 | if (tp->urg_data) { |
1511 | u32 urg_offset = tp->urg_seq - *seq; | 1511 | u32 urg_offset = tp->urg_seq - *seq; |
1512 | if (urg_offset < used) { | 1512 | if (urg_offset < used) { |
1513 | if (!urg_offset) { | 1513 | if (!urg_offset) { |
1514 | if (!sock_flag(sk, SOCK_URGINLINE)) { | 1514 | if (!sock_flag(sk, SOCK_URGINLINE)) { |
1515 | ++*seq; | 1515 | ++*seq; |
1516 | offset++; | 1516 | offset++; |
1517 | used--; | 1517 | used--; |
1518 | if (!used) | 1518 | if (!used) |
1519 | goto skip_copy; | 1519 | goto skip_copy; |
1520 | } | 1520 | } |
1521 | } else | 1521 | } else |
1522 | used = urg_offset; | 1522 | used = urg_offset; |
1523 | } | 1523 | } |
1524 | } | 1524 | } |
1525 | 1525 | ||
1526 | if (!(flags & MSG_TRUNC)) { | 1526 | if (!(flags & MSG_TRUNC)) { |
1527 | #ifdef CONFIG_NET_DMA | 1527 | #ifdef CONFIG_NET_DMA |
1528 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | 1528 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
1529 | tp->ucopy.dma_chan = get_softnet_dma(); | 1529 | tp->ucopy.dma_chan = get_softnet_dma(); |
1530 | 1530 | ||
1531 | if (tp->ucopy.dma_chan) { | 1531 | if (tp->ucopy.dma_chan) { |
1532 | tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( | 1532 | tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( |
1533 | tp->ucopy.dma_chan, skb, offset, | 1533 | tp->ucopy.dma_chan, skb, offset, |
1534 | msg->msg_iov, used, | 1534 | msg->msg_iov, used, |
1535 | tp->ucopy.pinned_list); | 1535 | tp->ucopy.pinned_list); |
1536 | 1536 | ||
1537 | if (tp->ucopy.dma_cookie < 0) { | 1537 | if (tp->ucopy.dma_cookie < 0) { |
1538 | 1538 | ||
1539 | printk(KERN_ALERT "dma_cookie < 0\n"); | 1539 | printk(KERN_ALERT "dma_cookie < 0\n"); |
1540 | 1540 | ||
1541 | /* Exception. Bailout! */ | 1541 | /* Exception. Bailout! */ |
1542 | if (!copied) | 1542 | if (!copied) |
1543 | copied = -EFAULT; | 1543 | copied = -EFAULT; |
1544 | break; | 1544 | break; |
1545 | } | 1545 | } |
1546 | if ((offset + used) == skb->len) | 1546 | if ((offset + used) == skb->len) |
1547 | copied_early = 1; | 1547 | copied_early = 1; |
1548 | 1548 | ||
1549 | } else | 1549 | } else |
1550 | #endif | 1550 | #endif |
1551 | { | 1551 | { |
1552 | err = skb_copy_datagram_iovec(skb, offset, | 1552 | err = skb_copy_datagram_iovec(skb, offset, |
1553 | msg->msg_iov, used); | 1553 | msg->msg_iov, used); |
1554 | if (err) { | 1554 | if (err) { |
1555 | /* Exception. Bailout! */ | 1555 | /* Exception. Bailout! */ |
1556 | if (!copied) | 1556 | if (!copied) |
1557 | copied = -EFAULT; | 1557 | copied = -EFAULT; |
1558 | break; | 1558 | break; |
1559 | } | 1559 | } |
1560 | } | 1560 | } |
1561 | } | 1561 | } |
1562 | 1562 | ||
1563 | *seq += used; | 1563 | *seq += used; |
1564 | copied += used; | 1564 | copied += used; |
1565 | len -= used; | 1565 | len -= used; |
1566 | 1566 | ||
1567 | tcp_rcv_space_adjust(sk); | 1567 | tcp_rcv_space_adjust(sk); |
1568 | 1568 | ||
1569 | skip_copy: | 1569 | skip_copy: |
1570 | if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { | 1570 | if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { |
1571 | tp->urg_data = 0; | 1571 | tp->urg_data = 0; |
1572 | tcp_fast_path_check(sk); | 1572 | tcp_fast_path_check(sk); |
1573 | } | 1573 | } |
1574 | if (used + offset < skb->len) | 1574 | if (used + offset < skb->len) |
1575 | continue; | 1575 | continue; |
1576 | 1576 | ||
1577 | if (tcp_hdr(skb)->fin) | 1577 | if (tcp_hdr(skb)->fin) |
1578 | goto found_fin_ok; | 1578 | goto found_fin_ok; |
1579 | if (!(flags & MSG_PEEK)) { | 1579 | if (!(flags & MSG_PEEK)) { |
1580 | sk_eat_skb(sk, skb, copied_early); | 1580 | sk_eat_skb(sk, skb, copied_early); |
1581 | copied_early = 0; | 1581 | copied_early = 0; |
1582 | } | 1582 | } |
1583 | continue; | 1583 | continue; |
1584 | 1584 | ||
1585 | found_fin_ok: | 1585 | found_fin_ok: |
1586 | /* Process the FIN. */ | 1586 | /* Process the FIN. */ |
1587 | ++*seq; | 1587 | ++*seq; |
1588 | if (!(flags & MSG_PEEK)) { | 1588 | if (!(flags & MSG_PEEK)) { |
1589 | sk_eat_skb(sk, skb, copied_early); | 1589 | sk_eat_skb(sk, skb, copied_early); |
1590 | copied_early = 0; | 1590 | copied_early = 0; |
1591 | } | 1591 | } |
1592 | break; | 1592 | break; |
1593 | } while (len > 0); | 1593 | } while (len > 0); |
1594 | 1594 | ||
1595 | if (user_recv) { | 1595 | if (user_recv) { |
1596 | if (!skb_queue_empty(&tp->ucopy.prequeue)) { | 1596 | if (!skb_queue_empty(&tp->ucopy.prequeue)) { |
1597 | int chunk; | 1597 | int chunk; |
1598 | 1598 | ||
1599 | tp->ucopy.len = copied > 0 ? len : 0; | 1599 | tp->ucopy.len = copied > 0 ? len : 0; |
1600 | 1600 | ||
1601 | tcp_prequeue_process(sk); | 1601 | tcp_prequeue_process(sk); |
1602 | 1602 | ||
1603 | if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { | 1603 | if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { |
1604 | NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); | 1604 | NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); |
1605 | len -= chunk; | 1605 | len -= chunk; |
1606 | copied += chunk; | 1606 | copied += chunk; |
1607 | } | 1607 | } |
1608 | } | 1608 | } |
1609 | 1609 | ||
1610 | tp->ucopy.task = NULL; | 1610 | tp->ucopy.task = NULL; |
1611 | tp->ucopy.len = 0; | 1611 | tp->ucopy.len = 0; |
1612 | } | 1612 | } |
1613 | 1613 | ||
1614 | #ifdef CONFIG_NET_DMA | 1614 | #ifdef CONFIG_NET_DMA |
1615 | if (tp->ucopy.dma_chan) { | 1615 | if (tp->ucopy.dma_chan) { |
1616 | dma_cookie_t done, used; | 1616 | dma_cookie_t done, used; |
1617 | 1617 | ||
1618 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | 1618 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); |
1619 | 1619 | ||
1620 | while (dma_async_memcpy_complete(tp->ucopy.dma_chan, | 1620 | while (dma_async_memcpy_complete(tp->ucopy.dma_chan, |
1621 | tp->ucopy.dma_cookie, &done, | 1621 | tp->ucopy.dma_cookie, &done, |
1622 | &used) == DMA_IN_PROGRESS) { | 1622 | &used) == DMA_IN_PROGRESS) { |
1623 | /* do partial cleanup of sk_async_wait_queue */ | 1623 | /* do partial cleanup of sk_async_wait_queue */ |
1624 | while ((skb = skb_peek(&sk->sk_async_wait_queue)) && | 1624 | while ((skb = skb_peek(&sk->sk_async_wait_queue)) && |
1625 | (dma_async_is_complete(skb->dma_cookie, done, | 1625 | (dma_async_is_complete(skb->dma_cookie, done, |
1626 | used) == DMA_SUCCESS)) { | 1626 | used) == DMA_SUCCESS)) { |
1627 | __skb_dequeue(&sk->sk_async_wait_queue); | 1627 | __skb_dequeue(&sk->sk_async_wait_queue); |
1628 | kfree_skb(skb); | 1628 | kfree_skb(skb); |
1629 | } | 1629 | } |
1630 | } | 1630 | } |
1631 | 1631 | ||
1632 | /* Safe to free early-copied skbs now */ | 1632 | /* Safe to free early-copied skbs now */ |
1633 | __skb_queue_purge(&sk->sk_async_wait_queue); | 1633 | __skb_queue_purge(&sk->sk_async_wait_queue); |
1634 | dma_chan_put(tp->ucopy.dma_chan); | 1634 | dma_chan_put(tp->ucopy.dma_chan); |
1635 | tp->ucopy.dma_chan = NULL; | 1635 | tp->ucopy.dma_chan = NULL; |
1636 | } | 1636 | } |
1637 | if (tp->ucopy.pinned_list) { | 1637 | if (tp->ucopy.pinned_list) { |
1638 | dma_unpin_iovec_pages(tp->ucopy.pinned_list); | 1638 | dma_unpin_iovec_pages(tp->ucopy.pinned_list); |
1639 | tp->ucopy.pinned_list = NULL; | 1639 | tp->ucopy.pinned_list = NULL; |
1640 | } | 1640 | } |
1641 | #endif | 1641 | #endif |
1642 | 1642 | ||
1643 | /* According to UNIX98, msg_name/msg_namelen are ignored | 1643 | /* According to UNIX98, msg_name/msg_namelen are ignored |
1644 | * on connected socket. I was just happy when found this 8) --ANK | 1644 | * on connected socket. I was just happy when found this 8) --ANK |
1645 | */ | 1645 | */ |
1646 | 1646 | ||
1647 | /* Clean up data we have read: This will do ACK frames. */ | 1647 | /* Clean up data we have read: This will do ACK frames. */ |
1648 | tcp_cleanup_rbuf(sk, copied); | 1648 | tcp_cleanup_rbuf(sk, copied); |
1649 | 1649 | ||
1650 | TCP_CHECK_TIMER(sk); | 1650 | TCP_CHECK_TIMER(sk); |
1651 | release_sock(sk); | 1651 | release_sock(sk); |
1652 | return copied; | 1652 | return copied; |
1653 | 1653 | ||
1654 | out: | 1654 | out: |
1655 | TCP_CHECK_TIMER(sk); | 1655 | TCP_CHECK_TIMER(sk); |
1656 | release_sock(sk); | 1656 | release_sock(sk); |
1657 | return err; | 1657 | return err; |
1658 | 1658 | ||
1659 | recv_urg: | 1659 | recv_urg: |
1660 | err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); | 1660 | err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); |
1661 | goto out; | 1661 | goto out; |
1662 | } | 1662 | } |
1663 | 1663 | ||
1664 | void tcp_set_state(struct sock *sk, int state) | 1664 | void tcp_set_state(struct sock *sk, int state) |
1665 | { | 1665 | { |
1666 | int oldstate = sk->sk_state; | 1666 | int oldstate = sk->sk_state; |
1667 | 1667 | ||
1668 | switch (state) { | 1668 | switch (state) { |
1669 | case TCP_ESTABLISHED: | 1669 | case TCP_ESTABLISHED: |
1670 | if (oldstate != TCP_ESTABLISHED) | 1670 | if (oldstate != TCP_ESTABLISHED) |
1671 | TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); | 1671 | TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); |
1672 | break; | 1672 | break; |
1673 | 1673 | ||
1674 | case TCP_CLOSE: | 1674 | case TCP_CLOSE: |
1675 | if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) | 1675 | if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) |
1676 | TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); | 1676 | TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); |
1677 | 1677 | ||
1678 | sk->sk_prot->unhash(sk); | 1678 | sk->sk_prot->unhash(sk); |
1679 | if (inet_csk(sk)->icsk_bind_hash && | 1679 | if (inet_csk(sk)->icsk_bind_hash && |
1680 | !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) | 1680 | !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) |
1681 | inet_put_port(sk); | 1681 | inet_put_port(sk); |
1682 | /* fall through */ | 1682 | /* fall through */ |
1683 | default: | 1683 | default: |
1684 | if (oldstate==TCP_ESTABLISHED) | 1684 | if (oldstate == TCP_ESTABLISHED) |
1685 | TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); | 1685 | TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); |
1686 | } | 1686 | } |
1687 | 1687 | ||
1688 | /* Change state AFTER socket is unhashed to avoid closed | 1688 | /* Change state AFTER socket is unhashed to avoid closed |
1689 | * socket sitting in hash tables. | 1689 | * socket sitting in hash tables. |
1690 | */ | 1690 | */ |
1691 | sk->sk_state = state; | 1691 | sk->sk_state = state; |
1692 | 1692 | ||
1693 | #ifdef STATE_TRACE | 1693 | #ifdef STATE_TRACE |
1694 | SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); | 1694 | SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); |
1695 | #endif | 1695 | #endif |
1696 | } | 1696 | } |
1697 | EXPORT_SYMBOL_GPL(tcp_set_state); | 1697 | EXPORT_SYMBOL_GPL(tcp_set_state); |
1698 | 1698 | ||
1699 | /* | 1699 | /* |
1700 | * State processing on a close. This implements the state shift for | 1700 | * State processing on a close. This implements the state shift for |
1701 | * sending our FIN frame. Note that we only send a FIN for some | 1701 | * sending our FIN frame. Note that we only send a FIN for some |
1702 | * states. A shutdown() may have already sent the FIN, or we may be | 1702 | * states. A shutdown() may have already sent the FIN, or we may be |
1703 | * closed. | 1703 | * closed. |
1704 | */ | 1704 | */ |
1705 | 1705 | ||
1706 | static const unsigned char new_state[16] = { | 1706 | static const unsigned char new_state[16] = { |
1707 | /* current state: new state: action: */ | 1707 | /* current state: new state: action: */ |
1708 | /* (Invalid) */ TCP_CLOSE, | 1708 | /* (Invalid) */ TCP_CLOSE, |
1709 | /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, | 1709 | /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, |
1710 | /* TCP_SYN_SENT */ TCP_CLOSE, | 1710 | /* TCP_SYN_SENT */ TCP_CLOSE, |
1711 | /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, | 1711 | /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, |
1712 | /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, | 1712 | /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, |
1713 | /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, | 1713 | /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, |
1714 | /* TCP_TIME_WAIT */ TCP_CLOSE, | 1714 | /* TCP_TIME_WAIT */ TCP_CLOSE, |
1715 | /* TCP_CLOSE */ TCP_CLOSE, | 1715 | /* TCP_CLOSE */ TCP_CLOSE, |
1716 | /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, | 1716 | /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, |
1717 | /* TCP_LAST_ACK */ TCP_LAST_ACK, | 1717 | /* TCP_LAST_ACK */ TCP_LAST_ACK, |
1718 | /* TCP_LISTEN */ TCP_CLOSE, | 1718 | /* TCP_LISTEN */ TCP_CLOSE, |
1719 | /* TCP_CLOSING */ TCP_CLOSING, | 1719 | /* TCP_CLOSING */ TCP_CLOSING, |
1720 | }; | 1720 | }; |
1721 | 1721 | ||
1722 | static int tcp_close_state(struct sock *sk) | 1722 | static int tcp_close_state(struct sock *sk) |
1723 | { | 1723 | { |
1724 | int next = (int)new_state[sk->sk_state]; | 1724 | int next = (int)new_state[sk->sk_state]; |
1725 | int ns = next & TCP_STATE_MASK; | 1725 | int ns = next & TCP_STATE_MASK; |
1726 | 1726 | ||
1727 | tcp_set_state(sk, ns); | 1727 | tcp_set_state(sk, ns); |
1728 | 1728 | ||
1729 | return next & TCP_ACTION_FIN; | 1729 | return next & TCP_ACTION_FIN; |
1730 | } | 1730 | } |
1731 | 1731 | ||
1732 | /* | 1732 | /* |
1733 | * Shutdown the sending side of a connection. Much like close except | 1733 | * Shutdown the sending side of a connection. Much like close except |
1734 | * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). | 1734 | * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). |
1735 | */ | 1735 | */ |
1736 | 1736 | ||
1737 | void tcp_shutdown(struct sock *sk, int how) | 1737 | void tcp_shutdown(struct sock *sk, int how) |
1738 | { | 1738 | { |
1739 | /* We need to grab some memory, and put together a FIN, | 1739 | /* We need to grab some memory, and put together a FIN, |
1740 | * and then put it into the queue to be sent. | 1740 | * and then put it into the queue to be sent. |
1741 | * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. | 1741 | * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. |
1742 | */ | 1742 | */ |
1743 | if (!(how & SEND_SHUTDOWN)) | 1743 | if (!(how & SEND_SHUTDOWN)) |
1744 | return; | 1744 | return; |
1745 | 1745 | ||
1746 | /* If we've already sent a FIN, or it's a closed state, skip this. */ | 1746 | /* If we've already sent a FIN, or it's a closed state, skip this. */ |
1747 | if ((1 << sk->sk_state) & | 1747 | if ((1 << sk->sk_state) & |
1748 | (TCPF_ESTABLISHED | TCPF_SYN_SENT | | 1748 | (TCPF_ESTABLISHED | TCPF_SYN_SENT | |
1749 | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { | 1749 | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { |
1750 | /* Clear out any half completed packets. FIN if needed. */ | 1750 | /* Clear out any half completed packets. FIN if needed. */ |
1751 | if (tcp_close_state(sk)) | 1751 | if (tcp_close_state(sk)) |
1752 | tcp_send_fin(sk); | 1752 | tcp_send_fin(sk); |
1753 | } | 1753 | } |
1754 | } | 1754 | } |
1755 | 1755 | ||
1756 | void tcp_close(struct sock *sk, long timeout) | 1756 | void tcp_close(struct sock *sk, long timeout) |
1757 | { | 1757 | { |
1758 | struct sk_buff *skb; | 1758 | struct sk_buff *skb; |
1759 | int data_was_unread = 0; | 1759 | int data_was_unread = 0; |
1760 | int state; | 1760 | int state; |
1761 | 1761 | ||
1762 | lock_sock(sk); | 1762 | lock_sock(sk); |
1763 | sk->sk_shutdown = SHUTDOWN_MASK; | 1763 | sk->sk_shutdown = SHUTDOWN_MASK; |
1764 | 1764 | ||
1765 | if (sk->sk_state == TCP_LISTEN) { | 1765 | if (sk->sk_state == TCP_LISTEN) { |
1766 | tcp_set_state(sk, TCP_CLOSE); | 1766 | tcp_set_state(sk, TCP_CLOSE); |
1767 | 1767 | ||
1768 | /* Special case. */ | 1768 | /* Special case. */ |
1769 | inet_csk_listen_stop(sk); | 1769 | inet_csk_listen_stop(sk); |
1770 | 1770 | ||
1771 | goto adjudge_to_death; | 1771 | goto adjudge_to_death; |
1772 | } | 1772 | } |
1773 | 1773 | ||
1774 | /* We need to flush the recv. buffs. We do this only on the | 1774 | /* We need to flush the recv. buffs. We do this only on the |
1775 | * descriptor close, not protocol-sourced closes, because the | 1775 | * descriptor close, not protocol-sourced closes, because the |
1776 | * reader process may not have drained the data yet! | 1776 | * reader process may not have drained the data yet! |
1777 | */ | 1777 | */ |
1778 | while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { | 1778 | while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { |
1779 | u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - | 1779 | u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - |
1780 | tcp_hdr(skb)->fin; | 1780 | tcp_hdr(skb)->fin; |
1781 | data_was_unread += len; | 1781 | data_was_unread += len; |
1782 | __kfree_skb(skb); | 1782 | __kfree_skb(skb); |
1783 | } | 1783 | } |
1784 | 1784 | ||
1785 | sk_mem_reclaim(sk); | 1785 | sk_mem_reclaim(sk); |
1786 | 1786 | ||
1787 | /* As outlined in RFC 2525, section 2.17, we send a RST here because | 1787 | /* As outlined in RFC 2525, section 2.17, we send a RST here because |
1788 | * data was lost. To witness the awful effects of the old behavior of | 1788 | * data was lost. To witness the awful effects of the old behavior of |
1789 | * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk | 1789 | * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk |
1790 | * GET in an FTP client, suspend the process, wait for the client to | 1790 | * GET in an FTP client, suspend the process, wait for the client to |
1791 | * advertise a zero window, then kill -9 the FTP client, wheee... | 1791 | * advertise a zero window, then kill -9 the FTP client, wheee... |
1792 | * Note: timeout is always zero in such a case. | 1792 | * Note: timeout is always zero in such a case. |
1793 | */ | 1793 | */ |
1794 | if (data_was_unread) { | 1794 | if (data_was_unread) { |
1795 | /* Unread data was tossed, zap the connection. */ | 1795 | /* Unread data was tossed, zap the connection. */ |
1796 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); | 1796 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); |
1797 | tcp_set_state(sk, TCP_CLOSE); | 1797 | tcp_set_state(sk, TCP_CLOSE); |
1798 | tcp_send_active_reset(sk, GFP_KERNEL); | 1798 | tcp_send_active_reset(sk, GFP_KERNEL); |
1799 | } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { | 1799 | } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { |
1800 | /* Check zero linger _after_ checking for unread data. */ | 1800 | /* Check zero linger _after_ checking for unread data. */ |
1801 | sk->sk_prot->disconnect(sk, 0); | 1801 | sk->sk_prot->disconnect(sk, 0); |
1802 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); | 1802 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); |
1803 | } else if (tcp_close_state(sk)) { | 1803 | } else if (tcp_close_state(sk)) { |
1804 | /* We FIN if the application ate all the data before | 1804 | /* We FIN if the application ate all the data before |
1805 | * zapping the connection. | 1805 | * zapping the connection. |
1806 | */ | 1806 | */ |
1807 | 1807 | ||
1808 | /* RED-PEN. Formally speaking, we have broken TCP state | 1808 | /* RED-PEN. Formally speaking, we have broken TCP state |
1809 | * machine. State transitions: | 1809 | * machine. State transitions: |
1810 | * | 1810 | * |
1811 | * TCP_ESTABLISHED -> TCP_FIN_WAIT1 | 1811 | * TCP_ESTABLISHED -> TCP_FIN_WAIT1 |
1812 | * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) | 1812 | * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) |
1813 | * TCP_CLOSE_WAIT -> TCP_LAST_ACK | 1813 | * TCP_CLOSE_WAIT -> TCP_LAST_ACK |
1814 | * | 1814 | * |
1815 | * are legal only when FIN has been sent (i.e. in window), | 1815 | * are legal only when FIN has been sent (i.e. in window), |
1816 | * rather than queued out of window. Purists blame. | 1816 | * rather than queued out of window. Purists blame. |
1817 | * | 1817 | * |
1818 | * F.e. "RFC state" is ESTABLISHED, | 1818 | * F.e. "RFC state" is ESTABLISHED, |
1819 | * if Linux state is FIN-WAIT-1, but FIN is still not sent. | 1819 | * if Linux state is FIN-WAIT-1, but FIN is still not sent. |
1820 | * | 1820 | * |
1821 | * The visible declinations are that sometimes | 1821 | * The visible declinations are that sometimes |
1822 | * we enter time-wait state, when it is not required really | 1822 | * we enter time-wait state, when it is not required really |
1823 | * (harmless), do not send active resets, when they are | 1823 | * (harmless), do not send active resets, when they are |
1824 | * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when | 1824 | * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when |
1825 | * they look as CLOSING or LAST_ACK for Linux) | 1825 | * they look as CLOSING or LAST_ACK for Linux) |
1826 | * Probably, I missed some more holelets. | 1826 | * Probably, I missed some more holelets. |
1827 | * --ANK | 1827 | * --ANK |
1828 | */ | 1828 | */ |
1829 | tcp_send_fin(sk); | 1829 | tcp_send_fin(sk); |
1830 | } | 1830 | } |
1831 | 1831 | ||
1832 | sk_stream_wait_close(sk, timeout); | 1832 | sk_stream_wait_close(sk, timeout); |
1833 | 1833 | ||
1834 | adjudge_to_death: | 1834 | adjudge_to_death: |
1835 | state = sk->sk_state; | 1835 | state = sk->sk_state; |
1836 | sock_hold(sk); | 1836 | sock_hold(sk); |
1837 | sock_orphan(sk); | 1837 | sock_orphan(sk); |
1838 | atomic_inc(sk->sk_prot->orphan_count); | 1838 | atomic_inc(sk->sk_prot->orphan_count); |
1839 | 1839 | ||
1840 | /* It is the last release_sock in its life. It will remove backlog. */ | 1840 | /* It is the last release_sock in its life. It will remove backlog. */ |
1841 | release_sock(sk); | 1841 | release_sock(sk); |
1842 | 1842 | ||
1843 | 1843 | ||
1844 | /* Now socket is owned by kernel and we acquire BH lock | 1844 | /* Now socket is owned by kernel and we acquire BH lock |
1845 | to finish close. No need to check for user refs. | 1845 | to finish close. No need to check for user refs. |
1846 | */ | 1846 | */ |
1847 | local_bh_disable(); | 1847 | local_bh_disable(); |
1848 | bh_lock_sock(sk); | 1848 | bh_lock_sock(sk); |
1849 | WARN_ON(sock_owned_by_user(sk)); | 1849 | WARN_ON(sock_owned_by_user(sk)); |
1850 | 1850 | ||
1851 | /* Have we already been destroyed by a softirq or backlog? */ | 1851 | /* Have we already been destroyed by a softirq or backlog? */ |
1852 | if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) | 1852 | if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) |
1853 | goto out; | 1853 | goto out; |
1854 | 1854 | ||
1855 | /* This is a (useful) BSD violating of the RFC. There is a | 1855 | /* This is a (useful) BSD violating of the RFC. There is a |
1856 | * problem with TCP as specified in that the other end could | 1856 | * problem with TCP as specified in that the other end could |
1857 | * keep a socket open forever with no application left this end. | 1857 | * keep a socket open forever with no application left this end. |
1858 | * We use a 3 minute timeout (about the same as BSD) then kill | 1858 | * We use a 3 minute timeout (about the same as BSD) then kill |
1859 | * our end. If they send after that then tough - BUT: long enough | 1859 | * our end. If they send after that then tough - BUT: long enough |
1860 | * that we won't make the old 4*rto = almost no time - whoops | 1860 | * that we won't make the old 4*rto = almost no time - whoops |
1861 | * reset mistake. | 1861 | * reset mistake. |
1862 | * | 1862 | * |
1863 | * Nope, it was not mistake. It is really desired behaviour | 1863 | * Nope, it was not mistake. It is really desired behaviour |
1864 | * f.e. on http servers, when such sockets are useless, but | 1864 | * f.e. on http servers, when such sockets are useless, but |
1865 | * consume significant resources. Let's do it with special | 1865 | * consume significant resources. Let's do it with special |
1866 | * linger2 option. --ANK | 1866 | * linger2 option. --ANK |
1867 | */ | 1867 | */ |
1868 | 1868 | ||
1869 | if (sk->sk_state == TCP_FIN_WAIT2) { | 1869 | if (sk->sk_state == TCP_FIN_WAIT2) { |
1870 | struct tcp_sock *tp = tcp_sk(sk); | 1870 | struct tcp_sock *tp = tcp_sk(sk); |
1871 | if (tp->linger2 < 0) { | 1871 | if (tp->linger2 < 0) { |
1872 | tcp_set_state(sk, TCP_CLOSE); | 1872 | tcp_set_state(sk, TCP_CLOSE); |
1873 | tcp_send_active_reset(sk, GFP_ATOMIC); | 1873 | tcp_send_active_reset(sk, GFP_ATOMIC); |
1874 | NET_INC_STATS_BH(sock_net(sk), | 1874 | NET_INC_STATS_BH(sock_net(sk), |
1875 | LINUX_MIB_TCPABORTONLINGER); | 1875 | LINUX_MIB_TCPABORTONLINGER); |
1876 | } else { | 1876 | } else { |
1877 | const int tmo = tcp_fin_time(sk); | 1877 | const int tmo = tcp_fin_time(sk); |
1878 | 1878 | ||
1879 | if (tmo > TCP_TIMEWAIT_LEN) { | 1879 | if (tmo > TCP_TIMEWAIT_LEN) { |
1880 | inet_csk_reset_keepalive_timer(sk, | 1880 | inet_csk_reset_keepalive_timer(sk, |
1881 | tmo - TCP_TIMEWAIT_LEN); | 1881 | tmo - TCP_TIMEWAIT_LEN); |
1882 | } else { | 1882 | } else { |
1883 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); | 1883 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
1884 | goto out; | 1884 | goto out; |
1885 | } | 1885 | } |
1886 | } | 1886 | } |
1887 | } | 1887 | } |
1888 | if (sk->sk_state != TCP_CLOSE) { | 1888 | if (sk->sk_state != TCP_CLOSE) { |
1889 | sk_mem_reclaim(sk); | 1889 | sk_mem_reclaim(sk); |
1890 | if (tcp_too_many_orphans(sk, | 1890 | if (tcp_too_many_orphans(sk, |
1891 | atomic_read(sk->sk_prot->orphan_count))) { | 1891 | atomic_read(sk->sk_prot->orphan_count))) { |
1892 | if (net_ratelimit()) | 1892 | if (net_ratelimit()) |
1893 | printk(KERN_INFO "TCP: too many of orphaned " | 1893 | printk(KERN_INFO "TCP: too many of orphaned " |
1894 | "sockets\n"); | 1894 | "sockets\n"); |
1895 | tcp_set_state(sk, TCP_CLOSE); | 1895 | tcp_set_state(sk, TCP_CLOSE); |
1896 | tcp_send_active_reset(sk, GFP_ATOMIC); | 1896 | tcp_send_active_reset(sk, GFP_ATOMIC); |
1897 | NET_INC_STATS_BH(sock_net(sk), | 1897 | NET_INC_STATS_BH(sock_net(sk), |
1898 | LINUX_MIB_TCPABORTONMEMORY); | 1898 | LINUX_MIB_TCPABORTONMEMORY); |
1899 | } | 1899 | } |
1900 | } | 1900 | } |
1901 | 1901 | ||
1902 | if (sk->sk_state == TCP_CLOSE) | 1902 | if (sk->sk_state == TCP_CLOSE) |
1903 | inet_csk_destroy_sock(sk); | 1903 | inet_csk_destroy_sock(sk); |
1904 | /* Otherwise, socket is reprieved until protocol close. */ | 1904 | /* Otherwise, socket is reprieved until protocol close. */ |
1905 | 1905 | ||
1906 | out: | 1906 | out: |
1907 | bh_unlock_sock(sk); | 1907 | bh_unlock_sock(sk); |
1908 | local_bh_enable(); | 1908 | local_bh_enable(); |
1909 | sock_put(sk); | 1909 | sock_put(sk); |
1910 | } | 1910 | } |
1911 | 1911 | ||
1912 | /* These states need RST on ABORT according to RFC793 */ | 1912 | /* These states need RST on ABORT according to RFC793 */ |
1913 | 1913 | ||
1914 | static inline int tcp_need_reset(int state) | 1914 | static inline int tcp_need_reset(int state) |
1915 | { | 1915 | { |
1916 | return (1 << state) & | 1916 | return (1 << state) & |
1917 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | | 1917 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | |
1918 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); | 1918 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); |
1919 | } | 1919 | } |
1920 | 1920 | ||
1921 | int tcp_disconnect(struct sock *sk, int flags) | 1921 | int tcp_disconnect(struct sock *sk, int flags) |
1922 | { | 1922 | { |
1923 | struct inet_sock *inet = inet_sk(sk); | 1923 | struct inet_sock *inet = inet_sk(sk); |
1924 | struct inet_connection_sock *icsk = inet_csk(sk); | 1924 | struct inet_connection_sock *icsk = inet_csk(sk); |
1925 | struct tcp_sock *tp = tcp_sk(sk); | 1925 | struct tcp_sock *tp = tcp_sk(sk); |
1926 | int err = 0; | 1926 | int err = 0; |
1927 | int old_state = sk->sk_state; | 1927 | int old_state = sk->sk_state; |
1928 | 1928 | ||
1929 | if (old_state != TCP_CLOSE) | 1929 | if (old_state != TCP_CLOSE) |
1930 | tcp_set_state(sk, TCP_CLOSE); | 1930 | tcp_set_state(sk, TCP_CLOSE); |
1931 | 1931 | ||
1932 | /* ABORT function of RFC793 */ | 1932 | /* ABORT function of RFC793 */ |
1933 | if (old_state == TCP_LISTEN) { | 1933 | if (old_state == TCP_LISTEN) { |
1934 | inet_csk_listen_stop(sk); | 1934 | inet_csk_listen_stop(sk); |
1935 | } else if (tcp_need_reset(old_state) || | 1935 | } else if (tcp_need_reset(old_state) || |
1936 | (tp->snd_nxt != tp->write_seq && | 1936 | (tp->snd_nxt != tp->write_seq && |
1937 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 1937 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
1938 | /* The last check adjusts for discrepancy of Linux wrt. RFC | 1938 | /* The last check adjusts for discrepancy of Linux wrt. RFC |
1939 | * states | 1939 | * states |
1940 | */ | 1940 | */ |
1941 | tcp_send_active_reset(sk, gfp_any()); | 1941 | tcp_send_active_reset(sk, gfp_any()); |
1942 | sk->sk_err = ECONNRESET; | 1942 | sk->sk_err = ECONNRESET; |
1943 | } else if (old_state == TCP_SYN_SENT) | 1943 | } else if (old_state == TCP_SYN_SENT) |
1944 | sk->sk_err = ECONNRESET; | 1944 | sk->sk_err = ECONNRESET; |
1945 | 1945 | ||
1946 | tcp_clear_xmit_timers(sk); | 1946 | tcp_clear_xmit_timers(sk); |
1947 | __skb_queue_purge(&sk->sk_receive_queue); | 1947 | __skb_queue_purge(&sk->sk_receive_queue); |
1948 | tcp_write_queue_purge(sk); | 1948 | tcp_write_queue_purge(sk); |
1949 | __skb_queue_purge(&tp->out_of_order_queue); | 1949 | __skb_queue_purge(&tp->out_of_order_queue); |
1950 | #ifdef CONFIG_NET_DMA | 1950 | #ifdef CONFIG_NET_DMA |
1951 | __skb_queue_purge(&sk->sk_async_wait_queue); | 1951 | __skb_queue_purge(&sk->sk_async_wait_queue); |
1952 | #endif | 1952 | #endif |
1953 | 1953 | ||
1954 | inet->dport = 0; | 1954 | inet->dport = 0; |
1955 | 1955 | ||
1956 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) | 1956 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
1957 | inet_reset_saddr(sk); | 1957 | inet_reset_saddr(sk); |
1958 | 1958 | ||
1959 | sk->sk_shutdown = 0; | 1959 | sk->sk_shutdown = 0; |
1960 | sock_reset_flag(sk, SOCK_DONE); | 1960 | sock_reset_flag(sk, SOCK_DONE); |
1961 | tp->srtt = 0; | 1961 | tp->srtt = 0; |
1962 | if ((tp->write_seq += tp->max_window + 2) == 0) | 1962 | if ((tp->write_seq += tp->max_window + 2) == 0) |
1963 | tp->write_seq = 1; | 1963 | tp->write_seq = 1; |
1964 | icsk->icsk_backoff = 0; | 1964 | icsk->icsk_backoff = 0; |
1965 | tp->snd_cwnd = 2; | 1965 | tp->snd_cwnd = 2; |
1966 | icsk->icsk_probes_out = 0; | 1966 | icsk->icsk_probes_out = 0; |
1967 | tp->packets_out = 0; | 1967 | tp->packets_out = 0; |
1968 | tp->snd_ssthresh = 0x7fffffff; | 1968 | tp->snd_ssthresh = 0x7fffffff; |
1969 | tp->snd_cwnd_cnt = 0; | 1969 | tp->snd_cwnd_cnt = 0; |
1970 | tp->bytes_acked = 0; | 1970 | tp->bytes_acked = 0; |
1971 | tcp_set_ca_state(sk, TCP_CA_Open); | 1971 | tcp_set_ca_state(sk, TCP_CA_Open); |
1972 | tcp_clear_retrans(tp); | 1972 | tcp_clear_retrans(tp); |
1973 | inet_csk_delack_init(sk); | 1973 | inet_csk_delack_init(sk); |
1974 | tcp_init_send_head(sk); | 1974 | tcp_init_send_head(sk); |
1975 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); | 1975 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); |
1976 | __sk_dst_reset(sk); | 1976 | __sk_dst_reset(sk); |
1977 | 1977 | ||
1978 | WARN_ON(inet->num && !icsk->icsk_bind_hash); | 1978 | WARN_ON(inet->num && !icsk->icsk_bind_hash); |
1979 | 1979 | ||
1980 | sk->sk_error_report(sk); | 1980 | sk->sk_error_report(sk); |
1981 | return err; | 1981 | return err; |
1982 | } | 1982 | } |
1983 | 1983 | ||
1984 | /* | 1984 | /* |
1985 | * Socket option code for TCP. | 1985 | * Socket option code for TCP. |
1986 | */ | 1986 | */ |
1987 | static int do_tcp_setsockopt(struct sock *sk, int level, | 1987 | static int do_tcp_setsockopt(struct sock *sk, int level, |
1988 | int optname, char __user *optval, int optlen) | 1988 | int optname, char __user *optval, int optlen) |
1989 | { | 1989 | { |
1990 | struct tcp_sock *tp = tcp_sk(sk); | 1990 | struct tcp_sock *tp = tcp_sk(sk); |
1991 | struct inet_connection_sock *icsk = inet_csk(sk); | 1991 | struct inet_connection_sock *icsk = inet_csk(sk); |
1992 | int val; | 1992 | int val; |
1993 | int err = 0; | 1993 | int err = 0; |
1994 | 1994 | ||
1995 | /* This is a string value all the others are int's */ | 1995 | /* This is a string value all the others are int's */ |
1996 | if (optname == TCP_CONGESTION) { | 1996 | if (optname == TCP_CONGESTION) { |
1997 | char name[TCP_CA_NAME_MAX]; | 1997 | char name[TCP_CA_NAME_MAX]; |
1998 | 1998 | ||
1999 | if (optlen < 1) | 1999 | if (optlen < 1) |
2000 | return -EINVAL; | 2000 | return -EINVAL; |
2001 | 2001 | ||
2002 | val = strncpy_from_user(name, optval, | 2002 | val = strncpy_from_user(name, optval, |
2003 | min(TCP_CA_NAME_MAX-1, optlen)); | 2003 | min(TCP_CA_NAME_MAX-1, optlen)); |
2004 | if (val < 0) | 2004 | if (val < 0) |
2005 | return -EFAULT; | 2005 | return -EFAULT; |
2006 | name[val] = 0; | 2006 | name[val] = 0; |
2007 | 2007 | ||
2008 | lock_sock(sk); | 2008 | lock_sock(sk); |
2009 | err = tcp_set_congestion_control(sk, name); | 2009 | err = tcp_set_congestion_control(sk, name); |
2010 | release_sock(sk); | 2010 | release_sock(sk); |
2011 | return err; | 2011 | return err; |
2012 | } | 2012 | } |
2013 | 2013 | ||
2014 | if (optlen < sizeof(int)) | 2014 | if (optlen < sizeof(int)) |
2015 | return -EINVAL; | 2015 | return -EINVAL; |
2016 | 2016 | ||
2017 | if (get_user(val, (int __user *)optval)) | 2017 | if (get_user(val, (int __user *)optval)) |
2018 | return -EFAULT; | 2018 | return -EFAULT; |
2019 | 2019 | ||
2020 | lock_sock(sk); | 2020 | lock_sock(sk); |
2021 | 2021 | ||
2022 | switch (optname) { | 2022 | switch (optname) { |
2023 | case TCP_MAXSEG: | 2023 | case TCP_MAXSEG: |
2024 | /* Values greater than interface MTU won't take effect. However | 2024 | /* Values greater than interface MTU won't take effect. However |
2025 | * at the point when this call is done we typically don't yet | 2025 | * at the point when this call is done we typically don't yet |
2026 | * know which interface is going to be used */ | 2026 | * know which interface is going to be used */ |
2027 | if (val < 8 || val > MAX_TCP_WINDOW) { | 2027 | if (val < 8 || val > MAX_TCP_WINDOW) { |
2028 | err = -EINVAL; | 2028 | err = -EINVAL; |
2029 | break; | 2029 | break; |
2030 | } | 2030 | } |
2031 | tp->rx_opt.user_mss = val; | 2031 | tp->rx_opt.user_mss = val; |
2032 | break; | 2032 | break; |
2033 | 2033 | ||
2034 | case TCP_NODELAY: | 2034 | case TCP_NODELAY: |
2035 | if (val) { | 2035 | if (val) { |
2036 | /* TCP_NODELAY is weaker than TCP_CORK, so that | 2036 | /* TCP_NODELAY is weaker than TCP_CORK, so that |
2037 | * this option on corked socket is remembered, but | 2037 | * this option on corked socket is remembered, but |
2038 | * it is not activated until cork is cleared. | 2038 | * it is not activated until cork is cleared. |
2039 | * | 2039 | * |
2040 | * However, when TCP_NODELAY is set we make | 2040 | * However, when TCP_NODELAY is set we make |
2041 | * an explicit push, which overrides even TCP_CORK | 2041 | * an explicit push, which overrides even TCP_CORK |
2042 | * for currently queued segments. | 2042 | * for currently queued segments. |
2043 | */ | 2043 | */ |
2044 | tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; | 2044 | tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; |
2045 | tcp_push_pending_frames(sk); | 2045 | tcp_push_pending_frames(sk); |
2046 | } else { | 2046 | } else { |
2047 | tp->nonagle &= ~TCP_NAGLE_OFF; | 2047 | tp->nonagle &= ~TCP_NAGLE_OFF; |
2048 | } | 2048 | } |
2049 | break; | 2049 | break; |
2050 | 2050 | ||
2051 | case TCP_CORK: | 2051 | case TCP_CORK: |
2052 | /* When set indicates to always queue non-full frames. | 2052 | /* When set indicates to always queue non-full frames. |
2053 | * Later the user clears this option and we transmit | 2053 | * Later the user clears this option and we transmit |
2054 | * any pending partial frames in the queue. This is | 2054 | * any pending partial frames in the queue. This is |
2055 | * meant to be used alongside sendfile() to get properly | 2055 | * meant to be used alongside sendfile() to get properly |
2056 | * filled frames when the user (for example) must write | 2056 | * filled frames when the user (for example) must write |
2057 | * out headers with a write() call first and then use | 2057 | * out headers with a write() call first and then use |
2058 | * sendfile to send out the data parts. | 2058 | * sendfile to send out the data parts. |
2059 | * | 2059 | * |
2060 | * TCP_CORK can be set together with TCP_NODELAY and it is | 2060 | * TCP_CORK can be set together with TCP_NODELAY and it is |
2061 | * stronger than TCP_NODELAY. | 2061 | * stronger than TCP_NODELAY. |
2062 | */ | 2062 | */ |
2063 | if (val) { | 2063 | if (val) { |
2064 | tp->nonagle |= TCP_NAGLE_CORK; | 2064 | tp->nonagle |= TCP_NAGLE_CORK; |
2065 | } else { | 2065 | } else { |
2066 | tp->nonagle &= ~TCP_NAGLE_CORK; | 2066 | tp->nonagle &= ~TCP_NAGLE_CORK; |
2067 | if (tp->nonagle&TCP_NAGLE_OFF) | 2067 | if (tp->nonagle&TCP_NAGLE_OFF) |
2068 | tp->nonagle |= TCP_NAGLE_PUSH; | 2068 | tp->nonagle |= TCP_NAGLE_PUSH; |
2069 | tcp_push_pending_frames(sk); | 2069 | tcp_push_pending_frames(sk); |
2070 | } | 2070 | } |
2071 | break; | 2071 | break; |
2072 | 2072 | ||
2073 | case TCP_KEEPIDLE: | 2073 | case TCP_KEEPIDLE: |
2074 | if (val < 1 || val > MAX_TCP_KEEPIDLE) | 2074 | if (val < 1 || val > MAX_TCP_KEEPIDLE) |
2075 | err = -EINVAL; | 2075 | err = -EINVAL; |
2076 | else { | 2076 | else { |
2077 | tp->keepalive_time = val * HZ; | 2077 | tp->keepalive_time = val * HZ; |
2078 | if (sock_flag(sk, SOCK_KEEPOPEN) && | 2078 | if (sock_flag(sk, SOCK_KEEPOPEN) && |
2079 | !((1 << sk->sk_state) & | 2079 | !((1 << sk->sk_state) & |
2080 | (TCPF_CLOSE | TCPF_LISTEN))) { | 2080 | (TCPF_CLOSE | TCPF_LISTEN))) { |
2081 | __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; | 2081 | __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; |
2082 | if (tp->keepalive_time > elapsed) | 2082 | if (tp->keepalive_time > elapsed) |
2083 | elapsed = tp->keepalive_time - elapsed; | 2083 | elapsed = tp->keepalive_time - elapsed; |
2084 | else | 2084 | else |
2085 | elapsed = 0; | 2085 | elapsed = 0; |
2086 | inet_csk_reset_keepalive_timer(sk, elapsed); | 2086 | inet_csk_reset_keepalive_timer(sk, elapsed); |
2087 | } | 2087 | } |
2088 | } | 2088 | } |
2089 | break; | 2089 | break; |
2090 | case TCP_KEEPINTVL: | 2090 | case TCP_KEEPINTVL: |
2091 | if (val < 1 || val > MAX_TCP_KEEPINTVL) | 2091 | if (val < 1 || val > MAX_TCP_KEEPINTVL) |
2092 | err = -EINVAL; | 2092 | err = -EINVAL; |
2093 | else | 2093 | else |
2094 | tp->keepalive_intvl = val * HZ; | 2094 | tp->keepalive_intvl = val * HZ; |
2095 | break; | 2095 | break; |
2096 | case TCP_KEEPCNT: | 2096 | case TCP_KEEPCNT: |
2097 | if (val < 1 || val > MAX_TCP_KEEPCNT) | 2097 | if (val < 1 || val > MAX_TCP_KEEPCNT) |
2098 | err = -EINVAL; | 2098 | err = -EINVAL; |
2099 | else | 2099 | else |
2100 | tp->keepalive_probes = val; | 2100 | tp->keepalive_probes = val; |
2101 | break; | 2101 | break; |
2102 | case TCP_SYNCNT: | 2102 | case TCP_SYNCNT: |
2103 | if (val < 1 || val > MAX_TCP_SYNCNT) | 2103 | if (val < 1 || val > MAX_TCP_SYNCNT) |
2104 | err = -EINVAL; | 2104 | err = -EINVAL; |
2105 | else | 2105 | else |
2106 | icsk->icsk_syn_retries = val; | 2106 | icsk->icsk_syn_retries = val; |
2107 | break; | 2107 | break; |
2108 | 2108 | ||
2109 | case TCP_LINGER2: | 2109 | case TCP_LINGER2: |
2110 | if (val < 0) | 2110 | if (val < 0) |
2111 | tp->linger2 = -1; | 2111 | tp->linger2 = -1; |
2112 | else if (val > sysctl_tcp_fin_timeout / HZ) | 2112 | else if (val > sysctl_tcp_fin_timeout / HZ) |
2113 | tp->linger2 = 0; | 2113 | tp->linger2 = 0; |
2114 | else | 2114 | else |
2115 | tp->linger2 = val * HZ; | 2115 | tp->linger2 = val * HZ; |
2116 | break; | 2116 | break; |
2117 | 2117 | ||
2118 | case TCP_DEFER_ACCEPT: | 2118 | case TCP_DEFER_ACCEPT: |
2119 | icsk->icsk_accept_queue.rskq_defer_accept = 0; | 2119 | icsk->icsk_accept_queue.rskq_defer_accept = 0; |
2120 | if (val > 0) { | 2120 | if (val > 0) { |
2121 | /* Translate value in seconds to number of | 2121 | /* Translate value in seconds to number of |
2122 | * retransmits */ | 2122 | * retransmits */ |
2123 | while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && | 2123 | while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && |
2124 | val > ((TCP_TIMEOUT_INIT / HZ) << | 2124 | val > ((TCP_TIMEOUT_INIT / HZ) << |
2125 | icsk->icsk_accept_queue.rskq_defer_accept)) | 2125 | icsk->icsk_accept_queue.rskq_defer_accept)) |
2126 | icsk->icsk_accept_queue.rskq_defer_accept++; | 2126 | icsk->icsk_accept_queue.rskq_defer_accept++; |
2127 | icsk->icsk_accept_queue.rskq_defer_accept++; | 2127 | icsk->icsk_accept_queue.rskq_defer_accept++; |
2128 | } | 2128 | } |
2129 | break; | 2129 | break; |
2130 | 2130 | ||
2131 | case TCP_WINDOW_CLAMP: | 2131 | case TCP_WINDOW_CLAMP: |
2132 | if (!val) { | 2132 | if (!val) { |
2133 | if (sk->sk_state != TCP_CLOSE) { | 2133 | if (sk->sk_state != TCP_CLOSE) { |
2134 | err = -EINVAL; | 2134 | err = -EINVAL; |
2135 | break; | 2135 | break; |
2136 | } | 2136 | } |
2137 | tp->window_clamp = 0; | 2137 | tp->window_clamp = 0; |
2138 | } else | 2138 | } else |
2139 | tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? | 2139 | tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? |
2140 | SOCK_MIN_RCVBUF / 2 : val; | 2140 | SOCK_MIN_RCVBUF / 2 : val; |
2141 | break; | 2141 | break; |
2142 | 2142 | ||
2143 | case TCP_QUICKACK: | 2143 | case TCP_QUICKACK: |
2144 | if (!val) { | 2144 | if (!val) { |
2145 | icsk->icsk_ack.pingpong = 1; | 2145 | icsk->icsk_ack.pingpong = 1; |
2146 | } else { | 2146 | } else { |
2147 | icsk->icsk_ack.pingpong = 0; | 2147 | icsk->icsk_ack.pingpong = 0; |
2148 | if ((1 << sk->sk_state) & | 2148 | if ((1 << sk->sk_state) & |
2149 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && | 2149 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
2150 | inet_csk_ack_scheduled(sk)) { | 2150 | inet_csk_ack_scheduled(sk)) { |
2151 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; | 2151 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
2152 | tcp_cleanup_rbuf(sk, 1); | 2152 | tcp_cleanup_rbuf(sk, 1); |
2153 | if (!(val & 1)) | 2153 | if (!(val & 1)) |
2154 | icsk->icsk_ack.pingpong = 1; | 2154 | icsk->icsk_ack.pingpong = 1; |
2155 | } | 2155 | } |
2156 | } | 2156 | } |
2157 | break; | 2157 | break; |
2158 | 2158 | ||
2159 | #ifdef CONFIG_TCP_MD5SIG | 2159 | #ifdef CONFIG_TCP_MD5SIG |
2160 | case TCP_MD5SIG: | 2160 | case TCP_MD5SIG: |
2161 | /* Read the IP->Key mappings from userspace */ | 2161 | /* Read the IP->Key mappings from userspace */ |
2162 | err = tp->af_specific->md5_parse(sk, optval, optlen); | 2162 | err = tp->af_specific->md5_parse(sk, optval, optlen); |
2163 | break; | 2163 | break; |
2164 | #endif | 2164 | #endif |
2165 | 2165 | ||
2166 | default: | 2166 | default: |
2167 | err = -ENOPROTOOPT; | 2167 | err = -ENOPROTOOPT; |
2168 | break; | 2168 | break; |
2169 | } | 2169 | } |
2170 | 2170 | ||
2171 | release_sock(sk); | 2171 | release_sock(sk); |
2172 | return err; | 2172 | return err; |
2173 | } | 2173 | } |
2174 | 2174 | ||
2175 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | 2175 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, |
2176 | int optlen) | 2176 | int optlen) |
2177 | { | 2177 | { |
2178 | struct inet_connection_sock *icsk = inet_csk(sk); | 2178 | struct inet_connection_sock *icsk = inet_csk(sk); |
2179 | 2179 | ||
2180 | if (level != SOL_TCP) | 2180 | if (level != SOL_TCP) |
2181 | return icsk->icsk_af_ops->setsockopt(sk, level, optname, | 2181 | return icsk->icsk_af_ops->setsockopt(sk, level, optname, |
2182 | optval, optlen); | 2182 | optval, optlen); |
2183 | return do_tcp_setsockopt(sk, level, optname, optval, optlen); | 2183 | return do_tcp_setsockopt(sk, level, optname, optval, optlen); |
2184 | } | 2184 | } |
2185 | 2185 | ||
2186 | #ifdef CONFIG_COMPAT | 2186 | #ifdef CONFIG_COMPAT |
2187 | int compat_tcp_setsockopt(struct sock *sk, int level, int optname, | 2187 | int compat_tcp_setsockopt(struct sock *sk, int level, int optname, |
2188 | char __user *optval, int optlen) | 2188 | char __user *optval, int optlen) |
2189 | { | 2189 | { |
2190 | if (level != SOL_TCP) | 2190 | if (level != SOL_TCP) |
2191 | return inet_csk_compat_setsockopt(sk, level, optname, | 2191 | return inet_csk_compat_setsockopt(sk, level, optname, |
2192 | optval, optlen); | 2192 | optval, optlen); |
2193 | return do_tcp_setsockopt(sk, level, optname, optval, optlen); | 2193 | return do_tcp_setsockopt(sk, level, optname, optval, optlen); |
2194 | } | 2194 | } |
2195 | 2195 | ||
2196 | EXPORT_SYMBOL(compat_tcp_setsockopt); | 2196 | EXPORT_SYMBOL(compat_tcp_setsockopt); |
2197 | #endif | 2197 | #endif |
2198 | 2198 | ||
2199 | /* Return information about state of tcp endpoint in API format. */ | 2199 | /* Return information about state of tcp endpoint in API format. */ |
2200 | void tcp_get_info(struct sock *sk, struct tcp_info *info) | 2200 | void tcp_get_info(struct sock *sk, struct tcp_info *info) |
2201 | { | 2201 | { |
2202 | struct tcp_sock *tp = tcp_sk(sk); | 2202 | struct tcp_sock *tp = tcp_sk(sk); |
2203 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2203 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2204 | u32 now = tcp_time_stamp; | 2204 | u32 now = tcp_time_stamp; |
2205 | 2205 | ||
2206 | memset(info, 0, sizeof(*info)); | 2206 | memset(info, 0, sizeof(*info)); |
2207 | 2207 | ||
2208 | info->tcpi_state = sk->sk_state; | 2208 | info->tcpi_state = sk->sk_state; |
2209 | info->tcpi_ca_state = icsk->icsk_ca_state; | 2209 | info->tcpi_ca_state = icsk->icsk_ca_state; |
2210 | info->tcpi_retransmits = icsk->icsk_retransmits; | 2210 | info->tcpi_retransmits = icsk->icsk_retransmits; |
2211 | info->tcpi_probes = icsk->icsk_probes_out; | 2211 | info->tcpi_probes = icsk->icsk_probes_out; |
2212 | info->tcpi_backoff = icsk->icsk_backoff; | 2212 | info->tcpi_backoff = icsk->icsk_backoff; |
2213 | 2213 | ||
2214 | if (tp->rx_opt.tstamp_ok) | 2214 | if (tp->rx_opt.tstamp_ok) |
2215 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; | 2215 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; |
2216 | if (tcp_is_sack(tp)) | 2216 | if (tcp_is_sack(tp)) |
2217 | info->tcpi_options |= TCPI_OPT_SACK; | 2217 | info->tcpi_options |= TCPI_OPT_SACK; |
2218 | if (tp->rx_opt.wscale_ok) { | 2218 | if (tp->rx_opt.wscale_ok) { |
2219 | info->tcpi_options |= TCPI_OPT_WSCALE; | 2219 | info->tcpi_options |= TCPI_OPT_WSCALE; |
2220 | info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; | 2220 | info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; |
2221 | info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; | 2221 | info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; |
2222 | } | 2222 | } |
2223 | 2223 | ||
2224 | if (tp->ecn_flags&TCP_ECN_OK) | 2224 | if (tp->ecn_flags&TCP_ECN_OK) |
2225 | info->tcpi_options |= TCPI_OPT_ECN; | 2225 | info->tcpi_options |= TCPI_OPT_ECN; |
2226 | 2226 | ||
2227 | info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); | 2227 | info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); |
2228 | info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); | 2228 | info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); |
2229 | info->tcpi_snd_mss = tp->mss_cache; | 2229 | info->tcpi_snd_mss = tp->mss_cache; |
2230 | info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; | 2230 | info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; |
2231 | 2231 | ||
2232 | if (sk->sk_state == TCP_LISTEN) { | 2232 | if (sk->sk_state == TCP_LISTEN) { |
2233 | info->tcpi_unacked = sk->sk_ack_backlog; | 2233 | info->tcpi_unacked = sk->sk_ack_backlog; |
2234 | info->tcpi_sacked = sk->sk_max_ack_backlog; | 2234 | info->tcpi_sacked = sk->sk_max_ack_backlog; |
2235 | } else { | 2235 | } else { |
2236 | info->tcpi_unacked = tp->packets_out; | 2236 | info->tcpi_unacked = tp->packets_out; |
2237 | info->tcpi_sacked = tp->sacked_out; | 2237 | info->tcpi_sacked = tp->sacked_out; |
2238 | } | 2238 | } |
2239 | info->tcpi_lost = tp->lost_out; | 2239 | info->tcpi_lost = tp->lost_out; |
2240 | info->tcpi_retrans = tp->retrans_out; | 2240 | info->tcpi_retrans = tp->retrans_out; |
2241 | info->tcpi_fackets = tp->fackets_out; | 2241 | info->tcpi_fackets = tp->fackets_out; |
2242 | 2242 | ||
2243 | info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); | 2243 | info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); |
2244 | info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); | 2244 | info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); |
2245 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); | 2245 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); |
2246 | 2246 | ||
2247 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; | 2247 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; |
2248 | info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; | 2248 | info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; |
2249 | info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; | 2249 | info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; |
2250 | info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; | 2250 | info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; |
2251 | info->tcpi_snd_ssthresh = tp->snd_ssthresh; | 2251 | info->tcpi_snd_ssthresh = tp->snd_ssthresh; |
2252 | info->tcpi_snd_cwnd = tp->snd_cwnd; | 2252 | info->tcpi_snd_cwnd = tp->snd_cwnd; |
2253 | info->tcpi_advmss = tp->advmss; | 2253 | info->tcpi_advmss = tp->advmss; |
2254 | info->tcpi_reordering = tp->reordering; | 2254 | info->tcpi_reordering = tp->reordering; |
2255 | 2255 | ||
2256 | info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; | 2256 | info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; |
2257 | info->tcpi_rcv_space = tp->rcvq_space.space; | 2257 | info->tcpi_rcv_space = tp->rcvq_space.space; |
2258 | 2258 | ||
2259 | info->tcpi_total_retrans = tp->total_retrans; | 2259 | info->tcpi_total_retrans = tp->total_retrans; |
2260 | } | 2260 | } |
2261 | 2261 | ||
2262 | EXPORT_SYMBOL_GPL(tcp_get_info); | 2262 | EXPORT_SYMBOL_GPL(tcp_get_info); |
2263 | 2263 | ||
2264 | static int do_tcp_getsockopt(struct sock *sk, int level, | 2264 | static int do_tcp_getsockopt(struct sock *sk, int level, |
2265 | int optname, char __user *optval, int __user *optlen) | 2265 | int optname, char __user *optval, int __user *optlen) |
2266 | { | 2266 | { |
2267 | struct inet_connection_sock *icsk = inet_csk(sk); | 2267 | struct inet_connection_sock *icsk = inet_csk(sk); |
2268 | struct tcp_sock *tp = tcp_sk(sk); | 2268 | struct tcp_sock *tp = tcp_sk(sk); |
2269 | int val, len; | 2269 | int val, len; |
2270 | 2270 | ||
2271 | if (get_user(len, optlen)) | 2271 | if (get_user(len, optlen)) |
2272 | return -EFAULT; | 2272 | return -EFAULT; |
2273 | 2273 | ||
2274 | len = min_t(unsigned int, len, sizeof(int)); | 2274 | len = min_t(unsigned int, len, sizeof(int)); |
2275 | 2275 | ||
2276 | if (len < 0) | 2276 | if (len < 0) |
2277 | return -EINVAL; | 2277 | return -EINVAL; |
2278 | 2278 | ||
2279 | switch (optname) { | 2279 | switch (optname) { |
2280 | case TCP_MAXSEG: | 2280 | case TCP_MAXSEG: |
2281 | val = tp->mss_cache; | 2281 | val = tp->mss_cache; |
2282 | if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) | 2282 | if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) |
2283 | val = tp->rx_opt.user_mss; | 2283 | val = tp->rx_opt.user_mss; |
2284 | break; | 2284 | break; |
2285 | case TCP_NODELAY: | 2285 | case TCP_NODELAY: |
2286 | val = !!(tp->nonagle&TCP_NAGLE_OFF); | 2286 | val = !!(tp->nonagle&TCP_NAGLE_OFF); |
2287 | break; | 2287 | break; |
2288 | case TCP_CORK: | 2288 | case TCP_CORK: |
2289 | val = !!(tp->nonagle&TCP_NAGLE_CORK); | 2289 | val = !!(tp->nonagle&TCP_NAGLE_CORK); |
2290 | break; | 2290 | break; |
2291 | case TCP_KEEPIDLE: | 2291 | case TCP_KEEPIDLE: |
2292 | val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; | 2292 | val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; |
2293 | break; | 2293 | break; |
2294 | case TCP_KEEPINTVL: | 2294 | case TCP_KEEPINTVL: |
2295 | val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; | 2295 | val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; |
2296 | break; | 2296 | break; |
2297 | case TCP_KEEPCNT: | 2297 | case TCP_KEEPCNT: |
2298 | val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; | 2298 | val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; |
2299 | break; | 2299 | break; |
2300 | case TCP_SYNCNT: | 2300 | case TCP_SYNCNT: |
2301 | val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 2301 | val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
2302 | break; | 2302 | break; |
2303 | case TCP_LINGER2: | 2303 | case TCP_LINGER2: |
2304 | val = tp->linger2; | 2304 | val = tp->linger2; |
2305 | if (val >= 0) | 2305 | if (val >= 0) |
2306 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; | 2306 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; |
2307 | break; | 2307 | break; |
2308 | case TCP_DEFER_ACCEPT: | 2308 | case TCP_DEFER_ACCEPT: |
2309 | val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : | 2309 | val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : |
2310 | ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); | 2310 | ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); |
2311 | break; | 2311 | break; |
2312 | case TCP_WINDOW_CLAMP: | 2312 | case TCP_WINDOW_CLAMP: |
2313 | val = tp->window_clamp; | 2313 | val = tp->window_clamp; |
2314 | break; | 2314 | break; |
2315 | case TCP_INFO: { | 2315 | case TCP_INFO: { |
2316 | struct tcp_info info; | 2316 | struct tcp_info info; |
2317 | 2317 | ||
2318 | if (get_user(len, optlen)) | 2318 | if (get_user(len, optlen)) |
2319 | return -EFAULT; | 2319 | return -EFAULT; |
2320 | 2320 | ||
2321 | tcp_get_info(sk, &info); | 2321 | tcp_get_info(sk, &info); |
2322 | 2322 | ||
2323 | len = min_t(unsigned int, len, sizeof(info)); | 2323 | len = min_t(unsigned int, len, sizeof(info)); |
2324 | if (put_user(len, optlen)) | 2324 | if (put_user(len, optlen)) |
2325 | return -EFAULT; | 2325 | return -EFAULT; |
2326 | if (copy_to_user(optval, &info, len)) | 2326 | if (copy_to_user(optval, &info, len)) |
2327 | return -EFAULT; | 2327 | return -EFAULT; |
2328 | return 0; | 2328 | return 0; |
2329 | } | 2329 | } |
2330 | case TCP_QUICKACK: | 2330 | case TCP_QUICKACK: |
2331 | val = !icsk->icsk_ack.pingpong; | 2331 | val = !icsk->icsk_ack.pingpong; |
2332 | break; | 2332 | break; |
2333 | 2333 | ||
2334 | case TCP_CONGESTION: | 2334 | case TCP_CONGESTION: |
2335 | if (get_user(len, optlen)) | 2335 | if (get_user(len, optlen)) |
2336 | return -EFAULT; | 2336 | return -EFAULT; |
2337 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); | 2337 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); |
2338 | if (put_user(len, optlen)) | 2338 | if (put_user(len, optlen)) |
2339 | return -EFAULT; | 2339 | return -EFAULT; |
2340 | if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) | 2340 | if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) |
2341 | return -EFAULT; | 2341 | return -EFAULT; |
2342 | return 0; | 2342 | return 0; |
2343 | default: | 2343 | default: |
2344 | return -ENOPROTOOPT; | 2344 | return -ENOPROTOOPT; |
2345 | } | 2345 | } |
2346 | 2346 | ||
2347 | if (put_user(len, optlen)) | 2347 | if (put_user(len, optlen)) |
2348 | return -EFAULT; | 2348 | return -EFAULT; |
2349 | if (copy_to_user(optval, &val, len)) | 2349 | if (copy_to_user(optval, &val, len)) |
2350 | return -EFAULT; | 2350 | return -EFAULT; |
2351 | return 0; | 2351 | return 0; |
2352 | } | 2352 | } |
2353 | 2353 | ||
2354 | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | 2354 | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, |
2355 | int __user *optlen) | 2355 | int __user *optlen) |
2356 | { | 2356 | { |
2357 | struct inet_connection_sock *icsk = inet_csk(sk); | 2357 | struct inet_connection_sock *icsk = inet_csk(sk); |
2358 | 2358 | ||
2359 | if (level != SOL_TCP) | 2359 | if (level != SOL_TCP) |
2360 | return icsk->icsk_af_ops->getsockopt(sk, level, optname, | 2360 | return icsk->icsk_af_ops->getsockopt(sk, level, optname, |
2361 | optval, optlen); | 2361 | optval, optlen); |
2362 | return do_tcp_getsockopt(sk, level, optname, optval, optlen); | 2362 | return do_tcp_getsockopt(sk, level, optname, optval, optlen); |
2363 | } | 2363 | } |
2364 | 2364 | ||
2365 | #ifdef CONFIG_COMPAT | 2365 | #ifdef CONFIG_COMPAT |
2366 | int compat_tcp_getsockopt(struct sock *sk, int level, int optname, | 2366 | int compat_tcp_getsockopt(struct sock *sk, int level, int optname, |
2367 | char __user *optval, int __user *optlen) | 2367 | char __user *optval, int __user *optlen) |
2368 | { | 2368 | { |
2369 | if (level != SOL_TCP) | 2369 | if (level != SOL_TCP) |
2370 | return inet_csk_compat_getsockopt(sk, level, optname, | 2370 | return inet_csk_compat_getsockopt(sk, level, optname, |
2371 | optval, optlen); | 2371 | optval, optlen); |
2372 | return do_tcp_getsockopt(sk, level, optname, optval, optlen); | 2372 | return do_tcp_getsockopt(sk, level, optname, optval, optlen); |
2373 | } | 2373 | } |
2374 | 2374 | ||
2375 | EXPORT_SYMBOL(compat_tcp_getsockopt); | 2375 | EXPORT_SYMBOL(compat_tcp_getsockopt); |
2376 | #endif | 2376 | #endif |
2377 | 2377 | ||
2378 | struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) | 2378 | struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) |
2379 | { | 2379 | { |
2380 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 2380 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
2381 | struct tcphdr *th; | 2381 | struct tcphdr *th; |
2382 | unsigned thlen; | 2382 | unsigned thlen; |
2383 | unsigned int seq; | 2383 | unsigned int seq; |
2384 | __be32 delta; | 2384 | __be32 delta; |
2385 | unsigned int oldlen; | 2385 | unsigned int oldlen; |
2386 | unsigned int len; | 2386 | unsigned int len; |
2387 | 2387 | ||
2388 | if (!pskb_may_pull(skb, sizeof(*th))) | 2388 | if (!pskb_may_pull(skb, sizeof(*th))) |
2389 | goto out; | 2389 | goto out; |
2390 | 2390 | ||
2391 | th = tcp_hdr(skb); | 2391 | th = tcp_hdr(skb); |
2392 | thlen = th->doff * 4; | 2392 | thlen = th->doff * 4; |
2393 | if (thlen < sizeof(*th)) | 2393 | if (thlen < sizeof(*th)) |
2394 | goto out; | 2394 | goto out; |
2395 | 2395 | ||
2396 | if (!pskb_may_pull(skb, thlen)) | 2396 | if (!pskb_may_pull(skb, thlen)) |
2397 | goto out; | 2397 | goto out; |
2398 | 2398 | ||
2399 | oldlen = (u16)~skb->len; | 2399 | oldlen = (u16)~skb->len; |
2400 | __skb_pull(skb, thlen); | 2400 | __skb_pull(skb, thlen); |
2401 | 2401 | ||
2402 | if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { | 2402 | if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { |
2403 | /* Packet is from an untrusted source, reset gso_segs. */ | 2403 | /* Packet is from an untrusted source, reset gso_segs. */ |
2404 | int type = skb_shinfo(skb)->gso_type; | 2404 | int type = skb_shinfo(skb)->gso_type; |
2405 | int mss; | 2405 | int mss; |
2406 | 2406 | ||
2407 | if (unlikely(type & | 2407 | if (unlikely(type & |
2408 | ~(SKB_GSO_TCPV4 | | 2408 | ~(SKB_GSO_TCPV4 | |
2409 | SKB_GSO_DODGY | | 2409 | SKB_GSO_DODGY | |
2410 | SKB_GSO_TCP_ECN | | 2410 | SKB_GSO_TCP_ECN | |
2411 | SKB_GSO_TCPV6 | | 2411 | SKB_GSO_TCPV6 | |
2412 | 0) || | 2412 | 0) || |
2413 | !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) | 2413 | !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) |
2414 | goto out; | 2414 | goto out; |
2415 | 2415 | ||
2416 | mss = skb_shinfo(skb)->gso_size; | 2416 | mss = skb_shinfo(skb)->gso_size; |
2417 | skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); | 2417 | skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); |
2418 | 2418 | ||
2419 | segs = NULL; | 2419 | segs = NULL; |
2420 | goto out; | 2420 | goto out; |
2421 | } | 2421 | } |
2422 | 2422 | ||
2423 | segs = skb_segment(skb, features); | 2423 | segs = skb_segment(skb, features); |
2424 | if (IS_ERR(segs)) | 2424 | if (IS_ERR(segs)) |
2425 | goto out; | 2425 | goto out; |
2426 | 2426 | ||
2427 | len = skb_shinfo(skb)->gso_size; | 2427 | len = skb_shinfo(skb)->gso_size; |
2428 | delta = htonl(oldlen + (thlen + len)); | 2428 | delta = htonl(oldlen + (thlen + len)); |
2429 | 2429 | ||
2430 | skb = segs; | 2430 | skb = segs; |
2431 | th = tcp_hdr(skb); | 2431 | th = tcp_hdr(skb); |
2432 | seq = ntohl(th->seq); | 2432 | seq = ntohl(th->seq); |
2433 | 2433 | ||
2434 | do { | 2434 | do { |
2435 | th->fin = th->psh = 0; | 2435 | th->fin = th->psh = 0; |
2436 | 2436 | ||
2437 | th->check = ~csum_fold((__force __wsum)((__force u32)th->check + | 2437 | th->check = ~csum_fold((__force __wsum)((__force u32)th->check + |
2438 | (__force u32)delta)); | 2438 | (__force u32)delta)); |
2439 | if (skb->ip_summed != CHECKSUM_PARTIAL) | 2439 | if (skb->ip_summed != CHECKSUM_PARTIAL) |
2440 | th->check = | 2440 | th->check = |
2441 | csum_fold(csum_partial(skb_transport_header(skb), | 2441 | csum_fold(csum_partial(skb_transport_header(skb), |
2442 | thlen, skb->csum)); | 2442 | thlen, skb->csum)); |
2443 | 2443 | ||
2444 | seq += len; | 2444 | seq += len; |
2445 | skb = skb->next; | 2445 | skb = skb->next; |
2446 | th = tcp_hdr(skb); | 2446 | th = tcp_hdr(skb); |
2447 | 2447 | ||
2448 | th->seq = htonl(seq); | 2448 | th->seq = htonl(seq); |
2449 | th->cwr = 0; | 2449 | th->cwr = 0; |
2450 | } while (skb->next); | 2450 | } while (skb->next); |
2451 | 2451 | ||
2452 | delta = htonl(oldlen + (skb->tail - skb->transport_header) + | 2452 | delta = htonl(oldlen + (skb->tail - skb->transport_header) + |
2453 | skb->data_len); | 2453 | skb->data_len); |
2454 | th->check = ~csum_fold((__force __wsum)((__force u32)th->check + | 2454 | th->check = ~csum_fold((__force __wsum)((__force u32)th->check + |
2455 | (__force u32)delta)); | 2455 | (__force u32)delta)); |
2456 | if (skb->ip_summed != CHECKSUM_PARTIAL) | 2456 | if (skb->ip_summed != CHECKSUM_PARTIAL) |
2457 | th->check = csum_fold(csum_partial(skb_transport_header(skb), | 2457 | th->check = csum_fold(csum_partial(skb_transport_header(skb), |
2458 | thlen, skb->csum)); | 2458 | thlen, skb->csum)); |
2459 | 2459 | ||
2460 | out: | 2460 | out: |
2461 | return segs; | 2461 | return segs; |
2462 | } | 2462 | } |
2463 | EXPORT_SYMBOL(tcp_tso_segment); | 2463 | EXPORT_SYMBOL(tcp_tso_segment); |
2464 | 2464 | ||
2465 | #ifdef CONFIG_TCP_MD5SIG | 2465 | #ifdef CONFIG_TCP_MD5SIG |
2466 | static unsigned long tcp_md5sig_users; | 2466 | static unsigned long tcp_md5sig_users; |
2467 | static struct tcp_md5sig_pool **tcp_md5sig_pool; | 2467 | static struct tcp_md5sig_pool **tcp_md5sig_pool; |
2468 | static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); | 2468 | static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); |
2469 | 2469 | ||
2470 | static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) | 2470 | static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) |
2471 | { | 2471 | { |
2472 | int cpu; | 2472 | int cpu; |
2473 | for_each_possible_cpu(cpu) { | 2473 | for_each_possible_cpu(cpu) { |
2474 | struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); | 2474 | struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); |
2475 | if (p) { | 2475 | if (p) { |
2476 | if (p->md5_desc.tfm) | 2476 | if (p->md5_desc.tfm) |
2477 | crypto_free_hash(p->md5_desc.tfm); | 2477 | crypto_free_hash(p->md5_desc.tfm); |
2478 | kfree(p); | 2478 | kfree(p); |
2479 | p = NULL; | 2479 | p = NULL; |
2480 | } | 2480 | } |
2481 | } | 2481 | } |
2482 | free_percpu(pool); | 2482 | free_percpu(pool); |
2483 | } | 2483 | } |
2484 | 2484 | ||
2485 | void tcp_free_md5sig_pool(void) | 2485 | void tcp_free_md5sig_pool(void) |
2486 | { | 2486 | { |
2487 | struct tcp_md5sig_pool **pool = NULL; | 2487 | struct tcp_md5sig_pool **pool = NULL; |
2488 | 2488 | ||
2489 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2489 | spin_lock_bh(&tcp_md5sig_pool_lock); |
2490 | if (--tcp_md5sig_users == 0) { | 2490 | if (--tcp_md5sig_users == 0) { |
2491 | pool = tcp_md5sig_pool; | 2491 | pool = tcp_md5sig_pool; |
2492 | tcp_md5sig_pool = NULL; | 2492 | tcp_md5sig_pool = NULL; |
2493 | } | 2493 | } |
2494 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2494 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2495 | if (pool) | 2495 | if (pool) |
2496 | __tcp_free_md5sig_pool(pool); | 2496 | __tcp_free_md5sig_pool(pool); |
2497 | } | 2497 | } |
2498 | 2498 | ||
2499 | EXPORT_SYMBOL(tcp_free_md5sig_pool); | 2499 | EXPORT_SYMBOL(tcp_free_md5sig_pool); |
2500 | 2500 | ||
2501 | static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) | 2501 | static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) |
2502 | { | 2502 | { |
2503 | int cpu; | 2503 | int cpu; |
2504 | struct tcp_md5sig_pool **pool; | 2504 | struct tcp_md5sig_pool **pool; |
2505 | 2505 | ||
2506 | pool = alloc_percpu(struct tcp_md5sig_pool *); | 2506 | pool = alloc_percpu(struct tcp_md5sig_pool *); |
2507 | if (!pool) | 2507 | if (!pool) |
2508 | return NULL; | 2508 | return NULL; |
2509 | 2509 | ||
2510 | for_each_possible_cpu(cpu) { | 2510 | for_each_possible_cpu(cpu) { |
2511 | struct tcp_md5sig_pool *p; | 2511 | struct tcp_md5sig_pool *p; |
2512 | struct crypto_hash *hash; | 2512 | struct crypto_hash *hash; |
2513 | 2513 | ||
2514 | p = kzalloc(sizeof(*p), GFP_KERNEL); | 2514 | p = kzalloc(sizeof(*p), GFP_KERNEL); |
2515 | if (!p) | 2515 | if (!p) |
2516 | goto out_free; | 2516 | goto out_free; |
2517 | *per_cpu_ptr(pool, cpu) = p; | 2517 | *per_cpu_ptr(pool, cpu) = p; |
2518 | 2518 | ||
2519 | hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); | 2519 | hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); |
2520 | if (!hash || IS_ERR(hash)) | 2520 | if (!hash || IS_ERR(hash)) |
2521 | goto out_free; | 2521 | goto out_free; |
2522 | 2522 | ||
2523 | p->md5_desc.tfm = hash; | 2523 | p->md5_desc.tfm = hash; |
2524 | } | 2524 | } |
2525 | return pool; | 2525 | return pool; |
2526 | out_free: | 2526 | out_free: |
2527 | __tcp_free_md5sig_pool(pool); | 2527 | __tcp_free_md5sig_pool(pool); |
2528 | return NULL; | 2528 | return NULL; |
2529 | } | 2529 | } |
2530 | 2530 | ||
2531 | struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) | 2531 | struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) |
2532 | { | 2532 | { |
2533 | struct tcp_md5sig_pool **pool; | 2533 | struct tcp_md5sig_pool **pool; |
2534 | int alloc = 0; | 2534 | int alloc = 0; |
2535 | 2535 | ||
2536 | retry: | 2536 | retry: |
2537 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2537 | spin_lock_bh(&tcp_md5sig_pool_lock); |
2538 | pool = tcp_md5sig_pool; | 2538 | pool = tcp_md5sig_pool; |
2539 | if (tcp_md5sig_users++ == 0) { | 2539 | if (tcp_md5sig_users++ == 0) { |
2540 | alloc = 1; | 2540 | alloc = 1; |
2541 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2541 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2542 | } else if (!pool) { | 2542 | } else if (!pool) { |
2543 | tcp_md5sig_users--; | 2543 | tcp_md5sig_users--; |
2544 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2544 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2545 | cpu_relax(); | 2545 | cpu_relax(); |
2546 | goto retry; | 2546 | goto retry; |
2547 | } else | 2547 | } else |
2548 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2548 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2549 | 2549 | ||
2550 | if (alloc) { | 2550 | if (alloc) { |
2551 | /* we cannot hold spinlock here because this may sleep. */ | 2551 | /* we cannot hold spinlock here because this may sleep. */ |
2552 | struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); | 2552 | struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); |
2553 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2553 | spin_lock_bh(&tcp_md5sig_pool_lock); |
2554 | if (!p) { | 2554 | if (!p) { |
2555 | tcp_md5sig_users--; | 2555 | tcp_md5sig_users--; |
2556 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2556 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2557 | return NULL; | 2557 | return NULL; |
2558 | } | 2558 | } |
2559 | pool = tcp_md5sig_pool; | 2559 | pool = tcp_md5sig_pool; |
2560 | if (pool) { | 2560 | if (pool) { |
2561 | /* oops, it has already been assigned. */ | 2561 | /* oops, it has already been assigned. */ |
2562 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2562 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2563 | __tcp_free_md5sig_pool(p); | 2563 | __tcp_free_md5sig_pool(p); |
2564 | } else { | 2564 | } else { |
2565 | tcp_md5sig_pool = pool = p; | 2565 | tcp_md5sig_pool = pool = p; |
2566 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2566 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2567 | } | 2567 | } |
2568 | } | 2568 | } |
2569 | return pool; | 2569 | return pool; |
2570 | } | 2570 | } |
2571 | 2571 | ||
2572 | EXPORT_SYMBOL(tcp_alloc_md5sig_pool); | 2572 | EXPORT_SYMBOL(tcp_alloc_md5sig_pool); |
2573 | 2573 | ||
2574 | struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) | 2574 | struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) |
2575 | { | 2575 | { |
2576 | struct tcp_md5sig_pool **p; | 2576 | struct tcp_md5sig_pool **p; |
2577 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2577 | spin_lock_bh(&tcp_md5sig_pool_lock); |
2578 | p = tcp_md5sig_pool; | 2578 | p = tcp_md5sig_pool; |
2579 | if (p) | 2579 | if (p) |
2580 | tcp_md5sig_users++; | 2580 | tcp_md5sig_users++; |
2581 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2581 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
2582 | return (p ? *per_cpu_ptr(p, cpu) : NULL); | 2582 | return (p ? *per_cpu_ptr(p, cpu) : NULL); |
2583 | } | 2583 | } |
2584 | 2584 | ||
2585 | EXPORT_SYMBOL(__tcp_get_md5sig_pool); | 2585 | EXPORT_SYMBOL(__tcp_get_md5sig_pool); |
2586 | 2586 | ||
2587 | void __tcp_put_md5sig_pool(void) | 2587 | void __tcp_put_md5sig_pool(void) |
2588 | { | 2588 | { |
2589 | tcp_free_md5sig_pool(); | 2589 | tcp_free_md5sig_pool(); |
2590 | } | 2590 | } |
2591 | 2591 | ||
2592 | EXPORT_SYMBOL(__tcp_put_md5sig_pool); | 2592 | EXPORT_SYMBOL(__tcp_put_md5sig_pool); |
2593 | 2593 | ||
2594 | int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, | 2594 | int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, |
2595 | struct tcphdr *th) | 2595 | struct tcphdr *th) |
2596 | { | 2596 | { |
2597 | struct scatterlist sg; | 2597 | struct scatterlist sg; |
2598 | int err; | 2598 | int err; |
2599 | 2599 | ||
2600 | __sum16 old_checksum = th->check; | 2600 | __sum16 old_checksum = th->check; |
2601 | th->check = 0; | 2601 | th->check = 0; |
2602 | /* options aren't included in the hash */ | 2602 | /* options aren't included in the hash */ |
2603 | sg_init_one(&sg, th, sizeof(struct tcphdr)); | 2603 | sg_init_one(&sg, th, sizeof(struct tcphdr)); |
2604 | err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); | 2604 | err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); |
2605 | th->check = old_checksum; | 2605 | th->check = old_checksum; |
2606 | return err; | 2606 | return err; |
2607 | } | 2607 | } |
2608 | 2608 | ||
2609 | EXPORT_SYMBOL(tcp_md5_hash_header); | 2609 | EXPORT_SYMBOL(tcp_md5_hash_header); |
2610 | 2610 | ||
2611 | int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, | 2611 | int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, |
2612 | struct sk_buff *skb, unsigned header_len) | 2612 | struct sk_buff *skb, unsigned header_len) |
2613 | { | 2613 | { |
2614 | struct scatterlist sg; | 2614 | struct scatterlist sg; |
2615 | const struct tcphdr *tp = tcp_hdr(skb); | 2615 | const struct tcphdr *tp = tcp_hdr(skb); |
2616 | struct hash_desc *desc = &hp->md5_desc; | 2616 | struct hash_desc *desc = &hp->md5_desc; |
2617 | unsigned i; | 2617 | unsigned i; |
2618 | const unsigned head_data_len = skb_headlen(skb) > header_len ? | 2618 | const unsigned head_data_len = skb_headlen(skb) > header_len ? |
2619 | skb_headlen(skb) - header_len : 0; | 2619 | skb_headlen(skb) - header_len : 0; |
2620 | const struct skb_shared_info *shi = skb_shinfo(skb); | 2620 | const struct skb_shared_info *shi = skb_shinfo(skb); |
2621 | 2621 | ||
2622 | sg_init_table(&sg, 1); | 2622 | sg_init_table(&sg, 1); |
2623 | 2623 | ||
2624 | sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); | 2624 | sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); |
2625 | if (crypto_hash_update(desc, &sg, head_data_len)) | 2625 | if (crypto_hash_update(desc, &sg, head_data_len)) |
2626 | return 1; | 2626 | return 1; |
2627 | 2627 | ||
2628 | for (i = 0; i < shi->nr_frags; ++i) { | 2628 | for (i = 0; i < shi->nr_frags; ++i) { |
2629 | const struct skb_frag_struct *f = &shi->frags[i]; | 2629 | const struct skb_frag_struct *f = &shi->frags[i]; |
2630 | sg_set_page(&sg, f->page, f->size, f->page_offset); | 2630 | sg_set_page(&sg, f->page, f->size, f->page_offset); |
2631 | if (crypto_hash_update(desc, &sg, f->size)) | 2631 | if (crypto_hash_update(desc, &sg, f->size)) |
2632 | return 1; | 2632 | return 1; |
2633 | } | 2633 | } |
2634 | 2634 | ||
2635 | return 0; | 2635 | return 0; |
2636 | } | 2636 | } |
2637 | 2637 | ||
2638 | EXPORT_SYMBOL(tcp_md5_hash_skb_data); | 2638 | EXPORT_SYMBOL(tcp_md5_hash_skb_data); |
2639 | 2639 | ||
2640 | int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) | 2640 | int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) |
2641 | { | 2641 | { |
2642 | struct scatterlist sg; | 2642 | struct scatterlist sg; |
2643 | 2643 | ||
2644 | sg_init_one(&sg, key->key, key->keylen); | 2644 | sg_init_one(&sg, key->key, key->keylen); |
2645 | return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); | 2645 | return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); |
2646 | } | 2646 | } |
2647 | 2647 | ||
2648 | EXPORT_SYMBOL(tcp_md5_hash_key); | 2648 | EXPORT_SYMBOL(tcp_md5_hash_key); |
2649 | 2649 | ||
2650 | #endif | 2650 | #endif |
2651 | 2651 | ||
2652 | void tcp_done(struct sock *sk) | 2652 | void tcp_done(struct sock *sk) |
2653 | { | 2653 | { |
2654 | if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) | 2654 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
2655 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); | 2655 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
2656 | 2656 | ||
2657 | tcp_set_state(sk, TCP_CLOSE); | 2657 | tcp_set_state(sk, TCP_CLOSE); |
2658 | tcp_clear_xmit_timers(sk); | 2658 | tcp_clear_xmit_timers(sk); |
2659 | 2659 | ||
2660 | sk->sk_shutdown = SHUTDOWN_MASK; | 2660 | sk->sk_shutdown = SHUTDOWN_MASK; |
2661 | 2661 | ||
2662 | if (!sock_flag(sk, SOCK_DEAD)) | 2662 | if (!sock_flag(sk, SOCK_DEAD)) |
2663 | sk->sk_state_change(sk); | 2663 | sk->sk_state_change(sk); |
2664 | else | 2664 | else |
2665 | inet_csk_destroy_sock(sk); | 2665 | inet_csk_destroy_sock(sk); |
2666 | } | 2666 | } |
2667 | EXPORT_SYMBOL_GPL(tcp_done); | 2667 | EXPORT_SYMBOL_GPL(tcp_done); |
2668 | 2668 | ||
2669 | extern struct tcp_congestion_ops tcp_reno; | 2669 | extern struct tcp_congestion_ops tcp_reno; |
2670 | 2670 | ||
2671 | static __initdata unsigned long thash_entries; | 2671 | static __initdata unsigned long thash_entries; |
2672 | static int __init set_thash_entries(char *str) | 2672 | static int __init set_thash_entries(char *str) |
2673 | { | 2673 | { |
2674 | if (!str) | 2674 | if (!str) |
2675 | return 0; | 2675 | return 0; |
2676 | thash_entries = simple_strtoul(str, &str, 0); | 2676 | thash_entries = simple_strtoul(str, &str, 0); |
2677 | return 1; | 2677 | return 1; |
2678 | } | 2678 | } |
2679 | __setup("thash_entries=", set_thash_entries); | 2679 | __setup("thash_entries=", set_thash_entries); |
2680 | 2680 | ||
2681 | void __init tcp_init(void) | 2681 | void __init tcp_init(void) |
2682 | { | 2682 | { |
2683 | struct sk_buff *skb = NULL; | 2683 | struct sk_buff *skb = NULL; |
2684 | unsigned long nr_pages, limit; | 2684 | unsigned long nr_pages, limit; |
2685 | int order, i, max_share; | 2685 | int order, i, max_share; |
2686 | 2686 | ||
2687 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); | 2687 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); |
2688 | 2688 | ||
2689 | tcp_hashinfo.bind_bucket_cachep = | 2689 | tcp_hashinfo.bind_bucket_cachep = |
2690 | kmem_cache_create("tcp_bind_bucket", | 2690 | kmem_cache_create("tcp_bind_bucket", |
2691 | sizeof(struct inet_bind_bucket), 0, | 2691 | sizeof(struct inet_bind_bucket), 0, |
2692 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 2692 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
2693 | 2693 | ||
2694 | /* Size and allocate the main established and bind bucket | 2694 | /* Size and allocate the main established and bind bucket |
2695 | * hash tables. | 2695 | * hash tables. |
2696 | * | 2696 | * |
2697 | * The methodology is similar to that of the buffer cache. | 2697 | * The methodology is similar to that of the buffer cache. |
2698 | */ | 2698 | */ |
2699 | tcp_hashinfo.ehash = | 2699 | tcp_hashinfo.ehash = |
2700 | alloc_large_system_hash("TCP established", | 2700 | alloc_large_system_hash("TCP established", |
2701 | sizeof(struct inet_ehash_bucket), | 2701 | sizeof(struct inet_ehash_bucket), |
2702 | thash_entries, | 2702 | thash_entries, |
2703 | (num_physpages >= 128 * 1024) ? | 2703 | (num_physpages >= 128 * 1024) ? |
2704 | 13 : 15, | 2704 | 13 : 15, |
2705 | 0, | 2705 | 0, |
2706 | &tcp_hashinfo.ehash_size, | 2706 | &tcp_hashinfo.ehash_size, |
2707 | NULL, | 2707 | NULL, |
2708 | thash_entries ? 0 : 512 * 1024); | 2708 | thash_entries ? 0 : 512 * 1024); |
2709 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; | 2709 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; |
2710 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { | 2710 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { |
2711 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); | 2711 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); |
2712 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); | 2712 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); |
2713 | } | 2713 | } |
2714 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) | 2714 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) |
2715 | panic("TCP: failed to alloc ehash_locks"); | 2715 | panic("TCP: failed to alloc ehash_locks"); |
2716 | tcp_hashinfo.bhash = | 2716 | tcp_hashinfo.bhash = |
2717 | alloc_large_system_hash("TCP bind", | 2717 | alloc_large_system_hash("TCP bind", |
2718 | sizeof(struct inet_bind_hashbucket), | 2718 | sizeof(struct inet_bind_hashbucket), |
2719 | tcp_hashinfo.ehash_size, | 2719 | tcp_hashinfo.ehash_size, |
2720 | (num_physpages >= 128 * 1024) ? | 2720 | (num_physpages >= 128 * 1024) ? |
2721 | 13 : 15, | 2721 | 13 : 15, |
2722 | 0, | 2722 | 0, |
2723 | &tcp_hashinfo.bhash_size, | 2723 | &tcp_hashinfo.bhash_size, |
2724 | NULL, | 2724 | NULL, |
2725 | 64 * 1024); | 2725 | 64 * 1024); |
2726 | tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; | 2726 | tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; |
2727 | for (i = 0; i < tcp_hashinfo.bhash_size; i++) { | 2727 | for (i = 0; i < tcp_hashinfo.bhash_size; i++) { |
2728 | spin_lock_init(&tcp_hashinfo.bhash[i].lock); | 2728 | spin_lock_init(&tcp_hashinfo.bhash[i].lock); |
2729 | INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); | 2729 | INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); |
2730 | } | 2730 | } |
2731 | 2731 | ||
2732 | /* Try to be a bit smarter and adjust defaults depending | 2732 | /* Try to be a bit smarter and adjust defaults depending |
2733 | * on available memory. | 2733 | * on available memory. |
2734 | */ | 2734 | */ |
2735 | for (order = 0; ((1 << order) << PAGE_SHIFT) < | 2735 | for (order = 0; ((1 << order) << PAGE_SHIFT) < |
2736 | (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); | 2736 | (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); |
2737 | order++) | 2737 | order++) |
2738 | ; | 2738 | ; |
2739 | if (order >= 4) { | 2739 | if (order >= 4) { |
2740 | tcp_death_row.sysctl_max_tw_buckets = 180000; | 2740 | tcp_death_row.sysctl_max_tw_buckets = 180000; |
2741 | sysctl_tcp_max_orphans = 4096 << (order - 4); | 2741 | sysctl_tcp_max_orphans = 4096 << (order - 4); |
2742 | sysctl_max_syn_backlog = 1024; | 2742 | sysctl_max_syn_backlog = 1024; |
2743 | } else if (order < 3) { | 2743 | } else if (order < 3) { |
2744 | tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); | 2744 | tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); |
2745 | sysctl_tcp_max_orphans >>= (3 - order); | 2745 | sysctl_tcp_max_orphans >>= (3 - order); |
2746 | sysctl_max_syn_backlog = 128; | 2746 | sysctl_max_syn_backlog = 128; |
2747 | } | 2747 | } |
2748 | 2748 | ||
2749 | /* Set the pressure threshold to be a fraction of global memory that | 2749 | /* Set the pressure threshold to be a fraction of global memory that |
2750 | * is up to 1/2 at 256 MB, decreasing toward zero with the amount of | 2750 | * is up to 1/2 at 256 MB, decreasing toward zero with the amount of |
2751 | * memory, with a floor of 128 pages. | 2751 | * memory, with a floor of 128 pages. |
2752 | */ | 2752 | */ |
2753 | nr_pages = totalram_pages - totalhigh_pages; | 2753 | nr_pages = totalram_pages - totalhigh_pages; |
2754 | limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); | 2754 | limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); |
2755 | limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); | 2755 | limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); |
2756 | limit = max(limit, 128UL); | 2756 | limit = max(limit, 128UL); |
2757 | sysctl_tcp_mem[0] = limit / 4 * 3; | 2757 | sysctl_tcp_mem[0] = limit / 4 * 3; |
2758 | sysctl_tcp_mem[1] = limit; | 2758 | sysctl_tcp_mem[1] = limit; |
2759 | sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; | 2759 | sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; |
2760 | 2760 | ||
2761 | /* Set per-socket limits to no more than 1/128 the pressure threshold */ | 2761 | /* Set per-socket limits to no more than 1/128 the pressure threshold */ |
2762 | limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); | 2762 | limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); |
2763 | max_share = min(4UL*1024*1024, limit); | 2763 | max_share = min(4UL*1024*1024, limit); |
2764 | 2764 | ||
2765 | sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; | 2765 | sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; |
2766 | sysctl_tcp_wmem[1] = 16*1024; | 2766 | sysctl_tcp_wmem[1] = 16*1024; |
2767 | sysctl_tcp_wmem[2] = max(64*1024, max_share); | 2767 | sysctl_tcp_wmem[2] = max(64*1024, max_share); |
2768 | 2768 | ||
2769 | sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; | 2769 | sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; |
2770 | sysctl_tcp_rmem[1] = 87380; | 2770 | sysctl_tcp_rmem[1] = 87380; |
2771 | sysctl_tcp_rmem[2] = max(87380, max_share); | 2771 | sysctl_tcp_rmem[2] = max(87380, max_share); |
2772 | 2772 | ||
2773 | printk(KERN_INFO "TCP: Hash tables configured " | 2773 | printk(KERN_INFO "TCP: Hash tables configured " |
2774 | "(established %d bind %d)\n", | 2774 | "(established %d bind %d)\n", |
2775 | tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); | 2775 | tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); |
2776 | 2776 | ||
2777 | tcp_register_congestion_control(&tcp_reno); | 2777 | tcp_register_congestion_control(&tcp_reno); |
2778 | } | 2778 | } |
2779 | 2779 | ||
2780 | EXPORT_SYMBOL(tcp_close); | 2780 | EXPORT_SYMBOL(tcp_close); |
2781 | EXPORT_SYMBOL(tcp_disconnect); | 2781 | EXPORT_SYMBOL(tcp_disconnect); |
2782 | EXPORT_SYMBOL(tcp_getsockopt); | 2782 | EXPORT_SYMBOL(tcp_getsockopt); |
2783 | EXPORT_SYMBOL(tcp_ioctl); | 2783 | EXPORT_SYMBOL(tcp_ioctl); |
2784 | EXPORT_SYMBOL(tcp_poll); | 2784 | EXPORT_SYMBOL(tcp_poll); |
2785 | EXPORT_SYMBOL(tcp_read_sock); | 2785 | EXPORT_SYMBOL(tcp_read_sock); |
2786 | EXPORT_SYMBOL(tcp_recvmsg); | 2786 | EXPORT_SYMBOL(tcp_recvmsg); |
2787 | EXPORT_SYMBOL(tcp_sendmsg); | 2787 | EXPORT_SYMBOL(tcp_sendmsg); |
2788 | EXPORT_SYMBOL(tcp_splice_read); | 2788 | EXPORT_SYMBOL(tcp_splice_read); |
2789 | EXPORT_SYMBOL(tcp_sendpage); | 2789 | EXPORT_SYMBOL(tcp_sendpage); |
2790 | EXPORT_SYMBOL(tcp_setsockopt); | 2790 | EXPORT_SYMBOL(tcp_setsockopt); |
2791 | EXPORT_SYMBOL(tcp_shutdown); | 2791 | EXPORT_SYMBOL(tcp_shutdown); |
2792 | 2792 |
net/ipv4/tcp_minisocks.c
1 | /* | 1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
3 | * operating system. INET is implemented using the BSD Socket | 3 | * operating system. INET is implemented using the BSD Socket |
4 | * interface as the means of communication with the user level. | 4 | * interface as the means of communication with the user level. |
5 | * | 5 | * |
6 | * Implementation of the Transmission Control Protocol(TCP). | 6 | * Implementation of the Transmission Control Protocol(TCP). |
7 | * | 7 | * |
8 | * Authors: Ross Biro | 8 | * Authors: Ross Biro |
9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | 9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> |
10 | * Mark Evans, <evansmp@uhura.aston.ac.uk> | 10 | * Mark Evans, <evansmp@uhura.aston.ac.uk> |
11 | * Corey Minyard <wf-rch!minyard@relay.EU.net> | 11 | * Corey Minyard <wf-rch!minyard@relay.EU.net> |
12 | * Florian La Roche, <flla@stud.uni-sb.de> | 12 | * Florian La Roche, <flla@stud.uni-sb.de> |
13 | * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> | 13 | * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> |
14 | * Linus Torvalds, <torvalds@cs.helsinki.fi> | 14 | * Linus Torvalds, <torvalds@cs.helsinki.fi> |
15 | * Alan Cox, <gw4pts@gw4pts.ampr.org> | 15 | * Alan Cox, <gw4pts@gw4pts.ampr.org> |
16 | * Matthew Dillon, <dillon@apollo.west.oic.com> | 16 | * Matthew Dillon, <dillon@apollo.west.oic.com> |
17 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | 17 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> |
18 | * Jorge Cwik, <jorge@laser.satlink.net> | 18 | * Jorge Cwik, <jorge@laser.satlink.net> |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/sysctl.h> | 23 | #include <linux/sysctl.h> |
24 | #include <linux/workqueue.h> | 24 | #include <linux/workqueue.h> |
25 | #include <net/tcp.h> | 25 | #include <net/tcp.h> |
26 | #include <net/inet_common.h> | 26 | #include <net/inet_common.h> |
27 | #include <net/xfrm.h> | 27 | #include <net/xfrm.h> |
28 | 28 | ||
29 | #ifdef CONFIG_SYSCTL | 29 | #ifdef CONFIG_SYSCTL |
30 | #define SYNC_INIT 0 /* let the user enable it */ | 30 | #define SYNC_INIT 0 /* let the user enable it */ |
31 | #else | 31 | #else |
32 | #define SYNC_INIT 1 | 32 | #define SYNC_INIT 1 |
33 | #endif | 33 | #endif |
34 | 34 | ||
35 | int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; | 35 | int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; |
36 | EXPORT_SYMBOL(sysctl_tcp_syncookies); | 36 | EXPORT_SYMBOL(sysctl_tcp_syncookies); |
37 | 37 | ||
38 | int sysctl_tcp_abort_on_overflow __read_mostly; | 38 | int sysctl_tcp_abort_on_overflow __read_mostly; |
39 | 39 | ||
40 | struct inet_timewait_death_row tcp_death_row = { | 40 | struct inet_timewait_death_row tcp_death_row = { |
41 | .sysctl_max_tw_buckets = NR_FILE * 2, | 41 | .sysctl_max_tw_buckets = NR_FILE * 2, |
42 | .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, | 42 | .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, |
43 | .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), | 43 | .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), |
44 | .hashinfo = &tcp_hashinfo, | 44 | .hashinfo = &tcp_hashinfo, |
45 | .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, | 45 | .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, |
46 | (unsigned long)&tcp_death_row), | 46 | (unsigned long)&tcp_death_row), |
47 | .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, | 47 | .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, |
48 | inet_twdr_twkill_work), | 48 | inet_twdr_twkill_work), |
49 | /* Short-time timewait calendar */ | 49 | /* Short-time timewait calendar */ |
50 | 50 | ||
51 | .twcal_hand = -1, | 51 | .twcal_hand = -1, |
52 | .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, | 52 | .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, |
53 | (unsigned long)&tcp_death_row), | 53 | (unsigned long)&tcp_death_row), |
54 | }; | 54 | }; |
55 | 55 | ||
56 | EXPORT_SYMBOL_GPL(tcp_death_row); | 56 | EXPORT_SYMBOL_GPL(tcp_death_row); |
57 | 57 | ||
58 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 58 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) |
59 | { | 59 | { |
60 | if (seq == s_win) | 60 | if (seq == s_win) |
61 | return 1; | 61 | return 1; |
62 | if (after(end_seq, s_win) && before(seq, e_win)) | 62 | if (after(end_seq, s_win) && before(seq, e_win)) |
63 | return 1; | 63 | return 1; |
64 | return (seq == e_win && seq == end_seq); | 64 | return (seq == e_win && seq == end_seq); |
65 | } | 65 | } |
66 | 66 | ||
67 | /* | 67 | /* |
68 | * * Main purpose of TIME-WAIT state is to close connection gracefully, | 68 | * * Main purpose of TIME-WAIT state is to close connection gracefully, |
69 | * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN | 69 | * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN |
70 | * (and, probably, tail of data) and one or more our ACKs are lost. | 70 | * (and, probably, tail of data) and one or more our ACKs are lost. |
71 | * * What is TIME-WAIT timeout? It is associated with maximal packet | 71 | * * What is TIME-WAIT timeout? It is associated with maximal packet |
72 | * lifetime in the internet, which results in wrong conclusion, that | 72 | * lifetime in the internet, which results in wrong conclusion, that |
73 | * it is set to catch "old duplicate segments" wandering out of their path. | 73 | * it is set to catch "old duplicate segments" wandering out of their path. |
74 | * It is not quite correct. This timeout is calculated so that it exceeds | 74 | * It is not quite correct. This timeout is calculated so that it exceeds |
75 | * maximal retransmission timeout enough to allow to lose one (or more) | 75 | * maximal retransmission timeout enough to allow to lose one (or more) |
76 | * segments sent by peer and our ACKs. This time may be calculated from RTO. | 76 | * segments sent by peer and our ACKs. This time may be calculated from RTO. |
77 | * * When TIME-WAIT socket receives RST, it means that another end | 77 | * * When TIME-WAIT socket receives RST, it means that another end |
78 | * finally closed and we are allowed to kill TIME-WAIT too. | 78 | * finally closed and we are allowed to kill TIME-WAIT too. |
79 | * * Second purpose of TIME-WAIT is catching old duplicate segments. | 79 | * * Second purpose of TIME-WAIT is catching old duplicate segments. |
80 | * Well, certainly it is pure paranoia, but if we load TIME-WAIT | 80 | * Well, certainly it is pure paranoia, but if we load TIME-WAIT |
81 | * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. | 81 | * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. |
82 | * * If we invented some more clever way to catch duplicates | 82 | * * If we invented some more clever way to catch duplicates |
83 | * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. | 83 | * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. |
84 | * | 84 | * |
85 | * The algorithm below is based on FORMAL INTERPRETATION of RFCs. | 85 | * The algorithm below is based on FORMAL INTERPRETATION of RFCs. |
86 | * When you compare it to RFCs, please, read section SEGMENT ARRIVES | 86 | * When you compare it to RFCs, please, read section SEGMENT ARRIVES |
87 | * from the very beginning. | 87 | * from the very beginning. |
88 | * | 88 | * |
89 | * NOTE. With recycling (and later with fin-wait-2) TW bucket | 89 | * NOTE. With recycling (and later with fin-wait-2) TW bucket |
90 | * is _not_ stateless. It means, that strictly speaking we must | 90 | * is _not_ stateless. It means, that strictly speaking we must |
91 | * spinlock it. I do not want! Well, probability of misbehaviour | 91 | * spinlock it. I do not want! Well, probability of misbehaviour |
92 | * is ridiculously low and, seems, we could use some mb() tricks | 92 | * is ridiculously low and, seems, we could use some mb() tricks |
93 | * to avoid misread sequence numbers, states etc. --ANK | 93 | * to avoid misread sequence numbers, states etc. --ANK |
94 | */ | 94 | */ |
95 | enum tcp_tw_status | 95 | enum tcp_tw_status |
96 | tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | 96 | tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, |
97 | const struct tcphdr *th) | 97 | const struct tcphdr *th) |
98 | { | 98 | { |
99 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 99 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
100 | struct tcp_options_received tmp_opt; | 100 | struct tcp_options_received tmp_opt; |
101 | int paws_reject = 0; | 101 | int paws_reject = 0; |
102 | 102 | ||
103 | tmp_opt.saw_tstamp = 0; | 103 | tmp_opt.saw_tstamp = 0; |
104 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { | 104 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { |
105 | tcp_parse_options(skb, &tmp_opt, 0); | 105 | tcp_parse_options(skb, &tmp_opt, 0); |
106 | 106 | ||
107 | if (tmp_opt.saw_tstamp) { | 107 | if (tmp_opt.saw_tstamp) { |
108 | tmp_opt.ts_recent = tcptw->tw_ts_recent; | 108 | tmp_opt.ts_recent = tcptw->tw_ts_recent; |
109 | tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; | 109 | tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; |
110 | paws_reject = tcp_paws_check(&tmp_opt, th->rst); | 110 | paws_reject = tcp_paws_check(&tmp_opt, th->rst); |
111 | } | 111 | } |
112 | } | 112 | } |
113 | 113 | ||
114 | if (tw->tw_substate == TCP_FIN_WAIT2) { | 114 | if (tw->tw_substate == TCP_FIN_WAIT2) { |
115 | /* Just repeat all the checks of tcp_rcv_state_process() */ | 115 | /* Just repeat all the checks of tcp_rcv_state_process() */ |
116 | 116 | ||
117 | /* Out of window, send ACK */ | 117 | /* Out of window, send ACK */ |
118 | if (paws_reject || | 118 | if (paws_reject || |
119 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 119 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
120 | tcptw->tw_rcv_nxt, | 120 | tcptw->tw_rcv_nxt, |
121 | tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) | 121 | tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) |
122 | return TCP_TW_ACK; | 122 | return TCP_TW_ACK; |
123 | 123 | ||
124 | if (th->rst) | 124 | if (th->rst) |
125 | goto kill; | 125 | goto kill; |
126 | 126 | ||
127 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) | 127 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) |
128 | goto kill_with_rst; | 128 | goto kill_with_rst; |
129 | 129 | ||
130 | /* Dup ACK? */ | 130 | /* Dup ACK? */ |
131 | if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || | 131 | if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || |
132 | TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { | 132 | TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { |
133 | inet_twsk_put(tw); | 133 | inet_twsk_put(tw); |
134 | return TCP_TW_SUCCESS; | 134 | return TCP_TW_SUCCESS; |
135 | } | 135 | } |
136 | 136 | ||
137 | /* New data or FIN. If new data arrive after half-duplex close, | 137 | /* New data or FIN. If new data arrive after half-duplex close, |
138 | * reset. | 138 | * reset. |
139 | */ | 139 | */ |
140 | if (!th->fin || | 140 | if (!th->fin || |
141 | TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { | 141 | TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { |
142 | kill_with_rst: | 142 | kill_with_rst: |
143 | inet_twsk_deschedule(tw, &tcp_death_row); | 143 | inet_twsk_deschedule(tw, &tcp_death_row); |
144 | inet_twsk_put(tw); | 144 | inet_twsk_put(tw); |
145 | return TCP_TW_RST; | 145 | return TCP_TW_RST; |
146 | } | 146 | } |
147 | 147 | ||
148 | /* FIN arrived, enter true time-wait state. */ | 148 | /* FIN arrived, enter true time-wait state. */ |
149 | tw->tw_substate = TCP_TIME_WAIT; | 149 | tw->tw_substate = TCP_TIME_WAIT; |
150 | tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 150 | tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
151 | if (tmp_opt.saw_tstamp) { | 151 | if (tmp_opt.saw_tstamp) { |
152 | tcptw->tw_ts_recent_stamp = get_seconds(); | 152 | tcptw->tw_ts_recent_stamp = get_seconds(); |
153 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; | 153 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
154 | } | 154 | } |
155 | 155 | ||
156 | /* I am shamed, but failed to make it more elegant. | 156 | /* I am shamed, but failed to make it more elegant. |
157 | * Yes, it is direct reference to IP, which is impossible | 157 | * Yes, it is direct reference to IP, which is impossible |
158 | * to generalize to IPv6. Taking into account that IPv6 | 158 | * to generalize to IPv6. Taking into account that IPv6 |
159 | * do not understand recycling in any case, it not | 159 | * do not understand recycling in any case, it not |
160 | * a big problem in practice. --ANK */ | 160 | * a big problem in practice. --ANK */ |
161 | if (tw->tw_family == AF_INET && | 161 | if (tw->tw_family == AF_INET && |
162 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && | 162 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && |
163 | tcp_v4_tw_remember_stamp(tw)) | 163 | tcp_v4_tw_remember_stamp(tw)) |
164 | inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, | 164 | inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, |
165 | TCP_TIMEWAIT_LEN); | 165 | TCP_TIMEWAIT_LEN); |
166 | else | 166 | else |
167 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 167 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
168 | TCP_TIMEWAIT_LEN); | 168 | TCP_TIMEWAIT_LEN); |
169 | return TCP_TW_ACK; | 169 | return TCP_TW_ACK; |
170 | } | 170 | } |
171 | 171 | ||
172 | /* | 172 | /* |
173 | * Now real TIME-WAIT state. | 173 | * Now real TIME-WAIT state. |
174 | * | 174 | * |
175 | * RFC 1122: | 175 | * RFC 1122: |
176 | * "When a connection is [...] on TIME-WAIT state [...] | 176 | * "When a connection is [...] on TIME-WAIT state [...] |
177 | * [a TCP] MAY accept a new SYN from the remote TCP to | 177 | * [a TCP] MAY accept a new SYN from the remote TCP to |
178 | * reopen the connection directly, if it: | 178 | * reopen the connection directly, if it: |
179 | * | 179 | * |
180 | * (1) assigns its initial sequence number for the new | 180 | * (1) assigns its initial sequence number for the new |
181 | * connection to be larger than the largest sequence | 181 | * connection to be larger than the largest sequence |
182 | * number it used on the previous connection incarnation, | 182 | * number it used on the previous connection incarnation, |
183 | * and | 183 | * and |
184 | * | 184 | * |
185 | * (2) returns to TIME-WAIT state if the SYN turns out | 185 | * (2) returns to TIME-WAIT state if the SYN turns out |
186 | * to be an old duplicate". | 186 | * to be an old duplicate". |
187 | */ | 187 | */ |
188 | 188 | ||
189 | if (!paws_reject && | 189 | if (!paws_reject && |
190 | (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && | 190 | (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && |
191 | (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { | 191 | (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { |
192 | /* In window segment, it may be only reset or bare ack. */ | 192 | /* In window segment, it may be only reset or bare ack. */ |
193 | 193 | ||
194 | if (th->rst) { | 194 | if (th->rst) { |
195 | /* This is TIME_WAIT assassination, in two flavors. | 195 | /* This is TIME_WAIT assassination, in two flavors. |
196 | * Oh well... nobody has a sufficient solution to this | 196 | * Oh well... nobody has a sufficient solution to this |
197 | * protocol bug yet. | 197 | * protocol bug yet. |
198 | */ | 198 | */ |
199 | if (sysctl_tcp_rfc1337 == 0) { | 199 | if (sysctl_tcp_rfc1337 == 0) { |
200 | kill: | 200 | kill: |
201 | inet_twsk_deschedule(tw, &tcp_death_row); | 201 | inet_twsk_deschedule(tw, &tcp_death_row); |
202 | inet_twsk_put(tw); | 202 | inet_twsk_put(tw); |
203 | return TCP_TW_SUCCESS; | 203 | return TCP_TW_SUCCESS; |
204 | } | 204 | } |
205 | } | 205 | } |
206 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 206 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
207 | TCP_TIMEWAIT_LEN); | 207 | TCP_TIMEWAIT_LEN); |
208 | 208 | ||
209 | if (tmp_opt.saw_tstamp) { | 209 | if (tmp_opt.saw_tstamp) { |
210 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; | 210 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
211 | tcptw->tw_ts_recent_stamp = get_seconds(); | 211 | tcptw->tw_ts_recent_stamp = get_seconds(); |
212 | } | 212 | } |
213 | 213 | ||
214 | inet_twsk_put(tw); | 214 | inet_twsk_put(tw); |
215 | return TCP_TW_SUCCESS; | 215 | return TCP_TW_SUCCESS; |
216 | } | 216 | } |
217 | 217 | ||
218 | /* Out of window segment. | 218 | /* Out of window segment. |
219 | 219 | ||
220 | All the segments are ACKed immediately. | 220 | All the segments are ACKed immediately. |
221 | 221 | ||
222 | The only exception is new SYN. We accept it, if it is | 222 | The only exception is new SYN. We accept it, if it is |
223 | not old duplicate and we are not in danger to be killed | 223 | not old duplicate and we are not in danger to be killed |
224 | by delayed old duplicates. RFC check is that it has | 224 | by delayed old duplicates. RFC check is that it has |
225 | newer sequence number works at rates <40Mbit/sec. | 225 | newer sequence number works at rates <40Mbit/sec. |
226 | However, if paws works, it is reliable AND even more, | 226 | However, if paws works, it is reliable AND even more, |
227 | we even may relax silly seq space cutoff. | 227 | we even may relax silly seq space cutoff. |
228 | 228 | ||
229 | RED-PEN: we violate main RFC requirement, if this SYN will appear | 229 | RED-PEN: we violate main RFC requirement, if this SYN will appear |
230 | old duplicate (i.e. we receive RST in reply to SYN-ACK), | 230 | old duplicate (i.e. we receive RST in reply to SYN-ACK), |
231 | we must return socket to time-wait state. It is not good, | 231 | we must return socket to time-wait state. It is not good, |
232 | but not fatal yet. | 232 | but not fatal yet. |
233 | */ | 233 | */ |
234 | 234 | ||
235 | if (th->syn && !th->rst && !th->ack && !paws_reject && | 235 | if (th->syn && !th->rst && !th->ack && !paws_reject && |
236 | (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || | 236 | (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || |
237 | (tmp_opt.saw_tstamp && | 237 | (tmp_opt.saw_tstamp && |
238 | (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { | 238 | (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { |
239 | u32 isn = tcptw->tw_snd_nxt + 65535 + 2; | 239 | u32 isn = tcptw->tw_snd_nxt + 65535 + 2; |
240 | if (isn == 0) | 240 | if (isn == 0) |
241 | isn++; | 241 | isn++; |
242 | TCP_SKB_CB(skb)->when = isn; | 242 | TCP_SKB_CB(skb)->when = isn; |
243 | return TCP_TW_SYN; | 243 | return TCP_TW_SYN; |
244 | } | 244 | } |
245 | 245 | ||
246 | if (paws_reject) | 246 | if (paws_reject) |
247 | NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); | 247 | NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); |
248 | 248 | ||
249 | if (!th->rst) { | 249 | if (!th->rst) { |
250 | /* In this case we must reset the TIMEWAIT timer. | 250 | /* In this case we must reset the TIMEWAIT timer. |
251 | * | 251 | * |
252 | * If it is ACKless SYN it may be both old duplicate | 252 | * If it is ACKless SYN it may be both old duplicate |
253 | * and new good SYN with random sequence number <rcv_nxt. | 253 | * and new good SYN with random sequence number <rcv_nxt. |
254 | * Do not reschedule in the last case. | 254 | * Do not reschedule in the last case. |
255 | */ | 255 | */ |
256 | if (paws_reject || th->ack) | 256 | if (paws_reject || th->ack) |
257 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 257 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
258 | TCP_TIMEWAIT_LEN); | 258 | TCP_TIMEWAIT_LEN); |
259 | 259 | ||
260 | /* Send ACK. Note, we do not put the bucket, | 260 | /* Send ACK. Note, we do not put the bucket, |
261 | * it will be released by caller. | 261 | * it will be released by caller. |
262 | */ | 262 | */ |
263 | return TCP_TW_ACK; | 263 | return TCP_TW_ACK; |
264 | } | 264 | } |
265 | inet_twsk_put(tw); | 265 | inet_twsk_put(tw); |
266 | return TCP_TW_SUCCESS; | 266 | return TCP_TW_SUCCESS; |
267 | } | 267 | } |
268 | 268 | ||
269 | /* | 269 | /* |
270 | * Move a socket to time-wait or dead fin-wait-2 state. | 270 | * Move a socket to time-wait or dead fin-wait-2 state. |
271 | */ | 271 | */ |
272 | void tcp_time_wait(struct sock *sk, int state, int timeo) | 272 | void tcp_time_wait(struct sock *sk, int state, int timeo) |
273 | { | 273 | { |
274 | struct inet_timewait_sock *tw = NULL; | 274 | struct inet_timewait_sock *tw = NULL; |
275 | const struct inet_connection_sock *icsk = inet_csk(sk); | 275 | const struct inet_connection_sock *icsk = inet_csk(sk); |
276 | const struct tcp_sock *tp = tcp_sk(sk); | 276 | const struct tcp_sock *tp = tcp_sk(sk); |
277 | int recycle_ok = 0; | 277 | int recycle_ok = 0; |
278 | 278 | ||
279 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) | 279 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) |
280 | recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); | 280 | recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); |
281 | 281 | ||
282 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) | 282 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) |
283 | tw = inet_twsk_alloc(sk, state); | 283 | tw = inet_twsk_alloc(sk, state); |
284 | 284 | ||
285 | if (tw != NULL) { | 285 | if (tw != NULL) { |
286 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 286 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
287 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); | 287 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); |
288 | 288 | ||
289 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; | 289 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; |
290 | tcptw->tw_rcv_nxt = tp->rcv_nxt; | 290 | tcptw->tw_rcv_nxt = tp->rcv_nxt; |
291 | tcptw->tw_snd_nxt = tp->snd_nxt; | 291 | tcptw->tw_snd_nxt = tp->snd_nxt; |
292 | tcptw->tw_rcv_wnd = tcp_receive_window(tp); | 292 | tcptw->tw_rcv_wnd = tcp_receive_window(tp); |
293 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; | 293 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; |
294 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; | 294 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; |
295 | 295 | ||
296 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 296 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
297 | if (tw->tw_family == PF_INET6) { | 297 | if (tw->tw_family == PF_INET6) { |
298 | struct ipv6_pinfo *np = inet6_sk(sk); | 298 | struct ipv6_pinfo *np = inet6_sk(sk); |
299 | struct inet6_timewait_sock *tw6; | 299 | struct inet6_timewait_sock *tw6; |
300 | 300 | ||
301 | tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); | 301 | tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); |
302 | tw6 = inet6_twsk((struct sock *)tw); | 302 | tw6 = inet6_twsk((struct sock *)tw); |
303 | ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); | 303 | ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); |
304 | ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); | 304 | ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); |
305 | tw->tw_ipv6only = np->ipv6only; | 305 | tw->tw_ipv6only = np->ipv6only; |
306 | } | 306 | } |
307 | #endif | 307 | #endif |
308 | 308 | ||
309 | #ifdef CONFIG_TCP_MD5SIG | 309 | #ifdef CONFIG_TCP_MD5SIG |
310 | /* | 310 | /* |
311 | * The timewait bucket does not have the key DB from the | 311 | * The timewait bucket does not have the key DB from the |
312 | * sock structure. We just make a quick copy of the | 312 | * sock structure. We just make a quick copy of the |
313 | * md5 key being used (if indeed we are using one) | 313 | * md5 key being used (if indeed we are using one) |
314 | * so the timewait ack generating code has the key. | 314 | * so the timewait ack generating code has the key. |
315 | */ | 315 | */ |
316 | do { | 316 | do { |
317 | struct tcp_md5sig_key *key; | 317 | struct tcp_md5sig_key *key; |
318 | memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); | 318 | memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); |
319 | tcptw->tw_md5_keylen = 0; | 319 | tcptw->tw_md5_keylen = 0; |
320 | key = tp->af_specific->md5_lookup(sk, sk); | 320 | key = tp->af_specific->md5_lookup(sk, sk); |
321 | if (key != NULL) { | 321 | if (key != NULL) { |
322 | memcpy(&tcptw->tw_md5_key, key->key, key->keylen); | 322 | memcpy(&tcptw->tw_md5_key, key->key, key->keylen); |
323 | tcptw->tw_md5_keylen = key->keylen; | 323 | tcptw->tw_md5_keylen = key->keylen; |
324 | if (tcp_alloc_md5sig_pool() == NULL) | 324 | if (tcp_alloc_md5sig_pool() == NULL) |
325 | BUG(); | 325 | BUG(); |
326 | } | 326 | } |
327 | } while (0); | 327 | } while (0); |
328 | #endif | 328 | #endif |
329 | 329 | ||
330 | /* Linkage updates. */ | 330 | /* Linkage updates. */ |
331 | __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); | 331 | __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); |
332 | 332 | ||
333 | /* Get the TIME_WAIT timeout firing. */ | 333 | /* Get the TIME_WAIT timeout firing. */ |
334 | if (timeo < rto) | 334 | if (timeo < rto) |
335 | timeo = rto; | 335 | timeo = rto; |
336 | 336 | ||
337 | if (recycle_ok) { | 337 | if (recycle_ok) { |
338 | tw->tw_timeout = rto; | 338 | tw->tw_timeout = rto; |
339 | } else { | 339 | } else { |
340 | tw->tw_timeout = TCP_TIMEWAIT_LEN; | 340 | tw->tw_timeout = TCP_TIMEWAIT_LEN; |
341 | if (state == TCP_TIME_WAIT) | 341 | if (state == TCP_TIME_WAIT) |
342 | timeo = TCP_TIMEWAIT_LEN; | 342 | timeo = TCP_TIMEWAIT_LEN; |
343 | } | 343 | } |
344 | 344 | ||
345 | inet_twsk_schedule(tw, &tcp_death_row, timeo, | 345 | inet_twsk_schedule(tw, &tcp_death_row, timeo, |
346 | TCP_TIMEWAIT_LEN); | 346 | TCP_TIMEWAIT_LEN); |
347 | inet_twsk_put(tw); | 347 | inet_twsk_put(tw); |
348 | } else { | 348 | } else { |
349 | /* Sorry, if we're out of memory, just CLOSE this | 349 | /* Sorry, if we're out of memory, just CLOSE this |
350 | * socket up. We've got bigger problems than | 350 | * socket up. We've got bigger problems than |
351 | * non-graceful socket closings. | 351 | * non-graceful socket closings. |
352 | */ | 352 | */ |
353 | LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); | 353 | LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); |
354 | } | 354 | } |
355 | 355 | ||
356 | tcp_update_metrics(sk); | 356 | tcp_update_metrics(sk); |
357 | tcp_done(sk); | 357 | tcp_done(sk); |
358 | } | 358 | } |
359 | 359 | ||
360 | void tcp_twsk_destructor(struct sock *sk) | 360 | void tcp_twsk_destructor(struct sock *sk) |
361 | { | 361 | { |
362 | #ifdef CONFIG_TCP_MD5SIG | 362 | #ifdef CONFIG_TCP_MD5SIG |
363 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); | 363 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); |
364 | if (twsk->tw_md5_keylen) | 364 | if (twsk->tw_md5_keylen) |
365 | tcp_put_md5sig_pool(); | 365 | tcp_put_md5sig_pool(); |
366 | #endif | 366 | #endif |
367 | } | 367 | } |
368 | 368 | ||
369 | EXPORT_SYMBOL_GPL(tcp_twsk_destructor); | 369 | EXPORT_SYMBOL_GPL(tcp_twsk_destructor); |
370 | 370 | ||
371 | static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, | 371 | static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, |
372 | struct request_sock *req) | 372 | struct request_sock *req) |
373 | { | 373 | { |
374 | tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; | 374 | tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; |
375 | } | 375 | } |
376 | 376 | ||
377 | /* This is not only more efficient than what we used to do, it eliminates | 377 | /* This is not only more efficient than what we used to do, it eliminates |
378 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM | 378 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM |
379 | * | 379 | * |
380 | * Actually, we could lots of memory writes here. tp of listening | 380 | * Actually, we could lots of memory writes here. tp of listening |
381 | * socket contains all necessary default parameters. | 381 | * socket contains all necessary default parameters. |
382 | */ | 382 | */ |
383 | struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) | 383 | struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) |
384 | { | 384 | { |
385 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); | 385 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); |
386 | 386 | ||
387 | if (newsk != NULL) { | 387 | if (newsk != NULL) { |
388 | const struct inet_request_sock *ireq = inet_rsk(req); | 388 | const struct inet_request_sock *ireq = inet_rsk(req); |
389 | struct tcp_request_sock *treq = tcp_rsk(req); | 389 | struct tcp_request_sock *treq = tcp_rsk(req); |
390 | struct inet_connection_sock *newicsk = inet_csk(newsk); | 390 | struct inet_connection_sock *newicsk = inet_csk(newsk); |
391 | struct tcp_sock *newtp; | 391 | struct tcp_sock *newtp; |
392 | 392 | ||
393 | /* Now setup tcp_sock */ | 393 | /* Now setup tcp_sock */ |
394 | newtp = tcp_sk(newsk); | 394 | newtp = tcp_sk(newsk); |
395 | newtp->pred_flags = 0; | 395 | newtp->pred_flags = 0; |
396 | newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; | 396 | newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; |
397 | newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; | 397 | newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; |
398 | newtp->snd_up = treq->snt_isn + 1; | 398 | newtp->snd_up = treq->snt_isn + 1; |
399 | 399 | ||
400 | tcp_prequeue_init(newtp); | 400 | tcp_prequeue_init(newtp); |
401 | 401 | ||
402 | tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); | 402 | tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); |
403 | 403 | ||
404 | newtp->srtt = 0; | 404 | newtp->srtt = 0; |
405 | newtp->mdev = TCP_TIMEOUT_INIT; | 405 | newtp->mdev = TCP_TIMEOUT_INIT; |
406 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; | 406 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; |
407 | 407 | ||
408 | newtp->packets_out = 0; | 408 | newtp->packets_out = 0; |
409 | newtp->retrans_out = 0; | 409 | newtp->retrans_out = 0; |
410 | newtp->sacked_out = 0; | 410 | newtp->sacked_out = 0; |
411 | newtp->fackets_out = 0; | 411 | newtp->fackets_out = 0; |
412 | newtp->snd_ssthresh = 0x7fffffff; | 412 | newtp->snd_ssthresh = 0x7fffffff; |
413 | 413 | ||
414 | /* So many TCP implementations out there (incorrectly) count the | 414 | /* So many TCP implementations out there (incorrectly) count the |
415 | * initial SYN frame in their delayed-ACK and congestion control | 415 | * initial SYN frame in their delayed-ACK and congestion control |
416 | * algorithms that we must have the following bandaid to talk | 416 | * algorithms that we must have the following bandaid to talk |
417 | * efficiently to them. -DaveM | 417 | * efficiently to them. -DaveM |
418 | */ | 418 | */ |
419 | newtp->snd_cwnd = 2; | 419 | newtp->snd_cwnd = 2; |
420 | newtp->snd_cwnd_cnt = 0; | 420 | newtp->snd_cwnd_cnt = 0; |
421 | newtp->bytes_acked = 0; | 421 | newtp->bytes_acked = 0; |
422 | 422 | ||
423 | newtp->frto_counter = 0; | 423 | newtp->frto_counter = 0; |
424 | newtp->frto_highmark = 0; | 424 | newtp->frto_highmark = 0; |
425 | 425 | ||
426 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; | 426 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; |
427 | 427 | ||
428 | tcp_set_ca_state(newsk, TCP_CA_Open); | 428 | tcp_set_ca_state(newsk, TCP_CA_Open); |
429 | tcp_init_xmit_timers(newsk); | 429 | tcp_init_xmit_timers(newsk); |
430 | skb_queue_head_init(&newtp->out_of_order_queue); | 430 | skb_queue_head_init(&newtp->out_of_order_queue); |
431 | newtp->write_seq = treq->snt_isn + 1; | 431 | newtp->write_seq = treq->snt_isn + 1; |
432 | newtp->pushed_seq = newtp->write_seq; | 432 | newtp->pushed_seq = newtp->write_seq; |
433 | 433 | ||
434 | newtp->rx_opt.saw_tstamp = 0; | 434 | newtp->rx_opt.saw_tstamp = 0; |
435 | 435 | ||
436 | newtp->rx_opt.dsack = 0; | 436 | newtp->rx_opt.dsack = 0; |
437 | newtp->rx_opt.eff_sacks = 0; | 437 | newtp->rx_opt.eff_sacks = 0; |
438 | 438 | ||
439 | newtp->rx_opt.num_sacks = 0; | 439 | newtp->rx_opt.num_sacks = 0; |
440 | newtp->urg_data = 0; | 440 | newtp->urg_data = 0; |
441 | 441 | ||
442 | if (sock_flag(newsk, SOCK_KEEPOPEN)) | 442 | if (sock_flag(newsk, SOCK_KEEPOPEN)) |
443 | inet_csk_reset_keepalive_timer(newsk, | 443 | inet_csk_reset_keepalive_timer(newsk, |
444 | keepalive_time_when(newtp)); | 444 | keepalive_time_when(newtp)); |
445 | 445 | ||
446 | newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; | 446 | newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; |
447 | if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { | 447 | if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { |
448 | if (sysctl_tcp_fack) | 448 | if (sysctl_tcp_fack) |
449 | tcp_enable_fack(newtp); | 449 | tcp_enable_fack(newtp); |
450 | } | 450 | } |
451 | newtp->window_clamp = req->window_clamp; | 451 | newtp->window_clamp = req->window_clamp; |
452 | newtp->rcv_ssthresh = req->rcv_wnd; | 452 | newtp->rcv_ssthresh = req->rcv_wnd; |
453 | newtp->rcv_wnd = req->rcv_wnd; | 453 | newtp->rcv_wnd = req->rcv_wnd; |
454 | newtp->rx_opt.wscale_ok = ireq->wscale_ok; | 454 | newtp->rx_opt.wscale_ok = ireq->wscale_ok; |
455 | if (newtp->rx_opt.wscale_ok) { | 455 | if (newtp->rx_opt.wscale_ok) { |
456 | newtp->rx_opt.snd_wscale = ireq->snd_wscale; | 456 | newtp->rx_opt.snd_wscale = ireq->snd_wscale; |
457 | newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; | 457 | newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; |
458 | } else { | 458 | } else { |
459 | newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; | 459 | newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; |
460 | newtp->window_clamp = min(newtp->window_clamp, 65535U); | 460 | newtp->window_clamp = min(newtp->window_clamp, 65535U); |
461 | } | 461 | } |
462 | newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << | 462 | newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << |
463 | newtp->rx_opt.snd_wscale); | 463 | newtp->rx_opt.snd_wscale); |
464 | newtp->max_window = newtp->snd_wnd; | 464 | newtp->max_window = newtp->snd_wnd; |
465 | 465 | ||
466 | if (newtp->rx_opt.tstamp_ok) { | 466 | if (newtp->rx_opt.tstamp_ok) { |
467 | newtp->rx_opt.ts_recent = req->ts_recent; | 467 | newtp->rx_opt.ts_recent = req->ts_recent; |
468 | newtp->rx_opt.ts_recent_stamp = get_seconds(); | 468 | newtp->rx_opt.ts_recent_stamp = get_seconds(); |
469 | newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; | 469 | newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; |
470 | } else { | 470 | } else { |
471 | newtp->rx_opt.ts_recent_stamp = 0; | 471 | newtp->rx_opt.ts_recent_stamp = 0; |
472 | newtp->tcp_header_len = sizeof(struct tcphdr); | 472 | newtp->tcp_header_len = sizeof(struct tcphdr); |
473 | } | 473 | } |
474 | #ifdef CONFIG_TCP_MD5SIG | 474 | #ifdef CONFIG_TCP_MD5SIG |
475 | newtp->md5sig_info = NULL; /*XXX*/ | 475 | newtp->md5sig_info = NULL; /*XXX*/ |
476 | if (newtp->af_specific->md5_lookup(sk, newsk)) | 476 | if (newtp->af_specific->md5_lookup(sk, newsk)) |
477 | newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; | 477 | newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; |
478 | #endif | 478 | #endif |
479 | if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) | 479 | if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) |
480 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 480 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
481 | newtp->rx_opt.mss_clamp = req->mss; | 481 | newtp->rx_opt.mss_clamp = req->mss; |
482 | TCP_ECN_openreq_child(newtp, req); | 482 | TCP_ECN_openreq_child(newtp, req); |
483 | 483 | ||
484 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); | 484 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); |
485 | } | 485 | } |
486 | return newsk; | 486 | return newsk; |
487 | } | 487 | } |
488 | 488 | ||
489 | /* | 489 | /* |
490 | * Process an incoming packet for SYN_RECV sockets represented | 490 | * Process an incoming packet for SYN_RECV sockets represented |
491 | * as a request_sock. | 491 | * as a request_sock. |
492 | */ | 492 | */ |
493 | 493 | ||
494 | struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | 494 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
495 | struct request_sock *req, | 495 | struct request_sock *req, |
496 | struct request_sock **prev) | 496 | struct request_sock **prev) |
497 | { | 497 | { |
498 | const struct tcphdr *th = tcp_hdr(skb); | 498 | const struct tcphdr *th = tcp_hdr(skb); |
499 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 499 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
500 | int paws_reject = 0; | 500 | int paws_reject = 0; |
501 | struct tcp_options_received tmp_opt; | 501 | struct tcp_options_received tmp_opt; |
502 | struct sock *child; | 502 | struct sock *child; |
503 | 503 | ||
504 | tmp_opt.saw_tstamp = 0; | 504 | tmp_opt.saw_tstamp = 0; |
505 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 505 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
506 | tcp_parse_options(skb, &tmp_opt, 0); | 506 | tcp_parse_options(skb, &tmp_opt, 0); |
507 | 507 | ||
508 | if (tmp_opt.saw_tstamp) { | 508 | if (tmp_opt.saw_tstamp) { |
509 | tmp_opt.ts_recent = req->ts_recent; | 509 | tmp_opt.ts_recent = req->ts_recent; |
510 | /* We do not store true stamp, but it is not required, | 510 | /* We do not store true stamp, but it is not required, |
511 | * it can be estimated (approximately) | 511 | * it can be estimated (approximately) |
512 | * from another data. | 512 | * from another data. |
513 | */ | 513 | */ |
514 | tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); | 514 | tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); |
515 | paws_reject = tcp_paws_check(&tmp_opt, th->rst); | 515 | paws_reject = tcp_paws_check(&tmp_opt, th->rst); |
516 | } | 516 | } |
517 | } | 517 | } |
518 | 518 | ||
519 | /* Check for pure retransmitted SYN. */ | 519 | /* Check for pure retransmitted SYN. */ |
520 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && | 520 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && |
521 | flg == TCP_FLAG_SYN && | 521 | flg == TCP_FLAG_SYN && |
522 | !paws_reject) { | 522 | !paws_reject) { |
523 | /* | 523 | /* |
524 | * RFC793 draws (Incorrectly! It was fixed in RFC1122) | 524 | * RFC793 draws (Incorrectly! It was fixed in RFC1122) |
525 | * this case on figure 6 and figure 8, but formal | 525 | * this case on figure 6 and figure 8, but formal |
526 | * protocol description says NOTHING. | 526 | * protocol description says NOTHING. |
527 | * To be more exact, it says that we should send ACK, | 527 | * To be more exact, it says that we should send ACK, |
528 | * because this segment (at least, if it has no data) | 528 | * because this segment (at least, if it has no data) |
529 | * is out of window. | 529 | * is out of window. |
530 | * | 530 | * |
531 | * CONCLUSION: RFC793 (even with RFC1122) DOES NOT | 531 | * CONCLUSION: RFC793 (even with RFC1122) DOES NOT |
532 | * describe SYN-RECV state. All the description | 532 | * describe SYN-RECV state. All the description |
533 | * is wrong, we cannot believe to it and should | 533 | * is wrong, we cannot believe to it and should |
534 | * rely only on common sense and implementation | 534 | * rely only on common sense and implementation |
535 | * experience. | 535 | * experience. |
536 | * | 536 | * |
537 | * Enforce "SYN-ACK" according to figure 8, figure 6 | 537 | * Enforce "SYN-ACK" according to figure 8, figure 6 |
538 | * of RFC793, fixed by RFC1122. | 538 | * of RFC793, fixed by RFC1122. |
539 | */ | 539 | */ |
540 | req->rsk_ops->rtx_syn_ack(sk, req); | 540 | req->rsk_ops->rtx_syn_ack(sk, req); |
541 | return NULL; | 541 | return NULL; |
542 | } | 542 | } |
543 | 543 | ||
544 | /* Further reproduces section "SEGMENT ARRIVES" | 544 | /* Further reproduces section "SEGMENT ARRIVES" |
545 | for state SYN-RECEIVED of RFC793. | 545 | for state SYN-RECEIVED of RFC793. |
546 | It is broken, however, it does not work only | 546 | It is broken, however, it does not work only |
547 | when SYNs are crossed. | 547 | when SYNs are crossed. |
548 | 548 | ||
549 | You would think that SYN crossing is impossible here, since | 549 | You would think that SYN crossing is impossible here, since |
550 | we should have a SYN_SENT socket (from connect()) on our end, | 550 | we should have a SYN_SENT socket (from connect()) on our end, |
551 | but this is not true if the crossed SYNs were sent to both | 551 | but this is not true if the crossed SYNs were sent to both |
552 | ends by a malicious third party. We must defend against this, | 552 | ends by a malicious third party. We must defend against this, |
553 | and to do that we first verify the ACK (as per RFC793, page | 553 | and to do that we first verify the ACK (as per RFC793, page |
554 | 36) and reset if it is invalid. Is this a true full defense? | 554 | 36) and reset if it is invalid. Is this a true full defense? |
555 | To convince ourselves, let us consider a way in which the ACK | 555 | To convince ourselves, let us consider a way in which the ACK |
556 | test can still pass in this 'malicious crossed SYNs' case. | 556 | test can still pass in this 'malicious crossed SYNs' case. |
557 | Malicious sender sends identical SYNs (and thus identical sequence | 557 | Malicious sender sends identical SYNs (and thus identical sequence |
558 | numbers) to both A and B: | 558 | numbers) to both A and B: |
559 | 559 | ||
560 | A: gets SYN, seq=7 | 560 | A: gets SYN, seq=7 |
561 | B: gets SYN, seq=7 | 561 | B: gets SYN, seq=7 |
562 | 562 | ||
563 | By our good fortune, both A and B select the same initial | 563 | By our good fortune, both A and B select the same initial |
564 | send sequence number of seven :-) | 564 | send sequence number of seven :-) |
565 | 565 | ||
566 | A: sends SYN|ACK, seq=7, ack_seq=8 | 566 | A: sends SYN|ACK, seq=7, ack_seq=8 |
567 | B: sends SYN|ACK, seq=7, ack_seq=8 | 567 | B: sends SYN|ACK, seq=7, ack_seq=8 |
568 | 568 | ||
569 | So we are now A eating this SYN|ACK, ACK test passes. So | 569 | So we are now A eating this SYN|ACK, ACK test passes. So |
570 | does sequence test, SYN is truncated, and thus we consider | 570 | does sequence test, SYN is truncated, and thus we consider |
571 | it a bare ACK. | 571 | it a bare ACK. |
572 | 572 | ||
573 | If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this | 573 | If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this |
574 | bare ACK. Otherwise, we create an established connection. Both | 574 | bare ACK. Otherwise, we create an established connection. Both |
575 | ends (listening sockets) accept the new incoming connection and try | 575 | ends (listening sockets) accept the new incoming connection and try |
576 | to talk to each other. 8-) | 576 | to talk to each other. 8-) |
577 | 577 | ||
578 | Note: This case is both harmless, and rare. Possibility is about the | 578 | Note: This case is both harmless, and rare. Possibility is about the |
579 | same as us discovering intelligent life on another plant tomorrow. | 579 | same as us discovering intelligent life on another plant tomorrow. |
580 | 580 | ||
581 | But generally, we should (RFC lies!) to accept ACK | 581 | But generally, we should (RFC lies!) to accept ACK |
582 | from SYNACK both here and in tcp_rcv_state_process(). | 582 | from SYNACK both here and in tcp_rcv_state_process(). |
583 | tcp_rcv_state_process() does not, hence, we do not too. | 583 | tcp_rcv_state_process() does not, hence, we do not too. |
584 | 584 | ||
585 | Note that the case is absolutely generic: | 585 | Note that the case is absolutely generic: |
586 | we cannot optimize anything here without | 586 | we cannot optimize anything here without |
587 | violating protocol. All the checks must be made | 587 | violating protocol. All the checks must be made |
588 | before attempt to create socket. | 588 | before attempt to create socket. |
589 | */ | 589 | */ |
590 | 590 | ||
591 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... | 591 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... |
592 | * and the incoming segment acknowledges something not yet | 592 | * and the incoming segment acknowledges something not yet |
593 | * sent (the segment carries an unacceptable ACK) ... | 593 | * sent (the segment carries an unacceptable ACK) ... |
594 | * a reset is sent." | 594 | * a reset is sent." |
595 | * | 595 | * |
596 | * Invalid ACK: reset will be sent by listening socket | 596 | * Invalid ACK: reset will be sent by listening socket |
597 | */ | 597 | */ |
598 | if ((flg & TCP_FLAG_ACK) && | 598 | if ((flg & TCP_FLAG_ACK) && |
599 | (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) | 599 | (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) |
600 | return sk; | 600 | return sk; |
601 | 601 | ||
602 | /* Also, it would be not so bad idea to check rcv_tsecr, which | 602 | /* Also, it would be not so bad idea to check rcv_tsecr, which |
603 | * is essentially ACK extension and too early or too late values | 603 | * is essentially ACK extension and too early or too late values |
604 | * should cause reset in unsynchronized states. | 604 | * should cause reset in unsynchronized states. |
605 | */ | 605 | */ |
606 | 606 | ||
607 | /* RFC793: "first check sequence number". */ | 607 | /* RFC793: "first check sequence number". */ |
608 | 608 | ||
609 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 609 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
610 | tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { | 610 | tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { |
611 | /* Out of window: send ACK and drop. */ | 611 | /* Out of window: send ACK and drop. */ |
612 | if (!(flg & TCP_FLAG_RST)) | 612 | if (!(flg & TCP_FLAG_RST)) |
613 | req->rsk_ops->send_ack(sk, skb, req); | 613 | req->rsk_ops->send_ack(sk, skb, req); |
614 | if (paws_reject) | 614 | if (paws_reject) |
615 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | 615 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); |
616 | return NULL; | 616 | return NULL; |
617 | } | 617 | } |
618 | 618 | ||
619 | /* In sequence, PAWS is OK. */ | 619 | /* In sequence, PAWS is OK. */ |
620 | 620 | ||
621 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) | 621 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) |
622 | req->ts_recent = tmp_opt.rcv_tsval; | 622 | req->ts_recent = tmp_opt.rcv_tsval; |
623 | 623 | ||
624 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { | 624 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { |
625 | /* Truncate SYN, it is out of window starting | 625 | /* Truncate SYN, it is out of window starting |
626 | at tcp_rsk(req)->rcv_isn + 1. */ | 626 | at tcp_rsk(req)->rcv_isn + 1. */ |
627 | flg &= ~TCP_FLAG_SYN; | 627 | flg &= ~TCP_FLAG_SYN; |
628 | } | 628 | } |
629 | 629 | ||
630 | /* RFC793: "second check the RST bit" and | 630 | /* RFC793: "second check the RST bit" and |
631 | * "fourth, check the SYN bit" | 631 | * "fourth, check the SYN bit" |
632 | */ | 632 | */ |
633 | if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { | 633 | if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { |
634 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); | 634 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
635 | goto embryonic_reset; | 635 | goto embryonic_reset; |
636 | } | 636 | } |
637 | 637 | ||
638 | /* ACK sequence verified above, just make sure ACK is | 638 | /* ACK sequence verified above, just make sure ACK is |
639 | * set. If ACK not set, just silently drop the packet. | 639 | * set. If ACK not set, just silently drop the packet. |
640 | */ | 640 | */ |
641 | if (!(flg & TCP_FLAG_ACK)) | 641 | if (!(flg & TCP_FLAG_ACK)) |
642 | return NULL; | 642 | return NULL; |
643 | 643 | ||
644 | /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ | 644 | /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ |
645 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | 645 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
646 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 646 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { |
647 | inet_rsk(req)->acked = 1; | 647 | inet_rsk(req)->acked = 1; |
648 | return NULL; | 648 | return NULL; |
649 | } | 649 | } |
650 | 650 | ||
651 | /* OK, ACK is valid, create big socket and | 651 | /* OK, ACK is valid, create big socket and |
652 | * feed this segment to it. It will repeat all | 652 | * feed this segment to it. It will repeat all |
653 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO | 653 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO |
654 | * ESTABLISHED STATE. If it will be dropped after | 654 | * ESTABLISHED STATE. If it will be dropped after |
655 | * socket is created, wait for troubles. | 655 | * socket is created, wait for troubles. |
656 | */ | 656 | */ |
657 | child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); | 657 | child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); |
658 | if (child == NULL) | 658 | if (child == NULL) |
659 | goto listen_overflow; | 659 | goto listen_overflow; |
660 | #ifdef CONFIG_TCP_MD5SIG | 660 | #ifdef CONFIG_TCP_MD5SIG |
661 | else { | 661 | else { |
662 | /* Copy over the MD5 key from the original socket */ | 662 | /* Copy over the MD5 key from the original socket */ |
663 | struct tcp_md5sig_key *key; | 663 | struct tcp_md5sig_key *key; |
664 | struct tcp_sock *tp = tcp_sk(sk); | 664 | struct tcp_sock *tp = tcp_sk(sk); |
665 | key = tp->af_specific->md5_lookup(sk, child); | 665 | key = tp->af_specific->md5_lookup(sk, child); |
666 | if (key != NULL) { | 666 | if (key != NULL) { |
667 | /* | 667 | /* |
668 | * We're using one, so create a matching key on the | 668 | * We're using one, so create a matching key on the |
669 | * newsk structure. If we fail to get memory then we | 669 | * newsk structure. If we fail to get memory then we |
670 | * end up not copying the key across. Shucks. | 670 | * end up not copying the key across. Shucks. |
671 | */ | 671 | */ |
672 | char *newkey = kmemdup(key->key, key->keylen, | 672 | char *newkey = kmemdup(key->key, key->keylen, |
673 | GFP_ATOMIC); | 673 | GFP_ATOMIC); |
674 | if (newkey) { | 674 | if (newkey) { |
675 | if (!tcp_alloc_md5sig_pool()) | 675 | if (!tcp_alloc_md5sig_pool()) |
676 | BUG(); | 676 | BUG(); |
677 | tp->af_specific->md5_add(child, child, newkey, | 677 | tp->af_specific->md5_add(child, child, newkey, |
678 | key->keylen); | 678 | key->keylen); |
679 | } | 679 | } |
680 | } | 680 | } |
681 | } | 681 | } |
682 | #endif | 682 | #endif |
683 | 683 | ||
684 | inet_csk_reqsk_queue_unlink(sk, req, prev); | 684 | inet_csk_reqsk_queue_unlink(sk, req, prev); |
685 | inet_csk_reqsk_queue_removed(sk, req); | 685 | inet_csk_reqsk_queue_removed(sk, req); |
686 | 686 | ||
687 | inet_csk_reqsk_queue_add(sk, req, child); | 687 | inet_csk_reqsk_queue_add(sk, req, child); |
688 | return child; | 688 | return child; |
689 | 689 | ||
690 | listen_overflow: | 690 | listen_overflow: |
691 | if (!sysctl_tcp_abort_on_overflow) { | 691 | if (!sysctl_tcp_abort_on_overflow) { |
692 | inet_rsk(req)->acked = 1; | 692 | inet_rsk(req)->acked = 1; |
693 | return NULL; | 693 | return NULL; |
694 | } | 694 | } |
695 | 695 | ||
696 | embryonic_reset: | 696 | embryonic_reset: |
697 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | 697 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); |
698 | if (!(flg & TCP_FLAG_RST)) | 698 | if (!(flg & TCP_FLAG_RST)) |
699 | req->rsk_ops->send_reset(sk, skb); | 699 | req->rsk_ops->send_reset(sk, skb); |
700 | 700 | ||
701 | inet_csk_reqsk_queue_drop(sk, req, prev); | 701 | inet_csk_reqsk_queue_drop(sk, req, prev); |
702 | return NULL; | 702 | return NULL; |
703 | } | 703 | } |
704 | 704 | ||
705 | /* | 705 | /* |
706 | * Queue segment on the new socket if the new socket is active, | 706 | * Queue segment on the new socket if the new socket is active, |
707 | * otherwise we just shortcircuit this and continue with | 707 | * otherwise we just shortcircuit this and continue with |
708 | * the new socket. | 708 | * the new socket. |
709 | */ | 709 | */ |
710 | 710 | ||
711 | int tcp_child_process(struct sock *parent, struct sock *child, | 711 | int tcp_child_process(struct sock *parent, struct sock *child, |
712 | struct sk_buff *skb) | 712 | struct sk_buff *skb) |
713 | { | 713 | { |
714 | int ret = 0; | 714 | int ret = 0; |
715 | int state = child->sk_state; | 715 | int state = child->sk_state; |
716 | 716 | ||
717 | if (!sock_owned_by_user(child)) { | 717 | if (!sock_owned_by_user(child)) { |
718 | ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), | 718 | ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), |
719 | skb->len); | 719 | skb->len); |
720 | /* Wakeup parent, send SIGIO */ | 720 | /* Wakeup parent, send SIGIO */ |
721 | if (state == TCP_SYN_RECV && child->sk_state != state) | 721 | if (state == TCP_SYN_RECV && child->sk_state != state) |
722 | parent->sk_data_ready(parent, 0); | 722 | parent->sk_data_ready(parent, 0); |
723 | } else { | 723 | } else { |
724 | /* Alas, it is possible again, because we do lookup | 724 | /* Alas, it is possible again, because we do lookup |
725 | * in main socket hash table and lock on listening | 725 | * in main socket hash table and lock on listening |
726 | * socket does not protect us more. | 726 | * socket does not protect us more. |
727 | */ | 727 | */ |
728 | sk_add_backlog(child, skb); | 728 | sk_add_backlog(child, skb); |
729 | } | 729 | } |
730 | 730 | ||
731 | bh_unlock_sock(child); | 731 | bh_unlock_sock(child); |
732 | sock_put(child); | 732 | sock_put(child); |
733 | return ret; | 733 | return ret; |
734 | } | 734 | } |
735 | 735 | ||
736 | EXPORT_SYMBOL(tcp_check_req); | 736 | EXPORT_SYMBOL(tcp_check_req); |
737 | EXPORT_SYMBOL(tcp_child_process); | 737 | EXPORT_SYMBOL(tcp_child_process); |
738 | EXPORT_SYMBOL(tcp_create_openreq_child); | 738 | EXPORT_SYMBOL(tcp_create_openreq_child); |
739 | EXPORT_SYMBOL(tcp_timewait_state_process); | 739 | EXPORT_SYMBOL(tcp_timewait_state_process); |
740 | 740 |
net/ipv4/tcp_yeah.c
1 | /* | 1 | /* |
2 | * | 2 | * |
3 | * YeAH TCP | 3 | * YeAH TCP |
4 | * | 4 | * |
5 | * For further details look at: | 5 | * For further details look at: |
6 | * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf | 6 | * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/skbuff.h> | 11 | #include <linux/skbuff.h> |
12 | #include <linux/inet_diag.h> | 12 | #include <linux/inet_diag.h> |
13 | 13 | ||
14 | #include <net/tcp.h> | 14 | #include <net/tcp.h> |
15 | 15 | ||
16 | #include "tcp_vegas.h" | 16 | #include "tcp_vegas.h" |
17 | 17 | ||
18 | #define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck | 18 | #define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck |
19 | #define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt | 19 | #define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt |
20 | #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss | 20 | #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss |
21 | #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion | 21 | #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion |
22 | #define TCP_YEAH_PHY 8 //lin maximum delta from base | 22 | #define TCP_YEAH_PHY 8 //lin maximum delta from base |
23 | #define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss | 23 | #define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss |
24 | #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count | 24 | #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count |
25 | 25 | ||
26 | #define TCP_SCALABLE_AI_CNT 100U | 26 | #define TCP_SCALABLE_AI_CNT 100U |
27 | 27 | ||
28 | /* YeAH variables */ | 28 | /* YeAH variables */ |
29 | struct yeah { | 29 | struct yeah { |
30 | struct vegas vegas; /* must be first */ | 30 | struct vegas vegas; /* must be first */ |
31 | 31 | ||
32 | /* YeAH */ | 32 | /* YeAH */ |
33 | u32 lastQ; | 33 | u32 lastQ; |
34 | u32 doing_reno_now; | 34 | u32 doing_reno_now; |
35 | 35 | ||
36 | u32 reno_count; | 36 | u32 reno_count; |
37 | u32 fast_count; | 37 | u32 fast_count; |
38 | 38 | ||
39 | u32 pkts_acked; | 39 | u32 pkts_acked; |
40 | }; | 40 | }; |
41 | 41 | ||
42 | static void tcp_yeah_init(struct sock *sk) | 42 | static void tcp_yeah_init(struct sock *sk) |
43 | { | 43 | { |
44 | struct tcp_sock *tp = tcp_sk(sk); | 44 | struct tcp_sock *tp = tcp_sk(sk); |
45 | struct yeah *yeah = inet_csk_ca(sk); | 45 | struct yeah *yeah = inet_csk_ca(sk); |
46 | 46 | ||
47 | tcp_vegas_init(sk); | 47 | tcp_vegas_init(sk); |
48 | 48 | ||
49 | yeah->doing_reno_now = 0; | 49 | yeah->doing_reno_now = 0; |
50 | yeah->lastQ = 0; | 50 | yeah->lastQ = 0; |
51 | 51 | ||
52 | yeah->reno_count = 2; | 52 | yeah->reno_count = 2; |
53 | 53 | ||
54 | /* Ensure the MD arithmetic works. This is somewhat pedantic, | 54 | /* Ensure the MD arithmetic works. This is somewhat pedantic, |
55 | * since I don't think we will see a cwnd this large. :) */ | 55 | * since I don't think we will see a cwnd this large. :) */ |
56 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); | 56 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); |
57 | 57 | ||
58 | } | 58 | } |
59 | 59 | ||
60 | 60 | ||
61 | static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) | 61 | static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) |
62 | { | 62 | { |
63 | const struct inet_connection_sock *icsk = inet_csk(sk); | 63 | const struct inet_connection_sock *icsk = inet_csk(sk); |
64 | struct yeah *yeah = inet_csk_ca(sk); | 64 | struct yeah *yeah = inet_csk_ca(sk); |
65 | 65 | ||
66 | if (icsk->icsk_ca_state == TCP_CA_Open) | 66 | if (icsk->icsk_ca_state == TCP_CA_Open) |
67 | yeah->pkts_acked = pkts_acked; | 67 | yeah->pkts_acked = pkts_acked; |
68 | 68 | ||
69 | tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); | 69 | tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); |
70 | } | 70 | } |
71 | 71 | ||
72 | static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | 72 | static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) |
73 | { | 73 | { |
74 | struct tcp_sock *tp = tcp_sk(sk); | 74 | struct tcp_sock *tp = tcp_sk(sk); |
75 | struct yeah *yeah = inet_csk_ca(sk); | 75 | struct yeah *yeah = inet_csk_ca(sk); |
76 | 76 | ||
77 | if (!tcp_is_cwnd_limited(sk, in_flight)) | 77 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
78 | return; | 78 | return; |
79 | 79 | ||
80 | if (tp->snd_cwnd <= tp->snd_ssthresh) | 80 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
81 | tcp_slow_start(tp); | 81 | tcp_slow_start(tp); |
82 | 82 | ||
83 | else if (!yeah->doing_reno_now) { | 83 | else if (!yeah->doing_reno_now) { |
84 | /* Scalable */ | 84 | /* Scalable */ |
85 | 85 | ||
86 | tp->snd_cwnd_cnt+=yeah->pkts_acked; | 86 | tp->snd_cwnd_cnt += yeah->pkts_acked; |
87 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | 87 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ |
88 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 88 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
89 | tp->snd_cwnd++; | 89 | tp->snd_cwnd++; |
90 | tp->snd_cwnd_cnt = 0; | 90 | tp->snd_cwnd_cnt = 0; |
91 | } | 91 | } |
92 | 92 | ||
93 | yeah->pkts_acked = 1; | 93 | yeah->pkts_acked = 1; |
94 | 94 | ||
95 | } else { | 95 | } else { |
96 | /* Reno */ | 96 | /* Reno */ |
97 | 97 | ||
98 | if (tp->snd_cwnd_cnt < tp->snd_cwnd) | 98 | if (tp->snd_cwnd_cnt < tp->snd_cwnd) |
99 | tp->snd_cwnd_cnt++; | 99 | tp->snd_cwnd_cnt++; |
100 | 100 | ||
101 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | 101 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { |
102 | tp->snd_cwnd++; | 102 | tp->snd_cwnd++; |
103 | tp->snd_cwnd_cnt = 0; | 103 | tp->snd_cwnd_cnt = 0; |
104 | } | 104 | } |
105 | } | 105 | } |
106 | 106 | ||
107 | /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. | 107 | /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. |
108 | * | 108 | * |
109 | * These are so named because they represent the approximate values | 109 | * These are so named because they represent the approximate values |
110 | * of snd_una and snd_nxt at the beginning of the current RTT. More | 110 | * of snd_una and snd_nxt at the beginning of the current RTT. More |
111 | * precisely, they represent the amount of data sent during the RTT. | 111 | * precisely, they represent the amount of data sent during the RTT. |
112 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | 112 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, |
113 | * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding | 113 | * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding |
114 | * bytes of data have been ACKed during the course of the RTT, giving | 114 | * bytes of data have been ACKed during the course of the RTT, giving |
115 | * an "actual" rate of: | 115 | * an "actual" rate of: |
116 | * | 116 | * |
117 | * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) | 117 | * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) |
118 | * | 118 | * |
119 | * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, | 119 | * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, |
120 | * because delayed ACKs can cover more than one segment, so they | 120 | * because delayed ACKs can cover more than one segment, so they |
121 | * don't line up yeahly with the boundaries of RTTs. | 121 | * don't line up yeahly with the boundaries of RTTs. |
122 | * | 122 | * |
123 | * Another unfortunate fact of life is that delayed ACKs delay the | 123 | * Another unfortunate fact of life is that delayed ACKs delay the |
124 | * advance of the left edge of our send window, so that the number | 124 | * advance of the left edge of our send window, so that the number |
125 | * of bytes we send in an RTT is often less than our cwnd will allow. | 125 | * of bytes we send in an RTT is often less than our cwnd will allow. |
126 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | 126 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. |
127 | */ | 127 | */ |
128 | 128 | ||
129 | if (after(ack, yeah->vegas.beg_snd_nxt)) { | 129 | if (after(ack, yeah->vegas.beg_snd_nxt)) { |
130 | 130 | ||
131 | /* We do the Vegas calculations only if we got enough RTT | 131 | /* We do the Vegas calculations only if we got enough RTT |
132 | * samples that we can be reasonably sure that we got | 132 | * samples that we can be reasonably sure that we got |
133 | * at least one RTT sample that wasn't from a delayed ACK. | 133 | * at least one RTT sample that wasn't from a delayed ACK. |
134 | * If we only had 2 samples total, | 134 | * If we only had 2 samples total, |
135 | * then that means we're getting only 1 ACK per RTT, which | 135 | * then that means we're getting only 1 ACK per RTT, which |
136 | * means they're almost certainly delayed ACKs. | 136 | * means they're almost certainly delayed ACKs. |
137 | * If we have 3 samples, we should be OK. | 137 | * If we have 3 samples, we should be OK. |
138 | */ | 138 | */ |
139 | 139 | ||
140 | if (yeah->vegas.cntRTT > 2) { | 140 | if (yeah->vegas.cntRTT > 2) { |
141 | u32 rtt, queue; | 141 | u32 rtt, queue; |
142 | u64 bw; | 142 | u64 bw; |
143 | 143 | ||
144 | /* We have enough RTT samples, so, using the Vegas | 144 | /* We have enough RTT samples, so, using the Vegas |
145 | * algorithm, we determine if we should increase or | 145 | * algorithm, we determine if we should increase or |
146 | * decrease cwnd, and by how much. | 146 | * decrease cwnd, and by how much. |
147 | */ | 147 | */ |
148 | 148 | ||
149 | /* Pluck out the RTT we are using for the Vegas | 149 | /* Pluck out the RTT we are using for the Vegas |
150 | * calculations. This is the min RTT seen during the | 150 | * calculations. This is the min RTT seen during the |
151 | * last RTT. Taking the min filters out the effects | 151 | * last RTT. Taking the min filters out the effects |
152 | * of delayed ACKs, at the cost of noticing congestion | 152 | * of delayed ACKs, at the cost of noticing congestion |
153 | * a bit later. | 153 | * a bit later. |
154 | */ | 154 | */ |
155 | rtt = yeah->vegas.minRTT; | 155 | rtt = yeah->vegas.minRTT; |
156 | 156 | ||
157 | /* Compute excess number of packets above bandwidth | 157 | /* Compute excess number of packets above bandwidth |
158 | * Avoid doing full 64 bit divide. | 158 | * Avoid doing full 64 bit divide. |
159 | */ | 159 | */ |
160 | bw = tp->snd_cwnd; | 160 | bw = tp->snd_cwnd; |
161 | bw *= rtt - yeah->vegas.baseRTT; | 161 | bw *= rtt - yeah->vegas.baseRTT; |
162 | do_div(bw, rtt); | 162 | do_div(bw, rtt); |
163 | queue = bw; | 163 | queue = bw; |
164 | 164 | ||
165 | if (queue > TCP_YEAH_ALPHA || | 165 | if (queue > TCP_YEAH_ALPHA || |
166 | rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { | 166 | rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { |
167 | if (queue > TCP_YEAH_ALPHA | 167 | if (queue > TCP_YEAH_ALPHA |
168 | && tp->snd_cwnd > yeah->reno_count) { | 168 | && tp->snd_cwnd > yeah->reno_count) { |
169 | u32 reduction = min(queue / TCP_YEAH_GAMMA , | 169 | u32 reduction = min(queue / TCP_YEAH_GAMMA , |
170 | tp->snd_cwnd >> TCP_YEAH_EPSILON); | 170 | tp->snd_cwnd >> TCP_YEAH_EPSILON); |
171 | 171 | ||
172 | tp->snd_cwnd -= reduction; | 172 | tp->snd_cwnd -= reduction; |
173 | 173 | ||
174 | tp->snd_cwnd = max(tp->snd_cwnd, | 174 | tp->snd_cwnd = max(tp->snd_cwnd, |
175 | yeah->reno_count); | 175 | yeah->reno_count); |
176 | 176 | ||
177 | tp->snd_ssthresh = tp->snd_cwnd; | 177 | tp->snd_ssthresh = tp->snd_cwnd; |
178 | } | 178 | } |
179 | 179 | ||
180 | if (yeah->reno_count <= 2) | 180 | if (yeah->reno_count <= 2) |
181 | yeah->reno_count = max(tp->snd_cwnd>>1, 2U); | 181 | yeah->reno_count = max(tp->snd_cwnd>>1, 2U); |
182 | else | 182 | else |
183 | yeah->reno_count++; | 183 | yeah->reno_count++; |
184 | 184 | ||
185 | yeah->doing_reno_now = min(yeah->doing_reno_now + 1, | 185 | yeah->doing_reno_now = min(yeah->doing_reno_now + 1, |
186 | 0xffffffU); | 186 | 0xffffffU); |
187 | } else { | 187 | } else { |
188 | yeah->fast_count++; | 188 | yeah->fast_count++; |
189 | 189 | ||
190 | if (yeah->fast_count > TCP_YEAH_ZETA) { | 190 | if (yeah->fast_count > TCP_YEAH_ZETA) { |
191 | yeah->reno_count = 2; | 191 | yeah->reno_count = 2; |
192 | yeah->fast_count = 0; | 192 | yeah->fast_count = 0; |
193 | } | 193 | } |
194 | 194 | ||
195 | yeah->doing_reno_now = 0; | 195 | yeah->doing_reno_now = 0; |
196 | } | 196 | } |
197 | 197 | ||
198 | yeah->lastQ = queue; | 198 | yeah->lastQ = queue; |
199 | 199 | ||
200 | } | 200 | } |
201 | 201 | ||
202 | /* Save the extent of the current window so we can use this | 202 | /* Save the extent of the current window so we can use this |
203 | * at the end of the next RTT. | 203 | * at the end of the next RTT. |
204 | */ | 204 | */ |
205 | yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; | 205 | yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; |
206 | yeah->vegas.beg_snd_nxt = tp->snd_nxt; | 206 | yeah->vegas.beg_snd_nxt = tp->snd_nxt; |
207 | yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; | 207 | yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; |
208 | 208 | ||
209 | /* Wipe the slate clean for the next RTT. */ | 209 | /* Wipe the slate clean for the next RTT. */ |
210 | yeah->vegas.cntRTT = 0; | 210 | yeah->vegas.cntRTT = 0; |
211 | yeah->vegas.minRTT = 0x7fffffff; | 211 | yeah->vegas.minRTT = 0x7fffffff; |
212 | } | 212 | } |
213 | } | 213 | } |
214 | 214 | ||
215 | static u32 tcp_yeah_ssthresh(struct sock *sk) { | 215 | static u32 tcp_yeah_ssthresh(struct sock *sk) { |
216 | const struct tcp_sock *tp = tcp_sk(sk); | 216 | const struct tcp_sock *tp = tcp_sk(sk); |
217 | struct yeah *yeah = inet_csk_ca(sk); | 217 | struct yeah *yeah = inet_csk_ca(sk); |
218 | u32 reduction; | 218 | u32 reduction; |
219 | 219 | ||
220 | if (yeah->doing_reno_now < TCP_YEAH_RHO) { | 220 | if (yeah->doing_reno_now < TCP_YEAH_RHO) { |
221 | reduction = yeah->lastQ; | 221 | reduction = yeah->lastQ; |
222 | 222 | ||
223 | reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); | 223 | reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); |
224 | 224 | ||
225 | reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); | 225 | reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); |
226 | } else | 226 | } else |
227 | reduction = max(tp->snd_cwnd>>1,2U); | 227 | reduction = max(tp->snd_cwnd>>1, 2U); |
228 | 228 | ||
229 | yeah->fast_count = 0; | 229 | yeah->fast_count = 0; |
230 | yeah->reno_count = max(yeah->reno_count>>1, 2U); | 230 | yeah->reno_count = max(yeah->reno_count>>1, 2U); |
231 | 231 | ||
232 | return tp->snd_cwnd - reduction; | 232 | return tp->snd_cwnd - reduction; |
233 | } | 233 | } |
234 | 234 | ||
235 | static struct tcp_congestion_ops tcp_yeah = { | 235 | static struct tcp_congestion_ops tcp_yeah = { |
236 | .flags = TCP_CONG_RTT_STAMP, | 236 | .flags = TCP_CONG_RTT_STAMP, |
237 | .init = tcp_yeah_init, | 237 | .init = tcp_yeah_init, |
238 | .ssthresh = tcp_yeah_ssthresh, | 238 | .ssthresh = tcp_yeah_ssthresh, |
239 | .cong_avoid = tcp_yeah_cong_avoid, | 239 | .cong_avoid = tcp_yeah_cong_avoid, |
240 | .min_cwnd = tcp_reno_min_cwnd, | 240 | .min_cwnd = tcp_reno_min_cwnd, |
241 | .set_state = tcp_vegas_state, | 241 | .set_state = tcp_vegas_state, |
242 | .cwnd_event = tcp_vegas_cwnd_event, | 242 | .cwnd_event = tcp_vegas_cwnd_event, |
243 | .get_info = tcp_vegas_get_info, | 243 | .get_info = tcp_vegas_get_info, |
244 | .pkts_acked = tcp_yeah_pkts_acked, | 244 | .pkts_acked = tcp_yeah_pkts_acked, |
245 | 245 | ||
246 | .owner = THIS_MODULE, | 246 | .owner = THIS_MODULE, |
247 | .name = "yeah", | 247 | .name = "yeah", |
248 | }; | 248 | }; |
249 | 249 | ||
250 | static int __init tcp_yeah_register(void) | 250 | static int __init tcp_yeah_register(void) |
251 | { | 251 | { |
252 | BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); | 252 | BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); |
253 | tcp_register_congestion_control(&tcp_yeah); | 253 | tcp_register_congestion_control(&tcp_yeah); |
254 | return 0; | 254 | return 0; |
255 | } | 255 | } |
256 | 256 | ||
257 | static void __exit tcp_yeah_unregister(void) | 257 | static void __exit tcp_yeah_unregister(void) |
258 | { | 258 | { |
259 | tcp_unregister_congestion_control(&tcp_yeah); | 259 | tcp_unregister_congestion_control(&tcp_yeah); |
260 | } | 260 | } |
261 | 261 | ||
262 | module_init(tcp_yeah_register); | 262 | module_init(tcp_yeah_register); |
263 | module_exit(tcp_yeah_unregister); | 263 | module_exit(tcp_yeah_unregister); |
264 | 264 | ||
265 | MODULE_AUTHOR("Angelo P. Castellani"); | 265 | MODULE_AUTHOR("Angelo P. Castellani"); |
266 | MODULE_LICENSE("GPL"); | 266 | MODULE_LICENSE("GPL"); |
267 | MODULE_DESCRIPTION("YeAH TCP"); | 267 | MODULE_DESCRIPTION("YeAH TCP"); |
268 | 268 |
net/ipv4/xfrm4_policy.c
1 | /* | 1 | /* |
2 | * xfrm4_policy.c | 2 | * xfrm4_policy.c |
3 | * | 3 | * |
4 | * Changes: | 4 | * Changes: |
5 | * Kazunori MIYAZAWA @USAGI | 5 | * Kazunori MIYAZAWA @USAGI |
6 | * YOSHIFUJI Hideaki @USAGI | 6 | * YOSHIFUJI Hideaki @USAGI |
7 | * Split up af-specific portion | 7 | * Split up af-specific portion |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/err.h> | 11 | #include <linux/err.h> |
12 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
13 | #include <linux/inetdevice.h> | 13 | #include <linux/inetdevice.h> |
14 | #include <net/dst.h> | 14 | #include <net/dst.h> |
15 | #include <net/xfrm.h> | 15 | #include <net/xfrm.h> |
16 | #include <net/ip.h> | 16 | #include <net/ip.h> |
17 | 17 | ||
18 | static struct dst_ops xfrm4_dst_ops; | 18 | static struct dst_ops xfrm4_dst_ops; |
19 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; | 19 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; |
20 | 20 | ||
21 | static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, | 21 | static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, |
22 | xfrm_address_t *daddr) | 22 | xfrm_address_t *daddr) |
23 | { | 23 | { |
24 | struct flowi fl = { | 24 | struct flowi fl = { |
25 | .nl_u = { | 25 | .nl_u = { |
26 | .ip4_u = { | 26 | .ip4_u = { |
27 | .tos = tos, | 27 | .tos = tos, |
28 | .daddr = daddr->a4, | 28 | .daddr = daddr->a4, |
29 | }, | 29 | }, |
30 | }, | 30 | }, |
31 | }; | 31 | }; |
32 | struct dst_entry *dst; | 32 | struct dst_entry *dst; |
33 | struct rtable *rt; | 33 | struct rtable *rt; |
34 | int err; | 34 | int err; |
35 | 35 | ||
36 | if (saddr) | 36 | if (saddr) |
37 | fl.fl4_src = saddr->a4; | 37 | fl.fl4_src = saddr->a4; |
38 | 38 | ||
39 | err = __ip_route_output_key(&init_net, &rt, &fl); | 39 | err = __ip_route_output_key(&init_net, &rt, &fl); |
40 | dst = &rt->u.dst; | 40 | dst = &rt->u.dst; |
41 | if (err) | 41 | if (err) |
42 | dst = ERR_PTR(err); | 42 | dst = ERR_PTR(err); |
43 | return dst; | 43 | return dst; |
44 | } | 44 | } |
45 | 45 | ||
46 | static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) | 46 | static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) |
47 | { | 47 | { |
48 | struct dst_entry *dst; | 48 | struct dst_entry *dst; |
49 | struct rtable *rt; | 49 | struct rtable *rt; |
50 | 50 | ||
51 | dst = xfrm4_dst_lookup(0, NULL, daddr); | 51 | dst = xfrm4_dst_lookup(0, NULL, daddr); |
52 | if (IS_ERR(dst)) | 52 | if (IS_ERR(dst)) |
53 | return -EHOSTUNREACH; | 53 | return -EHOSTUNREACH; |
54 | 54 | ||
55 | rt = (struct rtable *)dst; | 55 | rt = (struct rtable *)dst; |
56 | saddr->a4 = rt->rt_src; | 56 | saddr->a4 = rt->rt_src; |
57 | dst_release(dst); | 57 | dst_release(dst); |
58 | return 0; | 58 | return 0; |
59 | } | 59 | } |
60 | 60 | ||
61 | static struct dst_entry * | 61 | static struct dst_entry * |
62 | __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) | 62 | __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) |
63 | { | 63 | { |
64 | struct dst_entry *dst; | 64 | struct dst_entry *dst; |
65 | 65 | ||
66 | read_lock_bh(&policy->lock); | 66 | read_lock_bh(&policy->lock); |
67 | for (dst = policy->bundles; dst; dst = dst->next) { | 67 | for (dst = policy->bundles; dst; dst = dst->next) { |
68 | struct xfrm_dst *xdst = (struct xfrm_dst*)dst; | 68 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; |
69 | if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ | 69 | if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ |
70 | xdst->u.rt.fl.fl4_dst == fl->fl4_dst && | 70 | xdst->u.rt.fl.fl4_dst == fl->fl4_dst && |
71 | xdst->u.rt.fl.fl4_src == fl->fl4_src && | 71 | xdst->u.rt.fl.fl4_src == fl->fl4_src && |
72 | xdst->u.rt.fl.fl4_tos == fl->fl4_tos && | 72 | xdst->u.rt.fl.fl4_tos == fl->fl4_tos && |
73 | xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { | 73 | xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { |
74 | dst_clone(dst); | 74 | dst_clone(dst); |
75 | break; | 75 | break; |
76 | } | 76 | } |
77 | } | 77 | } |
78 | read_unlock_bh(&policy->lock); | 78 | read_unlock_bh(&policy->lock); |
79 | return dst; | 79 | return dst; |
80 | } | 80 | } |
81 | 81 | ||
82 | static int xfrm4_get_tos(struct flowi *fl) | 82 | static int xfrm4_get_tos(struct flowi *fl) |
83 | { | 83 | { |
84 | return fl->fl4_tos; | 84 | return fl->fl4_tos; |
85 | } | 85 | } |
86 | 86 | ||
87 | static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, | 87 | static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, |
88 | int nfheader_len) | 88 | int nfheader_len) |
89 | { | 89 | { |
90 | return 0; | 90 | return 0; |
91 | } | 91 | } |
92 | 92 | ||
93 | static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) | 93 | static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) |
94 | { | 94 | { |
95 | struct rtable *rt = (struct rtable *)xdst->route; | 95 | struct rtable *rt = (struct rtable *)xdst->route; |
96 | 96 | ||
97 | xdst->u.rt.fl = rt->fl; | 97 | xdst->u.rt.fl = rt->fl; |
98 | 98 | ||
99 | xdst->u.dst.dev = dev; | 99 | xdst->u.dst.dev = dev; |
100 | dev_hold(dev); | 100 | dev_hold(dev); |
101 | 101 | ||
102 | xdst->u.rt.idev = in_dev_get(dev); | 102 | xdst->u.rt.idev = in_dev_get(dev); |
103 | if (!xdst->u.rt.idev) | 103 | if (!xdst->u.rt.idev) |
104 | return -ENODEV; | 104 | return -ENODEV; |
105 | 105 | ||
106 | xdst->u.rt.peer = rt->peer; | 106 | xdst->u.rt.peer = rt->peer; |
107 | if (rt->peer) | 107 | if (rt->peer) |
108 | atomic_inc(&rt->peer->refcnt); | 108 | atomic_inc(&rt->peer->refcnt); |
109 | 109 | ||
110 | /* Sheit... I remember I did this right. Apparently, | 110 | /* Sheit... I remember I did this right. Apparently, |
111 | * it was magically lost, so this code needs audit */ | 111 | * it was magically lost, so this code needs audit */ |
112 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | | 112 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | |
113 | RTCF_LOCAL); | 113 | RTCF_LOCAL); |
114 | xdst->u.rt.rt_type = rt->rt_type; | 114 | xdst->u.rt.rt_type = rt->rt_type; |
115 | xdst->u.rt.rt_src = rt->rt_src; | 115 | xdst->u.rt.rt_src = rt->rt_src; |
116 | xdst->u.rt.rt_dst = rt->rt_dst; | 116 | xdst->u.rt.rt_dst = rt->rt_dst; |
117 | xdst->u.rt.rt_gateway = rt->rt_gateway; | 117 | xdst->u.rt.rt_gateway = rt->rt_gateway; |
118 | xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; | 118 | xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; |
119 | 119 | ||
120 | return 0; | 120 | return 0; |
121 | } | 121 | } |
122 | 122 | ||
123 | static void | 123 | static void |
124 | _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | 124 | _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) |
125 | { | 125 | { |
126 | struct iphdr *iph = ip_hdr(skb); | 126 | struct iphdr *iph = ip_hdr(skb); |
127 | u8 *xprth = skb_network_header(skb) + iph->ihl * 4; | 127 | u8 *xprth = skb_network_header(skb) + iph->ihl * 4; |
128 | 128 | ||
129 | memset(fl, 0, sizeof(struct flowi)); | 129 | memset(fl, 0, sizeof(struct flowi)); |
130 | if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { | 130 | if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { |
131 | switch (iph->protocol) { | 131 | switch (iph->protocol) { |
132 | case IPPROTO_UDP: | 132 | case IPPROTO_UDP: |
133 | case IPPROTO_UDPLITE: | 133 | case IPPROTO_UDPLITE: |
134 | case IPPROTO_TCP: | 134 | case IPPROTO_TCP: |
135 | case IPPROTO_SCTP: | 135 | case IPPROTO_SCTP: |
136 | case IPPROTO_DCCP: | 136 | case IPPROTO_DCCP: |
137 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { | 137 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { |
138 | __be16 *ports = (__be16 *)xprth; | 138 | __be16 *ports = (__be16 *)xprth; |
139 | 139 | ||
140 | fl->fl_ip_sport = ports[!!reverse]; | 140 | fl->fl_ip_sport = ports[!!reverse]; |
141 | fl->fl_ip_dport = ports[!reverse]; | 141 | fl->fl_ip_dport = ports[!reverse]; |
142 | } | 142 | } |
143 | break; | 143 | break; |
144 | 144 | ||
145 | case IPPROTO_ICMP: | 145 | case IPPROTO_ICMP: |
146 | if (pskb_may_pull(skb, xprth + 2 - skb->data)) { | 146 | if (pskb_may_pull(skb, xprth + 2 - skb->data)) { |
147 | u8 *icmp = xprth; | 147 | u8 *icmp = xprth; |
148 | 148 | ||
149 | fl->fl_icmp_type = icmp[0]; | 149 | fl->fl_icmp_type = icmp[0]; |
150 | fl->fl_icmp_code = icmp[1]; | 150 | fl->fl_icmp_code = icmp[1]; |
151 | } | 151 | } |
152 | break; | 152 | break; |
153 | 153 | ||
154 | case IPPROTO_ESP: | 154 | case IPPROTO_ESP: |
155 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { | 155 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { |
156 | __be32 *ehdr = (__be32 *)xprth; | 156 | __be32 *ehdr = (__be32 *)xprth; |
157 | 157 | ||
158 | fl->fl_ipsec_spi = ehdr[0]; | 158 | fl->fl_ipsec_spi = ehdr[0]; |
159 | } | 159 | } |
160 | break; | 160 | break; |
161 | 161 | ||
162 | case IPPROTO_AH: | 162 | case IPPROTO_AH: |
163 | if (pskb_may_pull(skb, xprth + 8 - skb->data)) { | 163 | if (pskb_may_pull(skb, xprth + 8 - skb->data)) { |
164 | __be32 *ah_hdr = (__be32*)xprth; | 164 | __be32 *ah_hdr = (__be32*)xprth; |
165 | 165 | ||
166 | fl->fl_ipsec_spi = ah_hdr[1]; | 166 | fl->fl_ipsec_spi = ah_hdr[1]; |
167 | } | 167 | } |
168 | break; | 168 | break; |
169 | 169 | ||
170 | case IPPROTO_COMP: | 170 | case IPPROTO_COMP: |
171 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { | 171 | if (pskb_may_pull(skb, xprth + 4 - skb->data)) { |
172 | __be16 *ipcomp_hdr = (__be16 *)xprth; | 172 | __be16 *ipcomp_hdr = (__be16 *)xprth; |
173 | 173 | ||
174 | fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); | 174 | fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); |
175 | } | 175 | } |
176 | break; | 176 | break; |
177 | default: | 177 | default: |
178 | fl->fl_ipsec_spi = 0; | 178 | fl->fl_ipsec_spi = 0; |
179 | break; | 179 | break; |
180 | } | 180 | } |
181 | } | 181 | } |
182 | fl->proto = iph->protocol; | 182 | fl->proto = iph->protocol; |
183 | fl->fl4_dst = reverse ? iph->saddr : iph->daddr; | 183 | fl->fl4_dst = reverse ? iph->saddr : iph->daddr; |
184 | fl->fl4_src = reverse ? iph->daddr : iph->saddr; | 184 | fl->fl4_src = reverse ? iph->daddr : iph->saddr; |
185 | fl->fl4_tos = iph->tos; | 185 | fl->fl4_tos = iph->tos; |
186 | } | 186 | } |
187 | 187 | ||
188 | static inline int xfrm4_garbage_collect(struct dst_ops *ops) | 188 | static inline int xfrm4_garbage_collect(struct dst_ops *ops) |
189 | { | 189 | { |
190 | xfrm4_policy_afinfo.garbage_collect(); | 190 | xfrm4_policy_afinfo.garbage_collect(); |
191 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); | 191 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); |
192 | } | 192 | } |
193 | 193 | ||
194 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) | 194 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) |
195 | { | 195 | { |
196 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | 196 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; |
197 | struct dst_entry *path = xdst->route; | 197 | struct dst_entry *path = xdst->route; |
198 | 198 | ||
199 | path->ops->update_pmtu(path, mtu); | 199 | path->ops->update_pmtu(path, mtu); |
200 | } | 200 | } |
201 | 201 | ||
202 | static void xfrm4_dst_destroy(struct dst_entry *dst) | 202 | static void xfrm4_dst_destroy(struct dst_entry *dst) |
203 | { | 203 | { |
204 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | 204 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; |
205 | 205 | ||
206 | if (likely(xdst->u.rt.idev)) | 206 | if (likely(xdst->u.rt.idev)) |
207 | in_dev_put(xdst->u.rt.idev); | 207 | in_dev_put(xdst->u.rt.idev); |
208 | if (likely(xdst->u.rt.peer)) | 208 | if (likely(xdst->u.rt.peer)) |
209 | inet_putpeer(xdst->u.rt.peer); | 209 | inet_putpeer(xdst->u.rt.peer); |
210 | xfrm_dst_destroy(xdst); | 210 | xfrm_dst_destroy(xdst); |
211 | } | 211 | } |
212 | 212 | ||
213 | static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | 213 | static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, |
214 | int unregister) | 214 | int unregister) |
215 | { | 215 | { |
216 | struct xfrm_dst *xdst; | 216 | struct xfrm_dst *xdst; |
217 | 217 | ||
218 | if (!unregister) | 218 | if (!unregister) |
219 | return; | 219 | return; |
220 | 220 | ||
221 | xdst = (struct xfrm_dst *)dst; | 221 | xdst = (struct xfrm_dst *)dst; |
222 | if (xdst->u.rt.idev->dev == dev) { | 222 | if (xdst->u.rt.idev->dev == dev) { |
223 | struct in_device *loopback_idev = | 223 | struct in_device *loopback_idev = |
224 | in_dev_get(dev_net(dev)->loopback_dev); | 224 | in_dev_get(dev_net(dev)->loopback_dev); |
225 | BUG_ON(!loopback_idev); | 225 | BUG_ON(!loopback_idev); |
226 | 226 | ||
227 | do { | 227 | do { |
228 | in_dev_put(xdst->u.rt.idev); | 228 | in_dev_put(xdst->u.rt.idev); |
229 | xdst->u.rt.idev = loopback_idev; | 229 | xdst->u.rt.idev = loopback_idev; |
230 | in_dev_hold(loopback_idev); | 230 | in_dev_hold(loopback_idev); |
231 | xdst = (struct xfrm_dst *)xdst->u.dst.child; | 231 | xdst = (struct xfrm_dst *)xdst->u.dst.child; |
232 | } while (xdst->u.dst.xfrm); | 232 | } while (xdst->u.dst.xfrm); |
233 | 233 | ||
234 | __in_dev_put(loopback_idev); | 234 | __in_dev_put(loopback_idev); |
235 | } | 235 | } |
236 | 236 | ||
237 | xfrm_dst_ifdown(dst, dev); | 237 | xfrm_dst_ifdown(dst, dev); |
238 | } | 238 | } |
239 | 239 | ||
240 | static struct dst_ops xfrm4_dst_ops = { | 240 | static struct dst_ops xfrm4_dst_ops = { |
241 | .family = AF_INET, | 241 | .family = AF_INET, |
242 | .protocol = __constant_htons(ETH_P_IP), | 242 | .protocol = __constant_htons(ETH_P_IP), |
243 | .gc = xfrm4_garbage_collect, | 243 | .gc = xfrm4_garbage_collect, |
244 | .update_pmtu = xfrm4_update_pmtu, | 244 | .update_pmtu = xfrm4_update_pmtu, |
245 | .destroy = xfrm4_dst_destroy, | 245 | .destroy = xfrm4_dst_destroy, |
246 | .ifdown = xfrm4_dst_ifdown, | 246 | .ifdown = xfrm4_dst_ifdown, |
247 | .local_out = __ip_local_out, | 247 | .local_out = __ip_local_out, |
248 | .gc_thresh = 1024, | 248 | .gc_thresh = 1024, |
249 | .entry_size = sizeof(struct xfrm_dst), | 249 | .entry_size = sizeof(struct xfrm_dst), |
250 | .entries = ATOMIC_INIT(0), | 250 | .entries = ATOMIC_INIT(0), |
251 | }; | 251 | }; |
252 | 252 | ||
253 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { | 253 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |
254 | .family = AF_INET, | 254 | .family = AF_INET, |
255 | .dst_ops = &xfrm4_dst_ops, | 255 | .dst_ops = &xfrm4_dst_ops, |
256 | .dst_lookup = xfrm4_dst_lookup, | 256 | .dst_lookup = xfrm4_dst_lookup, |
257 | .get_saddr = xfrm4_get_saddr, | 257 | .get_saddr = xfrm4_get_saddr, |
258 | .find_bundle = __xfrm4_find_bundle, | 258 | .find_bundle = __xfrm4_find_bundle, |
259 | .decode_session = _decode_session4, | 259 | .decode_session = _decode_session4, |
260 | .get_tos = xfrm4_get_tos, | 260 | .get_tos = xfrm4_get_tos, |
261 | .init_path = xfrm4_init_path, | 261 | .init_path = xfrm4_init_path, |
262 | .fill_dst = xfrm4_fill_dst, | 262 | .fill_dst = xfrm4_fill_dst, |
263 | }; | 263 | }; |
264 | 264 | ||
265 | static void __init xfrm4_policy_init(void) | 265 | static void __init xfrm4_policy_init(void) |
266 | { | 266 | { |
267 | xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); | 267 | xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); |
268 | } | 268 | } |
269 | 269 | ||
270 | static void __exit xfrm4_policy_fini(void) | 270 | static void __exit xfrm4_policy_fini(void) |
271 | { | 271 | { |
272 | xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); | 272 | xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); |
273 | } | 273 | } |
274 | 274 | ||
275 | void __init xfrm4_init(void) | 275 | void __init xfrm4_init(void) |
276 | { | 276 | { |
277 | xfrm4_state_init(); | 277 | xfrm4_state_init(); |
278 | xfrm4_policy_init(); | 278 | xfrm4_policy_init(); |
279 | } | 279 | } |
280 | 280 | ||
281 | 281 |