Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 5a5f3a8db9d70c90e9d55b46e02b2d8deb1c2c2e

Authored by Jianjun Kong 2008-11-03 16:24:34 +0800

Committed by David S. Miller 2008-11-03 16:24:34 +0800

1 parent d9319100c1

Exists in master and in 7 other branches

net: clean up net/ipv4/ipip.c raw.c tcp.c tcp_minisocks.c tcp_yeah.c xfrm4_policy.c

Signed-off-by: Jianjun Kong <jianjun@zeuux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 6 changed files with 12 additions and 12 deletions Inline Diff

net/ipv4/ipip.c
net/ipv4/raw.c
net/ipv4/tcp.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_yeah.c
net/ipv4/xfrm4_policy.c

net/ipv4/ipip.c

Diff comments View file @ 5a5f3a8

1	/*	1	/*
2	* Linux NET3: IP/IP protocol decoder.	2	* Linux NET3: IP/IP protocol decoder.
3	*	3	*
4	* Authors:	4	* Authors:
5	* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95	5	* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
6	*	6	*
7	* Fixes:	7	* Fixes:
8	* Alan Cox : Merged and made usable non modular (its so tiny its silly as	8	* Alan Cox : Merged and made usable non modular (its so tiny its silly as
9	* a module taking up 2 pages).	9	* a module taking up 2 pages).
10	* Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)	10	* Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
11	* to keep ip_forward happy.	11	* to keep ip_forward happy.
12	* Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).	12	* Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
13	* Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL	13	* Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
14	* David Woodhouse : Perform some basic ICMP handling.	14	* David Woodhouse : Perform some basic ICMP handling.
15	* IPIP Routing without decapsulation.	15	* IPIP Routing without decapsulation.
16	* Carlos Picoto : GRE over IP support	16	* Carlos Picoto : GRE over IP support
17	* Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.	17	* Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
18	* I do not want to merge them together.	18	* I do not want to merge them together.
19	*	19	*
20	* This program is free software; you can redistribute it and/or	20	* This program is free software; you can redistribute it and/or
21	* modify it under the terms of the GNU General Public License	21	* modify it under the terms of the GNU General Public License
22	* as published by the Free Software Foundation; either version	22	* as published by the Free Software Foundation; either version
23	* 2 of the License, or (at your option) any later version.	23	* 2 of the License, or (at your option) any later version.
24	*	24	*
25	*/	25	*/
26		26
27	/* tunnel.c: an IP tunnel driver	27	/* tunnel.c: an IP tunnel driver
28		28
29	The purpose of this driver is to provide an IP tunnel through	29	The purpose of this driver is to provide an IP tunnel through
30	which you can tunnel network traffic transparently across subnets.	30	which you can tunnel network traffic transparently across subnets.
31		31
32	This was written by looking at Nick Holloway's dummy driver	32	This was written by looking at Nick Holloway's dummy driver
33	Thanks for the great code!	33	Thanks for the great code!
34		34
35	-Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95	35	-Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
36		36
37	Minor tweaks:	37	Minor tweaks:
38	Cleaned up the code a little and added some pre-1.3.0 tweaks.	38	Cleaned up the code a little and added some pre-1.3.0 tweaks.
39	dev->hard_header/hard_header_len changed to use no headers.	39	dev->hard_header/hard_header_len changed to use no headers.
40	Comments/bracketing tweaked.	40	Comments/bracketing tweaked.
41	Made the tunnels use dev->name not tunnel: when error reporting.	41	Made the tunnels use dev->name not tunnel: when error reporting.
42	Added tx_dropped stat	42	Added tx_dropped stat
43		43
44	-Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95	44	-Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95
45		45
46	Reworked:	46	Reworked:
47	Changed to tunnel to destination gateway in addition to the	47	Changed to tunnel to destination gateway in addition to the
48	tunnel's pointopoint address	48	tunnel's pointopoint address
49	Almost completely rewritten	49	Almost completely rewritten
50	Note: There is currently no firewall or ICMP handling done.	50	Note: There is currently no firewall or ICMP handling done.
51		51
52	-Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96	52	-Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
53		53
54	*/	54	*/
55		55
56	/* Things I wish I had known when writing the tunnel driver:	56	/* Things I wish I had known when writing the tunnel driver:
57		57
58	When the tunnel_xmit() function is called, the skb contains the	58	When the tunnel_xmit() function is called, the skb contains the
59	packet to be sent (plus a great deal of extra info), and dev	59	packet to be sent (plus a great deal of extra info), and dev
60	contains the tunnel device that _we_ are.	60	contains the tunnel device that _we_ are.
61		61
62	When we are passed a packet, we are expected to fill in the	62	When we are passed a packet, we are expected to fill in the
63	source address with our source IP address.	63	source address with our source IP address.
64		64
65	What is the proper way to allocate, copy and free a buffer?	65	What is the proper way to allocate, copy and free a buffer?
66	After you allocate it, it is a "0 length" chunk of memory	66	After you allocate it, it is a "0 length" chunk of memory
67	starting at zero. If you want to add headers to the buffer	67	starting at zero. If you want to add headers to the buffer
68	later, you'll have to call "skb_reserve(skb, amount)" with	68	later, you'll have to call "skb_reserve(skb, amount)" with
69	the amount of memory you want reserved. Then, you call	69	the amount of memory you want reserved. Then, you call
70	"skb_put(skb, amount)" with the amount of space you want in	70	"skb_put(skb, amount)" with the amount of space you want in
71	the buffer. skb_put() returns a pointer to the top (#0) of	71	the buffer. skb_put() returns a pointer to the top (#0) of
72	that buffer. skb->len is set to the amount of space you have	72	that buffer. skb->len is set to the amount of space you have
73	"allocated" with skb_put(). You can then write up to skb->len	73	"allocated" with skb_put(). You can then write up to skb->len
74	bytes to that buffer. If you need more, you can call skb_put()	74	bytes to that buffer. If you need more, you can call skb_put()
75	again with the additional amount of space you need. You can	75	again with the additional amount of space you need. You can
76	find out how much more space you can allocate by calling	76	find out how much more space you can allocate by calling
77	"skb_tailroom(skb)".	77	"skb_tailroom(skb)".
78	Now, to add header space, call "skb_push(skb, header_len)".	78	Now, to add header space, call "skb_push(skb, header_len)".
79	This creates space at the beginning of the buffer and returns	79	This creates space at the beginning of the buffer and returns
80	a pointer to this new space. If later you need to strip a	80	a pointer to this new space. If later you need to strip a
81	header from a buffer, call "skb_pull(skb, header_len)".	81	header from a buffer, call "skb_pull(skb, header_len)".
82	skb_headroom() will return how much space is left at the top	82	skb_headroom() will return how much space is left at the top
83	of the buffer (before the main data). Remember, this headroom	83	of the buffer (before the main data). Remember, this headroom
84	space must be reserved before the skb_put() function is called.	84	space must be reserved before the skb_put() function is called.
85	*/	85	*/
86		86
87	/*	87	/*
88	This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c	88	This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
89		89
90	For comments look at net/ipv4/ip_gre.c --ANK	90	For comments look at net/ipv4/ip_gre.c --ANK
91	*/	91	*/
92		92
93		93
94	#include <linux/capability.h>	94	#include <linux/capability.h>
95	#include <linux/module.h>	95	#include <linux/module.h>
96	#include <linux/types.h>	96	#include <linux/types.h>
97	#include <linux/kernel.h>	97	#include <linux/kernel.h>
98	#include <asm/uaccess.h>	98	#include <asm/uaccess.h>
99	#include <linux/skbuff.h>	99	#include <linux/skbuff.h>
100	#include <linux/netdevice.h>	100	#include <linux/netdevice.h>
101	#include <linux/in.h>	101	#include <linux/in.h>
102	#include <linux/tcp.h>	102	#include <linux/tcp.h>
103	#include <linux/udp.h>	103	#include <linux/udp.h>
104	#include <linux/if_arp.h>	104	#include <linux/if_arp.h>
105	#include <linux/mroute.h>	105	#include <linux/mroute.h>
106	#include <linux/init.h>	106	#include <linux/init.h>
107	#include <linux/netfilter_ipv4.h>	107	#include <linux/netfilter_ipv4.h>
108	#include <linux/if_ether.h>	108	#include <linux/if_ether.h>
109		109
110	#include <net/sock.h>	110	#include <net/sock.h>
111	#include <net/ip.h>	111	#include <net/ip.h>
112	#include <net/icmp.h>	112	#include <net/icmp.h>
113	#include <net/ipip.h>	113	#include <net/ipip.h>
114	#include <net/inet_ecn.h>	114	#include <net/inet_ecn.h>
115	#include <net/xfrm.h>	115	#include <net/xfrm.h>
116	#include <net/net_namespace.h>	116	#include <net/net_namespace.h>
117	#include <net/netns/generic.h>	117	#include <net/netns/generic.h>
118		118
119	#define HASH_SIZE 16	119	#define HASH_SIZE 16
120	#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)	120	#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
121		121
122	static int ipip_net_id;	122	static int ipip_net_id;
123	struct ipip_net {	123	struct ipip_net {
124	struct ip_tunnel *tunnels_r_l[HASH_SIZE];	124	struct ip_tunnel *tunnels_r_l[HASH_SIZE];
125	struct ip_tunnel *tunnels_r[HASH_SIZE];	125	struct ip_tunnel *tunnels_r[HASH_SIZE];
126	struct ip_tunnel *tunnels_l[HASH_SIZE];	126	struct ip_tunnel *tunnels_l[HASH_SIZE];
127	struct ip_tunnel *tunnels_wc[1];	127	struct ip_tunnel *tunnels_wc[1];
128	struct ip_tunnel **tunnels[4];	128	struct ip_tunnel **tunnels[4];
129		129
130	struct net_device *fb_tunnel_dev;	130	struct net_device *fb_tunnel_dev;
131	};	131	};
132		132
133	static int ipip_fb_tunnel_init(struct net_device *dev);	133	static int ipip_fb_tunnel_init(struct net_device *dev);
134	static int ipip_tunnel_init(struct net_device *dev);	134	static int ipip_tunnel_init(struct net_device *dev);
135	static void ipip_tunnel_setup(struct net_device *dev);	135	static void ipip_tunnel_setup(struct net_device *dev);
136		136
137	static DEFINE_RWLOCK(ipip_lock);	137	static DEFINE_RWLOCK(ipip_lock);
138		138
139	static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,	139	static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
140	__be32 remote, __be32 local)	140	__be32 remote, __be32 local)
141	{	141	{
142	unsigned h0 = HASH(remote);	142	unsigned h0 = HASH(remote);
143	unsigned h1 = HASH(local);	143	unsigned h1 = HASH(local);
144	struct ip_tunnel *t;	144	struct ip_tunnel *t;
145	struct ipip_net *ipn = net_generic(net, ipip_net_id);	145	struct ipip_net *ipn = net_generic(net, ipip_net_id);
146		146
147	for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {	147	for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
148	if (local == t->parms.iph.saddr &&	148	if (local == t->parms.iph.saddr &&
149	remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))	149	remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
150	return t;	150	return t;
151	}	151	}
152	for (t = ipn->tunnels_r[h0]; t; t = t->next) {	152	for (t = ipn->tunnels_r[h0]; t; t = t->next) {
153	if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))	153	if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
154	return t;	154	return t;
155	}	155	}
156	for (t = ipn->tunnels_l[h1]; t; t = t->next) {	156	for (t = ipn->tunnels_l[h1]; t; t = t->next) {
157	if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))	157	if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
158	return t;	158	return t;
159	}	159	}
160	if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))	160	if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
161	return t;	161	return t;
162	return NULL;	162	return NULL;
163	}	163	}
164		164
165	static struct ip_tunnel *__ipip_bucket(struct ipip_net ipn,	165	static struct ip_tunnel *__ipip_bucket(struct ipip_net ipn,
166	struct ip_tunnel_parm *parms)	166	struct ip_tunnel_parm *parms)
167	{	167	{
168	__be32 remote = parms->iph.daddr;	168	__be32 remote = parms->iph.daddr;
169	__be32 local = parms->iph.saddr;	169	__be32 local = parms->iph.saddr;
170	unsigned h = 0;	170	unsigned h = 0;
171	int prio = 0;	171	int prio = 0;
172		172
173	if (remote) {	173	if (remote) {
174	prio \|= 2;	174	prio \|= 2;
175	h ^= HASH(remote);	175	h ^= HASH(remote);
176	}	176	}
177	if (local) {	177	if (local) {
178	prio \|= 1;	178	prio \|= 1;
179	h ^= HASH(local);	179	h ^= HASH(local);
180	}	180	}
181	return &ipn->tunnels[prio][h];	181	return &ipn->tunnels[prio][h];
182	}	182	}
183		183
184	static inline struct ip_tunnel *ipip_bucket(struct ipip_net ipn,	184	static inline struct ip_tunnel *ipip_bucket(struct ipip_net ipn,
185	struct ip_tunnel *t)	185	struct ip_tunnel *t)
186	{	186	{
187	return __ipip_bucket(ipn, &t->parms);	187	return __ipip_bucket(ipn, &t->parms);
188	}	188	}
189		189
190	static void ipip_tunnel_unlink(struct ipip_net ipn, struct ip_tunnel t)	190	static void ipip_tunnel_unlink(struct ipip_net ipn, struct ip_tunnel t)
191	{	191	{
192	struct ip_tunnel **tp;	192	struct ip_tunnel **tp;
193		193
194	for (tp = ipip_bucket(ipn, t); tp; tp = &(tp)->next) {	194	for (tp = ipip_bucket(ipn, t); tp; tp = &(tp)->next) {
195	if (t == *tp) {	195	if (t == *tp) {
196	write_lock_bh(&ipip_lock);	196	write_lock_bh(&ipip_lock);
197	*tp = t->next;	197	*tp = t->next;
198	write_unlock_bh(&ipip_lock);	198	write_unlock_bh(&ipip_lock);
199	break;	199	break;
200	}	200	}
201	}	201	}
202	}	202	}
203		203
204	static void ipip_tunnel_link(struct ipip_net ipn, struct ip_tunnel t)	204	static void ipip_tunnel_link(struct ipip_net ipn, struct ip_tunnel t)
205	{	205	{
206	struct ip_tunnel **tp = ipip_bucket(ipn, t);	206	struct ip_tunnel **tp = ipip_bucket(ipn, t);
207		207
208	t->next = *tp;	208	t->next = *tp;
209	write_lock_bh(&ipip_lock);	209	write_lock_bh(&ipip_lock);
210	*tp = t;	210	*tp = t;
211	write_unlock_bh(&ipip_lock);	211	write_unlock_bh(&ipip_lock);
212	}	212	}
213		213
214	static struct ip_tunnel * ipip_tunnel_locate(struct net *net,	214	static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
215	struct ip_tunnel_parm *parms, int create)	215	struct ip_tunnel_parm *parms, int create)
216	{	216	{
217	__be32 remote = parms->iph.daddr;	217	__be32 remote = parms->iph.daddr;
218	__be32 local = parms->iph.saddr;	218	__be32 local = parms->iph.saddr;
219	struct ip_tunnel t, tp, nt;	219	struct ip_tunnel t, tp, nt;
220	struct net_device *dev;	220	struct net_device *dev;
221	char name[IFNAMSIZ];	221	char name[IFNAMSIZ];
222	struct ipip_net *ipn = net_generic(net, ipip_net_id);	222	struct ipip_net *ipn = net_generic(net, ipip_net_id);
223		223
224	for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {	224	for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
225	if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)	225	if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
226	return t;	226	return t;
227	}	227	}
228	if (!create)	228	if (!create)
229	return NULL;	229	return NULL;
230		230
231	if (parms->name[0])	231	if (parms->name[0])
232	strlcpy(name, parms->name, IFNAMSIZ);	232	strlcpy(name, parms->name, IFNAMSIZ);
233	else	233	else
234	sprintf(name, "tunl%%d");	234	sprintf(name, "tunl%%d");
235		235
236	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);	236	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
237	if (dev == NULL)	237	if (dev == NULL)
238	return NULL;	238	return NULL;
239		239
240	dev_net_set(dev, net);	240	dev_net_set(dev, net);
241		241
242	if (strchr(name, '%')) {	242	if (strchr(name, '%')) {
243	if (dev_alloc_name(dev, name) < 0)	243	if (dev_alloc_name(dev, name) < 0)
244	goto failed_free;	244	goto failed_free;
245	}	245	}
246		246
247	nt = netdev_priv(dev);	247	nt = netdev_priv(dev);
248	dev->init = ipip_tunnel_init;	248	dev->init = ipip_tunnel_init;
249	nt->parms = *parms;	249	nt->parms = *parms;
250		250
251	if (register_netdevice(dev) < 0)	251	if (register_netdevice(dev) < 0)
252	goto failed_free;	252	goto failed_free;
253		253
254	dev_hold(dev);	254	dev_hold(dev);
255	ipip_tunnel_link(ipn, nt);	255	ipip_tunnel_link(ipn, nt);
256	return nt;	256	return nt;
257		257
258	failed_free:	258	failed_free:
259	free_netdev(dev);	259	free_netdev(dev);
260	return NULL;	260	return NULL;
261	}	261	}
262		262
263	static void ipip_tunnel_uninit(struct net_device *dev)	263	static void ipip_tunnel_uninit(struct net_device *dev)
264	{	264	{
265	struct net *net = dev_net(dev);	265	struct net *net = dev_net(dev);
266	struct ipip_net *ipn = net_generic(net, ipip_net_id);	266	struct ipip_net *ipn = net_generic(net, ipip_net_id);
267		267
268	if (dev == ipn->fb_tunnel_dev) {	268	if (dev == ipn->fb_tunnel_dev) {
269	write_lock_bh(&ipip_lock);	269	write_lock_bh(&ipip_lock);
270	ipn->tunnels_wc[0] = NULL;	270	ipn->tunnels_wc[0] = NULL;
271	write_unlock_bh(&ipip_lock);	271	write_unlock_bh(&ipip_lock);
272	} else	272	} else
273	ipip_tunnel_unlink(ipn, netdev_priv(dev));	273	ipip_tunnel_unlink(ipn, netdev_priv(dev));
274	dev_put(dev);	274	dev_put(dev);
275	}	275	}
276		276
277	static int ipip_err(struct sk_buff *skb, u32 info)	277	static int ipip_err(struct sk_buff *skb, u32 info)
278	{	278	{
279		279
280	/* All the routers (except for Linux) return only	280	/* All the routers (except for Linux) return only
281	8 bytes of packet payload. It means, that precise relaying of	281	8 bytes of packet payload. It means, that precise relaying of
282	ICMP in the real Internet is absolutely infeasible.	282	ICMP in the real Internet is absolutely infeasible.
283	*/	283	*/
284	struct iphdr iph = (struct iphdr)skb->data;	284	struct iphdr iph = (struct iphdr )skb->data;
285	const int type = icmp_hdr(skb)->type;	285	const int type = icmp_hdr(skb)->type;
286	const int code = icmp_hdr(skb)->code;	286	const int code = icmp_hdr(skb)->code;
287	struct ip_tunnel *t;	287	struct ip_tunnel *t;
288	int err;	288	int err;
289		289
290	switch (type) {	290	switch (type) {
291	default:	291	default:
292	case ICMP_PARAMETERPROB:	292	case ICMP_PARAMETERPROB:
293	return 0;	293	return 0;
294		294
295	case ICMP_DEST_UNREACH:	295	case ICMP_DEST_UNREACH:
296	switch (code) {	296	switch (code) {
297	case ICMP_SR_FAILED:	297	case ICMP_SR_FAILED:
298	case ICMP_PORT_UNREACH:	298	case ICMP_PORT_UNREACH:
299	/* Impossible event. */	299	/* Impossible event. */
300	return 0;	300	return 0;
301	case ICMP_FRAG_NEEDED:	301	case ICMP_FRAG_NEEDED:
302	/* Soft state for pmtu is maintained by IP core. */	302	/* Soft state for pmtu is maintained by IP core. */
303	return 0;	303	return 0;
304	default:	304	default:
305	/* All others are translated to HOST_UNREACH.	305	/* All others are translated to HOST_UNREACH.
306	rfc2003 contains "deep thoughts" about NET_UNREACH,	306	rfc2003 contains "deep thoughts" about NET_UNREACH,
307	I believe they are just ether pollution. --ANK	307	I believe they are just ether pollution. --ANK
308	*/	308	*/
309	break;	309	break;
310	}	310	}
311	break;	311	break;
312	case ICMP_TIME_EXCEEDED:	312	case ICMP_TIME_EXCEEDED:
313	if (code != ICMP_EXC_TTL)	313	if (code != ICMP_EXC_TTL)
314	return 0;	314	return 0;
315	break;	315	break;
316	}	316	}
317		317
318	err = -ENOENT;	318	err = -ENOENT;
319		319
320	read_lock(&ipip_lock);	320	read_lock(&ipip_lock);
321	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);	321	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
322	if (t == NULL \|\| t->parms.iph.daddr == 0)	322	if (t == NULL \|\| t->parms.iph.daddr == 0)
323	goto out;	323	goto out;
324		324
325	err = 0;	325	err = 0;
326	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)	326	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
327	goto out;	327	goto out;
328		328
329	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)	329	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
330	t->err_count++;	330	t->err_count++;
331	else	331	else
332	t->err_count = 1;	332	t->err_count = 1;
333	t->err_time = jiffies;	333	t->err_time = jiffies;
334	out:	334	out:
335	read_unlock(&ipip_lock);	335	read_unlock(&ipip_lock);
336	return err;	336	return err;
337	}	337	}
338		338
339	static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,	339	static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
340	struct sk_buff *skb)	340	struct sk_buff *skb)
341	{	341	{
342	struct iphdr *inner_iph = ip_hdr(skb);	342	struct iphdr *inner_iph = ip_hdr(skb);
343		343
344	if (INET_ECN_is_ce(outer_iph->tos))	344	if (INET_ECN_is_ce(outer_iph->tos))
345	IP_ECN_set_ce(inner_iph);	345	IP_ECN_set_ce(inner_iph);
346	}	346	}
347		347
348	static int ipip_rcv(struct sk_buff *skb)	348	static int ipip_rcv(struct sk_buff *skb)
349	{	349	{
350	struct ip_tunnel *tunnel;	350	struct ip_tunnel *tunnel;
351	const struct iphdr *iph = ip_hdr(skb);	351	const struct iphdr *iph = ip_hdr(skb);
352		352
353	read_lock(&ipip_lock);	353	read_lock(&ipip_lock);
354	if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),	354	if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
355	iph->saddr, iph->daddr)) != NULL) {	355	iph->saddr, iph->daddr)) != NULL) {
356	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {	356	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
357	read_unlock(&ipip_lock);	357	read_unlock(&ipip_lock);
358	kfree_skb(skb);	358	kfree_skb(skb);
359	return 0;	359	return 0;
360	}	360	}
361		361
362	secpath_reset(skb);	362	secpath_reset(skb);
363		363
364	skb->mac_header = skb->network_header;	364	skb->mac_header = skb->network_header;
365	skb_reset_network_header(skb);	365	skb_reset_network_header(skb);
366	skb->protocol = htons(ETH_P_IP);	366	skb->protocol = htons(ETH_P_IP);
367	skb->pkt_type = PACKET_HOST;	367	skb->pkt_type = PACKET_HOST;
368		368
369	tunnel->dev->stats.rx_packets++;	369	tunnel->dev->stats.rx_packets++;
370	tunnel->dev->stats.rx_bytes += skb->len;	370	tunnel->dev->stats.rx_bytes += skb->len;
371	skb->dev = tunnel->dev;	371	skb->dev = tunnel->dev;
372	dst_release(skb->dst);	372	dst_release(skb->dst);
373	skb->dst = NULL;	373	skb->dst = NULL;
374	nf_reset(skb);	374	nf_reset(skb);
375	ipip_ecn_decapsulate(iph, skb);	375	ipip_ecn_decapsulate(iph, skb);
376	netif_rx(skb);	376	netif_rx(skb);
377	read_unlock(&ipip_lock);	377	read_unlock(&ipip_lock);
378	return 0;	378	return 0;
379	}	379	}
380	read_unlock(&ipip_lock);	380	read_unlock(&ipip_lock);
381		381
382	return -1;	382	return -1;
383	}	383	}
384		384
385	/*	385	/*
386	* This function assumes it is being called from dev_queue_xmit()	386	* This function assumes it is being called from dev_queue_xmit()
387	* and that skb is filled properly by that function.	387	* and that skb is filled properly by that function.
388	*/	388	*/
389		389
390	static int ipip_tunnel_xmit(struct sk_buff skb, struct net_device dev)	390	static int ipip_tunnel_xmit(struct sk_buff skb, struct net_device dev)
391	{	391	{
392	struct ip_tunnel *tunnel = netdev_priv(dev);	392	struct ip_tunnel *tunnel = netdev_priv(dev);
393	struct net_device_stats *stats = &tunnel->dev->stats;	393	struct net_device_stats *stats = &tunnel->dev->stats;
394	struct iphdr *tiph = &tunnel->parms.iph;	394	struct iphdr *tiph = &tunnel->parms.iph;
395	u8 tos = tunnel->parms.iph.tos;	395	u8 tos = tunnel->parms.iph.tos;
396	__be16 df = tiph->frag_off;	396	__be16 df = tiph->frag_off;
397	struct rtable rt; / Route to the other host */	397	struct rtable rt; / Route to the other host */
398	struct net_device tdev; / Device to other host */	398	struct net_device tdev; / Device to other host */
399	struct iphdr *old_iph = ip_hdr(skb);	399	struct iphdr *old_iph = ip_hdr(skb);
400	struct iphdr iph; / Our new IP header */	400	struct iphdr iph; / Our new IP header */
401	unsigned int max_headroom; /* The extra header space needed */	401	unsigned int max_headroom; /* The extra header space needed */
402	__be32 dst = tiph->daddr;	402	__be32 dst = tiph->daddr;
403	int mtu;	403	int mtu;
404		404
405	if (tunnel->recursion++) {	405	if (tunnel->recursion++) {
406	stats->collisions++;	406	stats->collisions++;
407	goto tx_error;	407	goto tx_error;
408	}	408	}
409		409
410	if (skb->protocol != htons(ETH_P_IP))	410	if (skb->protocol != htons(ETH_P_IP))
411	goto tx_error;	411	goto tx_error;
412		412
413	if (tos&1)	413	if (tos&1)
414	tos = old_iph->tos;	414	tos = old_iph->tos;
415		415
416	if (!dst) {	416	if (!dst) {
417	/* NBMA tunnel */	417	/* NBMA tunnel */
418	if ((rt = skb->rtable) == NULL) {	418	if ((rt = skb->rtable) == NULL) {
419	stats->tx_fifo_errors++;	419	stats->tx_fifo_errors++;
420	goto tx_error;	420	goto tx_error;
421	}	421	}
422	if ((dst = rt->rt_gateway) == 0)	422	if ((dst = rt->rt_gateway) == 0)
423	goto tx_error_icmp;	423	goto tx_error_icmp;
424	}	424	}
425		425
426	{	426	{
427	struct flowi fl = { .oif = tunnel->parms.link,	427	struct flowi fl = { .oif = tunnel->parms.link,
428	.nl_u = { .ip4_u =	428	.nl_u = { .ip4_u =
429	{ .daddr = dst,	429	{ .daddr = dst,
430	.saddr = tiph->saddr,	430	.saddr = tiph->saddr,
431	.tos = RT_TOS(tos) } },	431	.tos = RT_TOS(tos) } },
432	.proto = IPPROTO_IPIP };	432	.proto = IPPROTO_IPIP };
433	if (ip_route_output_key(dev_net(dev), &rt, &fl)) {	433	if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
434	stats->tx_carrier_errors++;	434	stats->tx_carrier_errors++;
435	goto tx_error_icmp;	435	goto tx_error_icmp;
436	}	436	}
437	}	437	}
438	tdev = rt->u.dst.dev;	438	tdev = rt->u.dst.dev;
439		439
440	if (tdev == dev) {	440	if (tdev == dev) {
441	ip_rt_put(rt);	441	ip_rt_put(rt);
442	stats->collisions++;	442	stats->collisions++;
443	goto tx_error;	443	goto tx_error;
444	}	444	}
445		445
446	if (tiph->frag_off)	446	if (tiph->frag_off)
447	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);	447	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
448	else	448	else
449	mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;	449	mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
450		450
451	if (mtu < 68) {	451	if (mtu < 68) {
452	stats->collisions++;	452	stats->collisions++;
453	ip_rt_put(rt);	453	ip_rt_put(rt);
454	goto tx_error;	454	goto tx_error;
455	}	455	}
456	if (skb->dst)	456	if (skb->dst)
457	skb->dst->ops->update_pmtu(skb->dst, mtu);	457	skb->dst->ops->update_pmtu(skb->dst, mtu);
458		458
459	df \|= (old_iph->frag_off&htons(IP_DF));	459	df \|= (old_iph->frag_off&htons(IP_DF));
460		460
461	if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {	461	if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
462	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));	462	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
463	ip_rt_put(rt);	463	ip_rt_put(rt);
464	goto tx_error;	464	goto tx_error;
465	}	465	}
466		466
467	if (tunnel->err_count > 0) {	467	if (tunnel->err_count > 0) {
468	if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {	468	if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
469	tunnel->err_count--;	469	tunnel->err_count--;
470	dst_link_failure(skb);	470	dst_link_failure(skb);
471	} else	471	} else
472	tunnel->err_count = 0;	472	tunnel->err_count = 0;
473	}	473	}
474		474
475	/*	475	/*
476	* Okay, now see if we can stuff it in the buffer as-is.	476	* Okay, now see if we can stuff it in the buffer as-is.
477	*/	477	*/
478	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));	478	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
479		479
480	if (skb_headroom(skb) < max_headroom \|\| skb_shared(skb) \|\|	480	if (skb_headroom(skb) < max_headroom \|\| skb_shared(skb) \|\|
481	(skb_cloned(skb) && !skb_clone_writable(skb, 0))) {	481	(skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
482	struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);	482	struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
483	if (!new_skb) {	483	if (!new_skb) {
484	ip_rt_put(rt);	484	ip_rt_put(rt);
485	stats->tx_dropped++;	485	stats->tx_dropped++;
486	dev_kfree_skb(skb);	486	dev_kfree_skb(skb);
487	tunnel->recursion--;	487	tunnel->recursion--;
488	return 0;	488	return 0;
489	}	489	}
490	if (skb->sk)	490	if (skb->sk)
491	skb_set_owner_w(new_skb, skb->sk);	491	skb_set_owner_w(new_skb, skb->sk);
492	dev_kfree_skb(skb);	492	dev_kfree_skb(skb);
493	skb = new_skb;	493	skb = new_skb;
494	old_iph = ip_hdr(skb);	494	old_iph = ip_hdr(skb);
495	}	495	}
496		496
497	skb->transport_header = skb->network_header;	497	skb->transport_header = skb->network_header;
498	skb_push(skb, sizeof(struct iphdr));	498	skb_push(skb, sizeof(struct iphdr));
499	skb_reset_network_header(skb);	499	skb_reset_network_header(skb);
500	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));	500	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
501	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE \| IPSKB_XFRM_TRANSFORMED \|	501	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE \| IPSKB_XFRM_TRANSFORMED \|
502	IPSKB_REROUTED);	502	IPSKB_REROUTED);
503	dst_release(skb->dst);	503	dst_release(skb->dst);
504	skb->dst = &rt->u.dst;	504	skb->dst = &rt->u.dst;
505		505
506	/*	506	/*
507	* Push down and install the IPIP header.	507	* Push down and install the IPIP header.
508	*/	508	*/
509		509
510	iph = ip_hdr(skb);	510	iph = ip_hdr(skb);
511	iph->version = 4;	511	iph->version = 4;
512	iph->ihl = sizeof(struct iphdr)>>2;	512	iph->ihl = sizeof(struct iphdr)>>2;
513	iph->frag_off = df;	513	iph->frag_off = df;
514	iph->protocol = IPPROTO_IPIP;	514	iph->protocol = IPPROTO_IPIP;
515	iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);	515	iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
516	iph->daddr = rt->rt_dst;	516	iph->daddr = rt->rt_dst;
517	iph->saddr = rt->rt_src;	517	iph->saddr = rt->rt_src;
518		518
519	if ((iph->ttl = tiph->ttl) == 0)	519	if ((iph->ttl = tiph->ttl) == 0)
520	iph->ttl = old_iph->ttl;	520	iph->ttl = old_iph->ttl;
521		521
522	nf_reset(skb);	522	nf_reset(skb);
523		523
524	IPTUNNEL_XMIT();	524	IPTUNNEL_XMIT();
525	tunnel->recursion--;	525	tunnel->recursion--;
526	return 0;	526	return 0;
527		527
528	tx_error_icmp:	528	tx_error_icmp:
529	dst_link_failure(skb);	529	dst_link_failure(skb);
530	tx_error:	530	tx_error:
531	stats->tx_errors++;	531	stats->tx_errors++;
532	dev_kfree_skb(skb);	532	dev_kfree_skb(skb);
533	tunnel->recursion--;	533	tunnel->recursion--;
534	return 0;	534	return 0;
535	}	535	}
536		536
537	static void ipip_tunnel_bind_dev(struct net_device *dev)	537	static void ipip_tunnel_bind_dev(struct net_device *dev)
538	{	538	{
539	struct net_device *tdev = NULL;	539	struct net_device *tdev = NULL;
540	struct ip_tunnel *tunnel;	540	struct ip_tunnel *tunnel;
541	struct iphdr *iph;	541	struct iphdr *iph;
542		542
543	tunnel = netdev_priv(dev);	543	tunnel = netdev_priv(dev);
544	iph = &tunnel->parms.iph;	544	iph = &tunnel->parms.iph;
545		545
546	if (iph->daddr) {	546	if (iph->daddr) {
547	struct flowi fl = { .oif = tunnel->parms.link,	547	struct flowi fl = { .oif = tunnel->parms.link,
548	.nl_u = { .ip4_u =	548	.nl_u = { .ip4_u =
549	{ .daddr = iph->daddr,	549	{ .daddr = iph->daddr,
550	.saddr = iph->saddr,	550	.saddr = iph->saddr,
551	.tos = RT_TOS(iph->tos) } },	551	.tos = RT_TOS(iph->tos) } },
552	.proto = IPPROTO_IPIP };	552	.proto = IPPROTO_IPIP };
553	struct rtable *rt;	553	struct rtable *rt;
554	if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {	554	if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
555	tdev = rt->u.dst.dev;	555	tdev = rt->u.dst.dev;
556	ip_rt_put(rt);	556	ip_rt_put(rt);
557	}	557	}
558	dev->flags \|= IFF_POINTOPOINT;	558	dev->flags \|= IFF_POINTOPOINT;
559	}	559	}
560		560
561	if (!tdev && tunnel->parms.link)	561	if (!tdev && tunnel->parms.link)
562	tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);	562	tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
563		563
564	if (tdev) {	564	if (tdev) {
565	dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);	565	dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
566	dev->mtu = tdev->mtu - sizeof(struct iphdr);	566	dev->mtu = tdev->mtu - sizeof(struct iphdr);
567	}	567	}
568	dev->iflink = tunnel->parms.link;	568	dev->iflink = tunnel->parms.link;
569	}	569	}
570		570
571	static int	571	static int
572	ipip_tunnel_ioctl (struct net_device dev, struct ifreq ifr, int cmd)	572	ipip_tunnel_ioctl (struct net_device dev, struct ifreq ifr, int cmd)
573	{	573	{
574	int err = 0;	574	int err = 0;
575	struct ip_tunnel_parm p;	575	struct ip_tunnel_parm p;
576	struct ip_tunnel *t;	576	struct ip_tunnel *t;
577	struct net *net = dev_net(dev);	577	struct net *net = dev_net(dev);
578	struct ipip_net *ipn = net_generic(net, ipip_net_id);	578	struct ipip_net *ipn = net_generic(net, ipip_net_id);
579		579
580	switch (cmd) {	580	switch (cmd) {
581	case SIOCGETTUNNEL:	581	case SIOCGETTUNNEL:
582	t = NULL;	582	t = NULL;
583	if (dev == ipn->fb_tunnel_dev) {	583	if (dev == ipn->fb_tunnel_dev) {
584	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {	584	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
585	err = -EFAULT;	585	err = -EFAULT;
586	break;	586	break;
587	}	587	}
588	t = ipip_tunnel_locate(net, &p, 0);	588	t = ipip_tunnel_locate(net, &p, 0);
589	}	589	}
590	if (t == NULL)	590	if (t == NULL)
591	t = netdev_priv(dev);	591	t = netdev_priv(dev);
592	memcpy(&p, &t->parms, sizeof(p));	592	memcpy(&p, &t->parms, sizeof(p));
593	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))	593	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
594	err = -EFAULT;	594	err = -EFAULT;
595	break;	595	break;
596		596
597	case SIOCADDTUNNEL:	597	case SIOCADDTUNNEL:
598	case SIOCCHGTUNNEL:	598	case SIOCCHGTUNNEL:
599	err = -EPERM;	599	err = -EPERM;
600	if (!capable(CAP_NET_ADMIN))	600	if (!capable(CAP_NET_ADMIN))
601	goto done;	601	goto done;
602		602
603	err = -EFAULT;	603	err = -EFAULT;
604	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))	604	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
605	goto done;	605	goto done;
606		606
607	err = -EINVAL;	607	err = -EINVAL;
608	if (p.iph.version != 4 \|\| p.iph.protocol != IPPROTO_IPIP \|\|	608	if (p.iph.version != 4 \|\| p.iph.protocol != IPPROTO_IPIP \|\|
609	p.iph.ihl != 5 \|\| (p.iph.frag_off&htons(~IP_DF)))	609	p.iph.ihl != 5 \|\| (p.iph.frag_off&htons(~IP_DF)))
610	goto done;	610	goto done;
611	if (p.iph.ttl)	611	if (p.iph.ttl)
612	p.iph.frag_off \|= htons(IP_DF);	612	p.iph.frag_off \|= htons(IP_DF);
613		613
614	t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);	614	t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
615		615
616	if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {	616	if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
617	if (t != NULL) {	617	if (t != NULL) {
618	if (t->dev != dev) {	618	if (t->dev != dev) {
619	err = -EEXIST;	619	err = -EEXIST;
620	break;	620	break;
621	}	621	}
622	} else {	622	} else {
623	if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) \|\|	623	if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) \|\|
624	(!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {	624	(!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
625	err = -EINVAL;	625	err = -EINVAL;
626	break;	626	break;
627	}	627	}
628	t = netdev_priv(dev);	628	t = netdev_priv(dev);
629	ipip_tunnel_unlink(ipn, t);	629	ipip_tunnel_unlink(ipn, t);
630	t->parms.iph.saddr = p.iph.saddr;	630	t->parms.iph.saddr = p.iph.saddr;
631	t->parms.iph.daddr = p.iph.daddr;	631	t->parms.iph.daddr = p.iph.daddr;
632	memcpy(dev->dev_addr, &p.iph.saddr, 4);	632	memcpy(dev->dev_addr, &p.iph.saddr, 4);
633	memcpy(dev->broadcast, &p.iph.daddr, 4);	633	memcpy(dev->broadcast, &p.iph.daddr, 4);
634	ipip_tunnel_link(ipn, t);	634	ipip_tunnel_link(ipn, t);
635	netdev_state_change(dev);	635	netdev_state_change(dev);
636	}	636	}
637	}	637	}
638		638
639	if (t) {	639	if (t) {
640	err = 0;	640	err = 0;
641	if (cmd == SIOCCHGTUNNEL) {	641	if (cmd == SIOCCHGTUNNEL) {
642	t->parms.iph.ttl = p.iph.ttl;	642	t->parms.iph.ttl = p.iph.ttl;
643	t->parms.iph.tos = p.iph.tos;	643	t->parms.iph.tos = p.iph.tos;
644	t->parms.iph.frag_off = p.iph.frag_off;	644	t->parms.iph.frag_off = p.iph.frag_off;
645	if (t->parms.link != p.link) {	645	if (t->parms.link != p.link) {
646	t->parms.link = p.link;	646	t->parms.link = p.link;
647	ipip_tunnel_bind_dev(dev);	647	ipip_tunnel_bind_dev(dev);
648	netdev_state_change(dev);	648	netdev_state_change(dev);
649	}	649	}
650	}	650	}
651	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))	651	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
652	err = -EFAULT;	652	err = -EFAULT;
653	} else	653	} else
654	err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);	654	err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
655	break;	655	break;
656		656
657	case SIOCDELTUNNEL:	657	case SIOCDELTUNNEL:
658	err = -EPERM;	658	err = -EPERM;
659	if (!capable(CAP_NET_ADMIN))	659	if (!capable(CAP_NET_ADMIN))
660	goto done;	660	goto done;
661		661
662	if (dev == ipn->fb_tunnel_dev) {	662	if (dev == ipn->fb_tunnel_dev) {
663	err = -EFAULT;	663	err = -EFAULT;
664	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))	664	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
665	goto done;	665	goto done;
666	err = -ENOENT;	666	err = -ENOENT;
667	if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)	667	if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
668	goto done;	668	goto done;
669	err = -EPERM;	669	err = -EPERM;
670	if (t->dev == ipn->fb_tunnel_dev)	670	if (t->dev == ipn->fb_tunnel_dev)
671	goto done;	671	goto done;
672	dev = t->dev;	672	dev = t->dev;
673	}	673	}
674	unregister_netdevice(dev);	674	unregister_netdevice(dev);
675	err = 0;	675	err = 0;
676	break;	676	break;
677		677
678	default:	678	default:
679	err = -EINVAL;	679	err = -EINVAL;
680	}	680	}
681		681
682	done:	682	done:
683	return err;	683	return err;
684	}	684	}
685		685
686	static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)	686	static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
687	{	687	{
688	if (new_mtu < 68 \|\| new_mtu > 0xFFF8 - sizeof(struct iphdr))	688	if (new_mtu < 68 \|\| new_mtu > 0xFFF8 - sizeof(struct iphdr))
689	return -EINVAL;	689	return -EINVAL;
690	dev->mtu = new_mtu;	690	dev->mtu = new_mtu;
691	return 0;	691	return 0;
692	}	692	}
693		693
694	static void ipip_tunnel_setup(struct net_device *dev)	694	static void ipip_tunnel_setup(struct net_device *dev)
695	{	695	{
696	dev->uninit = ipip_tunnel_uninit;	696	dev->uninit = ipip_tunnel_uninit;
697	dev->hard_start_xmit = ipip_tunnel_xmit;	697	dev->hard_start_xmit = ipip_tunnel_xmit;
698	dev->do_ioctl = ipip_tunnel_ioctl;	698	dev->do_ioctl = ipip_tunnel_ioctl;
699	dev->change_mtu = ipip_tunnel_change_mtu;	699	dev->change_mtu = ipip_tunnel_change_mtu;
700	dev->destructor = free_netdev;	700	dev->destructor = free_netdev;
701		701
702	dev->type = ARPHRD_TUNNEL;	702	dev->type = ARPHRD_TUNNEL;
703	dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);	703	dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
704	dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);	704	dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
705	dev->flags = IFF_NOARP;	705	dev->flags = IFF_NOARP;
706	dev->iflink = 0;	706	dev->iflink = 0;
707	dev->addr_len = 4;	707	dev->addr_len = 4;
708	dev->features \|= NETIF_F_NETNS_LOCAL;	708	dev->features \|= NETIF_F_NETNS_LOCAL;
709	}	709	}
710		710
711	static int ipip_tunnel_init(struct net_device *dev)	711	static int ipip_tunnel_init(struct net_device *dev)
712	{	712	{
713	struct ip_tunnel *tunnel;	713	struct ip_tunnel *tunnel;
714		714
715	tunnel = netdev_priv(dev);	715	tunnel = netdev_priv(dev);
716		716
717	tunnel->dev = dev;	717	tunnel->dev = dev;
718	strcpy(tunnel->parms.name, dev->name);	718	strcpy(tunnel->parms.name, dev->name);
719		719
720	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);	720	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
721	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);	721	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
722		722
723	ipip_tunnel_bind_dev(dev);	723	ipip_tunnel_bind_dev(dev);
724		724
725	return 0;	725	return 0;
726	}	726	}
727		727
728	static int ipip_fb_tunnel_init(struct net_device *dev)	728	static int ipip_fb_tunnel_init(struct net_device *dev)
729	{	729	{
730	struct ip_tunnel *tunnel = netdev_priv(dev);	730	struct ip_tunnel *tunnel = netdev_priv(dev);
731	struct iphdr *iph = &tunnel->parms.iph;	731	struct iphdr *iph = &tunnel->parms.iph;
732	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);	732	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
733		733
734	tunnel->dev = dev;	734	tunnel->dev = dev;
735	strcpy(tunnel->parms.name, dev->name);	735	strcpy(tunnel->parms.name, dev->name);
736		736
737	iph->version = 4;	737	iph->version = 4;
738	iph->protocol = IPPROTO_IPIP;	738	iph->protocol = IPPROTO_IPIP;
739	iph->ihl = 5;	739	iph->ihl = 5;
740		740
741	dev_hold(dev);	741	dev_hold(dev);
742	ipn->tunnels_wc[0] = tunnel;	742	ipn->tunnels_wc[0] = tunnel;
743	return 0;	743	return 0;
744	}	744	}
745		745
746	static struct xfrm_tunnel ipip_handler = {	746	static struct xfrm_tunnel ipip_handler = {
747	.handler = ipip_rcv,	747	.handler = ipip_rcv,
748	.err_handler = ipip_err,	748	.err_handler = ipip_err,
749	.priority = 1,	749	.priority = 1,
750	};	750	};
751		751
752	static char banner[] __initdata =	752	static char banner[] __initdata =
753	KERN_INFO "IPv4 over IPv4 tunneling driver\n";	753	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
754		754
755	static void ipip_destroy_tunnels(struct ipip_net *ipn)	755	static void ipip_destroy_tunnels(struct ipip_net *ipn)
756	{	756	{
757	int prio;	757	int prio;
758		758
759	for (prio = 1; prio < 4; prio++) {	759	for (prio = 1; prio < 4; prio++) {
760	int h;	760	int h;
761	for (h = 0; h < HASH_SIZE; h++) {	761	for (h = 0; h < HASH_SIZE; h++) {
762	struct ip_tunnel *t;	762	struct ip_tunnel *t;
763	while ((t = ipn->tunnels[prio][h]) != NULL)	763	while ((t = ipn->tunnels[prio][h]) != NULL)
764	unregister_netdevice(t->dev);	764	unregister_netdevice(t->dev);
765	}	765	}
766	}	766	}
767	}	767	}
768		768
769	static int ipip_init_net(struct net *net)	769	static int ipip_init_net(struct net *net)
770	{	770	{
771	int err;	771	int err;
772	struct ipip_net *ipn;	772	struct ipip_net *ipn;
773		773
774	err = -ENOMEM;	774	err = -ENOMEM;
775	ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);	775	ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
776	if (ipn == NULL)	776	if (ipn == NULL)
777	goto err_alloc;	777	goto err_alloc;
778		778
779	err = net_assign_generic(net, ipip_net_id, ipn);	779	err = net_assign_generic(net, ipip_net_id, ipn);
780	if (err < 0)	780	if (err < 0)
781	goto err_assign;	781	goto err_assign;
782		782
783	ipn->tunnels[0] = ipn->tunnels_wc;	783	ipn->tunnels[0] = ipn->tunnels_wc;
784	ipn->tunnels[1] = ipn->tunnels_l;	784	ipn->tunnels[1] = ipn->tunnels_l;
785	ipn->tunnels[2] = ipn->tunnels_r;	785	ipn->tunnels[2] = ipn->tunnels_r;
786	ipn->tunnels[3] = ipn->tunnels_r_l;	786	ipn->tunnels[3] = ipn->tunnels_r_l;
787		787
788	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),	788	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
789	"tunl0",	789	"tunl0",
790	ipip_tunnel_setup);	790	ipip_tunnel_setup);
791	if (!ipn->fb_tunnel_dev) {	791	if (!ipn->fb_tunnel_dev) {
792	err = -ENOMEM;	792	err = -ENOMEM;
793	goto err_alloc_dev;	793	goto err_alloc_dev;
794	}	794	}
795		795
796	ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;	796	ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;
797	dev_net_set(ipn->fb_tunnel_dev, net);	797	dev_net_set(ipn->fb_tunnel_dev, net);
798		798
799	if ((err = register_netdev(ipn->fb_tunnel_dev)))	799	if ((err = register_netdev(ipn->fb_tunnel_dev)))
800	goto err_reg_dev;	800	goto err_reg_dev;
801		801
802	return 0;	802	return 0;
803		803
804	err_reg_dev:	804	err_reg_dev:
805	free_netdev(ipn->fb_tunnel_dev);	805	free_netdev(ipn->fb_tunnel_dev);
806	err_alloc_dev:	806	err_alloc_dev:
807	/* nothing */	807	/* nothing */
808	err_assign:	808	err_assign:
809	kfree(ipn);	809	kfree(ipn);
810	err_alloc:	810	err_alloc:
811	return err;	811	return err;
812	}	812	}
813		813
814	static void ipip_exit_net(struct net *net)	814	static void ipip_exit_net(struct net *net)
815	{	815	{
816	struct ipip_net *ipn;	816	struct ipip_net *ipn;
817		817
818	ipn = net_generic(net, ipip_net_id);	818	ipn = net_generic(net, ipip_net_id);
819	rtnl_lock();	819	rtnl_lock();
820	ipip_destroy_tunnels(ipn);	820	ipip_destroy_tunnels(ipn);
821	unregister_netdevice(ipn->fb_tunnel_dev);	821	unregister_netdevice(ipn->fb_tunnel_dev);
822	rtnl_unlock();	822	rtnl_unlock();
823	kfree(ipn);	823	kfree(ipn);
824	}	824	}
825		825
826	static struct pernet_operations ipip_net_ops = {	826	static struct pernet_operations ipip_net_ops = {
827	.init = ipip_init_net,	827	.init = ipip_init_net,
828	.exit = ipip_exit_net,	828	.exit = ipip_exit_net,
829	};	829	};
830		830
831	static int __init ipip_init(void)	831	static int __init ipip_init(void)
832	{	832	{
833	int err;	833	int err;
834		834
835	printk(banner);	835	printk(banner);
836		836
837	if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {	837	if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {
838	printk(KERN_INFO "ipip init: can't register tunnel\n");	838	printk(KERN_INFO "ipip init: can't register tunnel\n");
839	return -EAGAIN;	839	return -EAGAIN;
840	}	840	}
841		841
842	err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);	842	err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
843	if (err)	843	if (err)
844	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);	844	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
845		845
846	return err;	846	return err;
847	}	847	}
848		848
849	static void __exit ipip_fini(void)	849	static void __exit ipip_fini(void)
850	{	850	{
851	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))	851	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
852	printk(KERN_INFO "ipip close: can't deregister tunnel\n");	852	printk(KERN_INFO "ipip close: can't deregister tunnel\n");
853		853
854	unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);	854	unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
855	}	855	}
856		856
857	module_init(ipip_init);	857	module_init(ipip_init);
858	module_exit(ipip_fini);	858	module_exit(ipip_fini);
859	MODULE_LICENSE("GPL");	859	MODULE_LICENSE("GPL");
860		860

net/ipv4/raw.c

Diff comments View file @ 5a5f3a8

1	/*	1	/*
2	* INET An implementation of the TCP/IP protocol suite for the LINUX	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
3	* operating system. INET is implemented using the BSD Socket	3	* operating system. INET is implemented using the BSD Socket
4	* interface as the means of communication with the user level.	4	* interface as the means of communication with the user level.
5	*	5	*
6	* RAW - implementation of IP "raw" sockets.	6	* RAW - implementation of IP "raw" sockets.
7	*	7	*
8	* Authors: Ross Biro	8	* Authors: Ross Biro
9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>	9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10	*	10	*
11	* Fixes:	11	* Fixes:
12	* Alan Cox : verify_area() fixed up	12	* Alan Cox : verify_area() fixed up
13	* Alan Cox : ICMP error handling	13	* Alan Cox : ICMP error handling
14	* Alan Cox : EMSGSIZE if you send too big a packet	14	* Alan Cox : EMSGSIZE if you send too big a packet
15	* Alan Cox : Now uses generic datagrams and shared	15	* Alan Cox : Now uses generic datagrams and shared
16	* skbuff library. No more peek crashes,	16	* skbuff library. No more peek crashes,
17	* no more backlogs	17	* no more backlogs
18	* Alan Cox : Checks sk->broadcast.	18	* Alan Cox : Checks sk->broadcast.
19	* Alan Cox : Uses skb_free_datagram/skb_copy_datagram	19	* Alan Cox : Uses skb_free_datagram/skb_copy_datagram
20	* Alan Cox : Raw passes ip options too	20	* Alan Cox : Raw passes ip options too
21	* Alan Cox : Setsocketopt added	21	* Alan Cox : Setsocketopt added
22	* Alan Cox : Fixed error return for broadcasts	22	* Alan Cox : Fixed error return for broadcasts
23	* Alan Cox : Removed wake_up calls	23	* Alan Cox : Removed wake_up calls
24	* Alan Cox : Use ttl/tos	24	* Alan Cox : Use ttl/tos
25	* Alan Cox : Cleaned up old debugging	25	* Alan Cox : Cleaned up old debugging
26	* Alan Cox : Use new kernel side addresses	26	* Alan Cox : Use new kernel side addresses
27	* Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.	27	* Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
28	* Alan Cox : BSD style RAW socket demultiplexing.	28	* Alan Cox : BSD style RAW socket demultiplexing.
29	* Alan Cox : Beginnings of mrouted support.	29	* Alan Cox : Beginnings of mrouted support.
30	* Alan Cox : Added IP_HDRINCL option.	30	* Alan Cox : Added IP_HDRINCL option.
31	* Alan Cox : Skip broadcast check if BSDism set.	31	* Alan Cox : Skip broadcast check if BSDism set.
32	* David S. Miller : New socket lookup architecture.	32	* David S. Miller : New socket lookup architecture.
33	*	33	*
34	* This program is free software; you can redistribute it and/or	34	* This program is free software; you can redistribute it and/or
35	* modify it under the terms of the GNU General Public License	35	* modify it under the terms of the GNU General Public License
36	* as published by the Free Software Foundation; either version	36	* as published by the Free Software Foundation; either version
37	* 2 of the License, or (at your option) any later version.	37	* 2 of the License, or (at your option) any later version.
38	*/	38	*/
39		39
40	#include <linux/types.h>	40	#include <linux/types.h>
41	#include <asm/atomic.h>	41	#include <asm/atomic.h>
42	#include <asm/byteorder.h>	42	#include <asm/byteorder.h>
43	#include <asm/current.h>	43	#include <asm/current.h>
44	#include <asm/uaccess.h>	44	#include <asm/uaccess.h>
45	#include <asm/ioctls.h>	45	#include <asm/ioctls.h>
46	#include <linux/stddef.h>	46	#include <linux/stddef.h>
47	#include <linux/slab.h>	47	#include <linux/slab.h>
48	#include <linux/errno.h>	48	#include <linux/errno.h>
49	#include <linux/aio.h>	49	#include <linux/aio.h>
50	#include <linux/kernel.h>	50	#include <linux/kernel.h>
51	#include <linux/spinlock.h>	51	#include <linux/spinlock.h>
52	#include <linux/sockios.h>	52	#include <linux/sockios.h>
53	#include <linux/socket.h>	53	#include <linux/socket.h>
54	#include <linux/in.h>	54	#include <linux/in.h>
55	#include <linux/mroute.h>	55	#include <linux/mroute.h>
56	#include <linux/netdevice.h>	56	#include <linux/netdevice.h>
57	#include <linux/in_route.h>	57	#include <linux/in_route.h>
58	#include <linux/route.h>	58	#include <linux/route.h>
59	#include <linux/skbuff.h>	59	#include <linux/skbuff.h>
60	#include <net/net_namespace.h>	60	#include <net/net_namespace.h>
61	#include <net/dst.h>	61	#include <net/dst.h>
62	#include <net/sock.h>	62	#include <net/sock.h>
63	#include <linux/gfp.h>	63	#include <linux/gfp.h>
64	#include <linux/ip.h>	64	#include <linux/ip.h>
65	#include <linux/net.h>	65	#include <linux/net.h>
66	#include <net/ip.h>	66	#include <net/ip.h>
67	#include <net/icmp.h>	67	#include <net/icmp.h>
68	#include <net/udp.h>	68	#include <net/udp.h>
69	#include <net/raw.h>	69	#include <net/raw.h>
70	#include <net/snmp.h>	70	#include <net/snmp.h>
71	#include <net/tcp_states.h>	71	#include <net/tcp_states.h>
72	#include <net/inet_common.h>	72	#include <net/inet_common.h>
73	#include <net/checksum.h>	73	#include <net/checksum.h>
74	#include <net/xfrm.h>	74	#include <net/xfrm.h>
75	#include <linux/rtnetlink.h>	75	#include <linux/rtnetlink.h>
76	#include <linux/proc_fs.h>	76	#include <linux/proc_fs.h>
77	#include <linux/seq_file.h>	77	#include <linux/seq_file.h>
78	#include <linux/netfilter.h>	78	#include <linux/netfilter.h>
79	#include <linux/netfilter_ipv4.h>	79	#include <linux/netfilter_ipv4.h>
80		80
81	static struct raw_hashinfo raw_v4_hashinfo = {	81	static struct raw_hashinfo raw_v4_hashinfo = {
82	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),	82	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
83	};	83	};
84		84
85	void raw_hash_sk(struct sock *sk)	85	void raw_hash_sk(struct sock *sk)
86	{	86	{
87	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;	87	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
88	struct hlist_head *head;	88	struct hlist_head *head;
89		89
90	head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];	90	head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];
91		91
92	write_lock_bh(&h->lock);	92	write_lock_bh(&h->lock);
93	sk_add_node(sk, head);	93	sk_add_node(sk, head);
94	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);	94	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
95	write_unlock_bh(&h->lock);	95	write_unlock_bh(&h->lock);
96	}	96	}
97	EXPORT_SYMBOL_GPL(raw_hash_sk);	97	EXPORT_SYMBOL_GPL(raw_hash_sk);
98		98
99	void raw_unhash_sk(struct sock *sk)	99	void raw_unhash_sk(struct sock *sk)
100	{	100	{
101	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;	101	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
102		102
103	write_lock_bh(&h->lock);	103	write_lock_bh(&h->lock);
104	if (sk_del_node_init(sk))	104	if (sk_del_node_init(sk))
105	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);	105	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
106	write_unlock_bh(&h->lock);	106	write_unlock_bh(&h->lock);
107	}	107	}
108	EXPORT_SYMBOL_GPL(raw_unhash_sk);	108	EXPORT_SYMBOL_GPL(raw_unhash_sk);
109		109
110	static struct sock __raw_v4_lookup(struct net net, struct sock *sk,	110	static struct sock __raw_v4_lookup(struct net net, struct sock *sk,
111	unsigned short num, __be32 raddr, __be32 laddr, int dif)	111	unsigned short num, __be32 raddr, __be32 laddr, int dif)
112	{	112	{
113	struct hlist_node *node;	113	struct hlist_node *node;
114		114
115	sk_for_each_from(sk, node) {	115	sk_for_each_from(sk, node) {
116	struct inet_sock *inet = inet_sk(sk);	116	struct inet_sock *inet = inet_sk(sk);
117		117
118	if (net_eq(sock_net(sk), net) && inet->num == num &&	118	if (net_eq(sock_net(sk), net) && inet->num == num &&
119	!(inet->daddr && inet->daddr != raddr) &&	119	!(inet->daddr && inet->daddr != raddr) &&
120	!(inet->rcv_saddr && inet->rcv_saddr != laddr) &&	120	!(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
121	!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))	121	!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
122	goto found; /* gotcha */	122	goto found; /* gotcha */
123	}	123	}
124	sk = NULL;	124	sk = NULL;
125	found:	125	found:
126	return sk;	126	return sk;
127	}	127	}
128		128
129	/*	129	/*
130	* 0 - deliver	130	* 0 - deliver
131	* 1 - block	131	* 1 - block
132	*/	132	*/
133	static __inline__ int icmp_filter(struct sock sk, struct sk_buff skb)	133	static __inline__ int icmp_filter(struct sock sk, struct sk_buff skb)
134	{	134	{
135	int type;	135	int type;
136		136
137	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))	137	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
138	return 1;	138	return 1;
139		139
140	type = icmp_hdr(skb)->type;	140	type = icmp_hdr(skb)->type;
141	if (type < 32) {	141	if (type < 32) {
142	__u32 data = raw_sk(sk)->filter.data;	142	__u32 data = raw_sk(sk)->filter.data;
143		143
144	return ((1 << type) & data) != 0;	144	return ((1 << type) & data) != 0;
145	}	145	}
146		146
147	/* Do not block unknown ICMP types */	147	/* Do not block unknown ICMP types */
148	return 0;	148	return 0;
149	}	149	}
150		150
151	/* IP input processing comes here for RAW socket delivery.	151	/* IP input processing comes here for RAW socket delivery.
152	* Caller owns SKB, so we must make clones.	152	* Caller owns SKB, so we must make clones.
153	*	153	*
154	* RFC 1122: SHOULD pass TOS value up to the transport layer.	154	* RFC 1122: SHOULD pass TOS value up to the transport layer.
155	* -> It does. And not only TOS, but all IP header.	155	* -> It does. And not only TOS, but all IP header.
156	*/	156	*/
157	static int raw_v4_input(struct sk_buff skb, struct iphdr iph, int hash)	157	static int raw_v4_input(struct sk_buff skb, struct iphdr iph, int hash)
158	{	158	{
159	struct sock *sk;	159	struct sock *sk;
160	struct hlist_head *head;	160	struct hlist_head *head;
161	int delivered = 0;	161	int delivered = 0;
162	struct net *net;	162	struct net *net;
163		163
164	read_lock(&raw_v4_hashinfo.lock);	164	read_lock(&raw_v4_hashinfo.lock);
165	head = &raw_v4_hashinfo.ht[hash];	165	head = &raw_v4_hashinfo.ht[hash];
166	if (hlist_empty(head))	166	if (hlist_empty(head))
167	goto out;	167	goto out;
168		168
169	net = dev_net(skb->dev);	169	net = dev_net(skb->dev);
170	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,	170	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
171	iph->saddr, iph->daddr,	171	iph->saddr, iph->daddr,
172	skb->dev->ifindex);	172	skb->dev->ifindex);
173		173
174	while (sk) {	174	while (sk) {
175	delivered = 1;	175	delivered = 1;
176	if (iph->protocol != IPPROTO_ICMP \|\| !icmp_filter(sk, skb)) {	176	if (iph->protocol != IPPROTO_ICMP \|\| !icmp_filter(sk, skb)) {
177	struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);	177	struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
178		178
179	/* Not releasing hash table! */	179	/* Not releasing hash table! */
180	if (clone)	180	if (clone)
181	raw_rcv(sk, clone);	181	raw_rcv(sk, clone);
182	}	182	}
183	sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,	183	sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
184	iph->saddr, iph->daddr,	184	iph->saddr, iph->daddr,
185	skb->dev->ifindex);	185	skb->dev->ifindex);
186	}	186	}
187	out:	187	out:
188	read_unlock(&raw_v4_hashinfo.lock);	188	read_unlock(&raw_v4_hashinfo.lock);
189	return delivered;	189	return delivered;
190	}	190	}
191		191
192	int raw_local_deliver(struct sk_buff *skb, int protocol)	192	int raw_local_deliver(struct sk_buff *skb, int protocol)
193	{	193	{
194	int hash;	194	int hash;
195	struct sock *raw_sk;	195	struct sock *raw_sk;
196		196
197	hash = protocol & (RAW_HTABLE_SIZE - 1);	197	hash = protocol & (RAW_HTABLE_SIZE - 1);
198	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);	198	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
199		199
200	/* If there maybe a raw socket we must check - if not we	200	/* If there maybe a raw socket we must check - if not we
201	* don't care less	201	* don't care less
202	*/	202	*/
203	if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))	203	if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
204	raw_sk = NULL;	204	raw_sk = NULL;
205		205
206	return raw_sk != NULL;	206	return raw_sk != NULL;
207		207
208	}	208	}
209		209
210	static void raw_err(struct sock sk, struct sk_buff skb, u32 info)	210	static void raw_err(struct sock sk, struct sk_buff skb, u32 info)
211	{	211	{
212	struct inet_sock *inet = inet_sk(sk);	212	struct inet_sock *inet = inet_sk(sk);
213	const int type = icmp_hdr(skb)->type;	213	const int type = icmp_hdr(skb)->type;
214	const int code = icmp_hdr(skb)->code;	214	const int code = icmp_hdr(skb)->code;
215	int err = 0;	215	int err = 0;
216	int harderr = 0;	216	int harderr = 0;
217		217
218	/* Report error on raw socket, if:	218	/* Report error on raw socket, if:
219	1. User requested ip_recverr.	219	1. User requested ip_recverr.
220	2. Socket is connected (otherwise the error indication	220	2. Socket is connected (otherwise the error indication
221	is useless without ip_recverr and error is hard.	221	is useless without ip_recverr and error is hard.
222	*/	222	*/
223	if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)	223	if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
224	return;	224	return;
225		225
226	switch (type) {	226	switch (type) {
227	default:	227	default:
228	case ICMP_TIME_EXCEEDED:	228	case ICMP_TIME_EXCEEDED:
229	err = EHOSTUNREACH;	229	err = EHOSTUNREACH;
230	break;	230	break;
231	case ICMP_SOURCE_QUENCH:	231	case ICMP_SOURCE_QUENCH:
232	return;	232	return;
233	case ICMP_PARAMETERPROB:	233	case ICMP_PARAMETERPROB:
234	err = EPROTO;	234	err = EPROTO;
235	harderr = 1;	235	harderr = 1;
236	break;	236	break;
237	case ICMP_DEST_UNREACH:	237	case ICMP_DEST_UNREACH:
238	err = EHOSTUNREACH;	238	err = EHOSTUNREACH;
239	if (code > NR_ICMP_UNREACH)	239	if (code > NR_ICMP_UNREACH)
240	break;	240	break;
241	err = icmp_err_convert[code].errno;	241	err = icmp_err_convert[code].errno;
242	harderr = icmp_err_convert[code].fatal;	242	harderr = icmp_err_convert[code].fatal;
243	if (code == ICMP_FRAG_NEEDED) {	243	if (code == ICMP_FRAG_NEEDED) {
244	harderr = inet->pmtudisc != IP_PMTUDISC_DONT;	244	harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
245	err = EMSGSIZE;	245	err = EMSGSIZE;
246	}	246	}
247	}	247	}
248		248
249	if (inet->recverr) {	249	if (inet->recverr) {
250	struct iphdr iph = (struct iphdr)skb->data;	250	struct iphdr iph = (struct iphdr )skb->data;
251	u8 *payload = skb->data + (iph->ihl << 2);	251	u8 *payload = skb->data + (iph->ihl << 2);
252		252
253	if (inet->hdrincl)	253	if (inet->hdrincl)
254	payload = skb->data;	254	payload = skb->data;
255	ip_icmp_error(sk, skb, err, 0, info, payload);	255	ip_icmp_error(sk, skb, err, 0, info, payload);
256	}	256	}
257		257
258	if (inet->recverr \|\| harderr) {	258	if (inet->recverr \|\| harderr) {
259	sk->sk_err = err;	259	sk->sk_err = err;
260	sk->sk_error_report(sk);	260	sk->sk_error_report(sk);
261	}	261	}
262	}	262	}
263		263
264	void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)	264	void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
265	{	265	{
266	int hash;	266	int hash;
267	struct sock *raw_sk;	267	struct sock *raw_sk;
268	struct iphdr *iph;	268	struct iphdr *iph;
269	struct net *net;	269	struct net *net;
270		270
271	hash = protocol & (RAW_HTABLE_SIZE - 1);	271	hash = protocol & (RAW_HTABLE_SIZE - 1);
272		272
273	read_lock(&raw_v4_hashinfo.lock);	273	read_lock(&raw_v4_hashinfo.lock);
274	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);	274	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
275	if (raw_sk != NULL) {	275	if (raw_sk != NULL) {
276	iph = (struct iphdr *)skb->data;	276	iph = (struct iphdr *)skb->data;
277	net = dev_net(skb->dev);	277	net = dev_net(skb->dev);
278		278
279	while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,	279	while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
280	iph->daddr, iph->saddr,	280	iph->daddr, iph->saddr,
281	skb->dev->ifindex)) != NULL) {	281	skb->dev->ifindex)) != NULL) {
282	raw_err(raw_sk, skb, info);	282	raw_err(raw_sk, skb, info);
283	raw_sk = sk_next(raw_sk);	283	raw_sk = sk_next(raw_sk);
284	iph = (struct iphdr *)skb->data;	284	iph = (struct iphdr *)skb->data;
285	}	285	}
286	}	286	}
287	read_unlock(&raw_v4_hashinfo.lock);	287	read_unlock(&raw_v4_hashinfo.lock);
288	}	288	}
289		289
290	static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)	290	static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
291	{	291	{
292	/* Charge it to the socket. */	292	/* Charge it to the socket. */
293		293
294	if (sock_queue_rcv_skb(sk, skb) < 0) {	294	if (sock_queue_rcv_skb(sk, skb) < 0) {
295	atomic_inc(&sk->sk_drops);	295	atomic_inc(&sk->sk_drops);
296	kfree_skb(skb);	296	kfree_skb(skb);
297	return NET_RX_DROP;	297	return NET_RX_DROP;
298	}	298	}
299		299
300	return NET_RX_SUCCESS;	300	return NET_RX_SUCCESS;
301	}	301	}
302		302
303	int raw_rcv(struct sock sk, struct sk_buff skb)	303	int raw_rcv(struct sock sk, struct sk_buff skb)
304	{	304	{
305	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {	305	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
306	atomic_inc(&sk->sk_drops);	306	atomic_inc(&sk->sk_drops);
307	kfree_skb(skb);	307	kfree_skb(skb);
308	return NET_RX_DROP;	308	return NET_RX_DROP;
309	}	309	}
310	nf_reset(skb);	310	nf_reset(skb);
311		311
312	skb_push(skb, skb->data - skb_network_header(skb));	312	skb_push(skb, skb->data - skb_network_header(skb));
313		313
314	raw_rcv_skb(sk, skb);	314	raw_rcv_skb(sk, skb);
315	return 0;	315	return 0;
316	}	316	}
317		317
318	static int raw_send_hdrinc(struct sock sk, void from, size_t length,	318	static int raw_send_hdrinc(struct sock sk, void from, size_t length,
319	struct rtable *rt,	319	struct rtable *rt,
320	unsigned int flags)	320	unsigned int flags)
321	{	321	{
322	struct inet_sock *inet = inet_sk(sk);	322	struct inet_sock *inet = inet_sk(sk);
323	struct net *net = sock_net(sk);	323	struct net *net = sock_net(sk);
324	struct iphdr *iph;	324	struct iphdr *iph;
325	struct sk_buff *skb;	325	struct sk_buff *skb;
326	unsigned int iphlen;	326	unsigned int iphlen;
327	int err;	327	int err;
328		328
329	if (length > rt->u.dst.dev->mtu) {	329	if (length > rt->u.dst.dev->mtu) {
330	ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,	330	ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
331	rt->u.dst.dev->mtu);	331	rt->u.dst.dev->mtu);
332	return -EMSGSIZE;	332	return -EMSGSIZE;
333	}	333	}
334	if (flags&MSG_PROBE)	334	if (flags&MSG_PROBE)
335	goto out;	335	goto out;
336		336
337	skb = sock_alloc_send_skb(sk,	337	skb = sock_alloc_send_skb(sk,
338	length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,	338	length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,
339	flags & MSG_DONTWAIT, &err);	339	flags & MSG_DONTWAIT, &err);
340	if (skb == NULL)	340	if (skb == NULL)
341	goto error;	341	goto error;
342	skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));	342	skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));
343		343
344	skb->priority = sk->sk_priority;	344	skb->priority = sk->sk_priority;
345	skb->mark = sk->sk_mark;	345	skb->mark = sk->sk_mark;
346	skb->dst = dst_clone(&rt->u.dst);	346	skb->dst = dst_clone(&rt->u.dst);
347		347
348	skb_reset_network_header(skb);	348	skb_reset_network_header(skb);
349	iph = ip_hdr(skb);	349	iph = ip_hdr(skb);
350	skb_put(skb, length);	350	skb_put(skb, length);
351		351
352	skb->ip_summed = CHECKSUM_NONE;	352	skb->ip_summed = CHECKSUM_NONE;
353		353
354	skb->transport_header = skb->network_header;	354	skb->transport_header = skb->network_header;
355	err = memcpy_fromiovecend((void *)iph, from, 0, length);	355	err = memcpy_fromiovecend((void *)iph, from, 0, length);
356	if (err)	356	if (err)
357	goto error_fault;	357	goto error_fault;
358		358
359	/* We don't modify invalid header */	359	/* We don't modify invalid header */
360	iphlen = iph->ihl * 4;	360	iphlen = iph->ihl * 4;
361	if (iphlen >= sizeof(*iph) && iphlen <= length) {	361	if (iphlen >= sizeof(*iph) && iphlen <= length) {
362	if (!iph->saddr)	362	if (!iph->saddr)
363	iph->saddr = rt->rt_src;	363	iph->saddr = rt->rt_src;
364	iph->check = 0;	364	iph->check = 0;
365	iph->tot_len = htons(length);	365	iph->tot_len = htons(length);
366	if (!iph->id)	366	if (!iph->id)
367	ip_select_ident(iph, &rt->u.dst, NULL);	367	ip_select_ident(iph, &rt->u.dst, NULL);
368		368
369	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);	369	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
370	}	370	}
371	if (iph->protocol == IPPROTO_ICMP)	371	if (iph->protocol == IPPROTO_ICMP)
372	icmp_out_count(net, ((struct icmphdr *)	372	icmp_out_count(net, ((struct icmphdr *)
373	skb_transport_header(skb))->type);	373	skb_transport_header(skb))->type);
374		374
375	err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,	375	err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
376	dst_output);	376	dst_output);
377	if (err > 0)	377	if (err > 0)
378	err = inet->recverr ? net_xmit_errno(err) : 0;	378	err = inet->recverr ? net_xmit_errno(err) : 0;
379	if (err)	379	if (err)
380	goto error;	380	goto error;
381	out:	381	out:
382	return 0;	382	return 0;
383		383
384	error_fault:	384	error_fault:
385	err = -EFAULT;	385	err = -EFAULT;
386	kfree_skb(skb);	386	kfree_skb(skb);
387	error:	387	error:
388	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);	388	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
389	return err;	389	return err;
390	}	390	}
391		391
392	static int raw_probe_proto_opt(struct flowi fl, struct msghdr msg)	392	static int raw_probe_proto_opt(struct flowi fl, struct msghdr msg)
393	{	393	{
394	struct iovec *iov;	394	struct iovec *iov;
395	u8 __user *type = NULL;	395	u8 __user *type = NULL;
396	u8 __user *code = NULL;	396	u8 __user *code = NULL;
397	int probed = 0;	397	int probed = 0;
398	unsigned int i;	398	unsigned int i;
399		399
400	if (!msg->msg_iov)	400	if (!msg->msg_iov)
401	return 0;	401	return 0;
402		402
403	for (i = 0; i < msg->msg_iovlen; i++) {	403	for (i = 0; i < msg->msg_iovlen; i++) {
404	iov = &msg->msg_iov[i];	404	iov = &msg->msg_iov[i];
405	if (!iov)	405	if (!iov)
406	continue;	406	continue;
407		407
408	switch (fl->proto) {	408	switch (fl->proto) {
409	case IPPROTO_ICMP:	409	case IPPROTO_ICMP:
410	/* check if one-byte field is readable or not. */	410	/* check if one-byte field is readable or not. */
411	if (iov->iov_base && iov->iov_len < 1)	411	if (iov->iov_base && iov->iov_len < 1)
412	break;	412	break;
413		413
414	if (!type) {	414	if (!type) {
415	type = iov->iov_base;	415	type = iov->iov_base;
416	/* check if code field is readable or not. */	416	/* check if code field is readable or not. */
417	if (iov->iov_len > 1)	417	if (iov->iov_len > 1)
418	code = type + 1;	418	code = type + 1;
419	} else if (!code)	419	} else if (!code)
420	code = iov->iov_base;	420	code = iov->iov_base;
421		421
422	if (type && code) {	422	if (type && code) {
423	if (get_user(fl->fl_icmp_type, type) \|\|	423	if (get_user(fl->fl_icmp_type, type) \|\|
424	get_user(fl->fl_icmp_code, code))	424	get_user(fl->fl_icmp_code, code))
425	return -EFAULT;	425	return -EFAULT;
426	probed = 1;	426	probed = 1;
427	}	427	}
428	break;	428	break;
429	default:	429	default:
430	probed = 1;	430	probed = 1;
431	break;	431	break;
432	}	432	}
433	if (probed)	433	if (probed)
434	break;	434	break;
435	}	435	}
436	return 0;	436	return 0;
437	}	437	}
438		438
439	static int raw_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,	439	static int raw_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
440	size_t len)	440	size_t len)
441	{	441	{
442	struct inet_sock *inet = inet_sk(sk);	442	struct inet_sock *inet = inet_sk(sk);
443	struct ipcm_cookie ipc;	443	struct ipcm_cookie ipc;
444	struct rtable *rt = NULL;	444	struct rtable *rt = NULL;
445	int free = 0;	445	int free = 0;
446	__be32 daddr;	446	__be32 daddr;
447	__be32 saddr;	447	__be32 saddr;
448	u8 tos;	448	u8 tos;
449	int err;	449	int err;
450		450
451	err = -EMSGSIZE;	451	err = -EMSGSIZE;
452	if (len > 0xFFFF)	452	if (len > 0xFFFF)
453	goto out;	453	goto out;
454		454
455	/*	455	/*
456	* Check the flags.	456	* Check the flags.
457	*/	457	*/
458		458
459	err = -EOPNOTSUPP;	459	err = -EOPNOTSUPP;
460	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */	460	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
461	goto out; /* compatibility */	461	goto out; /* compatibility */
462		462
463	/*	463	/*
464	* Get and verify the address.	464	* Get and verify the address.
465	*/	465	*/
466		466
467	if (msg->msg_namelen) {	467	if (msg->msg_namelen) {
468	struct sockaddr_in usin = (struct sockaddr_in)msg->msg_name;	468	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
469	err = -EINVAL;	469	err = -EINVAL;
470	if (msg->msg_namelen < sizeof(*usin))	470	if (msg->msg_namelen < sizeof(*usin))
471	goto out;	471	goto out;
472	if (usin->sin_family != AF_INET) {	472	if (usin->sin_family != AF_INET) {
473	static int complained;	473	static int complained;
474	if (!complained++)	474	if (!complained++)
475	printk(KERN_INFO "%s forgot to set AF_INET in "	475	printk(KERN_INFO "%s forgot to set AF_INET in "
476	"raw sendmsg. Fix it!\n",	476	"raw sendmsg. Fix it!\n",
477	current->comm);	477	current->comm);
478	err = -EAFNOSUPPORT;	478	err = -EAFNOSUPPORT;
479	if (usin->sin_family)	479	if (usin->sin_family)
480	goto out;	480	goto out;
481	}	481	}
482	daddr = usin->sin_addr.s_addr;	482	daddr = usin->sin_addr.s_addr;
483	/* ANK: I did not forget to get protocol from port field.	483	/* ANK: I did not forget to get protocol from port field.
484	* I just do not know, who uses this weirdness.	484	* I just do not know, who uses this weirdness.
485	* IP_HDRINCL is much more convenient.	485	* IP_HDRINCL is much more convenient.
486	*/	486	*/
487	} else {	487	} else {
488	err = -EDESTADDRREQ;	488	err = -EDESTADDRREQ;
489	if (sk->sk_state != TCP_ESTABLISHED)	489	if (sk->sk_state != TCP_ESTABLISHED)
490	goto out;	490	goto out;
491	daddr = inet->daddr;	491	daddr = inet->daddr;
492	}	492	}
493		493
494	ipc.addr = inet->saddr;	494	ipc.addr = inet->saddr;
495	ipc.opt = NULL;	495	ipc.opt = NULL;
496	ipc.oif = sk->sk_bound_dev_if;	496	ipc.oif = sk->sk_bound_dev_if;
497		497
498	if (msg->msg_controllen) {	498	if (msg->msg_controllen) {
499	err = ip_cmsg_send(sock_net(sk), msg, &ipc);	499	err = ip_cmsg_send(sock_net(sk), msg, &ipc);
500	if (err)	500	if (err)
501	goto out;	501	goto out;
502	if (ipc.opt)	502	if (ipc.opt)
503	free = 1;	503	free = 1;
504	}	504	}
505		505
506	saddr = ipc.addr;	506	saddr = ipc.addr;
507	ipc.addr = daddr;	507	ipc.addr = daddr;
508		508
509	if (!ipc.opt)	509	if (!ipc.opt)
510	ipc.opt = inet->opt;	510	ipc.opt = inet->opt;
511		511
512	if (ipc.opt) {	512	if (ipc.opt) {
513	err = -EINVAL;	513	err = -EINVAL;
514	/* Linux does not mangle headers on raw sockets,	514	/* Linux does not mangle headers on raw sockets,
515	* so that IP options + IP_HDRINCL is non-sense.	515	* so that IP options + IP_HDRINCL is non-sense.
516	*/	516	*/
517	if (inet->hdrincl)	517	if (inet->hdrincl)
518	goto done;	518	goto done;
519	if (ipc.opt->srr) {	519	if (ipc.opt->srr) {
520	if (!daddr)	520	if (!daddr)
521	goto done;	521	goto done;
522	daddr = ipc.opt->faddr;	522	daddr = ipc.opt->faddr;
523	}	523	}
524	}	524	}
525	tos = RT_CONN_FLAGS(sk);	525	tos = RT_CONN_FLAGS(sk);
526	if (msg->msg_flags & MSG_DONTROUTE)	526	if (msg->msg_flags & MSG_DONTROUTE)
527	tos \|= RTO_ONLINK;	527	tos \|= RTO_ONLINK;
528		528
529	if (ipv4_is_multicast(daddr)) {	529	if (ipv4_is_multicast(daddr)) {
530	if (!ipc.oif)	530	if (!ipc.oif)
531	ipc.oif = inet->mc_index;	531	ipc.oif = inet->mc_index;
532	if (!saddr)	532	if (!saddr)
533	saddr = inet->mc_addr;	533	saddr = inet->mc_addr;
534	}	534	}
535		535
536	{	536	{
537	struct flowi fl = { .oif = ipc.oif,	537	struct flowi fl = { .oif = ipc.oif,
538	.mark = sk->sk_mark,	538	.mark = sk->sk_mark,
539	.nl_u = { .ip4_u =	539	.nl_u = { .ip4_u =
540	{ .daddr = daddr,	540	{ .daddr = daddr,
541	.saddr = saddr,	541	.saddr = saddr,
542	.tos = tos } },	542	.tos = tos } },
543	.proto = inet->hdrincl ? IPPROTO_RAW :	543	.proto = inet->hdrincl ? IPPROTO_RAW :
544	sk->sk_protocol,	544	sk->sk_protocol,
545	};	545	};
546	if (!inet->hdrincl) {	546	if (!inet->hdrincl) {
547	err = raw_probe_proto_opt(&fl, msg);	547	err = raw_probe_proto_opt(&fl, msg);
548	if (err)	548	if (err)
549	goto done;	549	goto done;
550	}	550	}
551		551
552	security_sk_classify_flow(sk, &fl);	552	security_sk_classify_flow(sk, &fl);
553	err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);	553	err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
554	}	554	}
555	if (err)	555	if (err)
556	goto done;	556	goto done;
557		557
558	err = -EACCES;	558	err = -EACCES;
559	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))	559	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
560	goto done;	560	goto done;
561		561
562	if (msg->msg_flags & MSG_CONFIRM)	562	if (msg->msg_flags & MSG_CONFIRM)
563	goto do_confirm;	563	goto do_confirm;
564	back_from_confirm:	564	back_from_confirm:
565		565
566	if (inet->hdrincl)	566	if (inet->hdrincl)
567	err = raw_send_hdrinc(sk, msg->msg_iov, len,	567	err = raw_send_hdrinc(sk, msg->msg_iov, len,
568	rt, msg->msg_flags);	568	rt, msg->msg_flags);
569		569
570	else {	570	else {
571	if (!ipc.addr)	571	if (!ipc.addr)
572	ipc.addr = rt->rt_dst;	572	ipc.addr = rt->rt_dst;
573	lock_sock(sk);	573	lock_sock(sk);
574	err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,	574	err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
575	&ipc, rt, msg->msg_flags);	575	&ipc, rt, msg->msg_flags);
576	if (err)	576	if (err)
577	ip_flush_pending_frames(sk);	577	ip_flush_pending_frames(sk);
578	else if (!(msg->msg_flags & MSG_MORE))	578	else if (!(msg->msg_flags & MSG_MORE))
579	err = ip_push_pending_frames(sk);	579	err = ip_push_pending_frames(sk);
580	release_sock(sk);	580	release_sock(sk);
581	}	581	}
582	done:	582	done:
583	if (free)	583	if (free)
584	kfree(ipc.opt);	584	kfree(ipc.opt);
585	ip_rt_put(rt);	585	ip_rt_put(rt);
586		586
587	out:	587	out:
588	if (err < 0)	588	if (err < 0)
589	return err;	589	return err;
590	return len;	590	return len;
591		591
592	do_confirm:	592	do_confirm:
593	dst_confirm(&rt->u.dst);	593	dst_confirm(&rt->u.dst);
594	if (!(msg->msg_flags & MSG_PROBE) \|\| len)	594	if (!(msg->msg_flags & MSG_PROBE) \|\| len)
595	goto back_from_confirm;	595	goto back_from_confirm;
596	err = 0;	596	err = 0;
597	goto done;	597	goto done;
598	}	598	}
599		599
600	static void raw_close(struct sock *sk, long timeout)	600	static void raw_close(struct sock *sk, long timeout)
601	{	601	{
602	/*	602	/*
603	* Raw sockets may have direct kernel refereneces. Kill them.	603	* Raw sockets may have direct kernel refereneces. Kill them.
604	*/	604	*/
605	ip_ra_control(sk, 0, NULL);	605	ip_ra_control(sk, 0, NULL);
606		606
607	sk_common_release(sk);	607	sk_common_release(sk);
608	}	608	}
609		609
610	static void raw_destroy(struct sock *sk)	610	static void raw_destroy(struct sock *sk)
611	{	611	{
612	lock_sock(sk);	612	lock_sock(sk);
613	ip_flush_pending_frames(sk);	613	ip_flush_pending_frames(sk);
614	release_sock(sk);	614	release_sock(sk);
615	}	615	}
616		616
617	/* This gets rid of all the nasties in af_inet. -DaveM */	617	/* This gets rid of all the nasties in af_inet. -DaveM */
618	static int raw_bind(struct sock sk, struct sockaddr uaddr, int addr_len)	618	static int raw_bind(struct sock sk, struct sockaddr uaddr, int addr_len)
619	{	619	{
620	struct inet_sock *inet = inet_sk(sk);	620	struct inet_sock *inet = inet_sk(sk);
621	struct sockaddr_in addr = (struct sockaddr_in ) uaddr;	621	struct sockaddr_in addr = (struct sockaddr_in ) uaddr;
622	int ret = -EINVAL;	622	int ret = -EINVAL;
623	int chk_addr_ret;	623	int chk_addr_ret;
624		624
625	if (sk->sk_state != TCP_CLOSE \|\| addr_len < sizeof(struct sockaddr_in))	625	if (sk->sk_state != TCP_CLOSE \|\| addr_len < sizeof(struct sockaddr_in))
626	goto out;	626	goto out;
627	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);	627	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
628	ret = -EADDRNOTAVAIL;	628	ret = -EADDRNOTAVAIL;
629	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&	629	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
630	chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)	630	chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
631	goto out;	631	goto out;
632	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;	632	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
633	if (chk_addr_ret == RTN_MULTICAST \|\| chk_addr_ret == RTN_BROADCAST)	633	if (chk_addr_ret == RTN_MULTICAST \|\| chk_addr_ret == RTN_BROADCAST)
634	inet->saddr = 0; /* Use device */	634	inet->saddr = 0; /* Use device */
635	sk_dst_reset(sk);	635	sk_dst_reset(sk);
636	ret = 0;	636	ret = 0;
637	out: return ret;	637	out: return ret;
638	}	638	}
639		639
640	/*	640	/*
641	* This should be easy, if there is something there	641	* This should be easy, if there is something there
642	* we return it, otherwise we block.	642	* we return it, otherwise we block.
643	*/	643	*/
644		644
645	static int raw_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,	645	static int raw_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
646	size_t len, int noblock, int flags, int *addr_len)	646	size_t len, int noblock, int flags, int *addr_len)
647	{	647	{
648	struct inet_sock *inet = inet_sk(sk);	648	struct inet_sock *inet = inet_sk(sk);
649	size_t copied = 0;	649	size_t copied = 0;
650	int err = -EOPNOTSUPP;	650	int err = -EOPNOTSUPP;
651	struct sockaddr_in sin = (struct sockaddr_in )msg->msg_name;	651	struct sockaddr_in sin = (struct sockaddr_in )msg->msg_name;
652	struct sk_buff *skb;	652	struct sk_buff *skb;
653		653
654	if (flags & MSG_OOB)	654	if (flags & MSG_OOB)
655	goto out;	655	goto out;
656		656
657	if (addr_len)	657	if (addr_len)
658	addr_len = sizeof(sin);	658	addr_len = sizeof(sin);
659		659
660	if (flags & MSG_ERRQUEUE) {	660	if (flags & MSG_ERRQUEUE) {
661	err = ip_recv_error(sk, msg, len);	661	err = ip_recv_error(sk, msg, len);
662	goto out;	662	goto out;
663	}	663	}
664		664
665	skb = skb_recv_datagram(sk, flags, noblock, &err);	665	skb = skb_recv_datagram(sk, flags, noblock, &err);
666	if (!skb)	666	if (!skb)
667	goto out;	667	goto out;
668		668
669	copied = skb->len;	669	copied = skb->len;
670	if (len < copied) {	670	if (len < copied) {
671	msg->msg_flags \|= MSG_TRUNC;	671	msg->msg_flags \|= MSG_TRUNC;
672	copied = len;	672	copied = len;
673	}	673	}
674		674
675	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);	675	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
676	if (err)	676	if (err)
677	goto done;	677	goto done;
678		678
679	sock_recv_timestamp(msg, sk, skb);	679	sock_recv_timestamp(msg, sk, skb);
680		680
681	/* Copy the address. */	681	/* Copy the address. */
682	if (sin) {	682	if (sin) {
683	sin->sin_family = AF_INET;	683	sin->sin_family = AF_INET;
684	sin->sin_addr.s_addr = ip_hdr(skb)->saddr;	684	sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
685	sin->sin_port = 0;	685	sin->sin_port = 0;
686	memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));	686	memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
687	}	687	}
688	if (inet->cmsg_flags)	688	if (inet->cmsg_flags)
689	ip_cmsg_recv(msg, skb);	689	ip_cmsg_recv(msg, skb);
690	if (flags & MSG_TRUNC)	690	if (flags & MSG_TRUNC)
691	copied = skb->len;	691	copied = skb->len;
692	done:	692	done:
693	skb_free_datagram(sk, skb);	693	skb_free_datagram(sk, skb);
694	out:	694	out:
695	if (err)	695	if (err)
696	return err;	696	return err;
697	return copied;	697	return copied;
698	}	698	}
699		699
700	static int raw_init(struct sock *sk)	700	static int raw_init(struct sock *sk)
701	{	701	{
702	struct raw_sock *rp = raw_sk(sk);	702	struct raw_sock *rp = raw_sk(sk);
703		703
704	if (inet_sk(sk)->num == IPPROTO_ICMP)	704	if (inet_sk(sk)->num == IPPROTO_ICMP)
705	memset(&rp->filter, 0, sizeof(rp->filter));	705	memset(&rp->filter, 0, sizeof(rp->filter));
706	return 0;	706	return 0;
707	}	707	}
708		708
709	static int raw_seticmpfilter(struct sock sk, char __user optval, int optlen)	709	static int raw_seticmpfilter(struct sock sk, char __user optval, int optlen)
710	{	710	{
711	if (optlen > sizeof(struct icmp_filter))	711	if (optlen > sizeof(struct icmp_filter))
712	optlen = sizeof(struct icmp_filter);	712	optlen = sizeof(struct icmp_filter);
713	if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))	713	if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
714	return -EFAULT;	714	return -EFAULT;
715	return 0;	715	return 0;
716	}	716	}
717		717
718	static int raw_geticmpfilter(struct sock sk, char __user optval, int __user *optlen)	718	static int raw_geticmpfilter(struct sock sk, char __user optval, int __user *optlen)
719	{	719	{
720	int len, ret = -EFAULT;	720	int len, ret = -EFAULT;
721		721
722	if (get_user(len, optlen))	722	if (get_user(len, optlen))
723	goto out;	723	goto out;
724	ret = -EINVAL;	724	ret = -EINVAL;
725	if (len < 0)	725	if (len < 0)
726	goto out;	726	goto out;
727	if (len > sizeof(struct icmp_filter))	727	if (len > sizeof(struct icmp_filter))
728	len = sizeof(struct icmp_filter);	728	len = sizeof(struct icmp_filter);
729	ret = -EFAULT;	729	ret = -EFAULT;
730	if (put_user(len, optlen) \|\|	730	if (put_user(len, optlen) \|\|
731	copy_to_user(optval, &raw_sk(sk)->filter, len))	731	copy_to_user(optval, &raw_sk(sk)->filter, len))
732	goto out;	732	goto out;
733	ret = 0;	733	ret = 0;
734	out: return ret;	734	out: return ret;
735	}	735	}
736		736
737	static int do_raw_setsockopt(struct sock *sk, int level, int optname,	737	static int do_raw_setsockopt(struct sock *sk, int level, int optname,
738	char __user *optval, int optlen)	738	char __user *optval, int optlen)
739	{	739	{
740	if (optname == ICMP_FILTER) {	740	if (optname == ICMP_FILTER) {
741	if (inet_sk(sk)->num != IPPROTO_ICMP)	741	if (inet_sk(sk)->num != IPPROTO_ICMP)
742	return -EOPNOTSUPP;	742	return -EOPNOTSUPP;
743	else	743	else
744	return raw_seticmpfilter(sk, optval, optlen);	744	return raw_seticmpfilter(sk, optval, optlen);
745	}	745	}
746	return -ENOPROTOOPT;	746	return -ENOPROTOOPT;
747	}	747	}
748		748
749	static int raw_setsockopt(struct sock *sk, int level, int optname,	749	static int raw_setsockopt(struct sock *sk, int level, int optname,
750	char __user *optval, int optlen)	750	char __user *optval, int optlen)
751	{	751	{
752	if (level != SOL_RAW)	752	if (level != SOL_RAW)
753	return ip_setsockopt(sk, level, optname, optval, optlen);	753	return ip_setsockopt(sk, level, optname, optval, optlen);
754	return do_raw_setsockopt(sk, level, optname, optval, optlen);	754	return do_raw_setsockopt(sk, level, optname, optval, optlen);
755	}	755	}
756		756
757	#ifdef CONFIG_COMPAT	757	#ifdef CONFIG_COMPAT
758	static int compat_raw_setsockopt(struct sock *sk, int level, int optname,	758	static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
759	char __user *optval, int optlen)	759	char __user *optval, int optlen)
760	{	760	{
761	if (level != SOL_RAW)	761	if (level != SOL_RAW)
762	return compat_ip_setsockopt(sk, level, optname, optval, optlen);	762	return compat_ip_setsockopt(sk, level, optname, optval, optlen);
763	return do_raw_setsockopt(sk, level, optname, optval, optlen);	763	return do_raw_setsockopt(sk, level, optname, optval, optlen);
764	}	764	}
765	#endif	765	#endif
766		766
767	static int do_raw_getsockopt(struct sock *sk, int level, int optname,	767	static int do_raw_getsockopt(struct sock *sk, int level, int optname,
768	char __user optval, int __user optlen)	768	char __user optval, int __user optlen)
769	{	769	{
770	if (optname == ICMP_FILTER) {	770	if (optname == ICMP_FILTER) {
771	if (inet_sk(sk)->num != IPPROTO_ICMP)	771	if (inet_sk(sk)->num != IPPROTO_ICMP)
772	return -EOPNOTSUPP;	772	return -EOPNOTSUPP;
773	else	773	else
774	return raw_geticmpfilter(sk, optval, optlen);	774	return raw_geticmpfilter(sk, optval, optlen);
775	}	775	}
776	return -ENOPROTOOPT;	776	return -ENOPROTOOPT;
777	}	777	}
778		778
779	static int raw_getsockopt(struct sock *sk, int level, int optname,	779	static int raw_getsockopt(struct sock *sk, int level, int optname,
780	char __user optval, int __user optlen)	780	char __user optval, int __user optlen)
781	{	781	{
782	if (level != SOL_RAW)	782	if (level != SOL_RAW)
783	return ip_getsockopt(sk, level, optname, optval, optlen);	783	return ip_getsockopt(sk, level, optname, optval, optlen);
784	return do_raw_getsockopt(sk, level, optname, optval, optlen);	784	return do_raw_getsockopt(sk, level, optname, optval, optlen);
785	}	785	}
786		786
787	#ifdef CONFIG_COMPAT	787	#ifdef CONFIG_COMPAT
788	static int compat_raw_getsockopt(struct sock *sk, int level, int optname,	788	static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
789	char __user optval, int __user optlen)	789	char __user optval, int __user optlen)
790	{	790	{
791	if (level != SOL_RAW)	791	if (level != SOL_RAW)
792	return compat_ip_getsockopt(sk, level, optname, optval, optlen);	792	return compat_ip_getsockopt(sk, level, optname, optval, optlen);
793	return do_raw_getsockopt(sk, level, optname, optval, optlen);	793	return do_raw_getsockopt(sk, level, optname, optval, optlen);
794	}	794	}
795	#endif	795	#endif
796		796
797	static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)	797	static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
798	{	798	{
799	switch (cmd) {	799	switch (cmd) {
800	case SIOCOUTQ: {	800	case SIOCOUTQ: {
801	int amount = atomic_read(&sk->sk_wmem_alloc);	801	int amount = atomic_read(&sk->sk_wmem_alloc);
802	return put_user(amount, (int __user *)arg);	802	return put_user(amount, (int __user *)arg);
803	}	803	}
804	case SIOCINQ: {	804	case SIOCINQ: {
805	struct sk_buff *skb;	805	struct sk_buff *skb;
806	int amount = 0;	806	int amount = 0;
807		807
808	spin_lock_bh(&sk->sk_receive_queue.lock);	808	spin_lock_bh(&sk->sk_receive_queue.lock);
809	skb = skb_peek(&sk->sk_receive_queue);	809	skb = skb_peek(&sk->sk_receive_queue);
810	if (skb != NULL)	810	if (skb != NULL)
811	amount = skb->len;	811	amount = skb->len;
812	spin_unlock_bh(&sk->sk_receive_queue.lock);	812	spin_unlock_bh(&sk->sk_receive_queue.lock);
813	return put_user(amount, (int __user *)arg);	813	return put_user(amount, (int __user *)arg);
814	}	814	}
815		815
816	default:	816	default:
817	#ifdef CONFIG_IP_MROUTE	817	#ifdef CONFIG_IP_MROUTE
818	return ipmr_ioctl(sk, cmd, (void __user *)arg);	818	return ipmr_ioctl(sk, cmd, (void __user *)arg);
819	#else	819	#else
820	return -ENOIOCTLCMD;	820	return -ENOIOCTLCMD;
821	#endif	821	#endif
822	}	822	}
823	}	823	}
824		824
825	struct proto raw_prot = {	825	struct proto raw_prot = {
826	.name = "RAW",	826	.name = "RAW",
827	.owner = THIS_MODULE,	827	.owner = THIS_MODULE,
828	.close = raw_close,	828	.close = raw_close,
829	.destroy = raw_destroy,	829	.destroy = raw_destroy,
830	.connect = ip4_datagram_connect,	830	.connect = ip4_datagram_connect,
831	.disconnect = udp_disconnect,	831	.disconnect = udp_disconnect,
832	.ioctl = raw_ioctl,	832	.ioctl = raw_ioctl,
833	.init = raw_init,	833	.init = raw_init,
834	.setsockopt = raw_setsockopt,	834	.setsockopt = raw_setsockopt,
835	.getsockopt = raw_getsockopt,	835	.getsockopt = raw_getsockopt,
836	.sendmsg = raw_sendmsg,	836	.sendmsg = raw_sendmsg,
837	.recvmsg = raw_recvmsg,	837	.recvmsg = raw_recvmsg,
838	.bind = raw_bind,	838	.bind = raw_bind,
839	.backlog_rcv = raw_rcv_skb,	839	.backlog_rcv = raw_rcv_skb,
840	.hash = raw_hash_sk,	840	.hash = raw_hash_sk,
841	.unhash = raw_unhash_sk,	841	.unhash = raw_unhash_sk,
842	.obj_size = sizeof(struct raw_sock),	842	.obj_size = sizeof(struct raw_sock),
843	.h.raw_hash = &raw_v4_hashinfo,	843	.h.raw_hash = &raw_v4_hashinfo,
844	#ifdef CONFIG_COMPAT	844	#ifdef CONFIG_COMPAT
845	.compat_setsockopt = compat_raw_setsockopt,	845	.compat_setsockopt = compat_raw_setsockopt,
846	.compat_getsockopt = compat_raw_getsockopt,	846	.compat_getsockopt = compat_raw_getsockopt,
847	#endif	847	#endif
848	};	848	};
849		849
850	#ifdef CONFIG_PROC_FS	850	#ifdef CONFIG_PROC_FS
851	static struct sock raw_get_first(struct seq_file seq)	851	static struct sock raw_get_first(struct seq_file seq)
852	{	852	{
853	struct sock *sk;	853	struct sock *sk;
854	struct raw_iter_state* state = raw_seq_private(seq);	854	struct raw_iter_state *state = raw_seq_private(seq);
855		855
856	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;	856	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
857	++state->bucket) {	857	++state->bucket) {
858	struct hlist_node *node;	858	struct hlist_node *node;
859		859
860	sk_for_each(sk, node, &state->h->ht[state->bucket])	860	sk_for_each(sk, node, &state->h->ht[state->bucket])
861	if (sock_net(sk) == seq_file_net(seq))	861	if (sock_net(sk) == seq_file_net(seq))
862	goto found;	862	goto found;
863	}	863	}
864	sk = NULL;	864	sk = NULL;
865	found:	865	found:
866	return sk;	866	return sk;
867	}	867	}
868		868
869	static struct sock raw_get_next(struct seq_file seq, struct sock *sk)	869	static struct sock raw_get_next(struct seq_file seq, struct sock *sk)
870	{	870	{
871	struct raw_iter_state* state = raw_seq_private(seq);	871	struct raw_iter_state *state = raw_seq_private(seq);
872		872
873	do {	873	do {
874	sk = sk_next(sk);	874	sk = sk_next(sk);
875	try_again:	875	try_again:
876	;	876	;
877	} while (sk && sock_net(sk) != seq_file_net(seq));	877	} while (sk && sock_net(sk) != seq_file_net(seq));
878		878
879	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {	879	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
880	sk = sk_head(&state->h->ht[state->bucket]);	880	sk = sk_head(&state->h->ht[state->bucket]);
881	goto try_again;	881	goto try_again;
882	}	882	}
883	return sk;	883	return sk;
884	}	884	}
885		885
886	static struct sock raw_get_idx(struct seq_file seq, loff_t pos)	886	static struct sock raw_get_idx(struct seq_file seq, loff_t pos)
887	{	887	{
888	struct sock *sk = raw_get_first(seq);	888	struct sock *sk = raw_get_first(seq);
889		889
890	if (sk)	890	if (sk)
891	while (pos && (sk = raw_get_next(seq, sk)) != NULL)	891	while (pos && (sk = raw_get_next(seq, sk)) != NULL)
892	--pos;	892	--pos;
893	return pos ? NULL : sk;	893	return pos ? NULL : sk;
894	}	894	}
895		895
896	void raw_seq_start(struct seq_file seq, loff_t *pos)	896	void raw_seq_start(struct seq_file seq, loff_t *pos)
897	{	897	{
898	struct raw_iter_state *state = raw_seq_private(seq);	898	struct raw_iter_state *state = raw_seq_private(seq);
899		899
900	read_lock(&state->h->lock);	900	read_lock(&state->h->lock);
901	return pos ? raw_get_idx(seq, pos - 1) : SEQ_START_TOKEN;	901	return pos ? raw_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
902	}	902	}
903	EXPORT_SYMBOL_GPL(raw_seq_start);	903	EXPORT_SYMBOL_GPL(raw_seq_start);
904		904
905	void raw_seq_next(struct seq_file seq, void v, loff_t pos)	905	void raw_seq_next(struct seq_file seq, void v, loff_t pos)
906	{	906	{
907	struct sock *sk;	907	struct sock *sk;
908		908
909	if (v == SEQ_START_TOKEN)	909	if (v == SEQ_START_TOKEN)
910	sk = raw_get_first(seq);	910	sk = raw_get_first(seq);
911	else	911	else
912	sk = raw_get_next(seq, v);	912	sk = raw_get_next(seq, v);
913	++*pos;	913	++*pos;
914	return sk;	914	return sk;
915	}	915	}
916	EXPORT_SYMBOL_GPL(raw_seq_next);	916	EXPORT_SYMBOL_GPL(raw_seq_next);
917		917
918	void raw_seq_stop(struct seq_file seq, void v)	918	void raw_seq_stop(struct seq_file seq, void v)
919	{	919	{
920	struct raw_iter_state *state = raw_seq_private(seq);	920	struct raw_iter_state *state = raw_seq_private(seq);
921		921
922	read_unlock(&state->h->lock);	922	read_unlock(&state->h->lock);
923	}	923	}
924	EXPORT_SYMBOL_GPL(raw_seq_stop);	924	EXPORT_SYMBOL_GPL(raw_seq_stop);
925		925
926	static void raw_sock_seq_show(struct seq_file seq, struct sock sp, int i)	926	static void raw_sock_seq_show(struct seq_file seq, struct sock sp, int i)
927	{	927	{
928	struct inet_sock *inet = inet_sk(sp);	928	struct inet_sock *inet = inet_sk(sp);
929	__be32 dest = inet->daddr,	929	__be32 dest = inet->daddr,
930	src = inet->rcv_saddr;	930	src = inet->rcv_saddr;
931	__u16 destp = 0,	931	__u16 destp = 0,
932	srcp = inet->num;	932	srcp = inet->num;
933		933
934	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"	934	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
935	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",	935	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
936	i, src, srcp, dest, destp, sp->sk_state,	936	i, src, srcp, dest, destp, sp->sk_state,
937	atomic_read(&sp->sk_wmem_alloc),	937	atomic_read(&sp->sk_wmem_alloc),
938	atomic_read(&sp->sk_rmem_alloc),	938	atomic_read(&sp->sk_rmem_alloc),
939	0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),	939	0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
940	atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));	940	atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
941	}	941	}
942		942
943	static int raw_seq_show(struct seq_file seq, void v)	943	static int raw_seq_show(struct seq_file seq, void v)
944	{	944	{
945	if (v == SEQ_START_TOKEN)	945	if (v == SEQ_START_TOKEN)
946	seq_printf(seq, " sl local_address rem_address st tx_queue "	946	seq_printf(seq, " sl local_address rem_address st tx_queue "
947	"rx_queue tr tm->when retrnsmt uid timeout "	947	"rx_queue tr tm->when retrnsmt uid timeout "
948	"inode ref pointer drops\n");	948	"inode ref pointer drops\n");
949	else	949	else
950	raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);	950	raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
951	return 0;	951	return 0;
952	}	952	}
953		953
954	static const struct seq_operations raw_seq_ops = {	954	static const struct seq_operations raw_seq_ops = {
955	.start = raw_seq_start,	955	.start = raw_seq_start,
956	.next = raw_seq_next,	956	.next = raw_seq_next,
957	.stop = raw_seq_stop,	957	.stop = raw_seq_stop,
958	.show = raw_seq_show,	958	.show = raw_seq_show,
959	};	959	};
960		960
961	int raw_seq_open(struct inode ino, struct file file,	961	int raw_seq_open(struct inode ino, struct file file,
962	struct raw_hashinfo h, const struct seq_operations ops)	962	struct raw_hashinfo h, const struct seq_operations ops)
963	{	963	{
964	int err;	964	int err;
965	struct raw_iter_state *i;	965	struct raw_iter_state *i;
966		966
967	err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));	967	err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
968	if (err < 0)	968	if (err < 0)
969	return err;	969	return err;
970		970
971	i = raw_seq_private((struct seq_file *)file->private_data);	971	i = raw_seq_private((struct seq_file *)file->private_data);
972	i->h = h;	972	i->h = h;
973	return 0;	973	return 0;
974	}	974	}
975	EXPORT_SYMBOL_GPL(raw_seq_open);	975	EXPORT_SYMBOL_GPL(raw_seq_open);
976		976
977	static int raw_v4_seq_open(struct inode inode, struct file file)	977	static int raw_v4_seq_open(struct inode inode, struct file file)
978	{	978	{
979	return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);	979	return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
980	}	980	}
981		981
982	static const struct file_operations raw_seq_fops = {	982	static const struct file_operations raw_seq_fops = {
983	.owner = THIS_MODULE,	983	.owner = THIS_MODULE,
984	.open = raw_v4_seq_open,	984	.open = raw_v4_seq_open,
985	.read = seq_read,	985	.read = seq_read,
986	.llseek = seq_lseek,	986	.llseek = seq_lseek,
987	.release = seq_release_net,	987	.release = seq_release_net,
988	};	988	};
989		989
990	static __net_init int raw_init_net(struct net *net)	990	static __net_init int raw_init_net(struct net *net)
991	{	991	{
992	if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))	992	if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
993	return -ENOMEM;	993	return -ENOMEM;
994		994
995	return 0;	995	return 0;
996	}	996	}
997		997
998	static __net_exit void raw_exit_net(struct net *net)	998	static __net_exit void raw_exit_net(struct net *net)
999	{	999	{
1000	proc_net_remove(net, "raw");	1000	proc_net_remove(net, "raw");
1001	}	1001	}
1002		1002
1003	static __net_initdata struct pernet_operations raw_net_ops = {	1003	static __net_initdata struct pernet_operations raw_net_ops = {
1004	.init = raw_init_net,	1004	.init = raw_init_net,
1005	.exit = raw_exit_net,	1005	.exit = raw_exit_net,
1006	};	1006	};
1007		1007
1008	int __init raw_proc_init(void)	1008	int __init raw_proc_init(void)
1009	{	1009	{
1010	return register_pernet_subsys(&raw_net_ops);	1010	return register_pernet_subsys(&raw_net_ops);
1011	}	1011	}
1012		1012
1013	void __init raw_proc_exit(void)	1013	void __init raw_proc_exit(void)
1014	{	1014	{
1015	unregister_pernet_subsys(&raw_net_ops);	1015	unregister_pernet_subsys(&raw_net_ops);
1016	}	1016	}
1017	#endif /* CONFIG_PROC_FS */	1017	#endif /* CONFIG_PROC_FS */
1018		1018

net/ipv4/tcp.c

Diff comments View file @ 5a5f3a8

1	/*	1	/*
2	* INET An implementation of the TCP/IP protocol suite for the LINUX	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
3	* operating system. INET is implemented using the BSD Socket	3	* operating system. INET is implemented using the BSD Socket
4	* interface as the means of communication with the user level.	4	* interface as the means of communication with the user level.
5	*	5	*
6	* Implementation of the Transmission Control Protocol(TCP).	6	* Implementation of the Transmission Control Protocol(TCP).
7	*	7	*
8	* Authors: Ross Biro	8	* Authors: Ross Biro
9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>	9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10	* Mark Evans, <evansmp@uhura.aston.ac.uk>	10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
11	* Corey Minyard <wf-rch!minyard@relay.EU.net>	11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
12	* Florian La Roche, <flla@stud.uni-sb.de>	12	* Florian La Roche, <flla@stud.uni-sb.de>
13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>	13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14	* Linus Torvalds, <torvalds@cs.helsinki.fi>	14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
15	* Alan Cox, <gw4pts@gw4pts.ampr.org>	15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
16	* Matthew Dillon, <dillon@apollo.west.oic.com>	16	* Matthew Dillon, <dillon@apollo.west.oic.com>
17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>	17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18	* Jorge Cwik, <jorge@laser.satlink.net>	18	* Jorge Cwik, <jorge@laser.satlink.net>
19	*	19	*
20	* Fixes:	20	* Fixes:
21	* Alan Cox : Numerous verify_area() calls	21	* Alan Cox : Numerous verify_area() calls
22	* Alan Cox : Set the ACK bit on a reset	22	* Alan Cox : Set the ACK bit on a reset
23	* Alan Cox : Stopped it crashing if it closed while	23	* Alan Cox : Stopped it crashing if it closed while
24	* sk->inuse=1 and was trying to connect	24	* sk->inuse=1 and was trying to connect
25	* (tcp_err()).	25	* (tcp_err()).
26	* Alan Cox : All icmp error handling was broken	26	* Alan Cox : All icmp error handling was broken
27	* pointers passed where wrong and the	27	* pointers passed where wrong and the
28	* socket was looked up backwards. Nobody	28	* socket was looked up backwards. Nobody
29	* tested any icmp error code obviously.	29	* tested any icmp error code obviously.
30	* Alan Cox : tcp_err() now handled properly. It	30	* Alan Cox : tcp_err() now handled properly. It
31	* wakes people on errors. poll	31	* wakes people on errors. poll
32	* behaves and the icmp error race	32	* behaves and the icmp error race
33	* has gone by moving it into sock.c	33	* has gone by moving it into sock.c
34	* Alan Cox : tcp_send_reset() fixed to work for	34	* Alan Cox : tcp_send_reset() fixed to work for
35	* everything not just packets for	35	* everything not just packets for
36	* unknown sockets.	36	* unknown sockets.
37	* Alan Cox : tcp option processing.	37	* Alan Cox : tcp option processing.
38	* Alan Cox : Reset tweaked (still not 100%) [Had	38	* Alan Cox : Reset tweaked (still not 100%) [Had
39	* syn rule wrong]	39	* syn rule wrong]
40	* Herp Rosmanith : More reset fixes	40	* Herp Rosmanith : More reset fixes
41	* Alan Cox : No longer acks invalid rst frames.	41	* Alan Cox : No longer acks invalid rst frames.
42	* Acking any kind of RST is right out.	42	* Acking any kind of RST is right out.
43	* Alan Cox : Sets an ignore me flag on an rst	43	* Alan Cox : Sets an ignore me flag on an rst
44	* receive otherwise odd bits of prattle	44	* receive otherwise odd bits of prattle
45	* escape still	45	* escape still
46	* Alan Cox : Fixed another acking RST frame bug.	46	* Alan Cox : Fixed another acking RST frame bug.
47	* Should stop LAN workplace lockups.	47	* Should stop LAN workplace lockups.
48	* Alan Cox : Some tidyups using the new skb list	48	* Alan Cox : Some tidyups using the new skb list
49	* facilities	49	* facilities
50	* Alan Cox : sk->keepopen now seems to work	50	* Alan Cox : sk->keepopen now seems to work
51	* Alan Cox : Pulls options out correctly on accepts	51	* Alan Cox : Pulls options out correctly on accepts
52	* Alan Cox : Fixed assorted sk->rqueue->next errors	52	* Alan Cox : Fixed assorted sk->rqueue->next errors
53	* Alan Cox : PSH doesn't end a TCP read. Switched a	53	* Alan Cox : PSH doesn't end a TCP read. Switched a
54	* bit to skb ops.	54	* bit to skb ops.
55	* Alan Cox : Tidied tcp_data to avoid a potential	55	* Alan Cox : Tidied tcp_data to avoid a potential
56	* nasty.	56	* nasty.
57	* Alan Cox : Added some better commenting, as the	57	* Alan Cox : Added some better commenting, as the
58	* tcp is hard to follow	58	* tcp is hard to follow
59	* Alan Cox : Removed incorrect check for 20 * psh	59	* Alan Cox : Removed incorrect check for 20 * psh
60	* Michael O'Reilly : ack < copied bug fix.	60	* Michael O'Reilly : ack < copied bug fix.
61	* Johannes Stille : Misc tcp fixes (not all in yet).	61	* Johannes Stille : Misc tcp fixes (not all in yet).
62	* Alan Cox : FIN with no memory -> CRASH	62	* Alan Cox : FIN with no memory -> CRASH
63	* Alan Cox : Added socket option proto entries.	63	* Alan Cox : Added socket option proto entries.
64	* Also added awareness of them to accept.	64	* Also added awareness of them to accept.
65	* Alan Cox : Added TCP options (SOL_TCP)	65	* Alan Cox : Added TCP options (SOL_TCP)
66	* Alan Cox : Switched wakeup calls to callbacks,	66	* Alan Cox : Switched wakeup calls to callbacks,
67	* so the kernel can layer network	67	* so the kernel can layer network
68	* sockets.	68	* sockets.
69	* Alan Cox : Use ip_tos/ip_ttl settings.	69	* Alan Cox : Use ip_tos/ip_ttl settings.
70	* Alan Cox : Handle FIN (more) properly (we hope).	70	* Alan Cox : Handle FIN (more) properly (we hope).
71	* Alan Cox : RST frames sent on unsynchronised	71	* Alan Cox : RST frames sent on unsynchronised
72	* state ack error.	72	* state ack error.
73	* Alan Cox : Put in missing check for SYN bit.	73	* Alan Cox : Put in missing check for SYN bit.
74	* Alan Cox : Added tcp_select_window() aka NET2E	74	* Alan Cox : Added tcp_select_window() aka NET2E
75	* window non shrink trick.	75	* window non shrink trick.
76	* Alan Cox : Added a couple of small NET2E timer	76	* Alan Cox : Added a couple of small NET2E timer
77	* fixes	77	* fixes
78	* Charles Hedrick : TCP fixes	78	* Charles Hedrick : TCP fixes
79	* Toomas Tamm : TCP window fixes	79	* Toomas Tamm : TCP window fixes
80	* Alan Cox : Small URG fix to rlogin ^C ack fight	80	* Alan Cox : Small URG fix to rlogin ^C ack fight
81	* Charles Hedrick : Rewrote most of it to actually work	81	* Charles Hedrick : Rewrote most of it to actually work
82	* Linus : Rewrote tcp_read() and URG handling	82	* Linus : Rewrote tcp_read() and URG handling
83	* completely	83	* completely
84	* Gerhard Koerting: Fixed some missing timer handling	84	* Gerhard Koerting: Fixed some missing timer handling
85	* Matthew Dillon : Reworked TCP machine states as per RFC	85	* Matthew Dillon : Reworked TCP machine states as per RFC
86	* Gerhard Koerting: PC/TCP workarounds	86	* Gerhard Koerting: PC/TCP workarounds
87	* Adam Caldwell : Assorted timer/timing errors	87	* Adam Caldwell : Assorted timer/timing errors
88	* Matthew Dillon : Fixed another RST bug	88	* Matthew Dillon : Fixed another RST bug
89	* Alan Cox : Move to kernel side addressing changes.	89	* Alan Cox : Move to kernel side addressing changes.
90	* Alan Cox : Beginning work on TCP fastpathing	90	* Alan Cox : Beginning work on TCP fastpathing
91	* (not yet usable)	91	* (not yet usable)
92	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.	92	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
93	* Alan Cox : TCP fast path debugging	93	* Alan Cox : TCP fast path debugging
94	* Alan Cox : Window clamping	94	* Alan Cox : Window clamping
95	* Michael Riepe : Bug in tcp_check()	95	* Michael Riepe : Bug in tcp_check()
96	* Matt Dillon : More TCP improvements and RST bug fixes	96	* Matt Dillon : More TCP improvements and RST bug fixes
97	* Matt Dillon : Yet more small nasties remove from the	97	* Matt Dillon : Yet more small nasties remove from the
98	* TCP code (Be very nice to this man if	98	* TCP code (Be very nice to this man if
99	* tcp finally works 100%) 8)	99	* tcp finally works 100%) 8)
100	* Alan Cox : BSD accept semantics.	100	* Alan Cox : BSD accept semantics.
101	* Alan Cox : Reset on closedown bug.	101	* Alan Cox : Reset on closedown bug.
102	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().	102	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
103	* Michael Pall : Handle poll() after URG properly in	103	* Michael Pall : Handle poll() after URG properly in
104	* all cases.	104	* all cases.
105	* Michael Pall : Undo the last fix in tcp_read_urg()	105	* Michael Pall : Undo the last fix in tcp_read_urg()
106	* (multi URG PUSH broke rlogin).	106	* (multi URG PUSH broke rlogin).
107	* Michael Pall : Fix the multi URG PUSH problem in	107	* Michael Pall : Fix the multi URG PUSH problem in
108	* tcp_readable(), poll() after URG	108	* tcp_readable(), poll() after URG
109	* works now.	109	* works now.
110	* Michael Pall : recv(...,MSG_OOB) never blocks in the	110	* Michael Pall : recv(...,MSG_OOB) never blocks in the
111	* BSD api.	111	* BSD api.
112	* Alan Cox : Changed the semantics of sk->socket to	112	* Alan Cox : Changed the semantics of sk->socket to
113	* fix a race and a signal problem with	113	* fix a race and a signal problem with
114	* accept() and async I/O.	114	* accept() and async I/O.
115	* Alan Cox : Relaxed the rules on tcp_sendto().	115	* Alan Cox : Relaxed the rules on tcp_sendto().
116	* Yury Shevchuk : Really fixed accept() blocking problem.	116	* Yury Shevchuk : Really fixed accept() blocking problem.
117	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for	117	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
118	* clients/servers which listen in on	118	* clients/servers which listen in on
119	* fixed ports.	119	* fixed ports.
120	* Alan Cox : Cleaned the above up and shrank it to	120	* Alan Cox : Cleaned the above up and shrank it to
121	* a sensible code size.	121	* a sensible code size.
122	* Alan Cox : Self connect lockup fix.	122	* Alan Cox : Self connect lockup fix.
123	* Alan Cox : No connect to multicast.	123	* Alan Cox : No connect to multicast.
124	* Ross Biro : Close unaccepted children on master	124	* Ross Biro : Close unaccepted children on master
125	* socket close.	125	* socket close.
126	* Alan Cox : Reset tracing code.	126	* Alan Cox : Reset tracing code.
127	* Alan Cox : Spurious resets on shutdown.	127	* Alan Cox : Spurious resets on shutdown.
128	* Alan Cox : Giant 15 minute/60 second timer error	128	* Alan Cox : Giant 15 minute/60 second timer error
129	* Alan Cox : Small whoops in polling before an	129	* Alan Cox : Small whoops in polling before an
130	* accept.	130	* accept.
131	* Alan Cox : Kept the state trace facility since	131	* Alan Cox : Kept the state trace facility since
132	* it's handy for debugging.	132	* it's handy for debugging.
133	* Alan Cox : More reset handler fixes.	133	* Alan Cox : More reset handler fixes.
134	* Alan Cox : Started rewriting the code based on	134	* Alan Cox : Started rewriting the code based on
135	* the RFC's for other useful protocol	135	* the RFC's for other useful protocol
136	* references see: Comer, KA9Q NOS, and	136	* references see: Comer, KA9Q NOS, and
137	* for a reference on the difference	137	* for a reference on the difference
138	* between specifications and how BSD	138	* between specifications and how BSD
139	* works see the 4.4lite source.	139	* works see the 4.4lite source.
140	* A.N.Kuznetsov : Don't time wait on completion of tidy	140	* A.N.Kuznetsov : Don't time wait on completion of tidy
141	* close.	141	* close.
142	* Linus Torvalds : Fin/Shutdown & copied_seq changes.	142	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
143	* Linus Torvalds : Fixed BSD port reuse to work first syn	143	* Linus Torvalds : Fixed BSD port reuse to work first syn
144	* Alan Cox : Reimplemented timers as per the RFC	144	* Alan Cox : Reimplemented timers as per the RFC
145	* and using multiple timers for sanity.	145	* and using multiple timers for sanity.
146	* Alan Cox : Small bug fixes, and a lot of new	146	* Alan Cox : Small bug fixes, and a lot of new
147	* comments.	147	* comments.
148	* Alan Cox : Fixed dual reader crash by locking	148	* Alan Cox : Fixed dual reader crash by locking
149	* the buffers (much like datagram.c)	149	* the buffers (much like datagram.c)
150	* Alan Cox : Fixed stuck sockets in probe. A probe	150	* Alan Cox : Fixed stuck sockets in probe. A probe
151	* now gets fed up of retrying without	151	* now gets fed up of retrying without
152	* (even a no space) answer.	152	* (even a no space) answer.
153	* Alan Cox : Extracted closing code better	153	* Alan Cox : Extracted closing code better
154	* Alan Cox : Fixed the closing state machine to	154	* Alan Cox : Fixed the closing state machine to
155	* resemble the RFC.	155	* resemble the RFC.
156	* Alan Cox : More 'per spec' fixes.	156	* Alan Cox : More 'per spec' fixes.
157	* Jorge Cwik : Even faster checksumming.	157	* Jorge Cwik : Even faster checksumming.
158	* Alan Cox : tcp_data() doesn't ack illegal PSH	158	* Alan Cox : tcp_data() doesn't ack illegal PSH
159	* only frames. At least one pc tcp stack	159	* only frames. At least one pc tcp stack
160	* generates them.	160	* generates them.
161	* Alan Cox : Cache last socket.	161	* Alan Cox : Cache last socket.
162	* Alan Cox : Per route irtt.	162	* Alan Cox : Per route irtt.
163	* Matt Day : poll()->select() match BSD precisely on error	163	* Matt Day : poll()->select() match BSD precisely on error
164	* Alan Cox : New buffers	164	* Alan Cox : New buffers
165	* Marc Tamsky : Various sk->prot->retransmits and	165	* Marc Tamsky : Various sk->prot->retransmits and
166	* sk->retransmits misupdating fixed.	166	* sk->retransmits misupdating fixed.
167	* Fixed tcp_write_timeout: stuck close,	167	* Fixed tcp_write_timeout: stuck close,
168	* and TCP syn retries gets used now.	168	* and TCP syn retries gets used now.
169	* Mark Yarvis : In tcp_read_wakeup(), don't send an	169	* Mark Yarvis : In tcp_read_wakeup(), don't send an
170	* ack if state is TCP_CLOSED.	170	* ack if state is TCP_CLOSED.
171	* Alan Cox : Look up device on a retransmit - routes may	171	* Alan Cox : Look up device on a retransmit - routes may
172	* change. Doesn't yet cope with MSS shrink right	172	* change. Doesn't yet cope with MSS shrink right
173	* but it's a start!	173	* but it's a start!
174	* Marc Tamsky : Closing in closing fixes.	174	* Marc Tamsky : Closing in closing fixes.
175	* Mike Shaver : RFC1122 verifications.	175	* Mike Shaver : RFC1122 verifications.
176	* Alan Cox : rcv_saddr errors.	176	* Alan Cox : rcv_saddr errors.
177	* Alan Cox : Block double connect().	177	* Alan Cox : Block double connect().
178	* Alan Cox : Small hooks for enSKIP.	178	* Alan Cox : Small hooks for enSKIP.
179	* Alexey Kuznetsov: Path MTU discovery.	179	* Alexey Kuznetsov: Path MTU discovery.
180	* Alan Cox : Support soft errors.	180	* Alan Cox : Support soft errors.
181	* Alan Cox : Fix MTU discovery pathological case	181	* Alan Cox : Fix MTU discovery pathological case
182	* when the remote claims no mtu!	182	* when the remote claims no mtu!
183	* Marc Tamsky : TCP_CLOSE fix.	183	* Marc Tamsky : TCP_CLOSE fix.
184	* Colin (G3TNE) : Send a reset on syn ack replies in	184	* Colin (G3TNE) : Send a reset on syn ack replies in
185	* window but wrong (fixes NT lpd problems)	185	* window but wrong (fixes NT lpd problems)
186	* Pedro Roque : Better TCP window handling, delayed ack.	186	* Pedro Roque : Better TCP window handling, delayed ack.
187	* Joerg Reuter : No modification of locked buffers in	187	* Joerg Reuter : No modification of locked buffers in
188	* tcp_do_retransmit()	188	* tcp_do_retransmit()
189	* Eric Schenk : Changed receiver side silly window	189	* Eric Schenk : Changed receiver side silly window
190	* avoidance algorithm to BSD style	190	* avoidance algorithm to BSD style
191	* algorithm. This doubles throughput	191	* algorithm. This doubles throughput
192	* against machines running Solaris,	192	* against machines running Solaris,
193	* and seems to result in general	193	* and seems to result in general
194	* improvement.	194	* improvement.
195	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD	195	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
196	* Willy Konynenberg : Transparent proxying support.	196	* Willy Konynenberg : Transparent proxying support.
197	* Mike McLagan : Routing by source	197	* Mike McLagan : Routing by source
198	* Keith Owens : Do proper merging with partial SKB's in	198	* Keith Owens : Do proper merging with partial SKB's in
199	* tcp_do_sendmsg to avoid burstiness.	199	* tcp_do_sendmsg to avoid burstiness.
200	* Eric Schenk : Fix fast close down bug with	200	* Eric Schenk : Fix fast close down bug with
201	* shutdown() followed by close().	201	* shutdown() followed by close().
202	* Andi Kleen : Make poll agree with SIGIO	202	* Andi Kleen : Make poll agree with SIGIO
203	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and	203	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
204	* lingertime == 0 (RFC 793 ABORT Call)	204	* lingertime == 0 (RFC 793 ABORT Call)
205	* Hirokazu Takahashi : Use copy_from_user() instead of	205	* Hirokazu Takahashi : Use copy_from_user() instead of
206	* csum_and_copy_from_user() if possible.	206	* csum_and_copy_from_user() if possible.
207	*	207	*
208	* This program is free software; you can redistribute it and/or	208	* This program is free software; you can redistribute it and/or
209	* modify it under the terms of the GNU General Public License	209	* modify it under the terms of the GNU General Public License
210	* as published by the Free Software Foundation; either version	210	* as published by the Free Software Foundation; either version
211	* 2 of the License, or(at your option) any later version.	211	* 2 of the License, or(at your option) any later version.
212	*	212	*
213	* Description of States:	213	* Description of States:
214	*	214	*
215	* TCP_SYN_SENT sent a connection request, waiting for ack	215	* TCP_SYN_SENT sent a connection request, waiting for ack
216	*	216	*
217	* TCP_SYN_RECV received a connection request, sent ack,	217	* TCP_SYN_RECV received a connection request, sent ack,
218	* waiting for final ack in three-way handshake.	218	* waiting for final ack in three-way handshake.
219	*	219	*
220	* TCP_ESTABLISHED connection established	220	* TCP_ESTABLISHED connection established
221	*	221	*
222	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete	222	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223	* transmission of remaining buffered data	223	* transmission of remaining buffered data
224	*	224	*
225	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote	225	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226	* to shutdown	226	* to shutdown
227	*	227	*
228	* TCP_CLOSING both sides have shutdown but we still have	228	* TCP_CLOSING both sides have shutdown but we still have
229	* data we have to finish sending	229	* data we have to finish sending
230	*	230	*
231	* TCP_TIME_WAIT timeout to catch resent junk before entering	231	* TCP_TIME_WAIT timeout to catch resent junk before entering
232	* closed, can only be entered from FIN_WAIT2	232	* closed, can only be entered from FIN_WAIT2
233	* or CLOSING. Required because the other end	233	* or CLOSING. Required because the other end
234	* may not have gotten our last ACK causing it	234	* may not have gotten our last ACK causing it
235	* to retransmit the data packet (which we ignore)	235	* to retransmit the data packet (which we ignore)
236	*	236	*
237	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for	237	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238	* us to finish writing our data and to shutdown	238	* us to finish writing our data and to shutdown
239	* (we have to close() to move on to LAST_ACK)	239	* (we have to close() to move on to LAST_ACK)
240	*	240	*
241	* TCP_LAST_ACK out side has shutdown after remote has	241	* TCP_LAST_ACK out side has shutdown after remote has
242	* shutdown. There may still be data in our	242	* shutdown. There may still be data in our
243	* buffer that we have to finish sending	243	* buffer that we have to finish sending
244	*	244	*
245	* TCP_CLOSE socket is finished	245	* TCP_CLOSE socket is finished
246	*/	246	*/
247		247
248	#include <linux/kernel.h>	248	#include <linux/kernel.h>
249	#include <linux/module.h>	249	#include <linux/module.h>
250	#include <linux/types.h>	250	#include <linux/types.h>
251	#include <linux/fcntl.h>	251	#include <linux/fcntl.h>
252	#include <linux/poll.h>	252	#include <linux/poll.h>
253	#include <linux/init.h>	253	#include <linux/init.h>
254	#include <linux/fs.h>	254	#include <linux/fs.h>
255	#include <linux/skbuff.h>	255	#include <linux/skbuff.h>
256	#include <linux/scatterlist.h>	256	#include <linux/scatterlist.h>
257	#include <linux/splice.h>	257	#include <linux/splice.h>
258	#include <linux/net.h>	258	#include <linux/net.h>
259	#include <linux/socket.h>	259	#include <linux/socket.h>
260	#include <linux/random.h>	260	#include <linux/random.h>
261	#include <linux/bootmem.h>	261	#include <linux/bootmem.h>
262	#include <linux/highmem.h>	262	#include <linux/highmem.h>
263	#include <linux/swap.h>	263	#include <linux/swap.h>
264	#include <linux/cache.h>	264	#include <linux/cache.h>
265	#include <linux/err.h>	265	#include <linux/err.h>
266	#include <linux/crypto.h>	266	#include <linux/crypto.h>
267		267
268	#include <net/icmp.h>	268	#include <net/icmp.h>
269	#include <net/tcp.h>	269	#include <net/tcp.h>
270	#include <net/xfrm.h>	270	#include <net/xfrm.h>
271	#include <net/ip.h>	271	#include <net/ip.h>
272	#include <net/netdma.h>	272	#include <net/netdma.h>
273	#include <net/sock.h>	273	#include <net/sock.h>
274		274
275	#include <asm/uaccess.h>	275	#include <asm/uaccess.h>
276	#include <asm/ioctls.h>	276	#include <asm/ioctls.h>
277		277
278	int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;	278	int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279		279
280	atomic_t tcp_orphan_count = ATOMIC_INIT(0);	280	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
281		281
282	EXPORT_SYMBOL_GPL(tcp_orphan_count);	282	EXPORT_SYMBOL_GPL(tcp_orphan_count);
283		283
284	int sysctl_tcp_mem[3] __read_mostly;	284	int sysctl_tcp_mem[3] __read_mostly;
285	int sysctl_tcp_wmem[3] __read_mostly;	285	int sysctl_tcp_wmem[3] __read_mostly;
286	int sysctl_tcp_rmem[3] __read_mostly;	286	int sysctl_tcp_rmem[3] __read_mostly;
287		287
288	EXPORT_SYMBOL(sysctl_tcp_mem);	288	EXPORT_SYMBOL(sysctl_tcp_mem);
289	EXPORT_SYMBOL(sysctl_tcp_rmem);	289	EXPORT_SYMBOL(sysctl_tcp_rmem);
290	EXPORT_SYMBOL(sysctl_tcp_wmem);	290	EXPORT_SYMBOL(sysctl_tcp_wmem);
291		291
292	atomic_t tcp_memory_allocated; /* Current allocated memory. */	292	atomic_t tcp_memory_allocated; /* Current allocated memory. */
293	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */	293	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
294		294
295	EXPORT_SYMBOL(tcp_memory_allocated);	295	EXPORT_SYMBOL(tcp_memory_allocated);
296	EXPORT_SYMBOL(tcp_sockets_allocated);	296	EXPORT_SYMBOL(tcp_sockets_allocated);
297		297
298	/*	298	/*
299	* TCP splice context	299	* TCP splice context
300	*/	300	*/
301	struct tcp_splice_state {	301	struct tcp_splice_state {
302	struct pipe_inode_info *pipe;	302	struct pipe_inode_info *pipe;
303	size_t len;	303	size_t len;
304	unsigned int flags;	304	unsigned int flags;
305	};	305	};
306		306
307	/*	307	/*
308	* Pressure flag: try to collapse.	308	* Pressure flag: try to collapse.
309	* Technical note: it is used by multiple contexts non atomically.	309	* Technical note: it is used by multiple contexts non atomically.
310	* All the __sk_mem_schedule() is of this nature: accounting	310	* All the __sk_mem_schedule() is of this nature: accounting
311	* is strict, actions are advisory and have some latency.	311	* is strict, actions are advisory and have some latency.
312	*/	312	*/
313	int tcp_memory_pressure __read_mostly;	313	int tcp_memory_pressure __read_mostly;
314		314
315	EXPORT_SYMBOL(tcp_memory_pressure);	315	EXPORT_SYMBOL(tcp_memory_pressure);
316		316
317	void tcp_enter_memory_pressure(struct sock *sk)	317	void tcp_enter_memory_pressure(struct sock *sk)
318	{	318	{
319	if (!tcp_memory_pressure) {	319	if (!tcp_memory_pressure) {
320	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);	320	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
321	tcp_memory_pressure = 1;	321	tcp_memory_pressure = 1;
322	}	322	}
323	}	323	}
324		324
325	EXPORT_SYMBOL(tcp_enter_memory_pressure);	325	EXPORT_SYMBOL(tcp_enter_memory_pressure);
326		326
327	/*	327	/*
328	* Wait for a TCP event.	328	* Wait for a TCP event.
329	*	329	*
330	* Note that we don't need to lock the socket, as the upper poll layers	330	* Note that we don't need to lock the socket, as the upper poll layers
331	* take care of normal races (between the test and the event) and we don't	331	* take care of normal races (between the test and the event) and we don't
332	* go look at any of the socket buffers directly.	332	* go look at any of the socket buffers directly.
333	*/	333	*/
334	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)	334	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
335	{	335	{
336	unsigned int mask;	336	unsigned int mask;
337	struct sock *sk = sock->sk;	337	struct sock *sk = sock->sk;
338	struct tcp_sock *tp = tcp_sk(sk);	338	struct tcp_sock *tp = tcp_sk(sk);
339		339
340	poll_wait(file, sk->sk_sleep, wait);	340	poll_wait(file, sk->sk_sleep, wait);
341	if (sk->sk_state == TCP_LISTEN)	341	if (sk->sk_state == TCP_LISTEN)
342	return inet_csk_listen_poll(sk);	342	return inet_csk_listen_poll(sk);
343		343
344	/* Socket is not locked. We are protected from async events	344	/* Socket is not locked. We are protected from async events
345	* by poll logic and correct handling of state changes	345	* by poll logic and correct handling of state changes
346	* made by other threads is impossible in any case.	346	* made by other threads is impossible in any case.
347	*/	347	*/
348		348
349	mask = 0;	349	mask = 0;
350	if (sk->sk_err)	350	if (sk->sk_err)
351	mask = POLLERR;	351	mask = POLLERR;
352		352
353	/*	353	/*
354	* POLLHUP is certainly not done right. But poll() doesn't	354	* POLLHUP is certainly not done right. But poll() doesn't
355	* have a notion of HUP in just one direction, and for a	355	* have a notion of HUP in just one direction, and for a
356	* socket the read side is more interesting.	356	* socket the read side is more interesting.
357	*	357	*
358	* Some poll() documentation says that POLLHUP is incompatible	358	* Some poll() documentation says that POLLHUP is incompatible
359	* with the POLLOUT/POLLWR flags, so somebody should check this	359	* with the POLLOUT/POLLWR flags, so somebody should check this
360	* all. But careful, it tends to be safer to return too many	360	* all. But careful, it tends to be safer to return too many
361	* bits than too few, and you can easily break real applications	361	* bits than too few, and you can easily break real applications
362	* if you don't tell them that something has hung up!	362	* if you don't tell them that something has hung up!
363	*	363	*
364	* Check-me.	364	* Check-me.
365	*	365	*
366	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and	366	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
367	* our fs/select.c). It means that after we received EOF,	367	* our fs/select.c). It means that after we received EOF,
368	* poll always returns immediately, making impossible poll() on write()	368	* poll always returns immediately, making impossible poll() on write()
369	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP	369	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
370	* if and only if shutdown has been made in both directions.	370	* if and only if shutdown has been made in both directions.
371	* Actually, it is interesting to look how Solaris and DUX	371	* Actually, it is interesting to look how Solaris and DUX
372	* solve this dilemma. I would prefer, if POLLHUP were maskable,	372	* solve this dilemma. I would prefer, if POLLHUP were maskable,
373	* then we could set it on SND_SHUTDOWN. BTW examples given	373	* then we could set it on SND_SHUTDOWN. BTW examples given
374	* in Stevens' books assume exactly this behaviour, it explains	374	* in Stevens' books assume exactly this behaviour, it explains
375	* why POLLHUP is incompatible with POLLOUT. --ANK	375	* why POLLHUP is incompatible with POLLOUT. --ANK
376	*	376	*
377	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent	377	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
378	* blocking on fresh not-connected or disconnected socket. --ANK	378	* blocking on fresh not-connected or disconnected socket. --ANK
379	*/	379	*/
380	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)	380	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
381	mask \|= POLLHUP;	381	mask \|= POLLHUP;
382	if (sk->sk_shutdown & RCV_SHUTDOWN)	382	if (sk->sk_shutdown & RCV_SHUTDOWN)
383	mask \|= POLLIN \| POLLRDNORM \| POLLRDHUP;	383	mask \|= POLLIN \| POLLRDNORM \| POLLRDHUP;
384		384
385	/* Connected? */	385	/* Connected? */
386	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {	386	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
387	int target = sock_rcvlowat(sk, 0, INT_MAX);	387	int target = sock_rcvlowat(sk, 0, INT_MAX);
388		388
389	if (tp->urg_seq == tp->copied_seq &&	389	if (tp->urg_seq == tp->copied_seq &&
390	!sock_flag(sk, SOCK_URGINLINE) &&	390	!sock_flag(sk, SOCK_URGINLINE) &&
391	tp->urg_data)	391	tp->urg_data)
392	target--;	392	target--;
393		393
394	/* Potential race condition. If read of tp below will	394	/* Potential race condition. If read of tp below will
395	* escape above sk->sk_state, we can be illegally awaken	395	* escape above sk->sk_state, we can be illegally awaken
396	* in SYN_* states. */	396	* in SYN_* states. */
397	if (tp->rcv_nxt - tp->copied_seq >= target)	397	if (tp->rcv_nxt - tp->copied_seq >= target)
398	mask \|= POLLIN \| POLLRDNORM;	398	mask \|= POLLIN \| POLLRDNORM;
399		399
400	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {	400	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
401	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {	401	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
402	mask \|= POLLOUT \| POLLWRNORM;	402	mask \|= POLLOUT \| POLLWRNORM;
403	} else { /* send SIGIO later */	403	} else { /* send SIGIO later */
404	set_bit(SOCK_ASYNC_NOSPACE,	404	set_bit(SOCK_ASYNC_NOSPACE,
405	&sk->sk_socket->flags);	405	&sk->sk_socket->flags);
406	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);	406	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
407		407
408	/* Race breaker. If space is freed after	408	/* Race breaker. If space is freed after
409	* wspace test but before the flags are set,	409	* wspace test but before the flags are set,
410	* IO signal will be lost.	410	* IO signal will be lost.
411	*/	411	*/
412	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))	412	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
413	mask \|= POLLOUT \| POLLWRNORM;	413	mask \|= POLLOUT \| POLLWRNORM;
414	}	414	}
415	}	415	}
416		416
417	if (tp->urg_data & TCP_URG_VALID)	417	if (tp->urg_data & TCP_URG_VALID)
418	mask \|= POLLPRI;	418	mask \|= POLLPRI;
419	}	419	}
420	return mask;	420	return mask;
421	}	421	}
422		422
423	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)	423	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
424	{	424	{
425	struct tcp_sock *tp = tcp_sk(sk);	425	struct tcp_sock *tp = tcp_sk(sk);
426	int answ;	426	int answ;
427		427
428	switch (cmd) {	428	switch (cmd) {
429	case SIOCINQ:	429	case SIOCINQ:
430	if (sk->sk_state == TCP_LISTEN)	430	if (sk->sk_state == TCP_LISTEN)
431	return -EINVAL;	431	return -EINVAL;
432		432
433	lock_sock(sk);	433	lock_sock(sk);
434	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))	434	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
435	answ = 0;	435	answ = 0;
436	else if (sock_flag(sk, SOCK_URGINLINE) \|\|	436	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
437	!tp->urg_data \|\|	437	!tp->urg_data \|\|
438	before(tp->urg_seq, tp->copied_seq) \|\|	438	before(tp->urg_seq, tp->copied_seq) \|\|
439	!before(tp->urg_seq, tp->rcv_nxt)) {	439	!before(tp->urg_seq, tp->rcv_nxt)) {
440	answ = tp->rcv_nxt - tp->copied_seq;	440	answ = tp->rcv_nxt - tp->copied_seq;
441		441
442	/* Subtract 1, if FIN is in queue. */	442	/* Subtract 1, if FIN is in queue. */
443	if (answ && !skb_queue_empty(&sk->sk_receive_queue))	443	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
444	answ -=	444	answ -=
445	tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;	445	tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
446	} else	446	} else
447	answ = tp->urg_seq - tp->copied_seq;	447	answ = tp->urg_seq - tp->copied_seq;
448	release_sock(sk);	448	release_sock(sk);
449	break;	449	break;
450	case SIOCATMARK:	450	case SIOCATMARK:
451	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;	451	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
452	break;	452	break;
453	case SIOCOUTQ:	453	case SIOCOUTQ:
454	if (sk->sk_state == TCP_LISTEN)	454	if (sk->sk_state == TCP_LISTEN)
455	return -EINVAL;	455	return -EINVAL;
456		456
457	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))	457	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
458	answ = 0;	458	answ = 0;
459	else	459	else
460	answ = tp->write_seq - tp->snd_una;	460	answ = tp->write_seq - tp->snd_una;
461	break;	461	break;
462	default:	462	default:
463	return -ENOIOCTLCMD;	463	return -ENOIOCTLCMD;
464	}	464	}
465		465
466	return put_user(answ, (int __user *)arg);	466	return put_user(answ, (int __user *)arg);
467	}	467	}
468		468
469	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)	469	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
470	{	470	{
471	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;	471	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
472	tp->pushed_seq = tp->write_seq;	472	tp->pushed_seq = tp->write_seq;
473	}	473	}
474		474
475	static inline int forced_push(struct tcp_sock *tp)	475	static inline int forced_push(struct tcp_sock *tp)
476	{	476	{
477	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));	477	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
478	}	478	}
479		479
480	static inline void skb_entail(struct sock sk, struct sk_buff skb)	480	static inline void skb_entail(struct sock sk, struct sk_buff skb)
481	{	481	{
482	struct tcp_sock *tp = tcp_sk(sk);	482	struct tcp_sock *tp = tcp_sk(sk);
483	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);	483	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
484		484
485	skb->csum = 0;	485	skb->csum = 0;
486	tcb->seq = tcb->end_seq = tp->write_seq;	486	tcb->seq = tcb->end_seq = tp->write_seq;
487	tcb->flags = TCPCB_FLAG_ACK;	487	tcb->flags = TCPCB_FLAG_ACK;
488	tcb->sacked = 0;	488	tcb->sacked = 0;
489	skb_header_release(skb);	489	skb_header_release(skb);
490	tcp_add_write_queue_tail(sk, skb);	490	tcp_add_write_queue_tail(sk, skb);
491	sk->sk_wmem_queued += skb->truesize;	491	sk->sk_wmem_queued += skb->truesize;
492	sk_mem_charge(sk, skb->truesize);	492	sk_mem_charge(sk, skb->truesize);
493	if (tp->nonagle & TCP_NAGLE_PUSH)	493	if (tp->nonagle & TCP_NAGLE_PUSH)
494	tp->nonagle &= ~TCP_NAGLE_PUSH;	494	tp->nonagle &= ~TCP_NAGLE_PUSH;
495	}	495	}
496		496
497	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,	497	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
498	struct sk_buff *skb)	498	struct sk_buff *skb)
499	{	499	{
500	if (flags & MSG_OOB)	500	if (flags & MSG_OOB)
501	tp->snd_up = tp->write_seq;	501	tp->snd_up = tp->write_seq;
502	}	502	}
503		503
504	static inline void tcp_push(struct sock *sk, int flags, int mss_now,	504	static inline void tcp_push(struct sock *sk, int flags, int mss_now,
505	int nonagle)	505	int nonagle)
506	{	506	{
507	struct tcp_sock *tp = tcp_sk(sk);	507	struct tcp_sock *tp = tcp_sk(sk);
508		508
509	if (tcp_send_head(sk)) {	509	if (tcp_send_head(sk)) {
510	struct sk_buff *skb = tcp_write_queue_tail(sk);	510	struct sk_buff *skb = tcp_write_queue_tail(sk);
511	if (!(flags & MSG_MORE) \|\| forced_push(tp))	511	if (!(flags & MSG_MORE) \|\| forced_push(tp))
512	tcp_mark_push(tp, skb);	512	tcp_mark_push(tp, skb);
513	tcp_mark_urg(tp, flags, skb);	513	tcp_mark_urg(tp, flags, skb);
514	__tcp_push_pending_frames(sk, mss_now,	514	__tcp_push_pending_frames(sk, mss_now,
515	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);	515	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
516	}	516	}
517	}	517	}
518		518
519	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct sk_buff skb,	519	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct sk_buff skb,
520	unsigned int offset, size_t len)	520	unsigned int offset, size_t len)
521	{	521	{
522	struct tcp_splice_state *tss = rd_desc->arg.data;	522	struct tcp_splice_state *tss = rd_desc->arg.data;
523		523
524	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);	524	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
525	}	525	}
526		526
527	static int __tcp_splice_read(struct sock sk, struct tcp_splice_state tss)	527	static int __tcp_splice_read(struct sock sk, struct tcp_splice_state tss)
528	{	528	{
529	/* Store TCP splice context information in read_descriptor_t. */	529	/* Store TCP splice context information in read_descriptor_t. */
530	read_descriptor_t rd_desc = {	530	read_descriptor_t rd_desc = {
531	.arg.data = tss,	531	.arg.data = tss,
532	};	532	};
533		533
534	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);	534	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
535	}	535	}
536		536
537	/**	537	/**
538	* tcp_splice_read - splice data from TCP socket to a pipe	538	* tcp_splice_read - splice data from TCP socket to a pipe
539	* @sock: socket to splice from	539	* @sock: socket to splice from
540	* @ppos: position (not valid)	540	* @ppos: position (not valid)
541	* @pipe: pipe to splice to	541	* @pipe: pipe to splice to
542	* @len: number of bytes to splice	542	* @len: number of bytes to splice
543	* @flags: splice modifier flags	543	* @flags: splice modifier flags
544	*	544	*
545	* Description:	545	* Description:
546	* Will read pages from given socket and fill them into a pipe.	546	* Will read pages from given socket and fill them into a pipe.
547	*	547	*
548	**/	548	**/
549	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,	549	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,
550	struct pipe_inode_info *pipe, size_t len,	550	struct pipe_inode_info *pipe, size_t len,
551	unsigned int flags)	551	unsigned int flags)
552	{	552	{
553	struct sock *sk = sock->sk;	553	struct sock *sk = sock->sk;
554	struct tcp_splice_state tss = {	554	struct tcp_splice_state tss = {
555	.pipe = pipe,	555	.pipe = pipe,
556	.len = len,	556	.len = len,
557	.flags = flags,	557	.flags = flags,
558	};	558	};
559	long timeo;	559	long timeo;
560	ssize_t spliced;	560	ssize_t spliced;
561	int ret;	561	int ret;
562		562
563	/*	563	/*
564	* We can't seek on a socket input	564	* We can't seek on a socket input
565	*/	565	*/
566	if (unlikely(*ppos))	566	if (unlikely(*ppos))
567	return -ESPIPE;	567	return -ESPIPE;
568		568
569	ret = spliced = 0;	569	ret = spliced = 0;
570		570
571	lock_sock(sk);	571	lock_sock(sk);
572		572
573	timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);	573	timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
574	while (tss.len) {	574	while (tss.len) {
575	ret = __tcp_splice_read(sk, &tss);	575	ret = __tcp_splice_read(sk, &tss);
576	if (ret < 0)	576	if (ret < 0)
577	break;	577	break;
578	else if (!ret) {	578	else if (!ret) {
579	if (spliced)	579	if (spliced)
580	break;	580	break;
581	if (flags & SPLICE_F_NONBLOCK) {	581	if (flags & SPLICE_F_NONBLOCK) {
582	ret = -EAGAIN;	582	ret = -EAGAIN;
583	break;	583	break;
584	}	584	}
585	if (sock_flag(sk, SOCK_DONE))	585	if (sock_flag(sk, SOCK_DONE))
586	break;	586	break;
587	if (sk->sk_err) {	587	if (sk->sk_err) {
588	ret = sock_error(sk);	588	ret = sock_error(sk);
589	break;	589	break;
590	}	590	}
591	if (sk->sk_shutdown & RCV_SHUTDOWN)	591	if (sk->sk_shutdown & RCV_SHUTDOWN)
592	break;	592	break;
593	if (sk->sk_state == TCP_CLOSE) {	593	if (sk->sk_state == TCP_CLOSE) {
594	/*	594	/*
595	* This occurs when user tries to read	595	* This occurs when user tries to read
596	* from never connected socket.	596	* from never connected socket.
597	*/	597	*/
598	if (!sock_flag(sk, SOCK_DONE))	598	if (!sock_flag(sk, SOCK_DONE))
599	ret = -ENOTCONN;	599	ret = -ENOTCONN;
600	break;	600	break;
601	}	601	}
602	if (!timeo) {	602	if (!timeo) {
603	ret = -EAGAIN;	603	ret = -EAGAIN;
604	break;	604	break;
605	}	605	}
606	sk_wait_data(sk, &timeo);	606	sk_wait_data(sk, &timeo);
607	if (signal_pending(current)) {	607	if (signal_pending(current)) {
608	ret = sock_intr_errno(timeo);	608	ret = sock_intr_errno(timeo);
609	break;	609	break;
610	}	610	}
611	continue;	611	continue;
612	}	612	}
613	tss.len -= ret;	613	tss.len -= ret;
614	spliced += ret;	614	spliced += ret;
615		615
616	release_sock(sk);	616	release_sock(sk);
617	lock_sock(sk);	617	lock_sock(sk);
618		618
619	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|	619	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|
620	(sk->sk_shutdown & RCV_SHUTDOWN) \|\| !timeo \|\|	620	(sk->sk_shutdown & RCV_SHUTDOWN) \|\| !timeo \|\|
621	signal_pending(current))	621	signal_pending(current))
622	break;	622	break;
623	}	623	}
624		624
625	release_sock(sk);	625	release_sock(sk);
626		626
627	if (spliced)	627	if (spliced)
628	return spliced;	628	return spliced;
629		629
630	return ret;	630	return ret;
631	}	631	}
632		632
633	struct sk_buff sk_stream_alloc_skb(struct sock sk, int size, gfp_t gfp)	633	struct sk_buff sk_stream_alloc_skb(struct sock sk, int size, gfp_t gfp)
634	{	634	{
635	struct sk_buff *skb;	635	struct sk_buff *skb;
636		636
637	/* The TCP header must be at least 32-bit aligned. */	637	/* The TCP header must be at least 32-bit aligned. */
638	size = ALIGN(size, 4);	638	size = ALIGN(size, 4);
639		639
640	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);	640	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
641	if (skb) {	641	if (skb) {
642	if (sk_wmem_schedule(sk, skb->truesize)) {	642	if (sk_wmem_schedule(sk, skb->truesize)) {
643	/*	643	/*
644	* Make sure that we have exactly size bytes	644	* Make sure that we have exactly size bytes
645	* available to the caller, no more, no less.	645	* available to the caller, no more, no less.
646	*/	646	*/
647	skb_reserve(skb, skb_tailroom(skb) - size);	647	skb_reserve(skb, skb_tailroom(skb) - size);
648	return skb;	648	return skb;
649	}	649	}
650	__kfree_skb(skb);	650	__kfree_skb(skb);
651	} else {	651	} else {
652	sk->sk_prot->enter_memory_pressure(sk);	652	sk->sk_prot->enter_memory_pressure(sk);
653	sk_stream_moderate_sndbuf(sk);	653	sk_stream_moderate_sndbuf(sk);
654	}	654	}
655	return NULL;	655	return NULL;
656	}	656	}
657		657
658	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,	658	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
659	size_t psize, int flags)	659	size_t psize, int flags)
660	{	660	{
661	struct tcp_sock *tp = tcp_sk(sk);	661	struct tcp_sock *tp = tcp_sk(sk);
662	int mss_now, size_goal;	662	int mss_now, size_goal;
663	int err;	663	int err;
664	ssize_t copied;	664	ssize_t copied;
665	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);	665	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
666		666
667	/* Wait for a connection to finish. */	667	/* Wait for a connection to finish. */
668	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))	668	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
669	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)	669	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
670	goto out_err;	670	goto out_err;
671		671
672	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);	672	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
673		673
674	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));	674	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
675	size_goal = tp->xmit_size_goal;	675	size_goal = tp->xmit_size_goal;
676	copied = 0;	676	copied = 0;
677		677
678	err = -EPIPE;	678	err = -EPIPE;
679	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))	679	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
680	goto do_error;	680	goto do_error;
681		681
682	while (psize > 0) {	682	while (psize > 0) {
683	struct sk_buff *skb = tcp_write_queue_tail(sk);	683	struct sk_buff *skb = tcp_write_queue_tail(sk);
684	struct page *page = pages[poffset / PAGE_SIZE];	684	struct page *page = pages[poffset / PAGE_SIZE];
685	int copy, i, can_coalesce;	685	int copy, i, can_coalesce;
686	int offset = poffset % PAGE_SIZE;	686	int offset = poffset % PAGE_SIZE;
687	int size = min_t(size_t, psize, PAGE_SIZE - offset);	687	int size = min_t(size_t, psize, PAGE_SIZE - offset);
688		688
689	if (!tcp_send_head(sk) \|\| (copy = size_goal - skb->len) <= 0) {	689	if (!tcp_send_head(sk) \|\| (copy = size_goal - skb->len) <= 0) {
690	new_segment:	690	new_segment:
691	if (!sk_stream_memory_free(sk))	691	if (!sk_stream_memory_free(sk))
692	goto wait_for_sndbuf;	692	goto wait_for_sndbuf;
693		693
694	skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);	694	skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
695	if (!skb)	695	if (!skb)
696	goto wait_for_memory;	696	goto wait_for_memory;
697		697
698	skb_entail(sk, skb);	698	skb_entail(sk, skb);
699	copy = size_goal;	699	copy = size_goal;
700	}	700	}
701		701
702	if (copy > size)	702	if (copy > size)
703	copy = size;	703	copy = size;
704		704
705	i = skb_shinfo(skb)->nr_frags;	705	i = skb_shinfo(skb)->nr_frags;
706	can_coalesce = skb_can_coalesce(skb, i, page, offset);	706	can_coalesce = skb_can_coalesce(skb, i, page, offset);
707	if (!can_coalesce && i >= MAX_SKB_FRAGS) {	707	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
708	tcp_mark_push(tp, skb);	708	tcp_mark_push(tp, skb);
709	goto new_segment;	709	goto new_segment;
710	}	710	}
711	if (!sk_wmem_schedule(sk, copy))	711	if (!sk_wmem_schedule(sk, copy))
712	goto wait_for_memory;	712	goto wait_for_memory;
713		713
714	if (can_coalesce) {	714	if (can_coalesce) {
715	skb_shinfo(skb)->frags[i - 1].size += copy;	715	skb_shinfo(skb)->frags[i - 1].size += copy;
716	} else {	716	} else {
717	get_page(page);	717	get_page(page);
718	skb_fill_page_desc(skb, i, page, offset, copy);	718	skb_fill_page_desc(skb, i, page, offset, copy);
719	}	719	}
720		720
721	skb->len += copy;	721	skb->len += copy;
722	skb->data_len += copy;	722	skb->data_len += copy;
723	skb->truesize += copy;	723	skb->truesize += copy;
724	sk->sk_wmem_queued += copy;	724	sk->sk_wmem_queued += copy;
725	sk_mem_charge(sk, copy);	725	sk_mem_charge(sk, copy);
726	skb->ip_summed = CHECKSUM_PARTIAL;	726	skb->ip_summed = CHECKSUM_PARTIAL;
727	tp->write_seq += copy;	727	tp->write_seq += copy;
728	TCP_SKB_CB(skb)->end_seq += copy;	728	TCP_SKB_CB(skb)->end_seq += copy;
729	skb_shinfo(skb)->gso_segs = 0;	729	skb_shinfo(skb)->gso_segs = 0;
730		730
731	if (!copied)	731	if (!copied)
732	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;	732	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
733		733
734	copied += copy;	734	copied += copy;
735	poffset += copy;	735	poffset += copy;
736	if (!(psize -= copy))	736	if (!(psize -= copy))
737	goto out;	737	goto out;
738		738
739	if (skb->len < size_goal \|\| (flags & MSG_OOB))	739	if (skb->len < size_goal \|\| (flags & MSG_OOB))
740	continue;	740	continue;
741		741
742	if (forced_push(tp)) {	742	if (forced_push(tp)) {
743	tcp_mark_push(tp, skb);	743	tcp_mark_push(tp, skb);
744	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);	744	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
745	} else if (skb == tcp_send_head(sk))	745	} else if (skb == tcp_send_head(sk))
746	tcp_push_one(sk, mss_now);	746	tcp_push_one(sk, mss_now);
747	continue;	747	continue;
748		748
749	wait_for_sndbuf:	749	wait_for_sndbuf:
750	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);	750	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
751	wait_for_memory:	751	wait_for_memory:
752	if (copied)	752	if (copied)
753	tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);	753	tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
754		754
755	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)	755	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
756	goto do_error;	756	goto do_error;
757		757
758	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));	758	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
759	size_goal = tp->xmit_size_goal;	759	size_goal = tp->xmit_size_goal;
760	}	760	}
761		761
762	out:	762	out:
763	if (copied)	763	if (copied)
764	tcp_push(sk, flags, mss_now, tp->nonagle);	764	tcp_push(sk, flags, mss_now, tp->nonagle);
765	return copied;	765	return copied;
766		766
767	do_error:	767	do_error:
768	if (copied)	768	if (copied)
769	goto out;	769	goto out;
770	out_err:	770	out_err:
771	return sk_stream_error(sk, flags, err);	771	return sk_stream_error(sk, flags, err);
772	}	772	}
773		773
774	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,	774	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
775	size_t size, int flags)	775	size_t size, int flags)
776	{	776	{
777	ssize_t res;	777	ssize_t res;
778	struct sock *sk = sock->sk;	778	struct sock *sk = sock->sk;
779		779
780	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|	780	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
781	!(sk->sk_route_caps & NETIF_F_ALL_CSUM))	781	!(sk->sk_route_caps & NETIF_F_ALL_CSUM))
782	return sock_no_sendpage(sock, page, offset, size, flags);	782	return sock_no_sendpage(sock, page, offset, size, flags);
783		783
784	lock_sock(sk);	784	lock_sock(sk);
785	TCP_CHECK_TIMER(sk);	785	TCP_CHECK_TIMER(sk);
786	res = do_tcp_sendpages(sk, &page, offset, size, flags);	786	res = do_tcp_sendpages(sk, &page, offset, size, flags);
787	TCP_CHECK_TIMER(sk);	787	TCP_CHECK_TIMER(sk);
788	release_sock(sk);	788	release_sock(sk);
789	return res;	789	return res;
790	}	790	}
791		791
792	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)	792	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
793	#define TCP_OFF(sk) (sk->sk_sndmsg_off)	793	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
794		794
795	static inline int select_size(struct sock *sk)	795	static inline int select_size(struct sock *sk)
796	{	796	{
797	struct tcp_sock *tp = tcp_sk(sk);	797	struct tcp_sock *tp = tcp_sk(sk);
798	int tmp = tp->mss_cache;	798	int tmp = tp->mss_cache;
799		799
800	if (sk->sk_route_caps & NETIF_F_SG) {	800	if (sk->sk_route_caps & NETIF_F_SG) {
801	if (sk_can_gso(sk))	801	if (sk_can_gso(sk))
802	tmp = 0;	802	tmp = 0;
803	else {	803	else {
804	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);	804	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
805		805
806	if (tmp >= pgbreak &&	806	if (tmp >= pgbreak &&
807	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)	807	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
808	tmp = pgbreak;	808	tmp = pgbreak;
809	}	809	}
810	}	810	}
811		811
812	return tmp;	812	return tmp;
813	}	813	}
814		814
815	int tcp_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,	815	int tcp_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
816	size_t size)	816	size_t size)
817	{	817	{
818	struct sock *sk = sock->sk;	818	struct sock *sk = sock->sk;
819	struct iovec *iov;	819	struct iovec *iov;
820	struct tcp_sock *tp = tcp_sk(sk);	820	struct tcp_sock *tp = tcp_sk(sk);
821	struct sk_buff *skb;	821	struct sk_buff *skb;
822	int iovlen, flags;	822	int iovlen, flags;
823	int mss_now, size_goal;	823	int mss_now, size_goal;
824	int err, copied;	824	int err, copied;
825	long timeo;	825	long timeo;
826		826
827	lock_sock(sk);	827	lock_sock(sk);
828	TCP_CHECK_TIMER(sk);	828	TCP_CHECK_TIMER(sk);
829		829
830	flags = msg->msg_flags;	830	flags = msg->msg_flags;
831	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);	831	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
832		832
833	/* Wait for a connection to finish. */	833	/* Wait for a connection to finish. */
834	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))	834	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
835	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)	835	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
836	goto out_err;	836	goto out_err;
837		837
838	/* This should be in poll */	838	/* This should be in poll */
839	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);	839	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
840		840
841	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));	841	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
842	size_goal = tp->xmit_size_goal;	842	size_goal = tp->xmit_size_goal;
843		843
844	/* Ok commence sending. */	844	/* Ok commence sending. */
845	iovlen = msg->msg_iovlen;	845	iovlen = msg->msg_iovlen;
846	iov = msg->msg_iov;	846	iov = msg->msg_iov;
847	copied = 0;	847	copied = 0;
848		848
849	err = -EPIPE;	849	err = -EPIPE;
850	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))	850	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
851	goto do_error;	851	goto do_error;
852		852
853	while (--iovlen >= 0) {	853	while (--iovlen >= 0) {
854	int seglen = iov->iov_len;	854	int seglen = iov->iov_len;
855	unsigned char __user *from = iov->iov_base;	855	unsigned char __user *from = iov->iov_base;
856		856
857	iov++;	857	iov++;
858		858
859	while (seglen > 0) {	859	while (seglen > 0) {
860	int copy;	860	int copy;
861		861
862	skb = tcp_write_queue_tail(sk);	862	skb = tcp_write_queue_tail(sk);
863		863
864	if (!tcp_send_head(sk) \|\|	864	if (!tcp_send_head(sk) \|\|
865	(copy = size_goal - skb->len) <= 0) {	865	(copy = size_goal - skb->len) <= 0) {
866		866
867	new_segment:	867	new_segment:
868	/* Allocate new segment. If the interface is SG,	868	/* Allocate new segment. If the interface is SG,
869	* allocate skb fitting to single page.	869	* allocate skb fitting to single page.
870	*/	870	*/
871	if (!sk_stream_memory_free(sk))	871	if (!sk_stream_memory_free(sk))
872	goto wait_for_sndbuf;	872	goto wait_for_sndbuf;
873		873
874	skb = sk_stream_alloc_skb(sk, select_size(sk),	874	skb = sk_stream_alloc_skb(sk, select_size(sk),
875	sk->sk_allocation);	875	sk->sk_allocation);
876	if (!skb)	876	if (!skb)
877	goto wait_for_memory;	877	goto wait_for_memory;
878		878
879	/*	879	/*
880	* Check whether we can use HW checksum.	880	* Check whether we can use HW checksum.
881	*/	881	*/
882	if (sk->sk_route_caps & NETIF_F_ALL_CSUM)	882	if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
883	skb->ip_summed = CHECKSUM_PARTIAL;	883	skb->ip_summed = CHECKSUM_PARTIAL;
884		884
885	skb_entail(sk, skb);	885	skb_entail(sk, skb);
886	copy = size_goal;	886	copy = size_goal;
887	}	887	}
888		888
889	/* Try to append data to the end of skb. */	889	/* Try to append data to the end of skb. */
890	if (copy > seglen)	890	if (copy > seglen)
891	copy = seglen;	891	copy = seglen;
892		892
893	/* Where to copy to? */	893	/* Where to copy to? */
894	if (skb_tailroom(skb) > 0) {	894	if (skb_tailroom(skb) > 0) {
895	/* We have some space in skb head. Superb! */	895	/* We have some space in skb head. Superb! */
896	if (copy > skb_tailroom(skb))	896	if (copy > skb_tailroom(skb))
897	copy = skb_tailroom(skb);	897	copy = skb_tailroom(skb);
898	if ((err = skb_add_data(skb, from, copy)) != 0)	898	if ((err = skb_add_data(skb, from, copy)) != 0)
899	goto do_fault;	899	goto do_fault;
900	} else {	900	} else {
901	int merge = 0;	901	int merge = 0;
902	int i = skb_shinfo(skb)->nr_frags;	902	int i = skb_shinfo(skb)->nr_frags;
903	struct page *page = TCP_PAGE(sk);	903	struct page *page = TCP_PAGE(sk);
904	int off = TCP_OFF(sk);	904	int off = TCP_OFF(sk);
905		905
906	if (skb_can_coalesce(skb, i, page, off) &&	906	if (skb_can_coalesce(skb, i, page, off) &&
907	off != PAGE_SIZE) {	907	off != PAGE_SIZE) {
908	/* We can extend the last page	908	/* We can extend the last page
909	* fragment. */	909	* fragment. */
910	merge = 1;	910	merge = 1;
911	} else if (i == MAX_SKB_FRAGS \|\|	911	} else if (i == MAX_SKB_FRAGS \|\|
912	(!i &&	912	(!i &&
913	!(sk->sk_route_caps & NETIF_F_SG))) {	913	!(sk->sk_route_caps & NETIF_F_SG))) {
914	/* Need to add new fragment and cannot	914	/* Need to add new fragment and cannot
915	* do this because interface is non-SG,	915	* do this because interface is non-SG,
916	* or because all the page slots are	916	* or because all the page slots are
917	* busy. */	917	* busy. */
918	tcp_mark_push(tp, skb);	918	tcp_mark_push(tp, skb);
919	goto new_segment;	919	goto new_segment;
920	} else if (page) {	920	} else if (page) {
921	if (off == PAGE_SIZE) {	921	if (off == PAGE_SIZE) {
922	put_page(page);	922	put_page(page);
923	TCP_PAGE(sk) = page = NULL;	923	TCP_PAGE(sk) = page = NULL;
924	off = 0;	924	off = 0;
925	}	925	}
926	} else	926	} else
927	off = 0;	927	off = 0;
928		928
929	if (copy > PAGE_SIZE - off)	929	if (copy > PAGE_SIZE - off)
930	copy = PAGE_SIZE - off;	930	copy = PAGE_SIZE - off;
931		931
932	if (!sk_wmem_schedule(sk, copy))	932	if (!sk_wmem_schedule(sk, copy))
933	goto wait_for_memory;	933	goto wait_for_memory;
934		934
935	if (!page) {	935	if (!page) {
936	/* Allocate new cache page. */	936	/* Allocate new cache page. */
937	if (!(page = sk_stream_alloc_page(sk)))	937	if (!(page = sk_stream_alloc_page(sk)))
938	goto wait_for_memory;	938	goto wait_for_memory;
939	}	939	}
940		940
941	/* Time to copy data. We are close to	941	/* Time to copy data. We are close to
942	* the end! */	942	* the end! */
943	err = skb_copy_to_page(sk, from, skb, page,	943	err = skb_copy_to_page(sk, from, skb, page,
944	off, copy);	944	off, copy);
945	if (err) {	945	if (err) {
946	/* If this page was new, give it to the	946	/* If this page was new, give it to the
947	* socket so it does not get leaked.	947	* socket so it does not get leaked.
948	*/	948	*/
949	if (!TCP_PAGE(sk)) {	949	if (!TCP_PAGE(sk)) {
950	TCP_PAGE(sk) = page;	950	TCP_PAGE(sk) = page;
951	TCP_OFF(sk) = 0;	951	TCP_OFF(sk) = 0;
952	}	952	}
953	goto do_error;	953	goto do_error;
954	}	954	}
955		955
956	/* Update the skb. */	956	/* Update the skb. */
957	if (merge) {	957	if (merge) {
958	skb_shinfo(skb)->frags[i - 1].size +=	958	skb_shinfo(skb)->frags[i - 1].size +=
959	copy;	959	copy;
960	} else {	960	} else {
961	skb_fill_page_desc(skb, i, page, off, copy);	961	skb_fill_page_desc(skb, i, page, off, copy);
962	if (TCP_PAGE(sk)) {	962	if (TCP_PAGE(sk)) {
963	get_page(page);	963	get_page(page);
964	} else if (off + copy < PAGE_SIZE) {	964	} else if (off + copy < PAGE_SIZE) {
965	get_page(page);	965	get_page(page);
966	TCP_PAGE(sk) = page;	966	TCP_PAGE(sk) = page;
967	}	967	}
968	}	968	}
969		969
970	TCP_OFF(sk) = off + copy;	970	TCP_OFF(sk) = off + copy;
971	}	971	}
972		972
973	if (!copied)	973	if (!copied)
974	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;	974	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
975		975
976	tp->write_seq += copy;	976	tp->write_seq += copy;
977	TCP_SKB_CB(skb)->end_seq += copy;	977	TCP_SKB_CB(skb)->end_seq += copy;
978	skb_shinfo(skb)->gso_segs = 0;	978	skb_shinfo(skb)->gso_segs = 0;
979		979
980	from += copy;	980	from += copy;
981	copied += copy;	981	copied += copy;
982	if ((seglen -= copy) == 0 && iovlen == 0)	982	if ((seglen -= copy) == 0 && iovlen == 0)
983	goto out;	983	goto out;
984		984
985	if (skb->len < size_goal \|\| (flags & MSG_OOB))	985	if (skb->len < size_goal \|\| (flags & MSG_OOB))
986	continue;	986	continue;
987		987
988	if (forced_push(tp)) {	988	if (forced_push(tp)) {
989	tcp_mark_push(tp, skb);	989	tcp_mark_push(tp, skb);
990	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);	990	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
991	} else if (skb == tcp_send_head(sk))	991	} else if (skb == tcp_send_head(sk))
992	tcp_push_one(sk, mss_now);	992	tcp_push_one(sk, mss_now);
993	continue;	993	continue;
994		994
995	wait_for_sndbuf:	995	wait_for_sndbuf:
996	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);	996	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
997	wait_for_memory:	997	wait_for_memory:
998	if (copied)	998	if (copied)
999	tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);	999	tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1000		1000
1001	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)	1001	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1002	goto do_error;	1002	goto do_error;
1003		1003
1004	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));	1004	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1005	size_goal = tp->xmit_size_goal;	1005	size_goal = tp->xmit_size_goal;
1006	}	1006	}
1007	}	1007	}
1008		1008
1009	out:	1009	out:
1010	if (copied)	1010	if (copied)
1011	tcp_push(sk, flags, mss_now, tp->nonagle);	1011	tcp_push(sk, flags, mss_now, tp->nonagle);
1012	TCP_CHECK_TIMER(sk);	1012	TCP_CHECK_TIMER(sk);
1013	release_sock(sk);	1013	release_sock(sk);
1014	return copied;	1014	return copied;
1015		1015
1016	do_fault:	1016	do_fault:
1017	if (!skb->len) {	1017	if (!skb->len) {
1018	tcp_unlink_write_queue(skb, sk);	1018	tcp_unlink_write_queue(skb, sk);
1019	/* It is the one place in all of TCP, except connection	1019	/* It is the one place in all of TCP, except connection
1020	* reset, where we can be unlinking the send_head.	1020	* reset, where we can be unlinking the send_head.
1021	*/	1021	*/
1022	tcp_check_send_head(sk, skb);	1022	tcp_check_send_head(sk, skb);
1023	sk_wmem_free_skb(sk, skb);	1023	sk_wmem_free_skb(sk, skb);
1024	}	1024	}
1025		1025
1026	do_error:	1026	do_error:
1027	if (copied)	1027	if (copied)
1028	goto out;	1028	goto out;
1029	out_err:	1029	out_err:
1030	err = sk_stream_error(sk, flags, err);	1030	err = sk_stream_error(sk, flags, err);
1031	TCP_CHECK_TIMER(sk);	1031	TCP_CHECK_TIMER(sk);
1032	release_sock(sk);	1032	release_sock(sk);
1033	return err;	1033	return err;
1034	}	1034	}
1035		1035
1036	/*	1036	/*
1037	* Handle reading urgent data. BSD has very simple semantics for	1037	* Handle reading urgent data. BSD has very simple semantics for
1038	* this, no blocking and very strange errors 8)	1038	* this, no blocking and very strange errors 8)
1039	*/	1039	*/
1040		1040
1041	static int tcp_recv_urg(struct sock *sk, long timeo,	1041	static int tcp_recv_urg(struct sock *sk, long timeo,
1042	struct msghdr *msg, int len, int flags,	1042	struct msghdr *msg, int len, int flags,
1043	int *addr_len)	1043	int *addr_len)
1044	{	1044	{
1045	struct tcp_sock *tp = tcp_sk(sk);	1045	struct tcp_sock *tp = tcp_sk(sk);
1046		1046
1047	/* No URG data to read. */	1047	/* No URG data to read. */
1048	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|	1048	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
1049	tp->urg_data == TCP_URG_READ)	1049	tp->urg_data == TCP_URG_READ)
1050	return -EINVAL; /* Yes this is right ! */	1050	return -EINVAL; /* Yes this is right ! */
1051		1051
1052	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))	1052	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1053	return -ENOTCONN;	1053	return -ENOTCONN;
1054		1054
1055	if (tp->urg_data & TCP_URG_VALID) {	1055	if (tp->urg_data & TCP_URG_VALID) {
1056	int err = 0;	1056	int err = 0;
1057	char c = tp->urg_data;	1057	char c = tp->urg_data;
1058		1058
1059	if (!(flags & MSG_PEEK))	1059	if (!(flags & MSG_PEEK))
1060	tp->urg_data = TCP_URG_READ;	1060	tp->urg_data = TCP_URG_READ;
1061		1061
1062	/* Read urgent data. */	1062	/* Read urgent data. */
1063	msg->msg_flags \|= MSG_OOB;	1063	msg->msg_flags \|= MSG_OOB;
1064		1064
1065	if (len > 0) {	1065	if (len > 0) {
1066	if (!(flags & MSG_TRUNC))	1066	if (!(flags & MSG_TRUNC))
1067	err = memcpy_toiovec(msg->msg_iov, &c, 1);	1067	err = memcpy_toiovec(msg->msg_iov, &c, 1);
1068	len = 1;	1068	len = 1;
1069	} else	1069	} else
1070	msg->msg_flags \|= MSG_TRUNC;	1070	msg->msg_flags \|= MSG_TRUNC;
1071		1071
1072	return err ? -EFAULT : len;	1072	return err ? -EFAULT : len;
1073	}	1073	}
1074		1074
1075	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))	1075	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
1076	return 0;	1076	return 0;
1077		1077
1078	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and	1078	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1079	* the available implementations agree in this case:	1079	* the available implementations agree in this case:
1080	* this call should never block, independent of the	1080	* this call should never block, independent of the
1081	* blocking state of the socket.	1081	* blocking state of the socket.
1082	* Mike <pall@rz.uni-karlsruhe.de>	1082	* Mike <pall@rz.uni-karlsruhe.de>
1083	*/	1083	*/
1084	return -EAGAIN;	1084	return -EAGAIN;
1085	}	1085	}
1086		1086
1087	/* Clean up the receive buffer for full frames taken by the user,	1087	/* Clean up the receive buffer for full frames taken by the user,
1088	* then send an ACK if necessary. COPIED is the number of bytes	1088	* then send an ACK if necessary. COPIED is the number of bytes
1089	* tcp_recvmsg has given to the user so far, it speeds up the	1089	* tcp_recvmsg has given to the user so far, it speeds up the
1090	* calculation of whether or not we must ACK for the sake of	1090	* calculation of whether or not we must ACK for the sake of
1091	* a window update.	1091	* a window update.
1092	*/	1092	*/
1093	void tcp_cleanup_rbuf(struct sock *sk, int copied)	1093	void tcp_cleanup_rbuf(struct sock *sk, int copied)
1094	{	1094	{
1095	struct tcp_sock *tp = tcp_sk(sk);	1095	struct tcp_sock *tp = tcp_sk(sk);
1096	int time_to_ack = 0;	1096	int time_to_ack = 0;
1097		1097
1098	#if TCP_DEBUG	1098	#if TCP_DEBUG
1099	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);	1099	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1100		1100
1101	WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));	1101	WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1102	#endif	1102	#endif
1103		1103
1104	if (inet_csk_ack_scheduled(sk)) {	1104	if (inet_csk_ack_scheduled(sk)) {
1105	const struct inet_connection_sock *icsk = inet_csk(sk);	1105	const struct inet_connection_sock *icsk = inet_csk(sk);
1106	/* Delayed ACKs frequently hit locked sockets during bulk	1106	/* Delayed ACKs frequently hit locked sockets during bulk
1107	* receive. */	1107	* receive. */
1108	if (icsk->icsk_ack.blocked \|\|	1108	if (icsk->icsk_ack.blocked \|\|
1109	/* Once-per-two-segments ACK was not sent by tcp_input.c */	1109	/* Once-per-two-segments ACK was not sent by tcp_input.c */
1110	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|	1110	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
1111	/*	1111	/*
1112	* If this read emptied read buffer, we send ACK, if	1112	* If this read emptied read buffer, we send ACK, if
1113	* connection is not bidirectional, user drained	1113	* connection is not bidirectional, user drained
1114	* receive buffer and there was a small segment	1114	* receive buffer and there was a small segment
1115	* in queue.	1115	* in queue.
1116	*/	1116	*/
1117	(copied > 0 &&	1117	(copied > 0 &&
1118	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|	1118	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|
1119	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&	1119	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1120	!icsk->icsk_ack.pingpong)) &&	1120	!icsk->icsk_ack.pingpong)) &&
1121	!atomic_read(&sk->sk_rmem_alloc)))	1121	!atomic_read(&sk->sk_rmem_alloc)))
1122	time_to_ack = 1;	1122	time_to_ack = 1;
1123	}	1123	}
1124		1124
1125	/* We send an ACK if we can now advertise a non-zero window	1125	/* We send an ACK if we can now advertise a non-zero window
1126	* which has been raised "significantly".	1126	* which has been raised "significantly".
1127	*	1127	*
1128	* Even if window raised up to infinity, do not send window open ACK	1128	* Even if window raised up to infinity, do not send window open ACK
1129	* in states, where we will not receive more. It is useless.	1129	* in states, where we will not receive more. It is useless.
1130	*/	1130	*/
1131	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {	1131	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1132	__u32 rcv_window_now = tcp_receive_window(tp);	1132	__u32 rcv_window_now = tcp_receive_window(tp);
1133		1133
1134	/* Optimize, __tcp_select_window() is not cheap. */	1134	/* Optimize, __tcp_select_window() is not cheap. */
1135	if (2*rcv_window_now <= tp->window_clamp) {	1135	if (2*rcv_window_now <= tp->window_clamp) {
1136	__u32 new_window = __tcp_select_window(sk);	1136	__u32 new_window = __tcp_select_window(sk);
1137		1137
1138	/* Send ACK now, if this read freed lots of space	1138	/* Send ACK now, if this read freed lots of space
1139	* in our buffer. Certainly, new_window is new window.	1139	* in our buffer. Certainly, new_window is new window.
1140	* We can advertise it now, if it is not less than current one.	1140	* We can advertise it now, if it is not less than current one.
1141	* "Lots" means "at least twice" here.	1141	* "Lots" means "at least twice" here.
1142	*/	1142	*/
1143	if (new_window && new_window >= 2 * rcv_window_now)	1143	if (new_window && new_window >= 2 * rcv_window_now)
1144	time_to_ack = 1;	1144	time_to_ack = 1;
1145	}	1145	}
1146	}	1146	}
1147	if (time_to_ack)	1147	if (time_to_ack)
1148	tcp_send_ack(sk);	1148	tcp_send_ack(sk);
1149	}	1149	}
1150		1150
1151	static void tcp_prequeue_process(struct sock *sk)	1151	static void tcp_prequeue_process(struct sock *sk)
1152	{	1152	{
1153	struct sk_buff *skb;	1153	struct sk_buff *skb;
1154	struct tcp_sock *tp = tcp_sk(sk);	1154	struct tcp_sock *tp = tcp_sk(sk);
1155		1155
1156	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);	1156	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1157		1157
1158	/* RX process wants to run with disabled BHs, though it is not	1158	/* RX process wants to run with disabled BHs, though it is not
1159	* necessary */	1159	* necessary */
1160	local_bh_disable();	1160	local_bh_disable();
1161	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)	1161	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1162	sk_backlog_rcv(sk, skb);	1162	sk_backlog_rcv(sk, skb);
1163	local_bh_enable();	1163	local_bh_enable();
1164		1164
1165	/* Clear memory counter. */	1165	/* Clear memory counter. */
1166	tp->ucopy.memory = 0;	1166	tp->ucopy.memory = 0;
1167	}	1167	}
1168		1168
1169	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)	1169	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
1170	{	1170	{
1171	struct sk_buff *skb;	1171	struct sk_buff *skb;
1172	u32 offset;	1172	u32 offset;
1173		1173
1174	skb_queue_walk(&sk->sk_receive_queue, skb) {	1174	skb_queue_walk(&sk->sk_receive_queue, skb) {
1175	offset = seq - TCP_SKB_CB(skb)->seq;	1175	offset = seq - TCP_SKB_CB(skb)->seq;
1176	if (tcp_hdr(skb)->syn)	1176	if (tcp_hdr(skb)->syn)
1177	offset--;	1177	offset--;
1178	if (offset < skb->len \|\| tcp_hdr(skb)->fin) {	1178	if (offset < skb->len \|\| tcp_hdr(skb)->fin) {
1179	*off = offset;	1179	*off = offset;
1180	return skb;	1180	return skb;
1181	}	1181	}
1182	}	1182	}
1183	return NULL;	1183	return NULL;
1184	}	1184	}
1185		1185
1186	/*	1186	/*
1187	* This routine provides an alternative to tcp_recvmsg() for routines	1187	* This routine provides an alternative to tcp_recvmsg() for routines
1188	* that would like to handle copying from skbuffs directly in 'sendfile'	1188	* that would like to handle copying from skbuffs directly in 'sendfile'
1189	* fashion.	1189	* fashion.
1190	* Note:	1190	* Note:
1191	* - It is assumed that the socket was locked by the caller.	1191	* - It is assumed that the socket was locked by the caller.
1192	* - The routine does not block.	1192	* - The routine does not block.
1193	* - At present, there is no support for reading OOB data	1193	* - At present, there is no support for reading OOB data
1194	* or for 'peeking' the socket using this routine	1194	* or for 'peeking' the socket using this routine
1195	* (although both would be easy to implement).	1195	* (although both would be easy to implement).
1196	*/	1196	*/
1197	int tcp_read_sock(struct sock sk, read_descriptor_t desc,	1197	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
1198	sk_read_actor_t recv_actor)	1198	sk_read_actor_t recv_actor)
1199	{	1199	{
1200	struct sk_buff *skb;	1200	struct sk_buff *skb;
1201	struct tcp_sock *tp = tcp_sk(sk);	1201	struct tcp_sock *tp = tcp_sk(sk);
1202	u32 seq = tp->copied_seq;	1202	u32 seq = tp->copied_seq;
1203	u32 offset;	1203	u32 offset;
1204	int copied = 0;	1204	int copied = 0;
1205		1205
1206	if (sk->sk_state == TCP_LISTEN)	1206	if (sk->sk_state == TCP_LISTEN)
1207	return -ENOTCONN;	1207	return -ENOTCONN;
1208	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {	1208	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1209	if (offset < skb->len) {	1209	if (offset < skb->len) {
1210	int used;	1210	int used;
1211	size_t len;	1211	size_t len;
1212		1212
1213	len = skb->len - offset;	1213	len = skb->len - offset;
1214	/* Stop reading if we hit a patch of urgent data */	1214	/* Stop reading if we hit a patch of urgent data */
1215	if (tp->urg_data) {	1215	if (tp->urg_data) {
1216	u32 urg_offset = tp->urg_seq - seq;	1216	u32 urg_offset = tp->urg_seq - seq;
1217	if (urg_offset < len)	1217	if (urg_offset < len)
1218	len = urg_offset;	1218	len = urg_offset;
1219	if (!len)	1219	if (!len)
1220	break;	1220	break;
1221	}	1221	}
1222	used = recv_actor(desc, skb, offset, len);	1222	used = recv_actor(desc, skb, offset, len);
1223	if (used < 0) {	1223	if (used < 0) {
1224	if (!copied)	1224	if (!copied)
1225	copied = used;	1225	copied = used;
1226	break;	1226	break;
1227	} else if (used <= len) {	1227	} else if (used <= len) {
1228	seq += used;	1228	seq += used;
1229	copied += used;	1229	copied += used;
1230	offset += used;	1230	offset += used;
1231	}	1231	}
1232	/*	1232	/*
1233	* If recv_actor drops the lock (e.g. TCP splice	1233	* If recv_actor drops the lock (e.g. TCP splice
1234	* receive) the skb pointer might be invalid when	1234	* receive) the skb pointer might be invalid when
1235	* getting here: tcp_collapse might have deleted it	1235	* getting here: tcp_collapse might have deleted it
1236	* while aggregating skbs from the socket queue.	1236	* while aggregating skbs from the socket queue.
1237	*/	1237	*/
1238	skb = tcp_recv_skb(sk, seq-1, &offset);	1238	skb = tcp_recv_skb(sk, seq-1, &offset);
1239	if (!skb \|\| (offset+1 != skb->len))	1239	if (!skb \|\| (offset+1 != skb->len))
1240	break;	1240	break;
1241	}	1241	}
1242	if (tcp_hdr(skb)->fin) {	1242	if (tcp_hdr(skb)->fin) {
1243	sk_eat_skb(sk, skb, 0);	1243	sk_eat_skb(sk, skb, 0);
1244	++seq;	1244	++seq;
1245	break;	1245	break;
1246	}	1246	}
1247	sk_eat_skb(sk, skb, 0);	1247	sk_eat_skb(sk, skb, 0);
1248	if (!desc->count)	1248	if (!desc->count)
1249	break;	1249	break;
1250	}	1250	}
1251	tp->copied_seq = seq;	1251	tp->copied_seq = seq;
1252		1252
1253	tcp_rcv_space_adjust(sk);	1253	tcp_rcv_space_adjust(sk);
1254		1254
1255	/* Clean up data we have read: This will do ACK frames. */	1255	/* Clean up data we have read: This will do ACK frames. */
1256	if (copied > 0)	1256	if (copied > 0)
1257	tcp_cleanup_rbuf(sk, copied);	1257	tcp_cleanup_rbuf(sk, copied);
1258	return copied;	1258	return copied;
1259	}	1259	}
1260		1260
1261	/*	1261	/*
1262	* This routine copies from a sock struct into the user buffer.	1262	* This routine copies from a sock struct into the user buffer.
1263	*	1263	*
1264	* Technical note: in 2.3 we work on _locked_ socket, so that	1264	* Technical note: in 2.3 we work on _locked_ socket, so that
1265	* tricks with *seq access order and skb->users are not required.	1265	* tricks with *seq access order and skb->users are not required.
1266	* Probably, code can be easily improved even more.	1266	* Probably, code can be easily improved even more.
1267	*/	1267	*/
1268		1268
1269	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,	1269	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
1270	size_t len, int nonblock, int flags, int *addr_len)	1270	size_t len, int nonblock, int flags, int *addr_len)
1271	{	1271	{
1272	struct tcp_sock *tp = tcp_sk(sk);	1272	struct tcp_sock *tp = tcp_sk(sk);
1273	int copied = 0;	1273	int copied = 0;
1274	u32 peek_seq;	1274	u32 peek_seq;
1275	u32 *seq;	1275	u32 *seq;
1276	unsigned long used;	1276	unsigned long used;
1277	int err;	1277	int err;
1278	int target; /* Read at least this many bytes */	1278	int target; /* Read at least this many bytes */
1279	long timeo;	1279	long timeo;
1280	struct task_struct *user_recv = NULL;	1280	struct task_struct *user_recv = NULL;
1281	int copied_early = 0;	1281	int copied_early = 0;
1282	struct sk_buff *skb;	1282	struct sk_buff *skb;
1283		1283
1284	lock_sock(sk);	1284	lock_sock(sk);
1285		1285
1286	TCP_CHECK_TIMER(sk);	1286	TCP_CHECK_TIMER(sk);
1287		1287
1288	err = -ENOTCONN;	1288	err = -ENOTCONN;
1289	if (sk->sk_state == TCP_LISTEN)	1289	if (sk->sk_state == TCP_LISTEN)
1290	goto out;	1290	goto out;
1291		1291
1292	timeo = sock_rcvtimeo(sk, nonblock);	1292	timeo = sock_rcvtimeo(sk, nonblock);
1293		1293
1294	/* Urgent data needs to be handled specially. */	1294	/* Urgent data needs to be handled specially. */
1295	if (flags & MSG_OOB)	1295	if (flags & MSG_OOB)
1296	goto recv_urg;	1296	goto recv_urg;
1297		1297
1298	seq = &tp->copied_seq;	1298	seq = &tp->copied_seq;
1299	if (flags & MSG_PEEK) {	1299	if (flags & MSG_PEEK) {
1300	peek_seq = tp->copied_seq;	1300	peek_seq = tp->copied_seq;
1301	seq = &peek_seq;	1301	seq = &peek_seq;
1302	}	1302	}
1303		1303
1304	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);	1304	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1305		1305
1306	#ifdef CONFIG_NET_DMA	1306	#ifdef CONFIG_NET_DMA
1307	tp->ucopy.dma_chan = NULL;	1307	tp->ucopy.dma_chan = NULL;
1308	preempt_disable();	1308	preempt_disable();
1309	skb = skb_peek_tail(&sk->sk_receive_queue);	1309	skb = skb_peek_tail(&sk->sk_receive_queue);
1310	{	1310	{
1311	int available = 0;	1311	int available = 0;
1312		1312
1313	if (skb)	1313	if (skb)
1314	available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);	1314	available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1315	if ((available < target) &&	1315	if ((available < target) &&
1316	(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&	1316	(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1317	!sysctl_tcp_low_latency &&	1317	!sysctl_tcp_low_latency &&
1318	__get_cpu_var(softnet_data).net_dma) {	1318	__get_cpu_var(softnet_data).net_dma) {
1319	preempt_enable_no_resched();	1319	preempt_enable_no_resched();
1320	tp->ucopy.pinned_list =	1320	tp->ucopy.pinned_list =
1321	dma_pin_iovec_pages(msg->msg_iov, len);	1321	dma_pin_iovec_pages(msg->msg_iov, len);
1322	} else {	1322	} else {
1323	preempt_enable_no_resched();	1323	preempt_enable_no_resched();
1324	}	1324	}
1325	}	1325	}
1326	#endif	1326	#endif
1327		1327
1328	do {	1328	do {
1329	u32 offset;	1329	u32 offset;
1330		1330
1331	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */	1331	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1332	if (tp->urg_data && tp->urg_seq == *seq) {	1332	if (tp->urg_data && tp->urg_seq == *seq) {
1333	if (copied)	1333	if (copied)
1334	break;	1334	break;
1335	if (signal_pending(current)) {	1335	if (signal_pending(current)) {
1336	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;	1336	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1337	break;	1337	break;
1338	}	1338	}
1339	}	1339	}
1340		1340
1341	/* Next get a buffer. */	1341	/* Next get a buffer. */
1342		1342
1343	skb = skb_peek(&sk->sk_receive_queue);	1343	skb = skb_peek(&sk->sk_receive_queue);
1344	do {	1344	do {
1345	if (!skb)	1345	if (!skb)
1346	break;	1346	break;
1347		1347
1348	/* Now that we have two receive queues this	1348	/* Now that we have two receive queues this
1349	* shouldn't happen.	1349	* shouldn't happen.
1350	*/	1350	*/
1351	if (before(*seq, TCP_SKB_CB(skb)->seq)) {	1351	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1352	printk(KERN_INFO "recvmsg bug: copied %X "	1352	printk(KERN_INFO "recvmsg bug: copied %X "
1353	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);	1353	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1354	break;	1354	break;
1355	}	1355	}
1356	offset = *seq - TCP_SKB_CB(skb)->seq;	1356	offset = *seq - TCP_SKB_CB(skb)->seq;
1357	if (tcp_hdr(skb)->syn)	1357	if (tcp_hdr(skb)->syn)
1358	offset--;	1358	offset--;
1359	if (offset < skb->len)	1359	if (offset < skb->len)
1360	goto found_ok_skb;	1360	goto found_ok_skb;
1361	if (tcp_hdr(skb)->fin)	1361	if (tcp_hdr(skb)->fin)
1362	goto found_fin_ok;	1362	goto found_fin_ok;
1363	WARN_ON(!(flags & MSG_PEEK));	1363	WARN_ON(!(flags & MSG_PEEK));
1364	skb = skb->next;	1364	skb = skb->next;
1365	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);	1365	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1366		1366
1367	/* Well, if we have backlog, try to process it now yet. */	1367	/* Well, if we have backlog, try to process it now yet. */
1368		1368
1369	if (copied >= target && !sk->sk_backlog.tail)	1369	if (copied >= target && !sk->sk_backlog.tail)
1370	break;	1370	break;
1371		1371
1372	if (copied) {	1372	if (copied) {
1373	if (sk->sk_err \|\|	1373	if (sk->sk_err \|\|
1374	sk->sk_state == TCP_CLOSE \|\|	1374	sk->sk_state == TCP_CLOSE \|\|
1375	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|	1375	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
1376	!timeo \|\|	1376	!timeo \|\|
1377	signal_pending(current) \|\|	1377	signal_pending(current) \|\|
1378	(flags & MSG_PEEK))	1378	(flags & MSG_PEEK))
1379	break;	1379	break;
1380	} else {	1380	} else {
1381	if (sock_flag(sk, SOCK_DONE))	1381	if (sock_flag(sk, SOCK_DONE))
1382	break;	1382	break;
1383		1383
1384	if (sk->sk_err) {	1384	if (sk->sk_err) {
1385	copied = sock_error(sk);	1385	copied = sock_error(sk);
1386	break;	1386	break;
1387	}	1387	}
1388		1388
1389	if (sk->sk_shutdown & RCV_SHUTDOWN)	1389	if (sk->sk_shutdown & RCV_SHUTDOWN)
1390	break;	1390	break;
1391		1391
1392	if (sk->sk_state == TCP_CLOSE) {	1392	if (sk->sk_state == TCP_CLOSE) {
1393	if (!sock_flag(sk, SOCK_DONE)) {	1393	if (!sock_flag(sk, SOCK_DONE)) {
1394	/* This occurs when user tries to read	1394	/* This occurs when user tries to read
1395	* from never connected socket.	1395	* from never connected socket.
1396	*/	1396	*/
1397	copied = -ENOTCONN;	1397	copied = -ENOTCONN;
1398	break;	1398	break;
1399	}	1399	}
1400	break;	1400	break;
1401	}	1401	}
1402		1402
1403	if (!timeo) {	1403	if (!timeo) {
1404	copied = -EAGAIN;	1404	copied = -EAGAIN;
1405	break;	1405	break;
1406	}	1406	}
1407		1407
1408	if (signal_pending(current)) {	1408	if (signal_pending(current)) {
1409	copied = sock_intr_errno(timeo);	1409	copied = sock_intr_errno(timeo);
1410	break;	1410	break;
1411	}	1411	}
1412	}	1412	}
1413		1413
1414	tcp_cleanup_rbuf(sk, copied);	1414	tcp_cleanup_rbuf(sk, copied);
1415		1415
1416	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {	1416	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1417	/* Install new reader */	1417	/* Install new reader */
1418	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {	1418	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
1419	user_recv = current;	1419	user_recv = current;
1420	tp->ucopy.task = user_recv;	1420	tp->ucopy.task = user_recv;
1421	tp->ucopy.iov = msg->msg_iov;	1421	tp->ucopy.iov = msg->msg_iov;
1422	}	1422	}
1423		1423
1424	tp->ucopy.len = len;	1424	tp->ucopy.len = len;
1425		1425
1426	WARN_ON(tp->copied_seq != tp->rcv_nxt &&	1426	WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1427	!(flags & (MSG_PEEK \| MSG_TRUNC)));	1427	!(flags & (MSG_PEEK \| MSG_TRUNC)));
1428		1428
1429	/* Ugly... If prequeue is not empty, we have to	1429	/* Ugly... If prequeue is not empty, we have to
1430	* process it before releasing socket, otherwise	1430	* process it before releasing socket, otherwise
1431	* order will be broken at second iteration.	1431	* order will be broken at second iteration.
1432	* More elegant solution is required!!!	1432	* More elegant solution is required!!!
1433	*	1433	*
1434	* Look: we have the following (pseudo)queues:	1434	* Look: we have the following (pseudo)queues:
1435	*	1435	*
1436	* 1. packets in flight	1436	* 1. packets in flight
1437	* 2. backlog	1437	* 2. backlog
1438	* 3. prequeue	1438	* 3. prequeue
1439	* 4. receive_queue	1439	* 4. receive_queue
1440	*	1440	*
1441	* Each queue can be processed only if the next ones	1441	* Each queue can be processed only if the next ones
1442	* are empty. At this point we have empty receive_queue.	1442	* are empty. At this point we have empty receive_queue.
1443	* But prequeue _can_ be not empty after 2nd iteration,	1443	* But prequeue _can_ be not empty after 2nd iteration,
1444	* when we jumped to start of loop because backlog	1444	* when we jumped to start of loop because backlog
1445	* processing added something to receive_queue.	1445	* processing added something to receive_queue.
1446	* We cannot release_sock(), because backlog contains	1446	* We cannot release_sock(), because backlog contains
1447	* packets arrived _after_ prequeued ones.	1447	* packets arrived _after_ prequeued ones.
1448	*	1448	*
1449	* Shortly, algorithm is clear --- to process all	1449	* Shortly, algorithm is clear --- to process all
1450	* the queues in order. We could make it more directly,	1450	* the queues in order. We could make it more directly,
1451	* requeueing packets from backlog to prequeue, if	1451	* requeueing packets from backlog to prequeue, if
1452	* is not empty. It is more elegant, but eats cycles,	1452	* is not empty. It is more elegant, but eats cycles,
1453	* unfortunately.	1453	* unfortunately.
1454	*/	1454	*/
1455	if (!skb_queue_empty(&tp->ucopy.prequeue))	1455	if (!skb_queue_empty(&tp->ucopy.prequeue))
1456	goto do_prequeue;	1456	goto do_prequeue;
1457		1457
1458	/* __ Set realtime policy in scheduler __ */	1458	/* __ Set realtime policy in scheduler __ */
1459	}	1459	}
1460		1460
1461	if (copied >= target) {	1461	if (copied >= target) {
1462	/* Do not sleep, just process backlog. */	1462	/* Do not sleep, just process backlog. */
1463	release_sock(sk);	1463	release_sock(sk);
1464	lock_sock(sk);	1464	lock_sock(sk);
1465	} else	1465	} else
1466	sk_wait_data(sk, &timeo);	1466	sk_wait_data(sk, &timeo);
1467		1467
1468	#ifdef CONFIG_NET_DMA	1468	#ifdef CONFIG_NET_DMA
1469	tp->ucopy.wakeup = 0;	1469	tp->ucopy.wakeup = 0;
1470	#endif	1470	#endif
1471		1471
1472	if (user_recv) {	1472	if (user_recv) {
1473	int chunk;	1473	int chunk;
1474		1474
1475	/* __ Restore normal policy in scheduler __ */	1475	/* __ Restore normal policy in scheduler __ */
1476		1476
1477	if ((chunk = len - tp->ucopy.len) != 0) {	1477	if ((chunk = len - tp->ucopy.len) != 0) {
1478	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);	1478	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1479	len -= chunk;	1479	len -= chunk;
1480	copied += chunk;	1480	copied += chunk;
1481	}	1481	}
1482		1482
1483	if (tp->rcv_nxt == tp->copied_seq &&	1483	if (tp->rcv_nxt == tp->copied_seq &&
1484	!skb_queue_empty(&tp->ucopy.prequeue)) {	1484	!skb_queue_empty(&tp->ucopy.prequeue)) {
1485	do_prequeue:	1485	do_prequeue:
1486	tcp_prequeue_process(sk);	1486	tcp_prequeue_process(sk);
1487		1487
1488	if ((chunk = len - tp->ucopy.len) != 0) {	1488	if ((chunk = len - tp->ucopy.len) != 0) {
1489	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);	1489	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1490	len -= chunk;	1490	len -= chunk;
1491	copied += chunk;	1491	copied += chunk;
1492	}	1492	}
1493	}	1493	}
1494	}	1494	}
1495	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {	1495	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1496	if (net_ratelimit())	1496	if (net_ratelimit())
1497	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",	1497	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1498	current->comm, task_pid_nr(current));	1498	current->comm, task_pid_nr(current));
1499	peek_seq = tp->copied_seq;	1499	peek_seq = tp->copied_seq;
1500	}	1500	}
1501	continue;	1501	continue;
1502		1502
1503	found_ok_skb:	1503	found_ok_skb:
1504	/* Ok so how much can we use? */	1504	/* Ok so how much can we use? */
1505	used = skb->len - offset;	1505	used = skb->len - offset;
1506	if (len < used)	1506	if (len < used)
1507	used = len;	1507	used = len;
1508		1508
1509	/* Do we have urgent data here? */	1509	/* Do we have urgent data here? */
1510	if (tp->urg_data) {	1510	if (tp->urg_data) {
1511	u32 urg_offset = tp->urg_seq - *seq;	1511	u32 urg_offset = tp->urg_seq - *seq;
1512	if (urg_offset < used) {	1512	if (urg_offset < used) {
1513	if (!urg_offset) {	1513	if (!urg_offset) {
1514	if (!sock_flag(sk, SOCK_URGINLINE)) {	1514	if (!sock_flag(sk, SOCK_URGINLINE)) {
1515	++*seq;	1515	++*seq;
1516	offset++;	1516	offset++;
1517	used--;	1517	used--;
1518	if (!used)	1518	if (!used)
1519	goto skip_copy;	1519	goto skip_copy;
1520	}	1520	}
1521	} else	1521	} else
1522	used = urg_offset;	1522	used = urg_offset;
1523	}	1523	}
1524	}	1524	}
1525		1525
1526	if (!(flags & MSG_TRUNC)) {	1526	if (!(flags & MSG_TRUNC)) {
1527	#ifdef CONFIG_NET_DMA	1527	#ifdef CONFIG_NET_DMA
1528	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)	1528	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1529	tp->ucopy.dma_chan = get_softnet_dma();	1529	tp->ucopy.dma_chan = get_softnet_dma();
1530		1530
1531	if (tp->ucopy.dma_chan) {	1531	if (tp->ucopy.dma_chan) {
1532	tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(	1532	tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1533	tp->ucopy.dma_chan, skb, offset,	1533	tp->ucopy.dma_chan, skb, offset,
1534	msg->msg_iov, used,	1534	msg->msg_iov, used,
1535	tp->ucopy.pinned_list);	1535	tp->ucopy.pinned_list);
1536		1536
1537	if (tp->ucopy.dma_cookie < 0) {	1537	if (tp->ucopy.dma_cookie < 0) {
1538		1538
1539	printk(KERN_ALERT "dma_cookie < 0\n");	1539	printk(KERN_ALERT "dma_cookie < 0\n");
1540		1540
1541	/* Exception. Bailout! */	1541	/* Exception. Bailout! */
1542	if (!copied)	1542	if (!copied)
1543	copied = -EFAULT;	1543	copied = -EFAULT;
1544	break;	1544	break;
1545	}	1545	}
1546	if ((offset + used) == skb->len)	1546	if ((offset + used) == skb->len)
1547	copied_early = 1;	1547	copied_early = 1;
1548		1548
1549	} else	1549	} else
1550	#endif	1550	#endif
1551	{	1551	{
1552	err = skb_copy_datagram_iovec(skb, offset,	1552	err = skb_copy_datagram_iovec(skb, offset,
1553	msg->msg_iov, used);	1553	msg->msg_iov, used);
1554	if (err) {	1554	if (err) {
1555	/* Exception. Bailout! */	1555	/* Exception. Bailout! */
1556	if (!copied)	1556	if (!copied)
1557	copied = -EFAULT;	1557	copied = -EFAULT;
1558	break;	1558	break;
1559	}	1559	}
1560	}	1560	}
1561	}	1561	}
1562		1562
1563	*seq += used;	1563	*seq += used;
1564	copied += used;	1564	copied += used;
1565	len -= used;	1565	len -= used;
1566		1566
1567	tcp_rcv_space_adjust(sk);	1567	tcp_rcv_space_adjust(sk);
1568		1568
1569	skip_copy:	1569	skip_copy:
1570	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {	1570	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1571	tp->urg_data = 0;	1571	tp->urg_data = 0;
1572	tcp_fast_path_check(sk);	1572	tcp_fast_path_check(sk);
1573	}	1573	}
1574	if (used + offset < skb->len)	1574	if (used + offset < skb->len)
1575	continue;	1575	continue;
1576		1576
1577	if (tcp_hdr(skb)->fin)	1577	if (tcp_hdr(skb)->fin)
1578	goto found_fin_ok;	1578	goto found_fin_ok;
1579	if (!(flags & MSG_PEEK)) {	1579	if (!(flags & MSG_PEEK)) {
1580	sk_eat_skb(sk, skb, copied_early);	1580	sk_eat_skb(sk, skb, copied_early);
1581	copied_early = 0;	1581	copied_early = 0;
1582	}	1582	}
1583	continue;	1583	continue;
1584		1584
1585	found_fin_ok:	1585	found_fin_ok:
1586	/* Process the FIN. */	1586	/* Process the FIN. */
1587	++*seq;	1587	++*seq;
1588	if (!(flags & MSG_PEEK)) {	1588	if (!(flags & MSG_PEEK)) {
1589	sk_eat_skb(sk, skb, copied_early);	1589	sk_eat_skb(sk, skb, copied_early);
1590	copied_early = 0;	1590	copied_early = 0;
1591	}	1591	}
1592	break;	1592	break;
1593	} while (len > 0);	1593	} while (len > 0);
1594		1594
1595	if (user_recv) {	1595	if (user_recv) {
1596	if (!skb_queue_empty(&tp->ucopy.prequeue)) {	1596	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1597	int chunk;	1597	int chunk;
1598		1598
1599	tp->ucopy.len = copied > 0 ? len : 0;	1599	tp->ucopy.len = copied > 0 ? len : 0;
1600		1600
1601	tcp_prequeue_process(sk);	1601	tcp_prequeue_process(sk);
1602		1602
1603	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {	1603	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1604	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);	1604	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1605	len -= chunk;	1605	len -= chunk;
1606	copied += chunk;	1606	copied += chunk;
1607	}	1607	}
1608	}	1608	}
1609		1609
1610	tp->ucopy.task = NULL;	1610	tp->ucopy.task = NULL;
1611	tp->ucopy.len = 0;	1611	tp->ucopy.len = 0;
1612	}	1612	}
1613		1613
1614	#ifdef CONFIG_NET_DMA	1614	#ifdef CONFIG_NET_DMA
1615	if (tp->ucopy.dma_chan) {	1615	if (tp->ucopy.dma_chan) {
1616	dma_cookie_t done, used;	1616	dma_cookie_t done, used;
1617		1617
1618	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);	1618	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1619		1619
1620	while (dma_async_memcpy_complete(tp->ucopy.dma_chan,	1620	while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1621	tp->ucopy.dma_cookie, &done,	1621	tp->ucopy.dma_cookie, &done,
1622	&used) == DMA_IN_PROGRESS) {	1622	&used) == DMA_IN_PROGRESS) {
1623	/* do partial cleanup of sk_async_wait_queue */	1623	/* do partial cleanup of sk_async_wait_queue */
1624	while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&	1624	while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1625	(dma_async_is_complete(skb->dma_cookie, done,	1625	(dma_async_is_complete(skb->dma_cookie, done,
1626	used) == DMA_SUCCESS)) {	1626	used) == DMA_SUCCESS)) {
1627	__skb_dequeue(&sk->sk_async_wait_queue);	1627	__skb_dequeue(&sk->sk_async_wait_queue);
1628	kfree_skb(skb);	1628	kfree_skb(skb);
1629	}	1629	}
1630	}	1630	}
1631		1631
1632	/* Safe to free early-copied skbs now */	1632	/* Safe to free early-copied skbs now */
1633	__skb_queue_purge(&sk->sk_async_wait_queue);	1633	__skb_queue_purge(&sk->sk_async_wait_queue);
1634	dma_chan_put(tp->ucopy.dma_chan);	1634	dma_chan_put(tp->ucopy.dma_chan);
1635	tp->ucopy.dma_chan = NULL;	1635	tp->ucopy.dma_chan = NULL;
1636	}	1636	}
1637	if (tp->ucopy.pinned_list) {	1637	if (tp->ucopy.pinned_list) {
1638	dma_unpin_iovec_pages(tp->ucopy.pinned_list);	1638	dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1639	tp->ucopy.pinned_list = NULL;	1639	tp->ucopy.pinned_list = NULL;
1640	}	1640	}
1641	#endif	1641	#endif
1642		1642
1643	/* According to UNIX98, msg_name/msg_namelen are ignored	1643	/* According to UNIX98, msg_name/msg_namelen are ignored
1644	* on connected socket. I was just happy when found this 8) --ANK	1644	* on connected socket. I was just happy when found this 8) --ANK
1645	*/	1645	*/
1646		1646
1647	/* Clean up data we have read: This will do ACK frames. */	1647	/* Clean up data we have read: This will do ACK frames. */
1648	tcp_cleanup_rbuf(sk, copied);	1648	tcp_cleanup_rbuf(sk, copied);
1649		1649
1650	TCP_CHECK_TIMER(sk);	1650	TCP_CHECK_TIMER(sk);
1651	release_sock(sk);	1651	release_sock(sk);
1652	return copied;	1652	return copied;
1653		1653
1654	out:	1654	out:
1655	TCP_CHECK_TIMER(sk);	1655	TCP_CHECK_TIMER(sk);
1656	release_sock(sk);	1656	release_sock(sk);
1657	return err;	1657	return err;
1658		1658
1659	recv_urg:	1659	recv_urg:
1660	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);	1660	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1661	goto out;	1661	goto out;
1662	}	1662	}
1663		1663
1664	void tcp_set_state(struct sock *sk, int state)	1664	void tcp_set_state(struct sock *sk, int state)
1665	{	1665	{
1666	int oldstate = sk->sk_state;	1666	int oldstate = sk->sk_state;
1667		1667
1668	switch (state) {	1668	switch (state) {
1669	case TCP_ESTABLISHED:	1669	case TCP_ESTABLISHED:
1670	if (oldstate != TCP_ESTABLISHED)	1670	if (oldstate != TCP_ESTABLISHED)
1671	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);	1671	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1672	break;	1672	break;
1673		1673
1674	case TCP_CLOSE:	1674	case TCP_CLOSE:
1675	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)	1675	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
1676	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);	1676	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1677		1677
1678	sk->sk_prot->unhash(sk);	1678	sk->sk_prot->unhash(sk);
1679	if (inet_csk(sk)->icsk_bind_hash &&	1679	if (inet_csk(sk)->icsk_bind_hash &&
1680	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))	1680	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1681	inet_put_port(sk);	1681	inet_put_port(sk);
1682	/* fall through */	1682	/* fall through */
1683	default:	1683	default:
1684	if (oldstate==TCP_ESTABLISHED)	1684	if (oldstate == TCP_ESTABLISHED)
1685	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);	1685	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1686	}	1686	}
1687		1687
1688	/* Change state AFTER socket is unhashed to avoid closed	1688	/* Change state AFTER socket is unhashed to avoid closed
1689	* socket sitting in hash tables.	1689	* socket sitting in hash tables.
1690	*/	1690	*/
1691	sk->sk_state = state;	1691	sk->sk_state = state;
1692		1692
1693	#ifdef STATE_TRACE	1693	#ifdef STATE_TRACE
1694	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);	1694	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1695	#endif	1695	#endif
1696	}	1696	}
1697	EXPORT_SYMBOL_GPL(tcp_set_state);	1697	EXPORT_SYMBOL_GPL(tcp_set_state);
1698		1698
1699	/*	1699	/*
1700	* State processing on a close. This implements the state shift for	1700	* State processing on a close. This implements the state shift for
1701	* sending our FIN frame. Note that we only send a FIN for some	1701	* sending our FIN frame. Note that we only send a FIN for some
1702	* states. A shutdown() may have already sent the FIN, or we may be	1702	* states. A shutdown() may have already sent the FIN, or we may be
1703	* closed.	1703	* closed.
1704	*/	1704	*/
1705		1705
1706	static const unsigned char new_state[16] = {	1706	static const unsigned char new_state[16] = {
1707	/* current state: new state: action: */	1707	/* current state: new state: action: */
1708	/* (Invalid) */ TCP_CLOSE,	1708	/* (Invalid) */ TCP_CLOSE,
1709	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,	1709	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
1710	/* TCP_SYN_SENT */ TCP_CLOSE,	1710	/* TCP_SYN_SENT */ TCP_CLOSE,
1711	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,	1711	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
1712	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,	1712	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1713	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,	1713	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1714	/* TCP_TIME_WAIT */ TCP_CLOSE,	1714	/* TCP_TIME_WAIT */ TCP_CLOSE,
1715	/* TCP_CLOSE */ TCP_CLOSE,	1715	/* TCP_CLOSE */ TCP_CLOSE,
1716	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,	1716	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
1717	/* TCP_LAST_ACK */ TCP_LAST_ACK,	1717	/* TCP_LAST_ACK */ TCP_LAST_ACK,
1718	/* TCP_LISTEN */ TCP_CLOSE,	1718	/* TCP_LISTEN */ TCP_CLOSE,
1719	/* TCP_CLOSING */ TCP_CLOSING,	1719	/* TCP_CLOSING */ TCP_CLOSING,
1720	};	1720	};
1721		1721
1722	static int tcp_close_state(struct sock *sk)	1722	static int tcp_close_state(struct sock *sk)
1723	{	1723	{
1724	int next = (int)new_state[sk->sk_state];	1724	int next = (int)new_state[sk->sk_state];
1725	int ns = next & TCP_STATE_MASK;	1725	int ns = next & TCP_STATE_MASK;
1726		1726
1727	tcp_set_state(sk, ns);	1727	tcp_set_state(sk, ns);
1728		1728
1729	return next & TCP_ACTION_FIN;	1729	return next & TCP_ACTION_FIN;
1730	}	1730	}
1731		1731
1732	/*	1732	/*
1733	* Shutdown the sending side of a connection. Much like close except	1733	* Shutdown the sending side of a connection. Much like close except
1734	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).	1734	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
1735	*/	1735	*/
1736		1736
1737	void tcp_shutdown(struct sock *sk, int how)	1737	void tcp_shutdown(struct sock *sk, int how)
1738	{	1738	{
1739	/* We need to grab some memory, and put together a FIN,	1739	/* We need to grab some memory, and put together a FIN,
1740	* and then put it into the queue to be sent.	1740	* and then put it into the queue to be sent.
1741	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.	1741	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1742	*/	1742	*/
1743	if (!(how & SEND_SHUTDOWN))	1743	if (!(how & SEND_SHUTDOWN))
1744	return;	1744	return;
1745		1745
1746	/* If we've already sent a FIN, or it's a closed state, skip this. */	1746	/* If we've already sent a FIN, or it's a closed state, skip this. */
1747	if ((1 << sk->sk_state) &	1747	if ((1 << sk->sk_state) &
1748	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|	1748	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
1749	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {	1749	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
1750	/* Clear out any half completed packets. FIN if needed. */	1750	/* Clear out any half completed packets. FIN if needed. */
1751	if (tcp_close_state(sk))	1751	if (tcp_close_state(sk))
1752	tcp_send_fin(sk);	1752	tcp_send_fin(sk);
1753	}	1753	}
1754	}	1754	}
1755		1755
1756	void tcp_close(struct sock *sk, long timeout)	1756	void tcp_close(struct sock *sk, long timeout)
1757	{	1757	{
1758	struct sk_buff *skb;	1758	struct sk_buff *skb;
1759	int data_was_unread = 0;	1759	int data_was_unread = 0;
1760	int state;	1760	int state;
1761		1761
1762	lock_sock(sk);	1762	lock_sock(sk);
1763	sk->sk_shutdown = SHUTDOWN_MASK;	1763	sk->sk_shutdown = SHUTDOWN_MASK;
1764		1764
1765	if (sk->sk_state == TCP_LISTEN) {	1765	if (sk->sk_state == TCP_LISTEN) {
1766	tcp_set_state(sk, TCP_CLOSE);	1766	tcp_set_state(sk, TCP_CLOSE);
1767		1767
1768	/* Special case. */	1768	/* Special case. */
1769	inet_csk_listen_stop(sk);	1769	inet_csk_listen_stop(sk);
1770		1770
1771	goto adjudge_to_death;	1771	goto adjudge_to_death;
1772	}	1772	}
1773		1773
1774	/* We need to flush the recv. buffs. We do this only on the	1774	/* We need to flush the recv. buffs. We do this only on the
1775	* descriptor close, not protocol-sourced closes, because the	1775	* descriptor close, not protocol-sourced closes, because the
1776	* reader process may not have drained the data yet!	1776	* reader process may not have drained the data yet!
1777	*/	1777	*/
1778	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {	1778	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1779	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -	1779	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1780	tcp_hdr(skb)->fin;	1780	tcp_hdr(skb)->fin;
1781	data_was_unread += len;	1781	data_was_unread += len;
1782	__kfree_skb(skb);	1782	__kfree_skb(skb);
1783	}	1783	}
1784		1784
1785	sk_mem_reclaim(sk);	1785	sk_mem_reclaim(sk);
1786		1786
1787	/* As outlined in RFC 2525, section 2.17, we send a RST here because	1787	/* As outlined in RFC 2525, section 2.17, we send a RST here because
1788	* data was lost. To witness the awful effects of the old behavior of	1788	* data was lost. To witness the awful effects of the old behavior of
1789	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk	1789	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1790	* GET in an FTP client, suspend the process, wait for the client to	1790	* GET in an FTP client, suspend the process, wait for the client to
1791	* advertise a zero window, then kill -9 the FTP client, wheee...	1791	* advertise a zero window, then kill -9 the FTP client, wheee...
1792	* Note: timeout is always zero in such a case.	1792	* Note: timeout is always zero in such a case.
1793	*/	1793	*/
1794	if (data_was_unread) {	1794	if (data_was_unread) {
1795	/* Unread data was tossed, zap the connection. */	1795	/* Unread data was tossed, zap the connection. */
1796	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);	1796	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1797	tcp_set_state(sk, TCP_CLOSE);	1797	tcp_set_state(sk, TCP_CLOSE);
1798	tcp_send_active_reset(sk, GFP_KERNEL);	1798	tcp_send_active_reset(sk, GFP_KERNEL);
1799	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {	1799	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1800	/* Check zero linger _after_ checking for unread data. */	1800	/* Check zero linger _after_ checking for unread data. */
1801	sk->sk_prot->disconnect(sk, 0);	1801	sk->sk_prot->disconnect(sk, 0);
1802	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);	1802	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1803	} else if (tcp_close_state(sk)) {	1803	} else if (tcp_close_state(sk)) {
1804	/* We FIN if the application ate all the data before	1804	/* We FIN if the application ate all the data before
1805	* zapping the connection.	1805	* zapping the connection.
1806	*/	1806	*/
1807		1807
1808	/* RED-PEN. Formally speaking, we have broken TCP state	1808	/* RED-PEN. Formally speaking, we have broken TCP state
1809	* machine. State transitions:	1809	* machine. State transitions:
1810	*	1810	*
1811	* TCP_ESTABLISHED -> TCP_FIN_WAIT1	1811	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
1812	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)	1812	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1813	* TCP_CLOSE_WAIT -> TCP_LAST_ACK	1813	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
1814	*	1814	*
1815	* are legal only when FIN has been sent (i.e. in window),	1815	* are legal only when FIN has been sent (i.e. in window),
1816	* rather than queued out of window. Purists blame.	1816	* rather than queued out of window. Purists blame.
1817	*	1817	*
1818	* F.e. "RFC state" is ESTABLISHED,	1818	* F.e. "RFC state" is ESTABLISHED,
1819	* if Linux state is FIN-WAIT-1, but FIN is still not sent.	1819	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
1820	*	1820	*
1821	* The visible declinations are that sometimes	1821	* The visible declinations are that sometimes
1822	* we enter time-wait state, when it is not required really	1822	* we enter time-wait state, when it is not required really
1823	* (harmless), do not send active resets, when they are	1823	* (harmless), do not send active resets, when they are
1824	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when	1824	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1825	* they look as CLOSING or LAST_ACK for Linux)	1825	* they look as CLOSING or LAST_ACK for Linux)
1826	* Probably, I missed some more holelets.	1826	* Probably, I missed some more holelets.
1827	* --ANK	1827	* --ANK
1828	*/	1828	*/
1829	tcp_send_fin(sk);	1829	tcp_send_fin(sk);
1830	}	1830	}
1831		1831
1832	sk_stream_wait_close(sk, timeout);	1832	sk_stream_wait_close(sk, timeout);
1833		1833
1834	adjudge_to_death:	1834	adjudge_to_death:
1835	state = sk->sk_state;	1835	state = sk->sk_state;
1836	sock_hold(sk);	1836	sock_hold(sk);
1837	sock_orphan(sk);	1837	sock_orphan(sk);
1838	atomic_inc(sk->sk_prot->orphan_count);	1838	atomic_inc(sk->sk_prot->orphan_count);
1839		1839
1840	/* It is the last release_sock in its life. It will remove backlog. */	1840	/* It is the last release_sock in its life. It will remove backlog. */
1841	release_sock(sk);	1841	release_sock(sk);
1842		1842
1843		1843
1844	/* Now socket is owned by kernel and we acquire BH lock	1844	/* Now socket is owned by kernel and we acquire BH lock
1845	to finish close. No need to check for user refs.	1845	to finish close. No need to check for user refs.
1846	*/	1846	*/
1847	local_bh_disable();	1847	local_bh_disable();
1848	bh_lock_sock(sk);	1848	bh_lock_sock(sk);
1849	WARN_ON(sock_owned_by_user(sk));	1849	WARN_ON(sock_owned_by_user(sk));
1850		1850
1851	/* Have we already been destroyed by a softirq or backlog? */	1851	/* Have we already been destroyed by a softirq or backlog? */
1852	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)	1852	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1853	goto out;	1853	goto out;
1854		1854
1855	/* This is a (useful) BSD violating of the RFC. There is a	1855	/* This is a (useful) BSD violating of the RFC. There is a
1856	* problem with TCP as specified in that the other end could	1856	* problem with TCP as specified in that the other end could
1857	* keep a socket open forever with no application left this end.	1857	* keep a socket open forever with no application left this end.
1858	* We use a 3 minute timeout (about the same as BSD) then kill	1858	* We use a 3 minute timeout (about the same as BSD) then kill
1859	* our end. If they send after that then tough - BUT: long enough	1859	* our end. If they send after that then tough - BUT: long enough
1860	* that we won't make the old 4*rto = almost no time - whoops	1860	* that we won't make the old 4*rto = almost no time - whoops
1861	* reset mistake.	1861	* reset mistake.
1862	*	1862	*
1863	* Nope, it was not mistake. It is really desired behaviour	1863	* Nope, it was not mistake. It is really desired behaviour
1864	* f.e. on http servers, when such sockets are useless, but	1864	* f.e. on http servers, when such sockets are useless, but
1865	* consume significant resources. Let's do it with special	1865	* consume significant resources. Let's do it with special
1866	* linger2 option. --ANK	1866	* linger2 option. --ANK
1867	*/	1867	*/
1868		1868
1869	if (sk->sk_state == TCP_FIN_WAIT2) {	1869	if (sk->sk_state == TCP_FIN_WAIT2) {
1870	struct tcp_sock *tp = tcp_sk(sk);	1870	struct tcp_sock *tp = tcp_sk(sk);
1871	if (tp->linger2 < 0) {	1871	if (tp->linger2 < 0) {
1872	tcp_set_state(sk, TCP_CLOSE);	1872	tcp_set_state(sk, TCP_CLOSE);
1873	tcp_send_active_reset(sk, GFP_ATOMIC);	1873	tcp_send_active_reset(sk, GFP_ATOMIC);
1874	NET_INC_STATS_BH(sock_net(sk),	1874	NET_INC_STATS_BH(sock_net(sk),
1875	LINUX_MIB_TCPABORTONLINGER);	1875	LINUX_MIB_TCPABORTONLINGER);
1876	} else {	1876	} else {
1877	const int tmo = tcp_fin_time(sk);	1877	const int tmo = tcp_fin_time(sk);
1878		1878
1879	if (tmo > TCP_TIMEWAIT_LEN) {	1879	if (tmo > TCP_TIMEWAIT_LEN) {
1880	inet_csk_reset_keepalive_timer(sk,	1880	inet_csk_reset_keepalive_timer(sk,
1881	tmo - TCP_TIMEWAIT_LEN);	1881	tmo - TCP_TIMEWAIT_LEN);
1882	} else {	1882	} else {
1883	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);	1883	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1884	goto out;	1884	goto out;
1885	}	1885	}
1886	}	1886	}
1887	}	1887	}
1888	if (sk->sk_state != TCP_CLOSE) {	1888	if (sk->sk_state != TCP_CLOSE) {
1889	sk_mem_reclaim(sk);	1889	sk_mem_reclaim(sk);
1890	if (tcp_too_many_orphans(sk,	1890	if (tcp_too_many_orphans(sk,
1891	atomic_read(sk->sk_prot->orphan_count))) {	1891	atomic_read(sk->sk_prot->orphan_count))) {
1892	if (net_ratelimit())	1892	if (net_ratelimit())
1893	printk(KERN_INFO "TCP: too many of orphaned "	1893	printk(KERN_INFO "TCP: too many of orphaned "
1894	"sockets\n");	1894	"sockets\n");
1895	tcp_set_state(sk, TCP_CLOSE);	1895	tcp_set_state(sk, TCP_CLOSE);
1896	tcp_send_active_reset(sk, GFP_ATOMIC);	1896	tcp_send_active_reset(sk, GFP_ATOMIC);
1897	NET_INC_STATS_BH(sock_net(sk),	1897	NET_INC_STATS_BH(sock_net(sk),
1898	LINUX_MIB_TCPABORTONMEMORY);	1898	LINUX_MIB_TCPABORTONMEMORY);
1899	}	1899	}
1900	}	1900	}
1901		1901
1902	if (sk->sk_state == TCP_CLOSE)	1902	if (sk->sk_state == TCP_CLOSE)
1903	inet_csk_destroy_sock(sk);	1903	inet_csk_destroy_sock(sk);
1904	/* Otherwise, socket is reprieved until protocol close. */	1904	/* Otherwise, socket is reprieved until protocol close. */
1905		1905
1906	out:	1906	out:
1907	bh_unlock_sock(sk);	1907	bh_unlock_sock(sk);
1908	local_bh_enable();	1908	local_bh_enable();
1909	sock_put(sk);	1909	sock_put(sk);
1910	}	1910	}
1911		1911
1912	/* These states need RST on ABORT according to RFC793 */	1912	/* These states need RST on ABORT according to RFC793 */
1913		1913
1914	static inline int tcp_need_reset(int state)	1914	static inline int tcp_need_reset(int state)
1915	{	1915	{
1916	return (1 << state) &	1916	return (1 << state) &
1917	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|	1917	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
1918	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);	1918	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
1919	}	1919	}
1920		1920
1921	int tcp_disconnect(struct sock *sk, int flags)	1921	int tcp_disconnect(struct sock *sk, int flags)
1922	{	1922	{
1923	struct inet_sock *inet = inet_sk(sk);	1923	struct inet_sock *inet = inet_sk(sk);
1924	struct inet_connection_sock *icsk = inet_csk(sk);	1924	struct inet_connection_sock *icsk = inet_csk(sk);
1925	struct tcp_sock *tp = tcp_sk(sk);	1925	struct tcp_sock *tp = tcp_sk(sk);
1926	int err = 0;	1926	int err = 0;
1927	int old_state = sk->sk_state;	1927	int old_state = sk->sk_state;
1928		1928
1929	if (old_state != TCP_CLOSE)	1929	if (old_state != TCP_CLOSE)
1930	tcp_set_state(sk, TCP_CLOSE);	1930	tcp_set_state(sk, TCP_CLOSE);
1931		1931
1932	/* ABORT function of RFC793 */	1932	/* ABORT function of RFC793 */
1933	if (old_state == TCP_LISTEN) {	1933	if (old_state == TCP_LISTEN) {
1934	inet_csk_listen_stop(sk);	1934	inet_csk_listen_stop(sk);
1935	} else if (tcp_need_reset(old_state) \|\|	1935	} else if (tcp_need_reset(old_state) \|\|
1936	(tp->snd_nxt != tp->write_seq &&	1936	(tp->snd_nxt != tp->write_seq &&
1937	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {	1937	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
1938	/* The last check adjusts for discrepancy of Linux wrt. RFC	1938	/* The last check adjusts for discrepancy of Linux wrt. RFC
1939	* states	1939	* states
1940	*/	1940	*/
1941	tcp_send_active_reset(sk, gfp_any());	1941	tcp_send_active_reset(sk, gfp_any());
1942	sk->sk_err = ECONNRESET;	1942	sk->sk_err = ECONNRESET;
1943	} else if (old_state == TCP_SYN_SENT)	1943	} else if (old_state == TCP_SYN_SENT)
1944	sk->sk_err = ECONNRESET;	1944	sk->sk_err = ECONNRESET;
1945		1945
1946	tcp_clear_xmit_timers(sk);	1946	tcp_clear_xmit_timers(sk);
1947	__skb_queue_purge(&sk->sk_receive_queue);	1947	__skb_queue_purge(&sk->sk_receive_queue);
1948	tcp_write_queue_purge(sk);	1948	tcp_write_queue_purge(sk);
1949	__skb_queue_purge(&tp->out_of_order_queue);	1949	__skb_queue_purge(&tp->out_of_order_queue);
1950	#ifdef CONFIG_NET_DMA	1950	#ifdef CONFIG_NET_DMA
1951	__skb_queue_purge(&sk->sk_async_wait_queue);	1951	__skb_queue_purge(&sk->sk_async_wait_queue);
1952	#endif	1952	#endif
1953		1953
1954	inet->dport = 0;	1954	inet->dport = 0;
1955		1955
1956	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))	1956	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1957	inet_reset_saddr(sk);	1957	inet_reset_saddr(sk);
1958		1958
1959	sk->sk_shutdown = 0;	1959	sk->sk_shutdown = 0;
1960	sock_reset_flag(sk, SOCK_DONE);	1960	sock_reset_flag(sk, SOCK_DONE);
1961	tp->srtt = 0;	1961	tp->srtt = 0;
1962	if ((tp->write_seq += tp->max_window + 2) == 0)	1962	if ((tp->write_seq += tp->max_window + 2) == 0)
1963	tp->write_seq = 1;	1963	tp->write_seq = 1;
1964	icsk->icsk_backoff = 0;	1964	icsk->icsk_backoff = 0;
1965	tp->snd_cwnd = 2;	1965	tp->snd_cwnd = 2;
1966	icsk->icsk_probes_out = 0;	1966	icsk->icsk_probes_out = 0;
1967	tp->packets_out = 0;	1967	tp->packets_out = 0;
1968	tp->snd_ssthresh = 0x7fffffff;	1968	tp->snd_ssthresh = 0x7fffffff;
1969	tp->snd_cwnd_cnt = 0;	1969	tp->snd_cwnd_cnt = 0;
1970	tp->bytes_acked = 0;	1970	tp->bytes_acked = 0;
1971	tcp_set_ca_state(sk, TCP_CA_Open);	1971	tcp_set_ca_state(sk, TCP_CA_Open);
1972	tcp_clear_retrans(tp);	1972	tcp_clear_retrans(tp);
1973	inet_csk_delack_init(sk);	1973	inet_csk_delack_init(sk);
1974	tcp_init_send_head(sk);	1974	tcp_init_send_head(sk);
1975	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));	1975	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
1976	__sk_dst_reset(sk);	1976	__sk_dst_reset(sk);
1977		1977
1978	WARN_ON(inet->num && !icsk->icsk_bind_hash);	1978	WARN_ON(inet->num && !icsk->icsk_bind_hash);
1979		1979
1980	sk->sk_error_report(sk);	1980	sk->sk_error_report(sk);
1981	return err;	1981	return err;
1982	}	1982	}
1983		1983
1984	/*	1984	/*
1985	* Socket option code for TCP.	1985	* Socket option code for TCP.
1986	*/	1986	*/
1987	static int do_tcp_setsockopt(struct sock *sk, int level,	1987	static int do_tcp_setsockopt(struct sock *sk, int level,
1988	int optname, char __user *optval, int optlen)	1988	int optname, char __user *optval, int optlen)
1989	{	1989	{
1990	struct tcp_sock *tp = tcp_sk(sk);	1990	struct tcp_sock *tp = tcp_sk(sk);
1991	struct inet_connection_sock *icsk = inet_csk(sk);	1991	struct inet_connection_sock *icsk = inet_csk(sk);
1992	int val;	1992	int val;
1993	int err = 0;	1993	int err = 0;
1994		1994
1995	/* This is a string value all the others are int's */	1995	/* This is a string value all the others are int's */
1996	if (optname == TCP_CONGESTION) {	1996	if (optname == TCP_CONGESTION) {
1997	char name[TCP_CA_NAME_MAX];	1997	char name[TCP_CA_NAME_MAX];
1998		1998
1999	if (optlen < 1)	1999	if (optlen < 1)
2000	return -EINVAL;	2000	return -EINVAL;
2001		2001
2002	val = strncpy_from_user(name, optval,	2002	val = strncpy_from_user(name, optval,
2003	min(TCP_CA_NAME_MAX-1, optlen));	2003	min(TCP_CA_NAME_MAX-1, optlen));
2004	if (val < 0)	2004	if (val < 0)
2005	return -EFAULT;	2005	return -EFAULT;
2006	name[val] = 0;	2006	name[val] = 0;
2007		2007
2008	lock_sock(sk);	2008	lock_sock(sk);
2009	err = tcp_set_congestion_control(sk, name);	2009	err = tcp_set_congestion_control(sk, name);
2010	release_sock(sk);	2010	release_sock(sk);
2011	return err;	2011	return err;
2012	}	2012	}
2013		2013
2014	if (optlen < sizeof(int))	2014	if (optlen < sizeof(int))
2015	return -EINVAL;	2015	return -EINVAL;
2016		2016
2017	if (get_user(val, (int __user *)optval))	2017	if (get_user(val, (int __user *)optval))
2018	return -EFAULT;	2018	return -EFAULT;
2019		2019
2020	lock_sock(sk);	2020	lock_sock(sk);
2021		2021
2022	switch (optname) {	2022	switch (optname) {
2023	case TCP_MAXSEG:	2023	case TCP_MAXSEG:
2024	/* Values greater than interface MTU won't take effect. However	2024	/* Values greater than interface MTU won't take effect. However
2025	* at the point when this call is done we typically don't yet	2025	* at the point when this call is done we typically don't yet
2026	* know which interface is going to be used */	2026	* know which interface is going to be used */
2027	if (val < 8 \|\| val > MAX_TCP_WINDOW) {	2027	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
2028	err = -EINVAL;	2028	err = -EINVAL;
2029	break;	2029	break;
2030	}	2030	}
2031	tp->rx_opt.user_mss = val;	2031	tp->rx_opt.user_mss = val;
2032	break;	2032	break;
2033		2033
2034	case TCP_NODELAY:	2034	case TCP_NODELAY:
2035	if (val) {	2035	if (val) {
2036	/* TCP_NODELAY is weaker than TCP_CORK, so that	2036	/* TCP_NODELAY is weaker than TCP_CORK, so that
2037	* this option on corked socket is remembered, but	2037	* this option on corked socket is remembered, but
2038	* it is not activated until cork is cleared.	2038	* it is not activated until cork is cleared.
2039	*	2039	*
2040	* However, when TCP_NODELAY is set we make	2040	* However, when TCP_NODELAY is set we make
2041	* an explicit push, which overrides even TCP_CORK	2041	* an explicit push, which overrides even TCP_CORK
2042	* for currently queued segments.	2042	* for currently queued segments.
2043	*/	2043	*/
2044	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;	2044	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
2045	tcp_push_pending_frames(sk);	2045	tcp_push_pending_frames(sk);
2046	} else {	2046	} else {
2047	tp->nonagle &= ~TCP_NAGLE_OFF;	2047	tp->nonagle &= ~TCP_NAGLE_OFF;
2048	}	2048	}
2049	break;	2049	break;
2050		2050
2051	case TCP_CORK:	2051	case TCP_CORK:
2052	/* When set indicates to always queue non-full frames.	2052	/* When set indicates to always queue non-full frames.
2053	* Later the user clears this option and we transmit	2053	* Later the user clears this option and we transmit
2054	* any pending partial frames in the queue. This is	2054	* any pending partial frames in the queue. This is
2055	* meant to be used alongside sendfile() to get properly	2055	* meant to be used alongside sendfile() to get properly
2056	* filled frames when the user (for example) must write	2056	* filled frames when the user (for example) must write
2057	* out headers with a write() call first and then use	2057	* out headers with a write() call first and then use
2058	* sendfile to send out the data parts.	2058	* sendfile to send out the data parts.
2059	*	2059	*
2060	* TCP_CORK can be set together with TCP_NODELAY and it is	2060	* TCP_CORK can be set together with TCP_NODELAY and it is
2061	* stronger than TCP_NODELAY.	2061	* stronger than TCP_NODELAY.
2062	*/	2062	*/
2063	if (val) {	2063	if (val) {
2064	tp->nonagle \|= TCP_NAGLE_CORK;	2064	tp->nonagle \|= TCP_NAGLE_CORK;
2065	} else {	2065	} else {
2066	tp->nonagle &= ~TCP_NAGLE_CORK;	2066	tp->nonagle &= ~TCP_NAGLE_CORK;
2067	if (tp->nonagle&TCP_NAGLE_OFF)	2067	if (tp->nonagle&TCP_NAGLE_OFF)
2068	tp->nonagle \|= TCP_NAGLE_PUSH;	2068	tp->nonagle \|= TCP_NAGLE_PUSH;
2069	tcp_push_pending_frames(sk);	2069	tcp_push_pending_frames(sk);
2070	}	2070	}
2071	break;	2071	break;
2072		2072
2073	case TCP_KEEPIDLE:	2073	case TCP_KEEPIDLE:
2074	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)	2074	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
2075	err = -EINVAL;	2075	err = -EINVAL;
2076	else {	2076	else {
2077	tp->keepalive_time = val * HZ;	2077	tp->keepalive_time = val * HZ;
2078	if (sock_flag(sk, SOCK_KEEPOPEN) &&	2078	if (sock_flag(sk, SOCK_KEEPOPEN) &&
2079	!((1 << sk->sk_state) &	2079	!((1 << sk->sk_state) &
2080	(TCPF_CLOSE \| TCPF_LISTEN))) {	2080	(TCPF_CLOSE \| TCPF_LISTEN))) {
2081	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;	2081	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2082	if (tp->keepalive_time > elapsed)	2082	if (tp->keepalive_time > elapsed)
2083	elapsed = tp->keepalive_time - elapsed;	2083	elapsed = tp->keepalive_time - elapsed;
2084	else	2084	else
2085	elapsed = 0;	2085	elapsed = 0;
2086	inet_csk_reset_keepalive_timer(sk, elapsed);	2086	inet_csk_reset_keepalive_timer(sk, elapsed);
2087	}	2087	}
2088	}	2088	}
2089	break;	2089	break;
2090	case TCP_KEEPINTVL:	2090	case TCP_KEEPINTVL:
2091	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)	2091	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
2092	err = -EINVAL;	2092	err = -EINVAL;
2093	else	2093	else
2094	tp->keepalive_intvl = val * HZ;	2094	tp->keepalive_intvl = val * HZ;
2095	break;	2095	break;
2096	case TCP_KEEPCNT:	2096	case TCP_KEEPCNT:
2097	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)	2097	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
2098	err = -EINVAL;	2098	err = -EINVAL;
2099	else	2099	else
2100	tp->keepalive_probes = val;	2100	tp->keepalive_probes = val;
2101	break;	2101	break;
2102	case TCP_SYNCNT:	2102	case TCP_SYNCNT:
2103	if (val < 1 \|\| val > MAX_TCP_SYNCNT)	2103	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
2104	err = -EINVAL;	2104	err = -EINVAL;
2105	else	2105	else
2106	icsk->icsk_syn_retries = val;	2106	icsk->icsk_syn_retries = val;
2107	break;	2107	break;
2108		2108
2109	case TCP_LINGER2:	2109	case TCP_LINGER2:
2110	if (val < 0)	2110	if (val < 0)
2111	tp->linger2 = -1;	2111	tp->linger2 = -1;
2112	else if (val > sysctl_tcp_fin_timeout / HZ)	2112	else if (val > sysctl_tcp_fin_timeout / HZ)
2113	tp->linger2 = 0;	2113	tp->linger2 = 0;
2114	else	2114	else
2115	tp->linger2 = val * HZ;	2115	tp->linger2 = val * HZ;
2116	break;	2116	break;
2117		2117
2118	case TCP_DEFER_ACCEPT:	2118	case TCP_DEFER_ACCEPT:
2119	icsk->icsk_accept_queue.rskq_defer_accept = 0;	2119	icsk->icsk_accept_queue.rskq_defer_accept = 0;
2120	if (val > 0) {	2120	if (val > 0) {
2121	/* Translate value in seconds to number of	2121	/* Translate value in seconds to number of
2122	* retransmits */	2122	* retransmits */
2123	while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&	2123	while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2124	val > ((TCP_TIMEOUT_INIT / HZ) <<	2124	val > ((TCP_TIMEOUT_INIT / HZ) <<
2125	icsk->icsk_accept_queue.rskq_defer_accept))	2125	icsk->icsk_accept_queue.rskq_defer_accept))
2126	icsk->icsk_accept_queue.rskq_defer_accept++;	2126	icsk->icsk_accept_queue.rskq_defer_accept++;
2127	icsk->icsk_accept_queue.rskq_defer_accept++;	2127	icsk->icsk_accept_queue.rskq_defer_accept++;
2128	}	2128	}
2129	break;	2129	break;
2130		2130
2131	case TCP_WINDOW_CLAMP:	2131	case TCP_WINDOW_CLAMP:
2132	if (!val) {	2132	if (!val) {
2133	if (sk->sk_state != TCP_CLOSE) {	2133	if (sk->sk_state != TCP_CLOSE) {
2134	err = -EINVAL;	2134	err = -EINVAL;
2135	break;	2135	break;
2136	}	2136	}
2137	tp->window_clamp = 0;	2137	tp->window_clamp = 0;
2138	} else	2138	} else
2139	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?	2139	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2140	SOCK_MIN_RCVBUF / 2 : val;	2140	SOCK_MIN_RCVBUF / 2 : val;
2141	break;	2141	break;
2142		2142
2143	case TCP_QUICKACK:	2143	case TCP_QUICKACK:
2144	if (!val) {	2144	if (!val) {
2145	icsk->icsk_ack.pingpong = 1;	2145	icsk->icsk_ack.pingpong = 1;
2146	} else {	2146	} else {
2147	icsk->icsk_ack.pingpong = 0;	2147	icsk->icsk_ack.pingpong = 0;
2148	if ((1 << sk->sk_state) &	2148	if ((1 << sk->sk_state) &
2149	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&	2149	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
2150	inet_csk_ack_scheduled(sk)) {	2150	inet_csk_ack_scheduled(sk)) {
2151	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;	2151	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
2152	tcp_cleanup_rbuf(sk, 1);	2152	tcp_cleanup_rbuf(sk, 1);
2153	if (!(val & 1))	2153	if (!(val & 1))
2154	icsk->icsk_ack.pingpong = 1;	2154	icsk->icsk_ack.pingpong = 1;
2155	}	2155	}
2156	}	2156	}
2157	break;	2157	break;
2158		2158
2159	#ifdef CONFIG_TCP_MD5SIG	2159	#ifdef CONFIG_TCP_MD5SIG
2160	case TCP_MD5SIG:	2160	case TCP_MD5SIG:
2161	/* Read the IP->Key mappings from userspace */	2161	/* Read the IP->Key mappings from userspace */
2162	err = tp->af_specific->md5_parse(sk, optval, optlen);	2162	err = tp->af_specific->md5_parse(sk, optval, optlen);
2163	break;	2163	break;
2164	#endif	2164	#endif
2165		2165
2166	default:	2166	default:
2167	err = -ENOPROTOOPT;	2167	err = -ENOPROTOOPT;
2168	break;	2168	break;
2169	}	2169	}
2170		2170
2171	release_sock(sk);	2171	release_sock(sk);
2172	return err;	2172	return err;
2173	}	2173	}
2174		2174
2175	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,	2175	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
2176	int optlen)	2176	int optlen)
2177	{	2177	{
2178	struct inet_connection_sock *icsk = inet_csk(sk);	2178	struct inet_connection_sock *icsk = inet_csk(sk);
2179		2179
2180	if (level != SOL_TCP)	2180	if (level != SOL_TCP)
2181	return icsk->icsk_af_ops->setsockopt(sk, level, optname,	2181	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2182	optval, optlen);	2182	optval, optlen);
2183	return do_tcp_setsockopt(sk, level, optname, optval, optlen);	2183	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2184	}	2184	}
2185		2185
2186	#ifdef CONFIG_COMPAT	2186	#ifdef CONFIG_COMPAT
2187	int compat_tcp_setsockopt(struct sock *sk, int level, int optname,	2187	int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2188	char __user *optval, int optlen)	2188	char __user *optval, int optlen)
2189	{	2189	{
2190	if (level != SOL_TCP)	2190	if (level != SOL_TCP)
2191	return inet_csk_compat_setsockopt(sk, level, optname,	2191	return inet_csk_compat_setsockopt(sk, level, optname,
2192	optval, optlen);	2192	optval, optlen);
2193	return do_tcp_setsockopt(sk, level, optname, optval, optlen);	2193	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2194	}	2194	}
2195		2195
2196	EXPORT_SYMBOL(compat_tcp_setsockopt);	2196	EXPORT_SYMBOL(compat_tcp_setsockopt);
2197	#endif	2197	#endif
2198		2198
2199	/* Return information about state of tcp endpoint in API format. */	2199	/* Return information about state of tcp endpoint in API format. */
2200	void tcp_get_info(struct sock sk, struct tcp_info info)	2200	void tcp_get_info(struct sock sk, struct tcp_info info)
2201	{	2201	{
2202	struct tcp_sock *tp = tcp_sk(sk);	2202	struct tcp_sock *tp = tcp_sk(sk);
2203	const struct inet_connection_sock *icsk = inet_csk(sk);	2203	const struct inet_connection_sock *icsk = inet_csk(sk);
2204	u32 now = tcp_time_stamp;	2204	u32 now = tcp_time_stamp;
2205		2205
2206	memset(info, 0, sizeof(*info));	2206	memset(info, 0, sizeof(*info));
2207		2207
2208	info->tcpi_state = sk->sk_state;	2208	info->tcpi_state = sk->sk_state;
2209	info->tcpi_ca_state = icsk->icsk_ca_state;	2209	info->tcpi_ca_state = icsk->icsk_ca_state;
2210	info->tcpi_retransmits = icsk->icsk_retransmits;	2210	info->tcpi_retransmits = icsk->icsk_retransmits;
2211	info->tcpi_probes = icsk->icsk_probes_out;	2211	info->tcpi_probes = icsk->icsk_probes_out;
2212	info->tcpi_backoff = icsk->icsk_backoff;	2212	info->tcpi_backoff = icsk->icsk_backoff;
2213		2213
2214	if (tp->rx_opt.tstamp_ok)	2214	if (tp->rx_opt.tstamp_ok)
2215	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;	2215	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
2216	if (tcp_is_sack(tp))	2216	if (tcp_is_sack(tp))
2217	info->tcpi_options \|= TCPI_OPT_SACK;	2217	info->tcpi_options \|= TCPI_OPT_SACK;
2218	if (tp->rx_opt.wscale_ok) {	2218	if (tp->rx_opt.wscale_ok) {
2219	info->tcpi_options \|= TCPI_OPT_WSCALE;	2219	info->tcpi_options \|= TCPI_OPT_WSCALE;
2220	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;	2220	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2221	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;	2221	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2222	}	2222	}
2223		2223
2224	if (tp->ecn_flags&TCP_ECN_OK)	2224	if (tp->ecn_flags&TCP_ECN_OK)
2225	info->tcpi_options \|= TCPI_OPT_ECN;	2225	info->tcpi_options \|= TCPI_OPT_ECN;
2226		2226
2227	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);	2227	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2228	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);	2228	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2229	info->tcpi_snd_mss = tp->mss_cache;	2229	info->tcpi_snd_mss = tp->mss_cache;
2230	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;	2230	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2231		2231
2232	if (sk->sk_state == TCP_LISTEN) {	2232	if (sk->sk_state == TCP_LISTEN) {
2233	info->tcpi_unacked = sk->sk_ack_backlog;	2233	info->tcpi_unacked = sk->sk_ack_backlog;
2234	info->tcpi_sacked = sk->sk_max_ack_backlog;	2234	info->tcpi_sacked = sk->sk_max_ack_backlog;
2235	} else {	2235	} else {
2236	info->tcpi_unacked = tp->packets_out;	2236	info->tcpi_unacked = tp->packets_out;
2237	info->tcpi_sacked = tp->sacked_out;	2237	info->tcpi_sacked = tp->sacked_out;
2238	}	2238	}
2239	info->tcpi_lost = tp->lost_out;	2239	info->tcpi_lost = tp->lost_out;
2240	info->tcpi_retrans = tp->retrans_out;	2240	info->tcpi_retrans = tp->retrans_out;
2241	info->tcpi_fackets = tp->fackets_out;	2241	info->tcpi_fackets = tp->fackets_out;
2242		2242
2243	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);	2243	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2244	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);	2244	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2245	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);	2245	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2246		2246
2247	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;	2247	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2248	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;	2248	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2249	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;	2249	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2250	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;	2250	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2251	info->tcpi_snd_ssthresh = tp->snd_ssthresh;	2251	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2252	info->tcpi_snd_cwnd = tp->snd_cwnd;	2252	info->tcpi_snd_cwnd = tp->snd_cwnd;
2253	info->tcpi_advmss = tp->advmss;	2253	info->tcpi_advmss = tp->advmss;
2254	info->tcpi_reordering = tp->reordering;	2254	info->tcpi_reordering = tp->reordering;
2255		2255
2256	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;	2256	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2257	info->tcpi_rcv_space = tp->rcvq_space.space;	2257	info->tcpi_rcv_space = tp->rcvq_space.space;
2258		2258
2259	info->tcpi_total_retrans = tp->total_retrans;	2259	info->tcpi_total_retrans = tp->total_retrans;
2260	}	2260	}
2261		2261
2262	EXPORT_SYMBOL_GPL(tcp_get_info);	2262	EXPORT_SYMBOL_GPL(tcp_get_info);
2263		2263
2264	static int do_tcp_getsockopt(struct sock *sk, int level,	2264	static int do_tcp_getsockopt(struct sock *sk, int level,
2265	int optname, char __user optval, int __user optlen)	2265	int optname, char __user optval, int __user optlen)
2266	{	2266	{
2267	struct inet_connection_sock *icsk = inet_csk(sk);	2267	struct inet_connection_sock *icsk = inet_csk(sk);
2268	struct tcp_sock *tp = tcp_sk(sk);	2268	struct tcp_sock *tp = tcp_sk(sk);
2269	int val, len;	2269	int val, len;
2270		2270
2271	if (get_user(len, optlen))	2271	if (get_user(len, optlen))
2272	return -EFAULT;	2272	return -EFAULT;
2273		2273
2274	len = min_t(unsigned int, len, sizeof(int));	2274	len = min_t(unsigned int, len, sizeof(int));
2275		2275
2276	if (len < 0)	2276	if (len < 0)
2277	return -EINVAL;	2277	return -EINVAL;
2278		2278
2279	switch (optname) {	2279	switch (optname) {
2280	case TCP_MAXSEG:	2280	case TCP_MAXSEG:
2281	val = tp->mss_cache;	2281	val = tp->mss_cache;
2282	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))	2282	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
2283	val = tp->rx_opt.user_mss;	2283	val = tp->rx_opt.user_mss;
2284	break;	2284	break;
2285	case TCP_NODELAY:	2285	case TCP_NODELAY:
2286	val = !!(tp->nonagle&TCP_NAGLE_OFF);	2286	val = !!(tp->nonagle&TCP_NAGLE_OFF);
2287	break;	2287	break;
2288	case TCP_CORK:	2288	case TCP_CORK:
2289	val = !!(tp->nonagle&TCP_NAGLE_CORK);	2289	val = !!(tp->nonagle&TCP_NAGLE_CORK);
2290	break;	2290	break;
2291	case TCP_KEEPIDLE:	2291	case TCP_KEEPIDLE:
2292	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;	2292	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2293	break;	2293	break;
2294	case TCP_KEEPINTVL:	2294	case TCP_KEEPINTVL:
2295	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;	2295	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2296	break;	2296	break;
2297	case TCP_KEEPCNT:	2297	case TCP_KEEPCNT:
2298	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;	2298	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2299	break;	2299	break;
2300	case TCP_SYNCNT:	2300	case TCP_SYNCNT:
2301	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;	2301	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2302	break;	2302	break;
2303	case TCP_LINGER2:	2303	case TCP_LINGER2:
2304	val = tp->linger2;	2304	val = tp->linger2;
2305	if (val >= 0)	2305	if (val >= 0)
2306	val = (val ? : sysctl_tcp_fin_timeout) / HZ;	2306	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2307	break;	2307	break;
2308	case TCP_DEFER_ACCEPT:	2308	case TCP_DEFER_ACCEPT:
2309	val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :	2309	val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2310	((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));	2310	((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2311	break;	2311	break;
2312	case TCP_WINDOW_CLAMP:	2312	case TCP_WINDOW_CLAMP:
2313	val = tp->window_clamp;	2313	val = tp->window_clamp;
2314	break;	2314	break;
2315	case TCP_INFO: {	2315	case TCP_INFO: {
2316	struct tcp_info info;	2316	struct tcp_info info;
2317		2317
2318	if (get_user(len, optlen))	2318	if (get_user(len, optlen))
2319	return -EFAULT;	2319	return -EFAULT;
2320		2320
2321	tcp_get_info(sk, &info);	2321	tcp_get_info(sk, &info);
2322		2322
2323	len = min_t(unsigned int, len, sizeof(info));	2323	len = min_t(unsigned int, len, sizeof(info));
2324	if (put_user(len, optlen))	2324	if (put_user(len, optlen))
2325	return -EFAULT;	2325	return -EFAULT;
2326	if (copy_to_user(optval, &info, len))	2326	if (copy_to_user(optval, &info, len))
2327	return -EFAULT;	2327	return -EFAULT;
2328	return 0;	2328	return 0;
2329	}	2329	}
2330	case TCP_QUICKACK:	2330	case TCP_QUICKACK:
2331	val = !icsk->icsk_ack.pingpong;	2331	val = !icsk->icsk_ack.pingpong;
2332	break;	2332	break;
2333		2333
2334	case TCP_CONGESTION:	2334	case TCP_CONGESTION:
2335	if (get_user(len, optlen))	2335	if (get_user(len, optlen))
2336	return -EFAULT;	2336	return -EFAULT;
2337	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);	2337	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2338	if (put_user(len, optlen))	2338	if (put_user(len, optlen))
2339	return -EFAULT;	2339	return -EFAULT;
2340	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))	2340	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2341	return -EFAULT;	2341	return -EFAULT;
2342	return 0;	2342	return 0;
2343	default:	2343	default:
2344	return -ENOPROTOOPT;	2344	return -ENOPROTOOPT;
2345	}	2345	}
2346		2346
2347	if (put_user(len, optlen))	2347	if (put_user(len, optlen))
2348	return -EFAULT;	2348	return -EFAULT;
2349	if (copy_to_user(optval, &val, len))	2349	if (copy_to_user(optval, &val, len))
2350	return -EFAULT;	2350	return -EFAULT;
2351	return 0;	2351	return 0;
2352	}	2352	}
2353		2353
2354	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,	2354	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
2355	int __user *optlen)	2355	int __user *optlen)
2356	{	2356	{
2357	struct inet_connection_sock *icsk = inet_csk(sk);	2357	struct inet_connection_sock *icsk = inet_csk(sk);
2358		2358
2359	if (level != SOL_TCP)	2359	if (level != SOL_TCP)
2360	return icsk->icsk_af_ops->getsockopt(sk, level, optname,	2360	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2361	optval, optlen);	2361	optval, optlen);
2362	return do_tcp_getsockopt(sk, level, optname, optval, optlen);	2362	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2363	}	2363	}
2364		2364
2365	#ifdef CONFIG_COMPAT	2365	#ifdef CONFIG_COMPAT
2366	int compat_tcp_getsockopt(struct sock *sk, int level, int optname,	2366	int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2367	char __user optval, int __user optlen)	2367	char __user optval, int __user optlen)
2368	{	2368	{
2369	if (level != SOL_TCP)	2369	if (level != SOL_TCP)
2370	return inet_csk_compat_getsockopt(sk, level, optname,	2370	return inet_csk_compat_getsockopt(sk, level, optname,
2371	optval, optlen);	2371	optval, optlen);
2372	return do_tcp_getsockopt(sk, level, optname, optval, optlen);	2372	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2373	}	2373	}
2374		2374
2375	EXPORT_SYMBOL(compat_tcp_getsockopt);	2375	EXPORT_SYMBOL(compat_tcp_getsockopt);
2376	#endif	2376	#endif
2377		2377
2378	struct sk_buff tcp_tso_segment(struct sk_buff skb, int features)	2378	struct sk_buff tcp_tso_segment(struct sk_buff skb, int features)
2379	{	2379	{
2380	struct sk_buff *segs = ERR_PTR(-EINVAL);	2380	struct sk_buff *segs = ERR_PTR(-EINVAL);
2381	struct tcphdr *th;	2381	struct tcphdr *th;
2382	unsigned thlen;	2382	unsigned thlen;
2383	unsigned int seq;	2383	unsigned int seq;
2384	__be32 delta;	2384	__be32 delta;
2385	unsigned int oldlen;	2385	unsigned int oldlen;
2386	unsigned int len;	2386	unsigned int len;
2387		2387
2388	if (!pskb_may_pull(skb, sizeof(*th)))	2388	if (!pskb_may_pull(skb, sizeof(*th)))
2389	goto out;	2389	goto out;
2390		2390
2391	th = tcp_hdr(skb);	2391	th = tcp_hdr(skb);
2392	thlen = th->doff * 4;	2392	thlen = th->doff * 4;
2393	if (thlen < sizeof(*th))	2393	if (thlen < sizeof(*th))
2394	goto out;	2394	goto out;
2395		2395
2396	if (!pskb_may_pull(skb, thlen))	2396	if (!pskb_may_pull(skb, thlen))
2397	goto out;	2397	goto out;
2398		2398
2399	oldlen = (u16)~skb->len;	2399	oldlen = (u16)~skb->len;
2400	__skb_pull(skb, thlen);	2400	__skb_pull(skb, thlen);
2401		2401
2402	if (skb_gso_ok(skb, features \| NETIF_F_GSO_ROBUST)) {	2402	if (skb_gso_ok(skb, features \| NETIF_F_GSO_ROBUST)) {
2403	/* Packet is from an untrusted source, reset gso_segs. */	2403	/* Packet is from an untrusted source, reset gso_segs. */
2404	int type = skb_shinfo(skb)->gso_type;	2404	int type = skb_shinfo(skb)->gso_type;
2405	int mss;	2405	int mss;
2406		2406
2407	if (unlikely(type &	2407	if (unlikely(type &
2408	~(SKB_GSO_TCPV4 \|	2408	~(SKB_GSO_TCPV4 \|
2409	SKB_GSO_DODGY \|	2409	SKB_GSO_DODGY \|
2410	SKB_GSO_TCP_ECN \|	2410	SKB_GSO_TCP_ECN \|
2411	SKB_GSO_TCPV6 \|	2411	SKB_GSO_TCPV6 \|
2412	0) \|\|	2412	0) \|\|
2413	!(type & (SKB_GSO_TCPV4 \| SKB_GSO_TCPV6))))	2413	!(type & (SKB_GSO_TCPV4 \| SKB_GSO_TCPV6))))
2414	goto out;	2414	goto out;
2415		2415
2416	mss = skb_shinfo(skb)->gso_size;	2416	mss = skb_shinfo(skb)->gso_size;
2417	skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);	2417	skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2418		2418
2419	segs = NULL;	2419	segs = NULL;
2420	goto out;	2420	goto out;
2421	}	2421	}
2422		2422
2423	segs = skb_segment(skb, features);	2423	segs = skb_segment(skb, features);
2424	if (IS_ERR(segs))	2424	if (IS_ERR(segs))
2425	goto out;	2425	goto out;
2426		2426
2427	len = skb_shinfo(skb)->gso_size;	2427	len = skb_shinfo(skb)->gso_size;
2428	delta = htonl(oldlen + (thlen + len));	2428	delta = htonl(oldlen + (thlen + len));
2429		2429
2430	skb = segs;	2430	skb = segs;
2431	th = tcp_hdr(skb);	2431	th = tcp_hdr(skb);
2432	seq = ntohl(th->seq);	2432	seq = ntohl(th->seq);
2433		2433
2434	do {	2434	do {
2435	th->fin = th->psh = 0;	2435	th->fin = th->psh = 0;
2436		2436
2437	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +	2437	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2438	(__force u32)delta));	2438	(__force u32)delta));
2439	if (skb->ip_summed != CHECKSUM_PARTIAL)	2439	if (skb->ip_summed != CHECKSUM_PARTIAL)
2440	th->check =	2440	th->check =
2441	csum_fold(csum_partial(skb_transport_header(skb),	2441	csum_fold(csum_partial(skb_transport_header(skb),
2442	thlen, skb->csum));	2442	thlen, skb->csum));
2443		2443
2444	seq += len;	2444	seq += len;
2445	skb = skb->next;	2445	skb = skb->next;
2446	th = tcp_hdr(skb);	2446	th = tcp_hdr(skb);
2447		2447
2448	th->seq = htonl(seq);	2448	th->seq = htonl(seq);
2449	th->cwr = 0;	2449	th->cwr = 0;
2450	} while (skb->next);	2450	} while (skb->next);
2451		2451
2452	delta = htonl(oldlen + (skb->tail - skb->transport_header) +	2452	delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2453	skb->data_len);	2453	skb->data_len);
2454	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +	2454	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2455	(__force u32)delta));	2455	(__force u32)delta));
2456	if (skb->ip_summed != CHECKSUM_PARTIAL)	2456	if (skb->ip_summed != CHECKSUM_PARTIAL)
2457	th->check = csum_fold(csum_partial(skb_transport_header(skb),	2457	th->check = csum_fold(csum_partial(skb_transport_header(skb),
2458	thlen, skb->csum));	2458	thlen, skb->csum));
2459		2459
2460	out:	2460	out:
2461	return segs;	2461	return segs;
2462	}	2462	}
2463	EXPORT_SYMBOL(tcp_tso_segment);	2463	EXPORT_SYMBOL(tcp_tso_segment);
2464		2464
2465	#ifdef CONFIG_TCP_MD5SIG	2465	#ifdef CONFIG_TCP_MD5SIG
2466	static unsigned long tcp_md5sig_users;	2466	static unsigned long tcp_md5sig_users;
2467	static struct tcp_md5sig_pool **tcp_md5sig_pool;	2467	static struct tcp_md5sig_pool **tcp_md5sig_pool;
2468	static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);	2468	static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2469		2469
2470	static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)	2470	static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2471	{	2471	{
2472	int cpu;	2472	int cpu;
2473	for_each_possible_cpu(cpu) {	2473	for_each_possible_cpu(cpu) {
2474	struct tcp_md5sig_pool p = per_cpu_ptr(pool, cpu);	2474	struct tcp_md5sig_pool p = per_cpu_ptr(pool, cpu);
2475	if (p) {	2475	if (p) {
2476	if (p->md5_desc.tfm)	2476	if (p->md5_desc.tfm)
2477	crypto_free_hash(p->md5_desc.tfm);	2477	crypto_free_hash(p->md5_desc.tfm);
2478	kfree(p);	2478	kfree(p);
2479	p = NULL;	2479	p = NULL;
2480	}	2480	}
2481	}	2481	}
2482	free_percpu(pool);	2482	free_percpu(pool);
2483	}	2483	}
2484		2484
2485	void tcp_free_md5sig_pool(void)	2485	void tcp_free_md5sig_pool(void)
2486	{	2486	{
2487	struct tcp_md5sig_pool **pool = NULL;	2487	struct tcp_md5sig_pool **pool = NULL;
2488		2488
2489	spin_lock_bh(&tcp_md5sig_pool_lock);	2489	spin_lock_bh(&tcp_md5sig_pool_lock);
2490	if (--tcp_md5sig_users == 0) {	2490	if (--tcp_md5sig_users == 0) {
2491	pool = tcp_md5sig_pool;	2491	pool = tcp_md5sig_pool;
2492	tcp_md5sig_pool = NULL;	2492	tcp_md5sig_pool = NULL;
2493	}	2493	}
2494	spin_unlock_bh(&tcp_md5sig_pool_lock);	2494	spin_unlock_bh(&tcp_md5sig_pool_lock);
2495	if (pool)	2495	if (pool)
2496	__tcp_free_md5sig_pool(pool);	2496	__tcp_free_md5sig_pool(pool);
2497	}	2497	}
2498		2498
2499	EXPORT_SYMBOL(tcp_free_md5sig_pool);	2499	EXPORT_SYMBOL(tcp_free_md5sig_pool);
2500		2500
2501	static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)	2501	static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2502	{	2502	{
2503	int cpu;	2503	int cpu;
2504	struct tcp_md5sig_pool **pool;	2504	struct tcp_md5sig_pool **pool;
2505		2505
2506	pool = alloc_percpu(struct tcp_md5sig_pool *);	2506	pool = alloc_percpu(struct tcp_md5sig_pool *);
2507	if (!pool)	2507	if (!pool)
2508	return NULL;	2508	return NULL;
2509		2509
2510	for_each_possible_cpu(cpu) {	2510	for_each_possible_cpu(cpu) {
2511	struct tcp_md5sig_pool *p;	2511	struct tcp_md5sig_pool *p;
2512	struct crypto_hash *hash;	2512	struct crypto_hash *hash;
2513		2513
2514	p = kzalloc(sizeof(*p), GFP_KERNEL);	2514	p = kzalloc(sizeof(*p), GFP_KERNEL);
2515	if (!p)	2515	if (!p)
2516	goto out_free;	2516	goto out_free;
2517	*per_cpu_ptr(pool, cpu) = p;	2517	*per_cpu_ptr(pool, cpu) = p;
2518		2518
2519	hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);	2519	hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2520	if (!hash \|\| IS_ERR(hash))	2520	if (!hash \|\| IS_ERR(hash))
2521	goto out_free;	2521	goto out_free;
2522		2522
2523	p->md5_desc.tfm = hash;	2523	p->md5_desc.tfm = hash;
2524	}	2524	}
2525	return pool;	2525	return pool;
2526	out_free:	2526	out_free:
2527	__tcp_free_md5sig_pool(pool);	2527	__tcp_free_md5sig_pool(pool);
2528	return NULL;	2528	return NULL;
2529	}	2529	}
2530		2530
2531	struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)	2531	struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2532	{	2532	{
2533	struct tcp_md5sig_pool **pool;	2533	struct tcp_md5sig_pool **pool;
2534	int alloc = 0;	2534	int alloc = 0;
2535		2535
2536	retry:	2536	retry:
2537	spin_lock_bh(&tcp_md5sig_pool_lock);	2537	spin_lock_bh(&tcp_md5sig_pool_lock);
2538	pool = tcp_md5sig_pool;	2538	pool = tcp_md5sig_pool;
2539	if (tcp_md5sig_users++ == 0) {	2539	if (tcp_md5sig_users++ == 0) {
2540	alloc = 1;	2540	alloc = 1;
2541	spin_unlock_bh(&tcp_md5sig_pool_lock);	2541	spin_unlock_bh(&tcp_md5sig_pool_lock);
2542	} else if (!pool) {	2542	} else if (!pool) {
2543	tcp_md5sig_users--;	2543	tcp_md5sig_users--;
2544	spin_unlock_bh(&tcp_md5sig_pool_lock);	2544	spin_unlock_bh(&tcp_md5sig_pool_lock);
2545	cpu_relax();	2545	cpu_relax();
2546	goto retry;	2546	goto retry;
2547	} else	2547	} else
2548	spin_unlock_bh(&tcp_md5sig_pool_lock);	2548	spin_unlock_bh(&tcp_md5sig_pool_lock);
2549		2549
2550	if (alloc) {	2550	if (alloc) {
2551	/* we cannot hold spinlock here because this may sleep. */	2551	/* we cannot hold spinlock here because this may sleep. */
2552	struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();	2552	struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2553	spin_lock_bh(&tcp_md5sig_pool_lock);	2553	spin_lock_bh(&tcp_md5sig_pool_lock);
2554	if (!p) {	2554	if (!p) {
2555	tcp_md5sig_users--;	2555	tcp_md5sig_users--;
2556	spin_unlock_bh(&tcp_md5sig_pool_lock);	2556	spin_unlock_bh(&tcp_md5sig_pool_lock);
2557	return NULL;	2557	return NULL;
2558	}	2558	}
2559	pool = tcp_md5sig_pool;	2559	pool = tcp_md5sig_pool;
2560	if (pool) {	2560	if (pool) {
2561	/* oops, it has already been assigned. */	2561	/* oops, it has already been assigned. */
2562	spin_unlock_bh(&tcp_md5sig_pool_lock);	2562	spin_unlock_bh(&tcp_md5sig_pool_lock);
2563	__tcp_free_md5sig_pool(p);	2563	__tcp_free_md5sig_pool(p);
2564	} else {	2564	} else {
2565	tcp_md5sig_pool = pool = p;	2565	tcp_md5sig_pool = pool = p;
2566	spin_unlock_bh(&tcp_md5sig_pool_lock);	2566	spin_unlock_bh(&tcp_md5sig_pool_lock);
2567	}	2567	}
2568	}	2568	}
2569	return pool;	2569	return pool;
2570	}	2570	}
2571		2571
2572	EXPORT_SYMBOL(tcp_alloc_md5sig_pool);	2572	EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2573		2573
2574	struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)	2574	struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2575	{	2575	{
2576	struct tcp_md5sig_pool **p;	2576	struct tcp_md5sig_pool **p;
2577	spin_lock_bh(&tcp_md5sig_pool_lock);	2577	spin_lock_bh(&tcp_md5sig_pool_lock);
2578	p = tcp_md5sig_pool;	2578	p = tcp_md5sig_pool;
2579	if (p)	2579	if (p)
2580	tcp_md5sig_users++;	2580	tcp_md5sig_users++;
2581	spin_unlock_bh(&tcp_md5sig_pool_lock);	2581	spin_unlock_bh(&tcp_md5sig_pool_lock);
2582	return (p ? *per_cpu_ptr(p, cpu) : NULL);	2582	return (p ? *per_cpu_ptr(p, cpu) : NULL);
2583	}	2583	}
2584		2584
2585	EXPORT_SYMBOL(__tcp_get_md5sig_pool);	2585	EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2586		2586
2587	void __tcp_put_md5sig_pool(void)	2587	void __tcp_put_md5sig_pool(void)
2588	{	2588	{
2589	tcp_free_md5sig_pool();	2589	tcp_free_md5sig_pool();
2590	}	2590	}
2591		2591
2592	EXPORT_SYMBOL(__tcp_put_md5sig_pool);	2592	EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2593		2593
2594	int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,	2594	int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2595	struct tcphdr *th)	2595	struct tcphdr *th)
2596	{	2596	{
2597	struct scatterlist sg;	2597	struct scatterlist sg;
2598	int err;	2598	int err;
2599		2599
2600	__sum16 old_checksum = th->check;	2600	__sum16 old_checksum = th->check;
2601	th->check = 0;	2601	th->check = 0;
2602	/* options aren't included in the hash */	2602	/* options aren't included in the hash */
2603	sg_init_one(&sg, th, sizeof(struct tcphdr));	2603	sg_init_one(&sg, th, sizeof(struct tcphdr));
2604	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));	2604	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
2605	th->check = old_checksum;	2605	th->check = old_checksum;
2606	return err;	2606	return err;
2607	}	2607	}
2608		2608
2609	EXPORT_SYMBOL(tcp_md5_hash_header);	2609	EXPORT_SYMBOL(tcp_md5_hash_header);
2610		2610
2611	int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,	2611	int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2612	struct sk_buff *skb, unsigned header_len)	2612	struct sk_buff *skb, unsigned header_len)
2613	{	2613	{
2614	struct scatterlist sg;	2614	struct scatterlist sg;
2615	const struct tcphdr *tp = tcp_hdr(skb);	2615	const struct tcphdr *tp = tcp_hdr(skb);
2616	struct hash_desc *desc = &hp->md5_desc;	2616	struct hash_desc *desc = &hp->md5_desc;
2617	unsigned i;	2617	unsigned i;
2618	const unsigned head_data_len = skb_headlen(skb) > header_len ?	2618	const unsigned head_data_len = skb_headlen(skb) > header_len ?
2619	skb_headlen(skb) - header_len : 0;	2619	skb_headlen(skb) - header_len : 0;
2620	const struct skb_shared_info *shi = skb_shinfo(skb);	2620	const struct skb_shared_info *shi = skb_shinfo(skb);
2621		2621
2622	sg_init_table(&sg, 1);	2622	sg_init_table(&sg, 1);
2623		2623
2624	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);	2624	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
2625	if (crypto_hash_update(desc, &sg, head_data_len))	2625	if (crypto_hash_update(desc, &sg, head_data_len))
2626	return 1;	2626	return 1;
2627		2627
2628	for (i = 0; i < shi->nr_frags; ++i) {	2628	for (i = 0; i < shi->nr_frags; ++i) {
2629	const struct skb_frag_struct *f = &shi->frags[i];	2629	const struct skb_frag_struct *f = &shi->frags[i];
2630	sg_set_page(&sg, f->page, f->size, f->page_offset);	2630	sg_set_page(&sg, f->page, f->size, f->page_offset);
2631	if (crypto_hash_update(desc, &sg, f->size))	2631	if (crypto_hash_update(desc, &sg, f->size))
2632	return 1;	2632	return 1;
2633	}	2633	}
2634		2634
2635	return 0;	2635	return 0;
2636	}	2636	}
2637		2637
2638	EXPORT_SYMBOL(tcp_md5_hash_skb_data);	2638	EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2639		2639
2640	int tcp_md5_hash_key(struct tcp_md5sig_pool hp, struct tcp_md5sig_key key)	2640	int tcp_md5_hash_key(struct tcp_md5sig_pool hp, struct tcp_md5sig_key key)
2641	{	2641	{
2642	struct scatterlist sg;	2642	struct scatterlist sg;
2643		2643
2644	sg_init_one(&sg, key->key, key->keylen);	2644	sg_init_one(&sg, key->key, key->keylen);
2645	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);	2645	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2646	}	2646	}
2647		2647
2648	EXPORT_SYMBOL(tcp_md5_hash_key);	2648	EXPORT_SYMBOL(tcp_md5_hash_key);
2649		2649
2650	#endif	2650	#endif
2651		2651
2652	void tcp_done(struct sock *sk)	2652	void tcp_done(struct sock *sk)
2653	{	2653	{
2654	if(sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)	2654	if (sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)
2655	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);	2655	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2656		2656
2657	tcp_set_state(sk, TCP_CLOSE);	2657	tcp_set_state(sk, TCP_CLOSE);
2658	tcp_clear_xmit_timers(sk);	2658	tcp_clear_xmit_timers(sk);
2659		2659
2660	sk->sk_shutdown = SHUTDOWN_MASK;	2660	sk->sk_shutdown = SHUTDOWN_MASK;
2661		2661
2662	if (!sock_flag(sk, SOCK_DEAD))	2662	if (!sock_flag(sk, SOCK_DEAD))
2663	sk->sk_state_change(sk);	2663	sk->sk_state_change(sk);
2664	else	2664	else
2665	inet_csk_destroy_sock(sk);	2665	inet_csk_destroy_sock(sk);
2666	}	2666	}
2667	EXPORT_SYMBOL_GPL(tcp_done);	2667	EXPORT_SYMBOL_GPL(tcp_done);
2668		2668
2669	extern struct tcp_congestion_ops tcp_reno;	2669	extern struct tcp_congestion_ops tcp_reno;
2670		2670
2671	static __initdata unsigned long thash_entries;	2671	static __initdata unsigned long thash_entries;
2672	static int __init set_thash_entries(char *str)	2672	static int __init set_thash_entries(char *str)
2673	{	2673	{
2674	if (!str)	2674	if (!str)
2675	return 0;	2675	return 0;
2676	thash_entries = simple_strtoul(str, &str, 0);	2676	thash_entries = simple_strtoul(str, &str, 0);
2677	return 1;	2677	return 1;
2678	}	2678	}
2679	__setup("thash_entries=", set_thash_entries);	2679	__setup("thash_entries=", set_thash_entries);
2680		2680
2681	void __init tcp_init(void)	2681	void __init tcp_init(void)
2682	{	2682	{
2683	struct sk_buff *skb = NULL;	2683	struct sk_buff *skb = NULL;
2684	unsigned long nr_pages, limit;	2684	unsigned long nr_pages, limit;
2685	int order, i, max_share;	2685	int order, i, max_share;
2686		2686
2687	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));	2687	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2688		2688
2689	tcp_hashinfo.bind_bucket_cachep =	2689	tcp_hashinfo.bind_bucket_cachep =
2690	kmem_cache_create("tcp_bind_bucket",	2690	kmem_cache_create("tcp_bind_bucket",
2691	sizeof(struct inet_bind_bucket), 0,	2691	sizeof(struct inet_bind_bucket), 0,
2692	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);	2692	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
2693		2693
2694	/* Size and allocate the main established and bind bucket	2694	/* Size and allocate the main established and bind bucket
2695	* hash tables.	2695	* hash tables.
2696	*	2696	*
2697	* The methodology is similar to that of the buffer cache.	2697	* The methodology is similar to that of the buffer cache.
2698	*/	2698	*/
2699	tcp_hashinfo.ehash =	2699	tcp_hashinfo.ehash =
2700	alloc_large_system_hash("TCP established",	2700	alloc_large_system_hash("TCP established",
2701	sizeof(struct inet_ehash_bucket),	2701	sizeof(struct inet_ehash_bucket),
2702	thash_entries,	2702	thash_entries,
2703	(num_physpages >= 128 * 1024) ?	2703	(num_physpages >= 128 * 1024) ?
2704	13 : 15,	2704	13 : 15,
2705	0,	2705	0,
2706	&tcp_hashinfo.ehash_size,	2706	&tcp_hashinfo.ehash_size,
2707	NULL,	2707	NULL,
2708	thash_entries ? 0 : 512 * 1024);	2708	thash_entries ? 0 : 512 * 1024);
2709	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;	2709	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2710	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {	2710	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2711	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);	2711	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2712	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);	2712	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2713	}	2713	}
2714	if (inet_ehash_locks_alloc(&tcp_hashinfo))	2714	if (inet_ehash_locks_alloc(&tcp_hashinfo))
2715	panic("TCP: failed to alloc ehash_locks");	2715	panic("TCP: failed to alloc ehash_locks");
2716	tcp_hashinfo.bhash =	2716	tcp_hashinfo.bhash =
2717	alloc_large_system_hash("TCP bind",	2717	alloc_large_system_hash("TCP bind",
2718	sizeof(struct inet_bind_hashbucket),	2718	sizeof(struct inet_bind_hashbucket),
2719	tcp_hashinfo.ehash_size,	2719	tcp_hashinfo.ehash_size,
2720	(num_physpages >= 128 * 1024) ?	2720	(num_physpages >= 128 * 1024) ?
2721	13 : 15,	2721	13 : 15,
2722	0,	2722	0,
2723	&tcp_hashinfo.bhash_size,	2723	&tcp_hashinfo.bhash_size,
2724	NULL,	2724	NULL,
2725	64 * 1024);	2725	64 * 1024);
2726	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;	2726	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2727	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {	2727	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2728	spin_lock_init(&tcp_hashinfo.bhash[i].lock);	2728	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2729	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);	2729	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2730	}	2730	}
2731		2731
2732	/* Try to be a bit smarter and adjust defaults depending	2732	/* Try to be a bit smarter and adjust defaults depending
2733	* on available memory.	2733	* on available memory.
2734	*/	2734	*/
2735	for (order = 0; ((1 << order) << PAGE_SHIFT) <	2735	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2736	(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));	2736	(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2737	order++)	2737	order++)
2738	;	2738	;
2739	if (order >= 4) {	2739	if (order >= 4) {
2740	tcp_death_row.sysctl_max_tw_buckets = 180000;	2740	tcp_death_row.sysctl_max_tw_buckets = 180000;
2741	sysctl_tcp_max_orphans = 4096 << (order - 4);	2741	sysctl_tcp_max_orphans = 4096 << (order - 4);
2742	sysctl_max_syn_backlog = 1024;	2742	sysctl_max_syn_backlog = 1024;
2743	} else if (order < 3) {	2743	} else if (order < 3) {
2744	tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);	2744	tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2745	sysctl_tcp_max_orphans >>= (3 - order);	2745	sysctl_tcp_max_orphans >>= (3 - order);
2746	sysctl_max_syn_backlog = 128;	2746	sysctl_max_syn_backlog = 128;
2747	}	2747	}
2748		2748
2749	/* Set the pressure threshold to be a fraction of global memory that	2749	/* Set the pressure threshold to be a fraction of global memory that
2750	* is up to 1/2 at 256 MB, decreasing toward zero with the amount of	2750	* is up to 1/2 at 256 MB, decreasing toward zero with the amount of
2751	* memory, with a floor of 128 pages.	2751	* memory, with a floor of 128 pages.
2752	*/	2752	*/
2753	nr_pages = totalram_pages - totalhigh_pages;	2753	nr_pages = totalram_pages - totalhigh_pages;
2754	limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);	2754	limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2755	limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);	2755	limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2756	limit = max(limit, 128UL);	2756	limit = max(limit, 128UL);
2757	sysctl_tcp_mem[0] = limit / 4 * 3;	2757	sysctl_tcp_mem[0] = limit / 4 * 3;
2758	sysctl_tcp_mem[1] = limit;	2758	sysctl_tcp_mem[1] = limit;
2759	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;	2759	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2760		2760
2761	/* Set per-socket limits to no more than 1/128 the pressure threshold */	2761	/* Set per-socket limits to no more than 1/128 the pressure threshold */
2762	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);	2762	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2763	max_share = min(4UL10241024, limit);	2763	max_share = min(4UL10241024, limit);
2764		2764
2765	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;	2765	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2766	sysctl_tcp_wmem[1] = 16*1024;	2766	sysctl_tcp_wmem[1] = 16*1024;
2767	sysctl_tcp_wmem[2] = max(64*1024, max_share);	2767	sysctl_tcp_wmem[2] = max(64*1024, max_share);
2768		2768
2769	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;	2769	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2770	sysctl_tcp_rmem[1] = 87380;	2770	sysctl_tcp_rmem[1] = 87380;
2771	sysctl_tcp_rmem[2] = max(87380, max_share);	2771	sysctl_tcp_rmem[2] = max(87380, max_share);
2772		2772
2773	printk(KERN_INFO "TCP: Hash tables configured "	2773	printk(KERN_INFO "TCP: Hash tables configured "
2774	"(established %d bind %d)\n",	2774	"(established %d bind %d)\n",
2775	tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);	2775	tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2776		2776
2777	tcp_register_congestion_control(&tcp_reno);	2777	tcp_register_congestion_control(&tcp_reno);
2778	}	2778	}
2779		2779
2780	EXPORT_SYMBOL(tcp_close);	2780	EXPORT_SYMBOL(tcp_close);
2781	EXPORT_SYMBOL(tcp_disconnect);	2781	EXPORT_SYMBOL(tcp_disconnect);
2782	EXPORT_SYMBOL(tcp_getsockopt);	2782	EXPORT_SYMBOL(tcp_getsockopt);
2783	EXPORT_SYMBOL(tcp_ioctl);	2783	EXPORT_SYMBOL(tcp_ioctl);
2784	EXPORT_SYMBOL(tcp_poll);	2784	EXPORT_SYMBOL(tcp_poll);
2785	EXPORT_SYMBOL(tcp_read_sock);	2785	EXPORT_SYMBOL(tcp_read_sock);
2786	EXPORT_SYMBOL(tcp_recvmsg);	2786	EXPORT_SYMBOL(tcp_recvmsg);
2787	EXPORT_SYMBOL(tcp_sendmsg);	2787	EXPORT_SYMBOL(tcp_sendmsg);
2788	EXPORT_SYMBOL(tcp_splice_read);	2788	EXPORT_SYMBOL(tcp_splice_read);
2789	EXPORT_SYMBOL(tcp_sendpage);	2789	EXPORT_SYMBOL(tcp_sendpage);
2790	EXPORT_SYMBOL(tcp_setsockopt);	2790	EXPORT_SYMBOL(tcp_setsockopt);
2791	EXPORT_SYMBOL(tcp_shutdown);	2791	EXPORT_SYMBOL(tcp_shutdown);
2792		2792

net/ipv4/tcp_minisocks.c

Diff comments View file @ 5a5f3a8

net/ipv4/tcp_yeah.c

Diff comments View file @ 5a5f3a8

net/ipv4/xfrm4_policy.c

Diff comments View file @ 5a5f3a8

1	/*	1	/*
2	* xfrm4_policy.c	2	* xfrm4_policy.c
3	*	3	*
4	* Changes:	4	* Changes:
5	* Kazunori MIYAZAWA @USAGI	5	* Kazunori MIYAZAWA @USAGI
6	* YOSHIFUJI Hideaki @USAGI	6	* YOSHIFUJI Hideaki @USAGI
7	* Split up af-specific portion	7	* Split up af-specific portion
8	*	8	*
9	*/	9	*/
10		10
11	#include <linux/err.h>	11	#include <linux/err.h>
12	#include <linux/kernel.h>	12	#include <linux/kernel.h>
13	#include <linux/inetdevice.h>	13	#include <linux/inetdevice.h>
14	#include <net/dst.h>	14	#include <net/dst.h>
15	#include <net/xfrm.h>	15	#include <net/xfrm.h>
16	#include <net/ip.h>	16	#include <net/ip.h>
17		17
18	static struct dst_ops xfrm4_dst_ops;	18	static struct dst_ops xfrm4_dst_ops;
19	static struct xfrm_policy_afinfo xfrm4_policy_afinfo;	19	static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20		20
21	static struct dst_entry xfrm4_dst_lookup(int tos, xfrm_address_t saddr,	21	static struct dst_entry xfrm4_dst_lookup(int tos, xfrm_address_t saddr,
22	xfrm_address_t *daddr)	22	xfrm_address_t *daddr)
23	{	23	{
24	struct flowi fl = {	24	struct flowi fl = {
25	.nl_u = {	25	.nl_u = {
26	.ip4_u = {	26	.ip4_u = {
27	.tos = tos,	27	.tos = tos,
28	.daddr = daddr->a4,	28	.daddr = daddr->a4,
29	},	29	},
30	},	30	},
31	};	31	};
32	struct dst_entry *dst;	32	struct dst_entry *dst;
33	struct rtable *rt;	33	struct rtable *rt;
34	int err;	34	int err;
35		35
36	if (saddr)	36	if (saddr)
37	fl.fl4_src = saddr->a4;	37	fl.fl4_src = saddr->a4;
38		38
39	err = __ip_route_output_key(&init_net, &rt, &fl);	39	err = __ip_route_output_key(&init_net, &rt, &fl);
40	dst = &rt->u.dst;	40	dst = &rt->u.dst;
41	if (err)	41	if (err)
42	dst = ERR_PTR(err);	42	dst = ERR_PTR(err);
43	return dst;	43	return dst;
44	}	44	}
45		45
46	static int xfrm4_get_saddr(xfrm_address_t saddr, xfrm_address_t daddr)	46	static int xfrm4_get_saddr(xfrm_address_t saddr, xfrm_address_t daddr)
47	{	47	{
48	struct dst_entry *dst;	48	struct dst_entry *dst;
49	struct rtable *rt;	49	struct rtable *rt;
50		50
51	dst = xfrm4_dst_lookup(0, NULL, daddr);	51	dst = xfrm4_dst_lookup(0, NULL, daddr);
52	if (IS_ERR(dst))	52	if (IS_ERR(dst))
53	return -EHOSTUNREACH;	53	return -EHOSTUNREACH;
54		54
55	rt = (struct rtable *)dst;	55	rt = (struct rtable *)dst;
56	saddr->a4 = rt->rt_src;	56	saddr->a4 = rt->rt_src;
57	dst_release(dst);	57	dst_release(dst);
58	return 0;	58	return 0;
59	}	59	}
60		60
61	static struct dst_entry *	61	static struct dst_entry *
62	__xfrm4_find_bundle(struct flowi fl, struct xfrm_policy policy)	62	__xfrm4_find_bundle(struct flowi fl, struct xfrm_policy policy)
63	{	63	{
64	struct dst_entry *dst;	64	struct dst_entry *dst;
65		65
66	read_lock_bh(&policy->lock);	66	read_lock_bh(&policy->lock);
67	for (dst = policy->bundles; dst; dst = dst->next) {	67	for (dst = policy->bundles; dst; dst = dst->next) {
68	struct xfrm_dst xdst = (struct xfrm_dst)dst;	68	struct xfrm_dst xdst = (struct xfrm_dst )dst;
69	if (xdst->u.rt.fl.oif == fl->oif && /XXX/	69	if (xdst->u.rt.fl.oif == fl->oif && /XXX/
70	xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&	70	xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
71	xdst->u.rt.fl.fl4_src == fl->fl4_src &&	71	xdst->u.rt.fl.fl4_src == fl->fl4_src &&
72	xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&	72	xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
73	xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {	73	xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
74	dst_clone(dst);	74	dst_clone(dst);
75	break;	75	break;
76	}	76	}
77	}	77	}
78	read_unlock_bh(&policy->lock);	78	read_unlock_bh(&policy->lock);
79	return dst;	79	return dst;
80	}	80	}
81		81
82	static int xfrm4_get_tos(struct flowi *fl)	82	static int xfrm4_get_tos(struct flowi *fl)
83	{	83	{
84	return fl->fl4_tos;	84	return fl->fl4_tos;
85	}	85	}
86		86
87	static int xfrm4_init_path(struct xfrm_dst path, struct dst_entry dst,	87	static int xfrm4_init_path(struct xfrm_dst path, struct dst_entry dst,
88	int nfheader_len)	88	int nfheader_len)
89	{	89	{
90	return 0;	90	return 0;
91	}	91	}
92		92
93	static int xfrm4_fill_dst(struct xfrm_dst xdst, struct net_device dev)	93	static int xfrm4_fill_dst(struct xfrm_dst xdst, struct net_device dev)
94	{	94	{
95	struct rtable rt = (struct rtable )xdst->route;	95	struct rtable rt = (struct rtable )xdst->route;
96		96
97	xdst->u.rt.fl = rt->fl;	97	xdst->u.rt.fl = rt->fl;
98		98
99	xdst->u.dst.dev = dev;	99	xdst->u.dst.dev = dev;
100	dev_hold(dev);	100	dev_hold(dev);
101		101
102	xdst->u.rt.idev = in_dev_get(dev);	102	xdst->u.rt.idev = in_dev_get(dev);
103	if (!xdst->u.rt.idev)	103	if (!xdst->u.rt.idev)
104	return -ENODEV;	104	return -ENODEV;
105		105
106	xdst->u.rt.peer = rt->peer;	106	xdst->u.rt.peer = rt->peer;
107	if (rt->peer)	107	if (rt->peer)
108	atomic_inc(&rt->peer->refcnt);	108	atomic_inc(&rt->peer->refcnt);
109		109
110	/* Sheit... I remember I did this right. Apparently,	110	/* Sheit... I remember I did this right. Apparently,
111	* it was magically lost, so this code needs audit */	111	* it was magically lost, so this code needs audit */
112	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST \|	112	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST \|
113	RTCF_LOCAL);	113	RTCF_LOCAL);
114	xdst->u.rt.rt_type = rt->rt_type;	114	xdst->u.rt.rt_type = rt->rt_type;
115	xdst->u.rt.rt_src = rt->rt_src;	115	xdst->u.rt.rt_src = rt->rt_src;
116	xdst->u.rt.rt_dst = rt->rt_dst;	116	xdst->u.rt.rt_dst = rt->rt_dst;
117	xdst->u.rt.rt_gateway = rt->rt_gateway;	117	xdst->u.rt.rt_gateway = rt->rt_gateway;
118	xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;	118	xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
119		119
120	return 0;	120	return 0;
121	}	121	}
122		122
123	static void	123	static void
124	_decode_session4(struct sk_buff skb, struct flowi fl, int reverse)	124	_decode_session4(struct sk_buff skb, struct flowi fl, int reverse)
125	{	125	{
126	struct iphdr *iph = ip_hdr(skb);	126	struct iphdr *iph = ip_hdr(skb);
127	u8 xprth = skb_network_header(skb) + iph->ihl 4;	127	u8 xprth = skb_network_header(skb) + iph->ihl 4;
128		128
129	memset(fl, 0, sizeof(struct flowi));	129	memset(fl, 0, sizeof(struct flowi));
130	if (!(iph->frag_off & htons(IP_MF \| IP_OFFSET))) {	130	if (!(iph->frag_off & htons(IP_MF \| IP_OFFSET))) {
131	switch (iph->protocol) {	131	switch (iph->protocol) {
132	case IPPROTO_UDP:	132	case IPPROTO_UDP:
133	case IPPROTO_UDPLITE:	133	case IPPROTO_UDPLITE:
134	case IPPROTO_TCP:	134	case IPPROTO_TCP:
135	case IPPROTO_SCTP:	135	case IPPROTO_SCTP:
136	case IPPROTO_DCCP:	136	case IPPROTO_DCCP:
137	if (pskb_may_pull(skb, xprth + 4 - skb->data)) {	137	if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
138	__be16 ports = (__be16 )xprth;	138	__be16 ports = (__be16 )xprth;
139		139
140	fl->fl_ip_sport = ports[!!reverse];	140	fl->fl_ip_sport = ports[!!reverse];
141	fl->fl_ip_dport = ports[!reverse];	141	fl->fl_ip_dport = ports[!reverse];
142	}	142	}
143	break;	143	break;
144		144
145	case IPPROTO_ICMP:	145	case IPPROTO_ICMP:
146	if (pskb_may_pull(skb, xprth + 2 - skb->data)) {	146	if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
147	u8 *icmp = xprth;	147	u8 *icmp = xprth;
148		148
149	fl->fl_icmp_type = icmp[0];	149	fl->fl_icmp_type = icmp[0];
150	fl->fl_icmp_code = icmp[1];	150	fl->fl_icmp_code = icmp[1];
151	}	151	}
152	break;	152	break;
153		153
154	case IPPROTO_ESP:	154	case IPPROTO_ESP:
155	if (pskb_may_pull(skb, xprth + 4 - skb->data)) {	155	if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
156	__be32 ehdr = (__be32 )xprth;	156	__be32 ehdr = (__be32 )xprth;
157		157
158	fl->fl_ipsec_spi = ehdr[0];	158	fl->fl_ipsec_spi = ehdr[0];
159	}	159	}
160	break;	160	break;
161		161
162	case IPPROTO_AH:	162	case IPPROTO_AH:
163	if (pskb_may_pull(skb, xprth + 8 - skb->data)) {	163	if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
164	__be32 ah_hdr = (__be32)xprth;	164	__be32 ah_hdr = (__be32)xprth;
165		165
166	fl->fl_ipsec_spi = ah_hdr[1];	166	fl->fl_ipsec_spi = ah_hdr[1];
167	}	167	}
168	break;	168	break;
169		169
170	case IPPROTO_COMP:	170	case IPPROTO_COMP:
171	if (pskb_may_pull(skb, xprth + 4 - skb->data)) {	171	if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
172	__be16 ipcomp_hdr = (__be16 )xprth;	172	__be16 ipcomp_hdr = (__be16 )xprth;
173		173
174	fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));	174	fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
175	}	175	}
176	break;	176	break;
177	default:	177	default:
178	fl->fl_ipsec_spi = 0;	178	fl->fl_ipsec_spi = 0;
179	break;	179	break;
180	}	180	}
181	}	181	}
182	fl->proto = iph->protocol;	182	fl->proto = iph->protocol;
183	fl->fl4_dst = reverse ? iph->saddr : iph->daddr;	183	fl->fl4_dst = reverse ? iph->saddr : iph->daddr;
184	fl->fl4_src = reverse ? iph->daddr : iph->saddr;	184	fl->fl4_src = reverse ? iph->daddr : iph->saddr;
185	fl->fl4_tos = iph->tos;	185	fl->fl4_tos = iph->tos;
186	}	186	}
187		187
188	static inline int xfrm4_garbage_collect(struct dst_ops *ops)	188	static inline int xfrm4_garbage_collect(struct dst_ops *ops)
189	{	189	{
190	xfrm4_policy_afinfo.garbage_collect();	190	xfrm4_policy_afinfo.garbage_collect();
191	return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);	191	return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
192	}	192	}
193		193
194	static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)	194	static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
195	{	195	{
196	struct xfrm_dst xdst = (struct xfrm_dst )dst;	196	struct xfrm_dst xdst = (struct xfrm_dst )dst;
197	struct dst_entry *path = xdst->route;	197	struct dst_entry *path = xdst->route;
198		198
199	path->ops->update_pmtu(path, mtu);	199	path->ops->update_pmtu(path, mtu);
200	}	200	}
201		201
202	static void xfrm4_dst_destroy(struct dst_entry *dst)	202	static void xfrm4_dst_destroy(struct dst_entry *dst)
203	{	203	{
204	struct xfrm_dst xdst = (struct xfrm_dst )dst;	204	struct xfrm_dst xdst = (struct xfrm_dst )dst;
205		205
206	if (likely(xdst->u.rt.idev))	206	if (likely(xdst->u.rt.idev))
207	in_dev_put(xdst->u.rt.idev);	207	in_dev_put(xdst->u.rt.idev);
208	if (likely(xdst->u.rt.peer))	208	if (likely(xdst->u.rt.peer))
209	inet_putpeer(xdst->u.rt.peer);	209	inet_putpeer(xdst->u.rt.peer);
210	xfrm_dst_destroy(xdst);	210	xfrm_dst_destroy(xdst);
211	}	211	}
212		212
213	static void xfrm4_dst_ifdown(struct dst_entry dst, struct net_device dev,	213	static void xfrm4_dst_ifdown(struct dst_entry dst, struct net_device dev,
214	int unregister)	214	int unregister)
215	{	215	{
216	struct xfrm_dst *xdst;	216	struct xfrm_dst *xdst;
217		217
218	if (!unregister)	218	if (!unregister)
219	return;	219	return;
220		220
221	xdst = (struct xfrm_dst *)dst;	221	xdst = (struct xfrm_dst *)dst;
222	if (xdst->u.rt.idev->dev == dev) {	222	if (xdst->u.rt.idev->dev == dev) {
223	struct in_device *loopback_idev =	223	struct in_device *loopback_idev =
224	in_dev_get(dev_net(dev)->loopback_dev);	224	in_dev_get(dev_net(dev)->loopback_dev);
225	BUG_ON(!loopback_idev);	225	BUG_ON(!loopback_idev);
226		226
227	do {	227	do {
228	in_dev_put(xdst->u.rt.idev);	228	in_dev_put(xdst->u.rt.idev);
229	xdst->u.rt.idev = loopback_idev;	229	xdst->u.rt.idev = loopback_idev;
230	in_dev_hold(loopback_idev);	230	in_dev_hold(loopback_idev);
231	xdst = (struct xfrm_dst *)xdst->u.dst.child;	231	xdst = (struct xfrm_dst *)xdst->u.dst.child;
232	} while (xdst->u.dst.xfrm);	232	} while (xdst->u.dst.xfrm);
233		233
234	__in_dev_put(loopback_idev);	234	__in_dev_put(loopback_idev);
235	}	235	}
236		236
237	xfrm_dst_ifdown(dst, dev);	237	xfrm_dst_ifdown(dst, dev);
238	}	238	}
239		239
240	static struct dst_ops xfrm4_dst_ops = {	240	static struct dst_ops xfrm4_dst_ops = {
241	.family = AF_INET,	241	.family = AF_INET,
242	.protocol = __constant_htons(ETH_P_IP),	242	.protocol = __constant_htons(ETH_P_IP),
243	.gc = xfrm4_garbage_collect,	243	.gc = xfrm4_garbage_collect,
244	.update_pmtu = xfrm4_update_pmtu,	244	.update_pmtu = xfrm4_update_pmtu,
245	.destroy = xfrm4_dst_destroy,	245	.destroy = xfrm4_dst_destroy,
246	.ifdown = xfrm4_dst_ifdown,	246	.ifdown = xfrm4_dst_ifdown,
247	.local_out = __ip_local_out,	247	.local_out = __ip_local_out,
248	.gc_thresh = 1024,	248	.gc_thresh = 1024,
249	.entry_size = sizeof(struct xfrm_dst),	249	.entry_size = sizeof(struct xfrm_dst),
250	.entries = ATOMIC_INIT(0),	250	.entries = ATOMIC_INIT(0),
251	};	251	};
252		252
253	static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {	253	static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
254	.family = AF_INET,	254	.family = AF_INET,
255	.dst_ops = &xfrm4_dst_ops,	255	.dst_ops = &xfrm4_dst_ops,
256	.dst_lookup = xfrm4_dst_lookup,	256	.dst_lookup = xfrm4_dst_lookup,
257	.get_saddr = xfrm4_get_saddr,	257	.get_saddr = xfrm4_get_saddr,
258	.find_bundle = __xfrm4_find_bundle,	258	.find_bundle = __xfrm4_find_bundle,
259	.decode_session = _decode_session4,	259	.decode_session = _decode_session4,
260	.get_tos = xfrm4_get_tos,	260	.get_tos = xfrm4_get_tos,
261	.init_path = xfrm4_init_path,	261	.init_path = xfrm4_init_path,
262	.fill_dst = xfrm4_fill_dst,	262	.fill_dst = xfrm4_fill_dst,
263	};	263	};
264		264
265	static void __init xfrm4_policy_init(void)	265	static void __init xfrm4_policy_init(void)
266	{	266	{
267	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);	267	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
268	}	268	}
269		269
270	static void __exit xfrm4_policy_fini(void)	270	static void __exit xfrm4_policy_fini(void)
271	{	271	{
272	xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);	272	xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
273	}	273	}
274		274
275	void __init xfrm4_init(void)	275	void __init xfrm4_init(void)
276	{	276	{
277	xfrm4_state_init();	277	xfrm4_state_init();
278	xfrm4_policy_init();	278	xfrm4_policy_init();
279	}	279	}
280		280
281		281

1	/*	1	/*
2	*	2	*
3	* YeAH TCP	3	* YeAH TCP
4	*	4	*
5	* For further details look at:	5	* For further details look at:
6	* http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf	6	* http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
7	*	7	*
8	*/	8	*/
9	#include <linux/mm.h>	9	#include <linux/mm.h>
10	#include <linux/module.h>	10	#include <linux/module.h>
11	#include <linux/skbuff.h>	11	#include <linux/skbuff.h>
12	#include <linux/inet_diag.h>	12	#include <linux/inet_diag.h>
13		13
14	#include <net/tcp.h>	14	#include <net/tcp.h>
15		15
16	#include "tcp_vegas.h"	16	#include "tcp_vegas.h"
17		17
18	#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck	18	#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
19	#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt	19	#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
20	#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss	20	#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
21	#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion	21	#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
22	#define TCP_YEAH_PHY 8 //lin maximum delta from base	22	#define TCP_YEAH_PHY 8 //lin maximum delta from base
23	#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss	23	#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss
24	#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count	24	#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
25		25
26	#define TCP_SCALABLE_AI_CNT 100U	26	#define TCP_SCALABLE_AI_CNT 100U
27		27
28	/* YeAH variables */	28	/* YeAH variables */
29	struct yeah {	29	struct yeah {
30	struct vegas vegas; /* must be first */	30	struct vegas vegas; /* must be first */
31		31
32	/* YeAH */	32	/* YeAH */
33	u32 lastQ;	33	u32 lastQ;
34	u32 doing_reno_now;	34	u32 doing_reno_now;
35		35
36	u32 reno_count;	36	u32 reno_count;
37	u32 fast_count;	37	u32 fast_count;
38		38
39	u32 pkts_acked;	39	u32 pkts_acked;
40	};	40	};
41		41
42	static void tcp_yeah_init(struct sock *sk)	42	static void tcp_yeah_init(struct sock *sk)
43	{	43	{
44	struct tcp_sock *tp = tcp_sk(sk);	44	struct tcp_sock *tp = tcp_sk(sk);
45	struct yeah *yeah = inet_csk_ca(sk);	45	struct yeah *yeah = inet_csk_ca(sk);
46		46
47	tcp_vegas_init(sk);	47	tcp_vegas_init(sk);
48		48
49	yeah->doing_reno_now = 0;	49	yeah->doing_reno_now = 0;
50	yeah->lastQ = 0;	50	yeah->lastQ = 0;
51		51
52	yeah->reno_count = 2;	52	yeah->reno_count = 2;
53		53
54	/* Ensure the MD arithmetic works. This is somewhat pedantic,	54	/* Ensure the MD arithmetic works. This is somewhat pedantic,
55	* since I don't think we will see a cwnd this large. :) */	55	* since I don't think we will see a cwnd this large. :) */
56	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);	56	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
57		57
58	}	58	}
59		59
60		60
61	static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)	61	static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
62	{	62	{
63	const struct inet_connection_sock *icsk = inet_csk(sk);	63	const struct inet_connection_sock *icsk = inet_csk(sk);
64	struct yeah *yeah = inet_csk_ca(sk);	64	struct yeah *yeah = inet_csk_ca(sk);
65		65
66	if (icsk->icsk_ca_state == TCP_CA_Open)	66	if (icsk->icsk_ca_state == TCP_CA_Open)
67	yeah->pkts_acked = pkts_acked;	67	yeah->pkts_acked = pkts_acked;
68		68
69	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);	69	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
70	}	70	}
71		71
72	static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)	72	static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
73	{	73	{
74	struct tcp_sock *tp = tcp_sk(sk);	74	struct tcp_sock *tp = tcp_sk(sk);
75	struct yeah *yeah = inet_csk_ca(sk);	75	struct yeah *yeah = inet_csk_ca(sk);
76		76
77	if (!tcp_is_cwnd_limited(sk, in_flight))	77	if (!tcp_is_cwnd_limited(sk, in_flight))
78	return;	78	return;
79		79
80	if (tp->snd_cwnd <= tp->snd_ssthresh)	80	if (tp->snd_cwnd <= tp->snd_ssthresh)
81	tcp_slow_start(tp);	81	tcp_slow_start(tp);
82		82
83	else if (!yeah->doing_reno_now) {	83	else if (!yeah->doing_reno_now) {
84	/* Scalable */	84	/* Scalable */
85		85
86	tp->snd_cwnd_cnt+=yeah->pkts_acked;	86	tp->snd_cwnd_cnt += yeah->pkts_acked;
87	if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){	87	if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
88	if (tp->snd_cwnd < tp->snd_cwnd_clamp)	88	if (tp->snd_cwnd < tp->snd_cwnd_clamp)
89	tp->snd_cwnd++;	89	tp->snd_cwnd++;
90	tp->snd_cwnd_cnt = 0;	90	tp->snd_cwnd_cnt = 0;
91	}	91	}
92		92
93	yeah->pkts_acked = 1;	93	yeah->pkts_acked = 1;
94		94
95	} else {	95	} else {
96	/* Reno */	96	/* Reno */
97		97
98	if (tp->snd_cwnd_cnt < tp->snd_cwnd)	98	if (tp->snd_cwnd_cnt < tp->snd_cwnd)
99	tp->snd_cwnd_cnt++;	99	tp->snd_cwnd_cnt++;
100		100
101	if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {	101	if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
102	tp->snd_cwnd++;	102	tp->snd_cwnd++;
103	tp->snd_cwnd_cnt = 0;	103	tp->snd_cwnd_cnt = 0;
104	}	104	}
105	}	105	}
106		106
107	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.	107	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
108	*	108	*
109	* These are so named because they represent the approximate values	109	* These are so named because they represent the approximate values
110	* of snd_una and snd_nxt at the beginning of the current RTT. More	110	* of snd_una and snd_nxt at the beginning of the current RTT. More
111	* precisely, they represent the amount of data sent during the RTT.	111	* precisely, they represent the amount of data sent during the RTT.
112	* At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,	112	* At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
113	* we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding	113	* we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
114	* bytes of data have been ACKed during the course of the RTT, giving	114	* bytes of data have been ACKed during the course of the RTT, giving
115	* an "actual" rate of:	115	* an "actual" rate of:
116	*	116	*
117	* (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)	117	* (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
118	*	118	*
119	* Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,	119	* Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
120	* because delayed ACKs can cover more than one segment, so they	120	* because delayed ACKs can cover more than one segment, so they
121	* don't line up yeahly with the boundaries of RTTs.	121	* don't line up yeahly with the boundaries of RTTs.
122	*	122	*
123	* Another unfortunate fact of life is that delayed ACKs delay the	123	* Another unfortunate fact of life is that delayed ACKs delay the
124	* advance of the left edge of our send window, so that the number	124	* advance of the left edge of our send window, so that the number
125	* of bytes we send in an RTT is often less than our cwnd will allow.	125	* of bytes we send in an RTT is often less than our cwnd will allow.
126	* So we keep track of our cwnd separately, in v_beg_snd_cwnd.	126	* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
127	*/	127	*/
128		128
129	if (after(ack, yeah->vegas.beg_snd_nxt)) {	129	if (after(ack, yeah->vegas.beg_snd_nxt)) {
130		130
131	/* We do the Vegas calculations only if we got enough RTT	131	/* We do the Vegas calculations only if we got enough RTT
132	* samples that we can be reasonably sure that we got	132	* samples that we can be reasonably sure that we got
133	* at least one RTT sample that wasn't from a delayed ACK.	133	* at least one RTT sample that wasn't from a delayed ACK.
134	* If we only had 2 samples total,	134	* If we only had 2 samples total,
135	* then that means we're getting only 1 ACK per RTT, which	135	* then that means we're getting only 1 ACK per RTT, which
136	* means they're almost certainly delayed ACKs.	136	* means they're almost certainly delayed ACKs.
137	* If we have 3 samples, we should be OK.	137	* If we have 3 samples, we should be OK.
138	*/	138	*/
139		139
140	if (yeah->vegas.cntRTT > 2) {	140	if (yeah->vegas.cntRTT > 2) {
141	u32 rtt, queue;	141	u32 rtt, queue;
142	u64 bw;	142	u64 bw;
143		143
144	/* We have enough RTT samples, so, using the Vegas	144	/* We have enough RTT samples, so, using the Vegas
145	* algorithm, we determine if we should increase or	145	* algorithm, we determine if we should increase or
146	* decrease cwnd, and by how much.	146	* decrease cwnd, and by how much.
147	*/	147	*/
148		148
149	/* Pluck out the RTT we are using for the Vegas	149	/* Pluck out the RTT we are using for the Vegas
150	* calculations. This is the min RTT seen during the	150	* calculations. This is the min RTT seen during the
151	* last RTT. Taking the min filters out the effects	151	* last RTT. Taking the min filters out the effects
152	* of delayed ACKs, at the cost of noticing congestion	152	* of delayed ACKs, at the cost of noticing congestion
153	* a bit later.	153	* a bit later.
154	*/	154	*/
155	rtt = yeah->vegas.minRTT;	155	rtt = yeah->vegas.minRTT;
156		156
157	/* Compute excess number of packets above bandwidth	157	/* Compute excess number of packets above bandwidth
158	* Avoid doing full 64 bit divide.	158	* Avoid doing full 64 bit divide.
159	*/	159	*/
160	bw = tp->snd_cwnd;	160	bw = tp->snd_cwnd;
161	bw *= rtt - yeah->vegas.baseRTT;	161	bw *= rtt - yeah->vegas.baseRTT;
162	do_div(bw, rtt);	162	do_div(bw, rtt);
163	queue = bw;	163	queue = bw;
164		164
165	if (queue > TCP_YEAH_ALPHA \|\|	165	if (queue > TCP_YEAH_ALPHA \|\|
166	rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {	166	rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
167	if (queue > TCP_YEAH_ALPHA	167	if (queue > TCP_YEAH_ALPHA
168	&& tp->snd_cwnd > yeah->reno_count) {	168	&& tp->snd_cwnd > yeah->reno_count) {
169	u32 reduction = min(queue / TCP_YEAH_GAMMA ,	169	u32 reduction = min(queue / TCP_YEAH_GAMMA ,
170	tp->snd_cwnd >> TCP_YEAH_EPSILON);	170	tp->snd_cwnd >> TCP_YEAH_EPSILON);
171		171
172	tp->snd_cwnd -= reduction;	172	tp->snd_cwnd -= reduction;
173		173
174	tp->snd_cwnd = max(tp->snd_cwnd,	174	tp->snd_cwnd = max(tp->snd_cwnd,
175	yeah->reno_count);	175	yeah->reno_count);
176		176
177	tp->snd_ssthresh = tp->snd_cwnd;	177	tp->snd_ssthresh = tp->snd_cwnd;
178	}	178	}
179		179
180	if (yeah->reno_count <= 2)	180	if (yeah->reno_count <= 2)
181	yeah->reno_count = max(tp->snd_cwnd>>1, 2U);	181	yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
182	else	182	else
183	yeah->reno_count++;	183	yeah->reno_count++;
184		184
185	yeah->doing_reno_now = min(yeah->doing_reno_now + 1,	185	yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
186	0xffffffU);	186	0xffffffU);
187	} else {	187	} else {
188	yeah->fast_count++;	188	yeah->fast_count++;
189		189
190	if (yeah->fast_count > TCP_YEAH_ZETA) {	190	if (yeah->fast_count > TCP_YEAH_ZETA) {
191	yeah->reno_count = 2;	191	yeah->reno_count = 2;
192	yeah->fast_count = 0;	192	yeah->fast_count = 0;
193	}	193	}
194		194
195	yeah->doing_reno_now = 0;	195	yeah->doing_reno_now = 0;
196	}	196	}
197		197
198	yeah->lastQ = queue;	198	yeah->lastQ = queue;
199		199
200	}	200	}
201		201
202	/* Save the extent of the current window so we can use this	202	/* Save the extent of the current window so we can use this
203	* at the end of the next RTT.	203	* at the end of the next RTT.
204	*/	204	*/
205	yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;	205	yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
206	yeah->vegas.beg_snd_nxt = tp->snd_nxt;	206	yeah->vegas.beg_snd_nxt = tp->snd_nxt;
207	yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;	207	yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
208		208
209	/* Wipe the slate clean for the next RTT. */	209	/* Wipe the slate clean for the next RTT. */
210	yeah->vegas.cntRTT = 0;	210	yeah->vegas.cntRTT = 0;
211	yeah->vegas.minRTT = 0x7fffffff;	211	yeah->vegas.minRTT = 0x7fffffff;
212	}	212	}
213	}	213	}
214		214
215	static u32 tcp_yeah_ssthresh(struct sock *sk) {	215	static u32 tcp_yeah_ssthresh(struct sock *sk) {
216	const struct tcp_sock *tp = tcp_sk(sk);	216	const struct tcp_sock *tp = tcp_sk(sk);
217	struct yeah *yeah = inet_csk_ca(sk);	217	struct yeah *yeah = inet_csk_ca(sk);
218	u32 reduction;	218	u32 reduction;
219		219
220	if (yeah->doing_reno_now < TCP_YEAH_RHO) {	220	if (yeah->doing_reno_now < TCP_YEAH_RHO) {
221	reduction = yeah->lastQ;	221	reduction = yeah->lastQ;
222		222
223	reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );	223	reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
224		224
225	reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);	225	reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
226	} else	226	} else
227	reduction = max(tp->snd_cwnd>>1,2U);	227	reduction = max(tp->snd_cwnd>>1, 2U);
228		228
229	yeah->fast_count = 0;	229	yeah->fast_count = 0;
230	yeah->reno_count = max(yeah->reno_count>>1, 2U);	230	yeah->reno_count = max(yeah->reno_count>>1, 2U);
231		231
232	return tp->snd_cwnd - reduction;	232	return tp->snd_cwnd - reduction;
233	}	233	}
234		234
235	static struct tcp_congestion_ops tcp_yeah = {	235	static struct tcp_congestion_ops tcp_yeah = {
236	.flags = TCP_CONG_RTT_STAMP,	236	.flags = TCP_CONG_RTT_STAMP,
237	.init = tcp_yeah_init,	237	.init = tcp_yeah_init,
238	.ssthresh = tcp_yeah_ssthresh,	238	.ssthresh = tcp_yeah_ssthresh,
239	.cong_avoid = tcp_yeah_cong_avoid,	239	.cong_avoid = tcp_yeah_cong_avoid,
240	.min_cwnd = tcp_reno_min_cwnd,	240	.min_cwnd = tcp_reno_min_cwnd,
241	.set_state = tcp_vegas_state,	241	.set_state = tcp_vegas_state,
242	.cwnd_event = tcp_vegas_cwnd_event,	242	.cwnd_event = tcp_vegas_cwnd_event,
243	.get_info = tcp_vegas_get_info,	243	.get_info = tcp_vegas_get_info,
244	.pkts_acked = tcp_yeah_pkts_acked,	244	.pkts_acked = tcp_yeah_pkts_acked,
245		245
246	.owner = THIS_MODULE,	246	.owner = THIS_MODULE,
247	.name = "yeah",	247	.name = "yeah",
248	};	248	};
249		249
250	static int __init tcp_yeah_register(void)	250	static int __init tcp_yeah_register(void)
251	{	251	{
252	BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);	252	BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
253	tcp_register_congestion_control(&tcp_yeah);	253	tcp_register_congestion_control(&tcp_yeah);
254	return 0;	254	return 0;
255	}	255	}
256		256
257	static void __exit tcp_yeah_unregister(void)	257	static void __exit tcp_yeah_unregister(void)
258	{	258	{
259	tcp_unregister_congestion_control(&tcp_yeah);	259	tcp_unregister_congestion_control(&tcp_yeah);
260	}	260	}
261		261
262	module_init(tcp_yeah_register);	262	module_init(tcp_yeah_register);
263	module_exit(tcp_yeah_unregister);	263	module_exit(tcp_yeah_unregister);
264		264
265	MODULE_AUTHOR("Angelo P. Castellani");	265	MODULE_AUTHOR("Angelo P. Castellani");
266	MODULE_LICENSE("GPL");	266	MODULE_LICENSE("GPL");
267	MODULE_DESCRIPTION("YeAH TCP");	267	MODULE_DESCRIPTION("YeAH TCP");
268		268