Commit 8e3bff96afa67369008153f3326fa5ce985cabab

Authored by stephen hemminger
Committed by David S. Miller
1 parent 22a9321614

net: more spelling fixes

Various spelling fixes in networking stack

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 5 changed files with 11 additions and 11 deletions Inline Diff

net/bridge/br_netlink.c
1 /* 1 /*
2 * Bridge netlink control interface 2 * Bridge netlink control interface
3 * 3 *
4 * Authors: 4 * Authors:
5 * Stephen Hemminger <shemminger@osdl.org> 5 * Stephen Hemminger <shemminger@osdl.org>
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version. 10 * 2 of the License, or (at your option) any later version.
11 */ 11 */
12 12
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/slab.h> 14 #include <linux/slab.h>
15 #include <linux/etherdevice.h> 15 #include <linux/etherdevice.h>
16 #include <net/rtnetlink.h> 16 #include <net/rtnetlink.h>
17 #include <net/net_namespace.h> 17 #include <net/net_namespace.h>
18 #include <net/sock.h> 18 #include <net/sock.h>
19 #include <uapi/linux/if_bridge.h> 19 #include <uapi/linux/if_bridge.h>
20 20
21 #include "br_private.h" 21 #include "br_private.h"
22 #include "br_private_stp.h" 22 #include "br_private_stp.h"
23 23
24 static inline size_t br_port_info_size(void) 24 static inline size_t br_port_info_size(void)
25 { 25 {
26 return nla_total_size(1) /* IFLA_BRPORT_STATE */ 26 return nla_total_size(1) /* IFLA_BRPORT_STATE */
27 + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */ 27 + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */
28 + nla_total_size(4) /* IFLA_BRPORT_COST */ 28 + nla_total_size(4) /* IFLA_BRPORT_COST */
29 + nla_total_size(1) /* IFLA_BRPORT_MODE */ 29 + nla_total_size(1) /* IFLA_BRPORT_MODE */
30 + nla_total_size(1) /* IFLA_BRPORT_GUARD */ 30 + nla_total_size(1) /* IFLA_BRPORT_GUARD */
31 + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ 31 + nla_total_size(1) /* IFLA_BRPORT_PROTECT */
32 + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ 32 + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
33 + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ 33 + nla_total_size(1) /* IFLA_BRPORT_LEARNING */
34 + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ 34 + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
35 + 0; 35 + 0;
36 } 36 }
37 37
38 static inline size_t br_nlmsg_size(void) 38 static inline size_t br_nlmsg_size(void)
39 { 39 {
40 return NLMSG_ALIGN(sizeof(struct ifinfomsg)) 40 return NLMSG_ALIGN(sizeof(struct ifinfomsg))
41 + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ 41 + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
42 + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ 42 + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
43 + nla_total_size(4) /* IFLA_MASTER */ 43 + nla_total_size(4) /* IFLA_MASTER */
44 + nla_total_size(4) /* IFLA_MTU */ 44 + nla_total_size(4) /* IFLA_MTU */
45 + nla_total_size(4) /* IFLA_LINK */ 45 + nla_total_size(4) /* IFLA_LINK */
46 + nla_total_size(1) /* IFLA_OPERSTATE */ 46 + nla_total_size(1) /* IFLA_OPERSTATE */
47 + nla_total_size(br_port_info_size()); /* IFLA_PROTINFO */ 47 + nla_total_size(br_port_info_size()); /* IFLA_PROTINFO */
48 } 48 }
49 49
50 static int br_port_fill_attrs(struct sk_buff *skb, 50 static int br_port_fill_attrs(struct sk_buff *skb,
51 const struct net_bridge_port *p) 51 const struct net_bridge_port *p)
52 { 52 {
53 u8 mode = !!(p->flags & BR_HAIRPIN_MODE); 53 u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
54 54
55 if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || 55 if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
56 nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || 56 nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) ||
57 nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || 57 nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
58 nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || 58 nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
59 nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || 59 nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
60 nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || 60 nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
61 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || 61 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
62 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || 62 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
63 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD))) 63 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)))
64 return -EMSGSIZE; 64 return -EMSGSIZE;
65 65
66 return 0; 66 return 0;
67 } 67 }
68 68
69 /* 69 /*
70 * Create one netlink message for one interface 70 * Create one netlink message for one interface
71 * Contains port and master info as well as carrier and bridge state. 71 * Contains port and master info as well as carrier and bridge state.
72 */ 72 */
73 static int br_fill_ifinfo(struct sk_buff *skb, 73 static int br_fill_ifinfo(struct sk_buff *skb,
74 const struct net_bridge_port *port, 74 const struct net_bridge_port *port,
75 u32 pid, u32 seq, int event, unsigned int flags, 75 u32 pid, u32 seq, int event, unsigned int flags,
76 u32 filter_mask, const struct net_device *dev) 76 u32 filter_mask, const struct net_device *dev)
77 { 77 {
78 const struct net_bridge *br; 78 const struct net_bridge *br;
79 struct ifinfomsg *hdr; 79 struct ifinfomsg *hdr;
80 struct nlmsghdr *nlh; 80 struct nlmsghdr *nlh;
81 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; 81 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
82 82
83 if (port) 83 if (port)
84 br = port->br; 84 br = port->br;
85 else 85 else
86 br = netdev_priv(dev); 86 br = netdev_priv(dev);
87 87
88 br_debug(br, "br_fill_info event %d port %s master %s\n", 88 br_debug(br, "br_fill_info event %d port %s master %s\n",
89 event, dev->name, br->dev->name); 89 event, dev->name, br->dev->name);
90 90
91 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); 91 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
92 if (nlh == NULL) 92 if (nlh == NULL)
93 return -EMSGSIZE; 93 return -EMSGSIZE;
94 94
95 hdr = nlmsg_data(nlh); 95 hdr = nlmsg_data(nlh);
96 hdr->ifi_family = AF_BRIDGE; 96 hdr->ifi_family = AF_BRIDGE;
97 hdr->__ifi_pad = 0; 97 hdr->__ifi_pad = 0;
98 hdr->ifi_type = dev->type; 98 hdr->ifi_type = dev->type;
99 hdr->ifi_index = dev->ifindex; 99 hdr->ifi_index = dev->ifindex;
100 hdr->ifi_flags = dev_get_flags(dev); 100 hdr->ifi_flags = dev_get_flags(dev);
101 hdr->ifi_change = 0; 101 hdr->ifi_change = 0;
102 102
103 if (nla_put_string(skb, IFLA_IFNAME, dev->name) || 103 if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
104 nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) || 104 nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) ||
105 nla_put_u32(skb, IFLA_MTU, dev->mtu) || 105 nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
106 nla_put_u8(skb, IFLA_OPERSTATE, operstate) || 106 nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
107 (dev->addr_len && 107 (dev->addr_len &&
108 nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || 108 nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
109 (dev->ifindex != dev->iflink && 109 (dev->ifindex != dev->iflink &&
110 nla_put_u32(skb, IFLA_LINK, dev->iflink))) 110 nla_put_u32(skb, IFLA_LINK, dev->iflink)))
111 goto nla_put_failure; 111 goto nla_put_failure;
112 112
113 if (event == RTM_NEWLINK && port) { 113 if (event == RTM_NEWLINK && port) {
114 struct nlattr *nest 114 struct nlattr *nest
115 = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); 115 = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
116 116
117 if (nest == NULL || br_port_fill_attrs(skb, port) < 0) 117 if (nest == NULL || br_port_fill_attrs(skb, port) < 0)
118 goto nla_put_failure; 118 goto nla_put_failure;
119 nla_nest_end(skb, nest); 119 nla_nest_end(skb, nest);
120 } 120 }
121 121
122 /* Check if the VID information is requested */ 122 /* Check if the VID information is requested */
123 if (filter_mask & RTEXT_FILTER_BRVLAN) { 123 if (filter_mask & RTEXT_FILTER_BRVLAN) {
124 struct nlattr *af; 124 struct nlattr *af;
125 const struct net_port_vlans *pv; 125 const struct net_port_vlans *pv;
126 struct bridge_vlan_info vinfo; 126 struct bridge_vlan_info vinfo;
127 u16 vid; 127 u16 vid;
128 u16 pvid; 128 u16 pvid;
129 129
130 if (port) 130 if (port)
131 pv = nbp_get_vlan_info(port); 131 pv = nbp_get_vlan_info(port);
132 else 132 else
133 pv = br_get_vlan_info(br); 133 pv = br_get_vlan_info(br);
134 134
135 if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) 135 if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID))
136 goto done; 136 goto done;
137 137
138 af = nla_nest_start(skb, IFLA_AF_SPEC); 138 af = nla_nest_start(skb, IFLA_AF_SPEC);
139 if (!af) 139 if (!af)
140 goto nla_put_failure; 140 goto nla_put_failure;
141 141
142 pvid = br_get_pvid(pv); 142 pvid = br_get_pvid(pv);
143 for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { 143 for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
144 vinfo.vid = vid; 144 vinfo.vid = vid;
145 vinfo.flags = 0; 145 vinfo.flags = 0;
146 if (vid == pvid) 146 if (vid == pvid)
147 vinfo.flags |= BRIDGE_VLAN_INFO_PVID; 147 vinfo.flags |= BRIDGE_VLAN_INFO_PVID;
148 148
149 if (test_bit(vid, pv->untagged_bitmap)) 149 if (test_bit(vid, pv->untagged_bitmap))
150 vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; 150 vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
151 151
152 if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, 152 if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
153 sizeof(vinfo), &vinfo)) 153 sizeof(vinfo), &vinfo))
154 goto nla_put_failure; 154 goto nla_put_failure;
155 } 155 }
156 156
157 nla_nest_end(skb, af); 157 nla_nest_end(skb, af);
158 } 158 }
159 159
160 done: 160 done:
161 return nlmsg_end(skb, nlh); 161 return nlmsg_end(skb, nlh);
162 162
163 nla_put_failure: 163 nla_put_failure:
164 nlmsg_cancel(skb, nlh); 164 nlmsg_cancel(skb, nlh);
165 return -EMSGSIZE; 165 return -EMSGSIZE;
166 } 166 }
167 167
168 /* 168 /*
169 * Notify listeners of a change in port information 169 * Notify listeners of a change in port information
170 */ 170 */
171 void br_ifinfo_notify(int event, struct net_bridge_port *port) 171 void br_ifinfo_notify(int event, struct net_bridge_port *port)
172 { 172 {
173 struct net *net; 173 struct net *net;
174 struct sk_buff *skb; 174 struct sk_buff *skb;
175 int err = -ENOBUFS; 175 int err = -ENOBUFS;
176 176
177 if (!port) 177 if (!port)
178 return; 178 return;
179 179
180 net = dev_net(port->dev); 180 net = dev_net(port->dev);
181 br_debug(port->br, "port %u(%s) event %d\n", 181 br_debug(port->br, "port %u(%s) event %d\n",
182 (unsigned int)port->port_no, port->dev->name, event); 182 (unsigned int)port->port_no, port->dev->name, event);
183 183
184 skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC); 184 skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC);
185 if (skb == NULL) 185 if (skb == NULL)
186 goto errout; 186 goto errout;
187 187
188 err = br_fill_ifinfo(skb, port, 0, 0, event, 0, 0, port->dev); 188 err = br_fill_ifinfo(skb, port, 0, 0, event, 0, 0, port->dev);
189 if (err < 0) { 189 if (err < 0) {
190 /* -EMSGSIZE implies BUG in br_nlmsg_size() */ 190 /* -EMSGSIZE implies BUG in br_nlmsg_size() */
191 WARN_ON(err == -EMSGSIZE); 191 WARN_ON(err == -EMSGSIZE);
192 kfree_skb(skb); 192 kfree_skb(skb);
193 goto errout; 193 goto errout;
194 } 194 }
195 rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); 195 rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
196 return; 196 return;
197 errout: 197 errout:
198 if (err < 0) 198 if (err < 0)
199 rtnl_set_sk_err(net, RTNLGRP_LINK, err); 199 rtnl_set_sk_err(net, RTNLGRP_LINK, err);
200 } 200 }
201 201
202 202
203 /* 203 /*
204 * Dump information about all ports, in response to GETLINK 204 * Dump information about all ports, in response to GETLINK
205 */ 205 */
206 int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, 206 int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
207 struct net_device *dev, u32 filter_mask) 207 struct net_device *dev, u32 filter_mask)
208 { 208 {
209 int err = 0; 209 int err = 0;
210 struct net_bridge_port *port = br_port_get_rtnl(dev); 210 struct net_bridge_port *port = br_port_get_rtnl(dev);
211 211
212 /* not a bridge port and */ 212 /* not a bridge port and */
213 if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN)) 213 if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN))
214 goto out; 214 goto out;
215 215
216 err = br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI, 216 err = br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI,
217 filter_mask, dev); 217 filter_mask, dev);
218 out: 218 out:
219 return err; 219 return err;
220 } 220 }
221 221
222 static const struct nla_policy ifla_br_policy[IFLA_MAX+1] = { 222 static const struct nla_policy ifla_br_policy[IFLA_MAX+1] = {
223 [IFLA_BRIDGE_FLAGS] = { .type = NLA_U16 }, 223 [IFLA_BRIDGE_FLAGS] = { .type = NLA_U16 },
224 [IFLA_BRIDGE_MODE] = { .type = NLA_U16 }, 224 [IFLA_BRIDGE_MODE] = { .type = NLA_U16 },
225 [IFLA_BRIDGE_VLAN_INFO] = { .type = NLA_BINARY, 225 [IFLA_BRIDGE_VLAN_INFO] = { .type = NLA_BINARY,
226 .len = sizeof(struct bridge_vlan_info), }, 226 .len = sizeof(struct bridge_vlan_info), },
227 }; 227 };
228 228
229 static int br_afspec(struct net_bridge *br, 229 static int br_afspec(struct net_bridge *br,
230 struct net_bridge_port *p, 230 struct net_bridge_port *p,
231 struct nlattr *af_spec, 231 struct nlattr *af_spec,
232 int cmd) 232 int cmd)
233 { 233 {
234 struct nlattr *tb[IFLA_BRIDGE_MAX+1]; 234 struct nlattr *tb[IFLA_BRIDGE_MAX+1];
235 int err = 0; 235 int err = 0;
236 236
237 err = nla_parse_nested(tb, IFLA_BRIDGE_MAX, af_spec, ifla_br_policy); 237 err = nla_parse_nested(tb, IFLA_BRIDGE_MAX, af_spec, ifla_br_policy);
238 if (err) 238 if (err)
239 return err; 239 return err;
240 240
241 if (tb[IFLA_BRIDGE_VLAN_INFO]) { 241 if (tb[IFLA_BRIDGE_VLAN_INFO]) {
242 struct bridge_vlan_info *vinfo; 242 struct bridge_vlan_info *vinfo;
243 243
244 vinfo = nla_data(tb[IFLA_BRIDGE_VLAN_INFO]); 244 vinfo = nla_data(tb[IFLA_BRIDGE_VLAN_INFO]);
245 245
246 if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) 246 if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
247 return -EINVAL; 247 return -EINVAL;
248 248
249 switch (cmd) { 249 switch (cmd) {
250 case RTM_SETLINK: 250 case RTM_SETLINK:
251 if (p) { 251 if (p) {
252 err = nbp_vlan_add(p, vinfo->vid, vinfo->flags); 252 err = nbp_vlan_add(p, vinfo->vid, vinfo->flags);
253 if (err) 253 if (err)
254 break; 254 break;
255 255
256 if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) 256 if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER)
257 err = br_vlan_add(p->br, vinfo->vid, 257 err = br_vlan_add(p->br, vinfo->vid,
258 vinfo->flags); 258 vinfo->flags);
259 } else 259 } else
260 err = br_vlan_add(br, vinfo->vid, vinfo->flags); 260 err = br_vlan_add(br, vinfo->vid, vinfo->flags);
261 261
262 if (err) 262 if (err)
263 break; 263 break;
264 264
265 break; 265 break;
266 266
267 case RTM_DELLINK: 267 case RTM_DELLINK:
268 if (p) { 268 if (p) {
269 nbp_vlan_delete(p, vinfo->vid); 269 nbp_vlan_delete(p, vinfo->vid);
270 if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) 270 if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER)
271 br_vlan_delete(p->br, vinfo->vid); 271 br_vlan_delete(p->br, vinfo->vid);
272 } else 272 } else
273 br_vlan_delete(br, vinfo->vid); 273 br_vlan_delete(br, vinfo->vid);
274 break; 274 break;
275 } 275 }
276 } 276 }
277 277
278 return err; 278 return err;
279 } 279 }
280 280
281 static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { 281 static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = {
282 [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, 282 [IFLA_BRPORT_STATE] = { .type = NLA_U8 },
283 [IFLA_BRPORT_COST] = { .type = NLA_U32 }, 283 [IFLA_BRPORT_COST] = { .type = NLA_U32 },
284 [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, 284 [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 },
285 [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, 285 [IFLA_BRPORT_MODE] = { .type = NLA_U8 },
286 [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, 286 [IFLA_BRPORT_GUARD] = { .type = NLA_U8 },
287 [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, 287 [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 },
288 [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, 288 [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 },
289 [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, 289 [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
290 }; 290 };
291 291
292 /* Change the state of the port and notify spanning tree */ 292 /* Change the state of the port and notify spanning tree */
293 static int br_set_port_state(struct net_bridge_port *p, u8 state) 293 static int br_set_port_state(struct net_bridge_port *p, u8 state)
294 { 294 {
295 if (state > BR_STATE_BLOCKING) 295 if (state > BR_STATE_BLOCKING)
296 return -EINVAL; 296 return -EINVAL;
297 297
298 /* if kernel STP is running, don't allow changes */ 298 /* if kernel STP is running, don't allow changes */
299 if (p->br->stp_enabled == BR_KERNEL_STP) 299 if (p->br->stp_enabled == BR_KERNEL_STP)
300 return -EBUSY; 300 return -EBUSY;
301 301
302 /* if device is not up, change is not allowed 302 /* if device is not up, change is not allowed
303 * if link is not present, only allowable state is disabled 303 * if link is not present, only allowable state is disabled
304 */ 304 */
305 if (!netif_running(p->dev) || 305 if (!netif_running(p->dev) ||
306 (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) 306 (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED))
307 return -ENETDOWN; 307 return -ENETDOWN;
308 308
309 p->state = state; 309 p->state = state;
310 br_log_state(p); 310 br_log_state(p);
311 br_port_state_selection(p->br); 311 br_port_state_selection(p->br);
312 return 0; 312 return 0;
313 } 313 }
314 314
315 /* Set/clear or port flags based on attribute */ 315 /* Set/clear or port flags based on attribute */
316 static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], 316 static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
317 int attrtype, unsigned long mask) 317 int attrtype, unsigned long mask)
318 { 318 {
319 if (tb[attrtype]) { 319 if (tb[attrtype]) {
320 u8 flag = nla_get_u8(tb[attrtype]); 320 u8 flag = nla_get_u8(tb[attrtype]);
321 if (flag) 321 if (flag)
322 p->flags |= mask; 322 p->flags |= mask;
323 else 323 else
324 p->flags &= ~mask; 324 p->flags &= ~mask;
325 } 325 }
326 } 326 }
327 327
328 /* Process bridge protocol info on port */ 328 /* Process bridge protocol info on port */
329 static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) 329 static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
330 { 330 {
331 int err; 331 int err;
332 332
333 br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); 333 br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
334 br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); 334 br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
335 br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); 335 br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE);
336 br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); 336 br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
337 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); 337 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
338 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); 338 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
339 339
340 if (tb[IFLA_BRPORT_COST]) { 340 if (tb[IFLA_BRPORT_COST]) {
341 err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); 341 err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
342 if (err) 342 if (err)
343 return err; 343 return err;
344 } 344 }
345 345
346 if (tb[IFLA_BRPORT_PRIORITY]) { 346 if (tb[IFLA_BRPORT_PRIORITY]) {
347 err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); 347 err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY]));
348 if (err) 348 if (err)
349 return err; 349 return err;
350 } 350 }
351 351
352 if (tb[IFLA_BRPORT_STATE]) { 352 if (tb[IFLA_BRPORT_STATE]) {
353 err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); 353 err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE]));
354 if (err) 354 if (err)
355 return err; 355 return err;
356 } 356 }
357 return 0; 357 return 0;
358 } 358 }
359 359
360 /* Change state and parameters on port. */ 360 /* Change state and parameters on port. */
361 int br_setlink(struct net_device *dev, struct nlmsghdr *nlh) 361 int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
362 { 362 {
363 struct nlattr *protinfo; 363 struct nlattr *protinfo;
364 struct nlattr *afspec; 364 struct nlattr *afspec;
365 struct net_bridge_port *p; 365 struct net_bridge_port *p;
366 struct nlattr *tb[IFLA_BRPORT_MAX + 1]; 366 struct nlattr *tb[IFLA_BRPORT_MAX + 1];
367 int err = 0; 367 int err = 0;
368 368
369 protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); 369 protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO);
370 afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); 370 afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
371 if (!protinfo && !afspec) 371 if (!protinfo && !afspec)
372 return 0; 372 return 0;
373 373
374 p = br_port_get_rtnl(dev); 374 p = br_port_get_rtnl(dev);
375 /* We want to accept dev as bridge itself if the AF_SPEC 375 /* We want to accept dev as bridge itself if the AF_SPEC
376 * is set to see if someone is setting vlan info on the brigde 376 * is set to see if someone is setting vlan info on the bridge
377 */ 377 */
378 if (!p && !afspec) 378 if (!p && !afspec)
379 return -EINVAL; 379 return -EINVAL;
380 380
381 if (p && protinfo) { 381 if (p && protinfo) {
382 if (protinfo->nla_type & NLA_F_NESTED) { 382 if (protinfo->nla_type & NLA_F_NESTED) {
383 err = nla_parse_nested(tb, IFLA_BRPORT_MAX, 383 err = nla_parse_nested(tb, IFLA_BRPORT_MAX,
384 protinfo, ifla_brport_policy); 384 protinfo, ifla_brport_policy);
385 if (err) 385 if (err)
386 return err; 386 return err;
387 387
388 spin_lock_bh(&p->br->lock); 388 spin_lock_bh(&p->br->lock);
389 err = br_setport(p, tb); 389 err = br_setport(p, tb);
390 spin_unlock_bh(&p->br->lock); 390 spin_unlock_bh(&p->br->lock);
391 } else { 391 } else {
392 /* Binary compatability with old RSTP */ 392 /* Binary compatibility with old RSTP */
393 if (nla_len(protinfo) < sizeof(u8)) 393 if (nla_len(protinfo) < sizeof(u8))
394 return -EINVAL; 394 return -EINVAL;
395 395
396 spin_lock_bh(&p->br->lock); 396 spin_lock_bh(&p->br->lock);
397 err = br_set_port_state(p, nla_get_u8(protinfo)); 397 err = br_set_port_state(p, nla_get_u8(protinfo));
398 spin_unlock_bh(&p->br->lock); 398 spin_unlock_bh(&p->br->lock);
399 } 399 }
400 if (err) 400 if (err)
401 goto out; 401 goto out;
402 } 402 }
403 403
404 if (afspec) { 404 if (afspec) {
405 err = br_afspec((struct net_bridge *)netdev_priv(dev), p, 405 err = br_afspec((struct net_bridge *)netdev_priv(dev), p,
406 afspec, RTM_SETLINK); 406 afspec, RTM_SETLINK);
407 } 407 }
408 408
409 if (err == 0) 409 if (err == 0)
410 br_ifinfo_notify(RTM_NEWLINK, p); 410 br_ifinfo_notify(RTM_NEWLINK, p);
411 411
412 out: 412 out:
413 return err; 413 return err;
414 } 414 }
415 415
416 /* Delete port information */ 416 /* Delete port information */
417 int br_dellink(struct net_device *dev, struct nlmsghdr *nlh) 417 int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
418 { 418 {
419 struct nlattr *afspec; 419 struct nlattr *afspec;
420 struct net_bridge_port *p; 420 struct net_bridge_port *p;
421 int err; 421 int err;
422 422
423 afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); 423 afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
424 if (!afspec) 424 if (!afspec)
425 return 0; 425 return 0;
426 426
427 p = br_port_get_rtnl(dev); 427 p = br_port_get_rtnl(dev);
428 /* We want to accept dev as bridge itself as well */ 428 /* We want to accept dev as bridge itself as well */
429 if (!p && !(dev->priv_flags & IFF_EBRIDGE)) 429 if (!p && !(dev->priv_flags & IFF_EBRIDGE))
430 return -EINVAL; 430 return -EINVAL;
431 431
432 err = br_afspec((struct net_bridge *)netdev_priv(dev), p, 432 err = br_afspec((struct net_bridge *)netdev_priv(dev), p,
433 afspec, RTM_DELLINK); 433 afspec, RTM_DELLINK);
434 434
435 return err; 435 return err;
436 } 436 }
437 static int br_validate(struct nlattr *tb[], struct nlattr *data[]) 437 static int br_validate(struct nlattr *tb[], struct nlattr *data[])
438 { 438 {
439 if (tb[IFLA_ADDRESS]) { 439 if (tb[IFLA_ADDRESS]) {
440 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 440 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
441 return -EINVAL; 441 return -EINVAL;
442 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 442 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
443 return -EADDRNOTAVAIL; 443 return -EADDRNOTAVAIL;
444 } 444 }
445 445
446 return 0; 446 return 0;
447 } 447 }
448 448
449 static size_t br_get_link_af_size(const struct net_device *dev) 449 static size_t br_get_link_af_size(const struct net_device *dev)
450 { 450 {
451 struct net_port_vlans *pv; 451 struct net_port_vlans *pv;
452 452
453 if (br_port_exists(dev)) 453 if (br_port_exists(dev))
454 pv = nbp_get_vlan_info(br_port_get_rtnl(dev)); 454 pv = nbp_get_vlan_info(br_port_get_rtnl(dev));
455 else if (dev->priv_flags & IFF_EBRIDGE) 455 else if (dev->priv_flags & IFF_EBRIDGE)
456 pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); 456 pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev));
457 else 457 else
458 return 0; 458 return 0;
459 459
460 if (!pv) 460 if (!pv)
461 return 0; 461 return 0;
462 462
463 /* Each VLAN is returned in bridge_vlan_info along with flags */ 463 /* Each VLAN is returned in bridge_vlan_info along with flags */
464 return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info)); 464 return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info));
465 } 465 }
466 466
467 static struct rtnl_af_ops br_af_ops = { 467 static struct rtnl_af_ops br_af_ops = {
468 .family = AF_BRIDGE, 468 .family = AF_BRIDGE,
469 .get_link_af_size = br_get_link_af_size, 469 .get_link_af_size = br_get_link_af_size,
470 }; 470 };
471 471
472 struct rtnl_link_ops br_link_ops __read_mostly = { 472 struct rtnl_link_ops br_link_ops __read_mostly = {
473 .kind = "bridge", 473 .kind = "bridge",
474 .priv_size = sizeof(struct net_bridge), 474 .priv_size = sizeof(struct net_bridge),
475 .setup = br_dev_setup, 475 .setup = br_dev_setup,
476 .validate = br_validate, 476 .validate = br_validate,
477 .dellink = br_dev_delete, 477 .dellink = br_dev_delete,
478 }; 478 };
479 479
480 int __init br_netlink_init(void) 480 int __init br_netlink_init(void)
481 { 481 {
482 int err; 482 int err;
483 483
484 br_mdb_init(); 484 br_mdb_init();
485 err = rtnl_af_register(&br_af_ops); 485 err = rtnl_af_register(&br_af_ops);
486 if (err) 486 if (err)
487 goto out; 487 goto out;
488 488
489 err = rtnl_link_register(&br_link_ops); 489 err = rtnl_link_register(&br_link_ops);
490 if (err) 490 if (err)
491 goto out_af; 491 goto out_af;
492 492
493 return 0; 493 return 0;
494 494
495 out_af: 495 out_af:
496 rtnl_af_unregister(&br_af_ops); 496 rtnl_af_unregister(&br_af_ops);
497 out: 497 out:
498 br_mdb_uninit(); 498 br_mdb_uninit();
499 return err; 499 return err;
500 } 500 }
501 501
502 void __exit br_netlink_fini(void) 502 void __exit br_netlink_fini(void)
503 { 503 {
504 br_mdb_uninit(); 504 br_mdb_uninit();
505 rtnl_af_unregister(&br_af_ops); 505 rtnl_af_unregister(&br_af_ops);
506 rtnl_link_unregister(&br_link_ops); 506 rtnl_link_unregister(&br_link_ops);
507 } 507 }
508 508
net/core/net-sysfs.c
1 /* 1 /*
2 * net-sysfs.c - network device class and attributes 2 * net-sysfs.c - network device class and attributes
3 * 3 *
4 * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> 4 * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12 #include <linux/capability.h> 12 #include <linux/capability.h>
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/netdevice.h> 14 #include <linux/netdevice.h>
15 #include <linux/if_arp.h> 15 #include <linux/if_arp.h>
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <linux/nsproxy.h> 17 #include <linux/nsproxy.h>
18 #include <net/sock.h> 18 #include <net/sock.h>
19 #include <net/net_namespace.h> 19 #include <net/net_namespace.h>
20 #include <linux/rtnetlink.h> 20 #include <linux/rtnetlink.h>
21 #include <linux/vmalloc.h> 21 #include <linux/vmalloc.h>
22 #include <linux/export.h> 22 #include <linux/export.h>
23 #include <linux/jiffies.h> 23 #include <linux/jiffies.h>
24 #include <linux/pm_runtime.h> 24 #include <linux/pm_runtime.h>
25 25
26 #include "net-sysfs.h" 26 #include "net-sysfs.h"
27 27
28 #ifdef CONFIG_SYSFS 28 #ifdef CONFIG_SYSFS
29 static const char fmt_hex[] = "%#x\n"; 29 static const char fmt_hex[] = "%#x\n";
30 static const char fmt_long_hex[] = "%#lx\n"; 30 static const char fmt_long_hex[] = "%#lx\n";
31 static const char fmt_dec[] = "%d\n"; 31 static const char fmt_dec[] = "%d\n";
32 static const char fmt_udec[] = "%u\n"; 32 static const char fmt_udec[] = "%u\n";
33 static const char fmt_ulong[] = "%lu\n"; 33 static const char fmt_ulong[] = "%lu\n";
34 static const char fmt_u64[] = "%llu\n"; 34 static const char fmt_u64[] = "%llu\n";
35 35
36 static inline int dev_isalive(const struct net_device *dev) 36 static inline int dev_isalive(const struct net_device *dev)
37 { 37 {
38 return dev->reg_state <= NETREG_REGISTERED; 38 return dev->reg_state <= NETREG_REGISTERED;
39 } 39 }
40 40
41 /* use same locking rules as GIF* ioctl's */ 41 /* use same locking rules as GIF* ioctl's */
42 static ssize_t netdev_show(const struct device *dev, 42 static ssize_t netdev_show(const struct device *dev,
43 struct device_attribute *attr, char *buf, 43 struct device_attribute *attr, char *buf,
44 ssize_t (*format)(const struct net_device *, char *)) 44 ssize_t (*format)(const struct net_device *, char *))
45 { 45 {
46 struct net_device *net = to_net_dev(dev); 46 struct net_device *net = to_net_dev(dev);
47 ssize_t ret = -EINVAL; 47 ssize_t ret = -EINVAL;
48 48
49 read_lock(&dev_base_lock); 49 read_lock(&dev_base_lock);
50 if (dev_isalive(net)) 50 if (dev_isalive(net))
51 ret = (*format)(net, buf); 51 ret = (*format)(net, buf);
52 read_unlock(&dev_base_lock); 52 read_unlock(&dev_base_lock);
53 53
54 return ret; 54 return ret;
55 } 55 }
56 56
57 /* generate a show function for simple field */ 57 /* generate a show function for simple field */
58 #define NETDEVICE_SHOW(field, format_string) \ 58 #define NETDEVICE_SHOW(field, format_string) \
59 static ssize_t format_##field(const struct net_device *net, char *buf) \ 59 static ssize_t format_##field(const struct net_device *net, char *buf) \
60 { \ 60 { \
61 return sprintf(buf, format_string, net->field); \ 61 return sprintf(buf, format_string, net->field); \
62 } \ 62 } \
63 static ssize_t field##_show(struct device *dev, \ 63 static ssize_t field##_show(struct device *dev, \
64 struct device_attribute *attr, char *buf) \ 64 struct device_attribute *attr, char *buf) \
65 { \ 65 { \
66 return netdev_show(dev, attr, buf, format_##field); \ 66 return netdev_show(dev, attr, buf, format_##field); \
67 } \ 67 } \
68 68
69 #define NETDEVICE_SHOW_RO(field, format_string) \ 69 #define NETDEVICE_SHOW_RO(field, format_string) \
70 NETDEVICE_SHOW(field, format_string); \ 70 NETDEVICE_SHOW(field, format_string); \
71 static DEVICE_ATTR_RO(field) 71 static DEVICE_ATTR_RO(field)
72 72
73 #define NETDEVICE_SHOW_RW(field, format_string) \ 73 #define NETDEVICE_SHOW_RW(field, format_string) \
74 NETDEVICE_SHOW(field, format_string); \ 74 NETDEVICE_SHOW(field, format_string); \
75 static DEVICE_ATTR_RW(field) 75 static DEVICE_ATTR_RW(field)
76 76
77 /* use same locking and permission rules as SIF* ioctl's */ 77 /* use same locking and permission rules as SIF* ioctl's */
78 static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, 78 static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
79 const char *buf, size_t len, 79 const char *buf, size_t len,
80 int (*set)(struct net_device *, unsigned long)) 80 int (*set)(struct net_device *, unsigned long))
81 { 81 {
82 struct net_device *netdev = to_net_dev(dev); 82 struct net_device *netdev = to_net_dev(dev);
83 struct net *net = dev_net(netdev); 83 struct net *net = dev_net(netdev);
84 unsigned long new; 84 unsigned long new;
85 int ret = -EINVAL; 85 int ret = -EINVAL;
86 86
87 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 87 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
88 return -EPERM; 88 return -EPERM;
89 89
90 ret = kstrtoul(buf, 0, &new); 90 ret = kstrtoul(buf, 0, &new);
91 if (ret) 91 if (ret)
92 goto err; 92 goto err;
93 93
94 if (!rtnl_trylock()) 94 if (!rtnl_trylock())
95 return restart_syscall(); 95 return restart_syscall();
96 96
97 if (dev_isalive(netdev)) { 97 if (dev_isalive(netdev)) {
98 if ((ret = (*set)(netdev, new)) == 0) 98 if ((ret = (*set)(netdev, new)) == 0)
99 ret = len; 99 ret = len;
100 } 100 }
101 rtnl_unlock(); 101 rtnl_unlock();
102 err: 102 err:
103 return ret; 103 return ret;
104 } 104 }
105 105
106 NETDEVICE_SHOW_RO(dev_id, fmt_hex); 106 NETDEVICE_SHOW_RO(dev_id, fmt_hex);
107 NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); 107 NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
108 NETDEVICE_SHOW_RO(addr_len, fmt_dec); 108 NETDEVICE_SHOW_RO(addr_len, fmt_dec);
109 NETDEVICE_SHOW_RO(iflink, fmt_dec); 109 NETDEVICE_SHOW_RO(iflink, fmt_dec);
110 NETDEVICE_SHOW_RO(ifindex, fmt_dec); 110 NETDEVICE_SHOW_RO(ifindex, fmt_dec);
111 NETDEVICE_SHOW_RO(type, fmt_dec); 111 NETDEVICE_SHOW_RO(type, fmt_dec);
112 NETDEVICE_SHOW_RO(link_mode, fmt_dec); 112 NETDEVICE_SHOW_RO(link_mode, fmt_dec);
113 113
114 /* use same locking rules as GIFHWADDR ioctl's */ 114 /* use same locking rules as GIFHWADDR ioctl's */
115 static ssize_t address_show(struct device *dev, struct device_attribute *attr, 115 static ssize_t address_show(struct device *dev, struct device_attribute *attr,
116 char *buf) 116 char *buf)
117 { 117 {
118 struct net_device *net = to_net_dev(dev); 118 struct net_device *net = to_net_dev(dev);
119 ssize_t ret = -EINVAL; 119 ssize_t ret = -EINVAL;
120 120
121 read_lock(&dev_base_lock); 121 read_lock(&dev_base_lock);
122 if (dev_isalive(net)) 122 if (dev_isalive(net))
123 ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); 123 ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
124 read_unlock(&dev_base_lock); 124 read_unlock(&dev_base_lock);
125 return ret; 125 return ret;
126 } 126 }
127 static DEVICE_ATTR_RO(address); 127 static DEVICE_ATTR_RO(address);
128 128
129 static ssize_t broadcast_show(struct device *dev, 129 static ssize_t broadcast_show(struct device *dev,
130 struct device_attribute *attr, char *buf) 130 struct device_attribute *attr, char *buf)
131 { 131 {
132 struct net_device *net = to_net_dev(dev); 132 struct net_device *net = to_net_dev(dev);
133 if (dev_isalive(net)) 133 if (dev_isalive(net))
134 return sysfs_format_mac(buf, net->broadcast, net->addr_len); 134 return sysfs_format_mac(buf, net->broadcast, net->addr_len);
135 return -EINVAL; 135 return -EINVAL;
136 } 136 }
137 static DEVICE_ATTR_RO(broadcast); 137 static DEVICE_ATTR_RO(broadcast);
138 138
139 static int change_carrier(struct net_device *net, unsigned long new_carrier) 139 static int change_carrier(struct net_device *net, unsigned long new_carrier)
140 { 140 {
141 if (!netif_running(net)) 141 if (!netif_running(net))
142 return -EINVAL; 142 return -EINVAL;
143 return dev_change_carrier(net, (bool) new_carrier); 143 return dev_change_carrier(net, (bool) new_carrier);
144 } 144 }
145 145
146 static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, 146 static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
147 const char *buf, size_t len) 147 const char *buf, size_t len)
148 { 148 {
149 return netdev_store(dev, attr, buf, len, change_carrier); 149 return netdev_store(dev, attr, buf, len, change_carrier);
150 } 150 }
151 151
152 static ssize_t carrier_show(struct device *dev, 152 static ssize_t carrier_show(struct device *dev,
153 struct device_attribute *attr, char *buf) 153 struct device_attribute *attr, char *buf)
154 { 154 {
155 struct net_device *netdev = to_net_dev(dev); 155 struct net_device *netdev = to_net_dev(dev);
156 if (netif_running(netdev)) { 156 if (netif_running(netdev)) {
157 return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); 157 return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
158 } 158 }
159 return -EINVAL; 159 return -EINVAL;
160 } 160 }
161 static DEVICE_ATTR_RW(carrier); 161 static DEVICE_ATTR_RW(carrier);
162 162
163 static ssize_t speed_show(struct device *dev, 163 static ssize_t speed_show(struct device *dev,
164 struct device_attribute *attr, char *buf) 164 struct device_attribute *attr, char *buf)
165 { 165 {
166 struct net_device *netdev = to_net_dev(dev); 166 struct net_device *netdev = to_net_dev(dev);
167 int ret = -EINVAL; 167 int ret = -EINVAL;
168 168
169 if (!rtnl_trylock()) 169 if (!rtnl_trylock())
170 return restart_syscall(); 170 return restart_syscall();
171 171
172 if (netif_running(netdev)) { 172 if (netif_running(netdev)) {
173 struct ethtool_cmd cmd; 173 struct ethtool_cmd cmd;
174 if (!__ethtool_get_settings(netdev, &cmd)) 174 if (!__ethtool_get_settings(netdev, &cmd))
175 ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); 175 ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
176 } 176 }
177 rtnl_unlock(); 177 rtnl_unlock();
178 return ret; 178 return ret;
179 } 179 }
180 static DEVICE_ATTR_RO(speed); 180 static DEVICE_ATTR_RO(speed);
181 181
182 static ssize_t duplex_show(struct device *dev, 182 static ssize_t duplex_show(struct device *dev,
183 struct device_attribute *attr, char *buf) 183 struct device_attribute *attr, char *buf)
184 { 184 {
185 struct net_device *netdev = to_net_dev(dev); 185 struct net_device *netdev = to_net_dev(dev);
186 int ret = -EINVAL; 186 int ret = -EINVAL;
187 187
188 if (!rtnl_trylock()) 188 if (!rtnl_trylock())
189 return restart_syscall(); 189 return restart_syscall();
190 190
191 if (netif_running(netdev)) { 191 if (netif_running(netdev)) {
192 struct ethtool_cmd cmd; 192 struct ethtool_cmd cmd;
193 if (!__ethtool_get_settings(netdev, &cmd)) { 193 if (!__ethtool_get_settings(netdev, &cmd)) {
194 const char *duplex; 194 const char *duplex;
195 switch (cmd.duplex) { 195 switch (cmd.duplex) {
196 case DUPLEX_HALF: 196 case DUPLEX_HALF:
197 duplex = "half"; 197 duplex = "half";
198 break; 198 break;
199 case DUPLEX_FULL: 199 case DUPLEX_FULL:
200 duplex = "full"; 200 duplex = "full";
201 break; 201 break;
202 default: 202 default:
203 duplex = "unknown"; 203 duplex = "unknown";
204 break; 204 break;
205 } 205 }
206 ret = sprintf(buf, "%s\n", duplex); 206 ret = sprintf(buf, "%s\n", duplex);
207 } 207 }
208 } 208 }
209 rtnl_unlock(); 209 rtnl_unlock();
210 return ret; 210 return ret;
211 } 211 }
212 static DEVICE_ATTR_RO(duplex); 212 static DEVICE_ATTR_RO(duplex);
213 213
214 static ssize_t dormant_show(struct device *dev, 214 static ssize_t dormant_show(struct device *dev,
215 struct device_attribute *attr, char *buf) 215 struct device_attribute *attr, char *buf)
216 { 216 {
217 struct net_device *netdev = to_net_dev(dev); 217 struct net_device *netdev = to_net_dev(dev);
218 218
219 if (netif_running(netdev)) 219 if (netif_running(netdev))
220 return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); 220 return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
221 221
222 return -EINVAL; 222 return -EINVAL;
223 } 223 }
224 static DEVICE_ATTR_RO(dormant); 224 static DEVICE_ATTR_RO(dormant);
225 225
226 static const char *const operstates[] = { 226 static const char *const operstates[] = {
227 "unknown", 227 "unknown",
228 "notpresent", /* currently unused */ 228 "notpresent", /* currently unused */
229 "down", 229 "down",
230 "lowerlayerdown", 230 "lowerlayerdown",
231 "testing", /* currently unused */ 231 "testing", /* currently unused */
232 "dormant", 232 "dormant",
233 "up" 233 "up"
234 }; 234 };
235 235
236 static ssize_t operstate_show(struct device *dev, 236 static ssize_t operstate_show(struct device *dev,
237 struct device_attribute *attr, char *buf) 237 struct device_attribute *attr, char *buf)
238 { 238 {
239 const struct net_device *netdev = to_net_dev(dev); 239 const struct net_device *netdev = to_net_dev(dev);
240 unsigned char operstate; 240 unsigned char operstate;
241 241
242 read_lock(&dev_base_lock); 242 read_lock(&dev_base_lock);
243 operstate = netdev->operstate; 243 operstate = netdev->operstate;
244 if (!netif_running(netdev)) 244 if (!netif_running(netdev))
245 operstate = IF_OPER_DOWN; 245 operstate = IF_OPER_DOWN;
246 read_unlock(&dev_base_lock); 246 read_unlock(&dev_base_lock);
247 247
248 if (operstate >= ARRAY_SIZE(operstates)) 248 if (operstate >= ARRAY_SIZE(operstates))
249 return -EINVAL; /* should not happen */ 249 return -EINVAL; /* should not happen */
250 250
251 return sprintf(buf, "%s\n", operstates[operstate]); 251 return sprintf(buf, "%s\n", operstates[operstate]);
252 } 252 }
253 static DEVICE_ATTR_RO(operstate); 253 static DEVICE_ATTR_RO(operstate);
254 254
255 /* read-write attributes */ 255 /* read-write attributes */
256 256
257 static int change_mtu(struct net_device *net, unsigned long new_mtu) 257 static int change_mtu(struct net_device *net, unsigned long new_mtu)
258 { 258 {
259 return dev_set_mtu(net, (int) new_mtu); 259 return dev_set_mtu(net, (int) new_mtu);
260 } 260 }
261 261
262 static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, 262 static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
263 const char *buf, size_t len) 263 const char *buf, size_t len)
264 { 264 {
265 return netdev_store(dev, attr, buf, len, change_mtu); 265 return netdev_store(dev, attr, buf, len, change_mtu);
266 } 266 }
267 NETDEVICE_SHOW_RW(mtu, fmt_dec); 267 NETDEVICE_SHOW_RW(mtu, fmt_dec);
268 268
269 static int change_flags(struct net_device *net, unsigned long new_flags) 269 static int change_flags(struct net_device *net, unsigned long new_flags)
270 { 270 {
271 return dev_change_flags(net, (unsigned int) new_flags); 271 return dev_change_flags(net, (unsigned int) new_flags);
272 } 272 }
273 273
274 static ssize_t flags_store(struct device *dev, struct device_attribute *attr, 274 static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
275 const char *buf, size_t len) 275 const char *buf, size_t len)
276 { 276 {
277 return netdev_store(dev, attr, buf, len, change_flags); 277 return netdev_store(dev, attr, buf, len, change_flags);
278 } 278 }
279 NETDEVICE_SHOW_RW(flags, fmt_hex); 279 NETDEVICE_SHOW_RW(flags, fmt_hex);
280 280
281 static int change_tx_queue_len(struct net_device *net, unsigned long new_len) 281 static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
282 { 282 {
283 net->tx_queue_len = new_len; 283 net->tx_queue_len = new_len;
284 return 0; 284 return 0;
285 } 285 }
286 286
287 static ssize_t tx_queue_len_store(struct device *dev, 287 static ssize_t tx_queue_len_store(struct device *dev,
288 struct device_attribute *attr, 288 struct device_attribute *attr,
289 const char *buf, size_t len) 289 const char *buf, size_t len)
290 { 290 {
291 if (!capable(CAP_NET_ADMIN)) 291 if (!capable(CAP_NET_ADMIN))
292 return -EPERM; 292 return -EPERM;
293 293
294 return netdev_store(dev, attr, buf, len, change_tx_queue_len); 294 return netdev_store(dev, attr, buf, len, change_tx_queue_len);
295 } 295 }
296 NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); 296 NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
297 297
298 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, 298 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
299 const char *buf, size_t len) 299 const char *buf, size_t len)
300 { 300 {
301 struct net_device *netdev = to_net_dev(dev); 301 struct net_device *netdev = to_net_dev(dev);
302 struct net *net = dev_net(netdev); 302 struct net *net = dev_net(netdev);
303 size_t count = len; 303 size_t count = len;
304 ssize_t ret; 304 ssize_t ret;
305 305
306 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 306 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
307 return -EPERM; 307 return -EPERM;
308 308
309 /* ignore trailing newline */ 309 /* ignore trailing newline */
310 if (len > 0 && buf[len - 1] == '\n') 310 if (len > 0 && buf[len - 1] == '\n')
311 --count; 311 --count;
312 312
313 if (!rtnl_trylock()) 313 if (!rtnl_trylock())
314 return restart_syscall(); 314 return restart_syscall();
315 ret = dev_set_alias(netdev, buf, count); 315 ret = dev_set_alias(netdev, buf, count);
316 rtnl_unlock(); 316 rtnl_unlock();
317 317
318 return ret < 0 ? ret : len; 318 return ret < 0 ? ret : len;
319 } 319 }
320 320
321 static ssize_t ifalias_show(struct device *dev, 321 static ssize_t ifalias_show(struct device *dev,
322 struct device_attribute *attr, char *buf) 322 struct device_attribute *attr, char *buf)
323 { 323 {
324 const struct net_device *netdev = to_net_dev(dev); 324 const struct net_device *netdev = to_net_dev(dev);
325 ssize_t ret = 0; 325 ssize_t ret = 0;
326 326
327 if (!rtnl_trylock()) 327 if (!rtnl_trylock())
328 return restart_syscall(); 328 return restart_syscall();
329 if (netdev->ifalias) 329 if (netdev->ifalias)
330 ret = sprintf(buf, "%s\n", netdev->ifalias); 330 ret = sprintf(buf, "%s\n", netdev->ifalias);
331 rtnl_unlock(); 331 rtnl_unlock();
332 return ret; 332 return ret;
333 } 333 }
334 static DEVICE_ATTR_RW(ifalias); 334 static DEVICE_ATTR_RW(ifalias);
335 335
336 static int change_group(struct net_device *net, unsigned long new_group) 336 static int change_group(struct net_device *net, unsigned long new_group)
337 { 337 {
338 dev_set_group(net, (int) new_group); 338 dev_set_group(net, (int) new_group);
339 return 0; 339 return 0;
340 } 340 }
341 341
342 static ssize_t group_store(struct device *dev, struct device_attribute *attr, 342 static ssize_t group_store(struct device *dev, struct device_attribute *attr,
343 const char *buf, size_t len) 343 const char *buf, size_t len)
344 { 344 {
345 return netdev_store(dev, attr, buf, len, change_group); 345 return netdev_store(dev, attr, buf, len, change_group);
346 } 346 }
347 NETDEVICE_SHOW(group, fmt_dec); 347 NETDEVICE_SHOW(group, fmt_dec);
348 static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); 348 static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
349 349
350 static ssize_t phys_port_id_show(struct device *dev, 350 static ssize_t phys_port_id_show(struct device *dev,
351 struct device_attribute *attr, char *buf) 351 struct device_attribute *attr, char *buf)
352 { 352 {
353 struct net_device *netdev = to_net_dev(dev); 353 struct net_device *netdev = to_net_dev(dev);
354 ssize_t ret = -EINVAL; 354 ssize_t ret = -EINVAL;
355 355
356 if (!rtnl_trylock()) 356 if (!rtnl_trylock())
357 return restart_syscall(); 357 return restart_syscall();
358 358
359 if (dev_isalive(netdev)) { 359 if (dev_isalive(netdev)) {
360 struct netdev_phys_port_id ppid; 360 struct netdev_phys_port_id ppid;
361 361
362 ret = dev_get_phys_port_id(netdev, &ppid); 362 ret = dev_get_phys_port_id(netdev, &ppid);
363 if (!ret) 363 if (!ret)
364 ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); 364 ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
365 } 365 }
366 rtnl_unlock(); 366 rtnl_unlock();
367 367
368 return ret; 368 return ret;
369 } 369 }
370 static DEVICE_ATTR_RO(phys_port_id); 370 static DEVICE_ATTR_RO(phys_port_id);
371 371
372 static struct attribute *net_class_attrs[] = { 372 static struct attribute *net_class_attrs[] = {
373 &dev_attr_netdev_group.attr, 373 &dev_attr_netdev_group.attr,
374 &dev_attr_type.attr, 374 &dev_attr_type.attr,
375 &dev_attr_dev_id.attr, 375 &dev_attr_dev_id.attr,
376 &dev_attr_iflink.attr, 376 &dev_attr_iflink.attr,
377 &dev_attr_ifindex.attr, 377 &dev_attr_ifindex.attr,
378 &dev_attr_addr_assign_type.attr, 378 &dev_attr_addr_assign_type.attr,
379 &dev_attr_addr_len.attr, 379 &dev_attr_addr_len.attr,
380 &dev_attr_link_mode.attr, 380 &dev_attr_link_mode.attr,
381 &dev_attr_address.attr, 381 &dev_attr_address.attr,
382 &dev_attr_broadcast.attr, 382 &dev_attr_broadcast.attr,
383 &dev_attr_speed.attr, 383 &dev_attr_speed.attr,
384 &dev_attr_duplex.attr, 384 &dev_attr_duplex.attr,
385 &dev_attr_dormant.attr, 385 &dev_attr_dormant.attr,
386 &dev_attr_operstate.attr, 386 &dev_attr_operstate.attr,
387 &dev_attr_ifalias.attr, 387 &dev_attr_ifalias.attr,
388 &dev_attr_carrier.attr, 388 &dev_attr_carrier.attr,
389 &dev_attr_mtu.attr, 389 &dev_attr_mtu.attr,
390 &dev_attr_flags.attr, 390 &dev_attr_flags.attr,
391 &dev_attr_tx_queue_len.attr, 391 &dev_attr_tx_queue_len.attr,
392 &dev_attr_phys_port_id.attr, 392 &dev_attr_phys_port_id.attr,
393 NULL, 393 NULL,
394 }; 394 };
395 ATTRIBUTE_GROUPS(net_class); 395 ATTRIBUTE_GROUPS(net_class);
396 396
397 /* Show a given an attribute in the statistics group */ 397 /* Show a given an attribute in the statistics group */
398 static ssize_t netstat_show(const struct device *d, 398 static ssize_t netstat_show(const struct device *d,
399 struct device_attribute *attr, char *buf, 399 struct device_attribute *attr, char *buf,
400 unsigned long offset) 400 unsigned long offset)
401 { 401 {
402 struct net_device *dev = to_net_dev(d); 402 struct net_device *dev = to_net_dev(d);
403 ssize_t ret = -EINVAL; 403 ssize_t ret = -EINVAL;
404 404
405 WARN_ON(offset > sizeof(struct rtnl_link_stats64) || 405 WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
406 offset % sizeof(u64) != 0); 406 offset % sizeof(u64) != 0);
407 407
408 read_lock(&dev_base_lock); 408 read_lock(&dev_base_lock);
409 if (dev_isalive(dev)) { 409 if (dev_isalive(dev)) {
410 struct rtnl_link_stats64 temp; 410 struct rtnl_link_stats64 temp;
411 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); 411 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
412 412
413 ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); 413 ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
414 } 414 }
415 read_unlock(&dev_base_lock); 415 read_unlock(&dev_base_lock);
416 return ret; 416 return ret;
417 } 417 }
418 418
419 /* generate a read-only statistics attribute */ 419 /* generate a read-only statistics attribute */
420 #define NETSTAT_ENTRY(name) \ 420 #define NETSTAT_ENTRY(name) \
421 static ssize_t name##_show(struct device *d, \ 421 static ssize_t name##_show(struct device *d, \
422 struct device_attribute *attr, char *buf) \ 422 struct device_attribute *attr, char *buf) \
423 { \ 423 { \
424 return netstat_show(d, attr, buf, \ 424 return netstat_show(d, attr, buf, \
425 offsetof(struct rtnl_link_stats64, name)); \ 425 offsetof(struct rtnl_link_stats64, name)); \
426 } \ 426 } \
427 static DEVICE_ATTR_RO(name) 427 static DEVICE_ATTR_RO(name)
428 428
429 NETSTAT_ENTRY(rx_packets); 429 NETSTAT_ENTRY(rx_packets);
430 NETSTAT_ENTRY(tx_packets); 430 NETSTAT_ENTRY(tx_packets);
431 NETSTAT_ENTRY(rx_bytes); 431 NETSTAT_ENTRY(rx_bytes);
432 NETSTAT_ENTRY(tx_bytes); 432 NETSTAT_ENTRY(tx_bytes);
433 NETSTAT_ENTRY(rx_errors); 433 NETSTAT_ENTRY(rx_errors);
434 NETSTAT_ENTRY(tx_errors); 434 NETSTAT_ENTRY(tx_errors);
435 NETSTAT_ENTRY(rx_dropped); 435 NETSTAT_ENTRY(rx_dropped);
436 NETSTAT_ENTRY(tx_dropped); 436 NETSTAT_ENTRY(tx_dropped);
437 NETSTAT_ENTRY(multicast); 437 NETSTAT_ENTRY(multicast);
438 NETSTAT_ENTRY(collisions); 438 NETSTAT_ENTRY(collisions);
439 NETSTAT_ENTRY(rx_length_errors); 439 NETSTAT_ENTRY(rx_length_errors);
440 NETSTAT_ENTRY(rx_over_errors); 440 NETSTAT_ENTRY(rx_over_errors);
441 NETSTAT_ENTRY(rx_crc_errors); 441 NETSTAT_ENTRY(rx_crc_errors);
442 NETSTAT_ENTRY(rx_frame_errors); 442 NETSTAT_ENTRY(rx_frame_errors);
443 NETSTAT_ENTRY(rx_fifo_errors); 443 NETSTAT_ENTRY(rx_fifo_errors);
444 NETSTAT_ENTRY(rx_missed_errors); 444 NETSTAT_ENTRY(rx_missed_errors);
445 NETSTAT_ENTRY(tx_aborted_errors); 445 NETSTAT_ENTRY(tx_aborted_errors);
446 NETSTAT_ENTRY(tx_carrier_errors); 446 NETSTAT_ENTRY(tx_carrier_errors);
447 NETSTAT_ENTRY(tx_fifo_errors); 447 NETSTAT_ENTRY(tx_fifo_errors);
448 NETSTAT_ENTRY(tx_heartbeat_errors); 448 NETSTAT_ENTRY(tx_heartbeat_errors);
449 NETSTAT_ENTRY(tx_window_errors); 449 NETSTAT_ENTRY(tx_window_errors);
450 NETSTAT_ENTRY(rx_compressed); 450 NETSTAT_ENTRY(rx_compressed);
451 NETSTAT_ENTRY(tx_compressed); 451 NETSTAT_ENTRY(tx_compressed);
452 452
453 static struct attribute *netstat_attrs[] = { 453 static struct attribute *netstat_attrs[] = {
454 &dev_attr_rx_packets.attr, 454 &dev_attr_rx_packets.attr,
455 &dev_attr_tx_packets.attr, 455 &dev_attr_tx_packets.attr,
456 &dev_attr_rx_bytes.attr, 456 &dev_attr_rx_bytes.attr,
457 &dev_attr_tx_bytes.attr, 457 &dev_attr_tx_bytes.attr,
458 &dev_attr_rx_errors.attr, 458 &dev_attr_rx_errors.attr,
459 &dev_attr_tx_errors.attr, 459 &dev_attr_tx_errors.attr,
460 &dev_attr_rx_dropped.attr, 460 &dev_attr_rx_dropped.attr,
461 &dev_attr_tx_dropped.attr, 461 &dev_attr_tx_dropped.attr,
462 &dev_attr_multicast.attr, 462 &dev_attr_multicast.attr,
463 &dev_attr_collisions.attr, 463 &dev_attr_collisions.attr,
464 &dev_attr_rx_length_errors.attr, 464 &dev_attr_rx_length_errors.attr,
465 &dev_attr_rx_over_errors.attr, 465 &dev_attr_rx_over_errors.attr,
466 &dev_attr_rx_crc_errors.attr, 466 &dev_attr_rx_crc_errors.attr,
467 &dev_attr_rx_frame_errors.attr, 467 &dev_attr_rx_frame_errors.attr,
468 &dev_attr_rx_fifo_errors.attr, 468 &dev_attr_rx_fifo_errors.attr,
469 &dev_attr_rx_missed_errors.attr, 469 &dev_attr_rx_missed_errors.attr,
470 &dev_attr_tx_aborted_errors.attr, 470 &dev_attr_tx_aborted_errors.attr,
471 &dev_attr_tx_carrier_errors.attr, 471 &dev_attr_tx_carrier_errors.attr,
472 &dev_attr_tx_fifo_errors.attr, 472 &dev_attr_tx_fifo_errors.attr,
473 &dev_attr_tx_heartbeat_errors.attr, 473 &dev_attr_tx_heartbeat_errors.attr,
474 &dev_attr_tx_window_errors.attr, 474 &dev_attr_tx_window_errors.attr,
475 &dev_attr_rx_compressed.attr, 475 &dev_attr_rx_compressed.attr,
476 &dev_attr_tx_compressed.attr, 476 &dev_attr_tx_compressed.attr,
477 NULL 477 NULL
478 }; 478 };
479 479
480 480
481 static struct attribute_group netstat_group = { 481 static struct attribute_group netstat_group = {
482 .name = "statistics", 482 .name = "statistics",
483 .attrs = netstat_attrs, 483 .attrs = netstat_attrs,
484 }; 484 };
485 485
486 #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) 486 #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
487 static struct attribute *wireless_attrs[] = { 487 static struct attribute *wireless_attrs[] = {
488 NULL 488 NULL
489 }; 489 };
490 490
491 static struct attribute_group wireless_group = { 491 static struct attribute_group wireless_group = {
492 .name = "wireless", 492 .name = "wireless",
493 .attrs = wireless_attrs, 493 .attrs = wireless_attrs,
494 }; 494 };
495 #endif 495 #endif
496 496
497 #else /* CONFIG_SYSFS */ 497 #else /* CONFIG_SYSFS */
498 #define net_class_groups NULL 498 #define net_class_groups NULL
499 #endif /* CONFIG_SYSFS */ 499 #endif /* CONFIG_SYSFS */
500 500
501 #ifdef CONFIG_RPS 501 #ifdef CONFIG_RPS
502 /* 502 /*
503 * RX queue sysfs structures and functions. 503 * RX queue sysfs structures and functions.
504 */ 504 */
505 struct rx_queue_attribute { 505 struct rx_queue_attribute {
506 struct attribute attr; 506 struct attribute attr;
507 ssize_t (*show)(struct netdev_rx_queue *queue, 507 ssize_t (*show)(struct netdev_rx_queue *queue,
508 struct rx_queue_attribute *attr, char *buf); 508 struct rx_queue_attribute *attr, char *buf);
509 ssize_t (*store)(struct netdev_rx_queue *queue, 509 ssize_t (*store)(struct netdev_rx_queue *queue,
510 struct rx_queue_attribute *attr, const char *buf, size_t len); 510 struct rx_queue_attribute *attr, const char *buf, size_t len);
511 }; 511 };
512 #define to_rx_queue_attr(_attr) container_of(_attr, \ 512 #define to_rx_queue_attr(_attr) container_of(_attr, \
513 struct rx_queue_attribute, attr) 513 struct rx_queue_attribute, attr)
514 514
515 #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) 515 #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
516 516
517 static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, 517 static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
518 char *buf) 518 char *buf)
519 { 519 {
520 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); 520 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
521 struct netdev_rx_queue *queue = to_rx_queue(kobj); 521 struct netdev_rx_queue *queue = to_rx_queue(kobj);
522 522
523 if (!attribute->show) 523 if (!attribute->show)
524 return -EIO; 524 return -EIO;
525 525
526 return attribute->show(queue, attribute, buf); 526 return attribute->show(queue, attribute, buf);
527 } 527 }
528 528
529 static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, 529 static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
530 const char *buf, size_t count) 530 const char *buf, size_t count)
531 { 531 {
532 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); 532 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
533 struct netdev_rx_queue *queue = to_rx_queue(kobj); 533 struct netdev_rx_queue *queue = to_rx_queue(kobj);
534 534
535 if (!attribute->store) 535 if (!attribute->store)
536 return -EIO; 536 return -EIO;
537 537
538 return attribute->store(queue, attribute, buf, count); 538 return attribute->store(queue, attribute, buf, count);
539 } 539 }
540 540
541 static const struct sysfs_ops rx_queue_sysfs_ops = { 541 static const struct sysfs_ops rx_queue_sysfs_ops = {
542 .show = rx_queue_attr_show, 542 .show = rx_queue_attr_show,
543 .store = rx_queue_attr_store, 543 .store = rx_queue_attr_store,
544 }; 544 };
545 545
546 static ssize_t show_rps_map(struct netdev_rx_queue *queue, 546 static ssize_t show_rps_map(struct netdev_rx_queue *queue,
547 struct rx_queue_attribute *attribute, char *buf) 547 struct rx_queue_attribute *attribute, char *buf)
548 { 548 {
549 struct rps_map *map; 549 struct rps_map *map;
550 cpumask_var_t mask; 550 cpumask_var_t mask;
551 size_t len = 0; 551 size_t len = 0;
552 int i; 552 int i;
553 553
554 if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) 554 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
555 return -ENOMEM; 555 return -ENOMEM;
556 556
557 rcu_read_lock(); 557 rcu_read_lock();
558 map = rcu_dereference(queue->rps_map); 558 map = rcu_dereference(queue->rps_map);
559 if (map) 559 if (map)
560 for (i = 0; i < map->len; i++) 560 for (i = 0; i < map->len; i++)
561 cpumask_set_cpu(map->cpus[i], mask); 561 cpumask_set_cpu(map->cpus[i], mask);
562 562
563 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); 563 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
564 if (PAGE_SIZE - len < 3) { 564 if (PAGE_SIZE - len < 3) {
565 rcu_read_unlock(); 565 rcu_read_unlock();
566 free_cpumask_var(mask); 566 free_cpumask_var(mask);
567 return -EINVAL; 567 return -EINVAL;
568 } 568 }
569 rcu_read_unlock(); 569 rcu_read_unlock();
570 570
571 free_cpumask_var(mask); 571 free_cpumask_var(mask);
572 len += sprintf(buf + len, "\n"); 572 len += sprintf(buf + len, "\n");
573 return len; 573 return len;
574 } 574 }
575 575
576 static ssize_t store_rps_map(struct netdev_rx_queue *queue, 576 static ssize_t store_rps_map(struct netdev_rx_queue *queue,
577 struct rx_queue_attribute *attribute, 577 struct rx_queue_attribute *attribute,
578 const char *buf, size_t len) 578 const char *buf, size_t len)
579 { 579 {
580 struct rps_map *old_map, *map; 580 struct rps_map *old_map, *map;
581 cpumask_var_t mask; 581 cpumask_var_t mask;
582 int err, cpu, i; 582 int err, cpu, i;
583 static DEFINE_SPINLOCK(rps_map_lock); 583 static DEFINE_SPINLOCK(rps_map_lock);
584 584
585 if (!capable(CAP_NET_ADMIN)) 585 if (!capable(CAP_NET_ADMIN))
586 return -EPERM; 586 return -EPERM;
587 587
588 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 588 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
589 return -ENOMEM; 589 return -ENOMEM;
590 590
591 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); 591 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
592 if (err) { 592 if (err) {
593 free_cpumask_var(mask); 593 free_cpumask_var(mask);
594 return err; 594 return err;
595 } 595 }
596 596
597 map = kzalloc(max_t(unsigned int, 597 map = kzalloc(max_t(unsigned int,
598 RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), 598 RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
599 GFP_KERNEL); 599 GFP_KERNEL);
600 if (!map) { 600 if (!map) {
601 free_cpumask_var(mask); 601 free_cpumask_var(mask);
602 return -ENOMEM; 602 return -ENOMEM;
603 } 603 }
604 604
605 i = 0; 605 i = 0;
606 for_each_cpu_and(cpu, mask, cpu_online_mask) 606 for_each_cpu_and(cpu, mask, cpu_online_mask)
607 map->cpus[i++] = cpu; 607 map->cpus[i++] = cpu;
608 608
609 if (i) 609 if (i)
610 map->len = i; 610 map->len = i;
611 else { 611 else {
612 kfree(map); 612 kfree(map);
613 map = NULL; 613 map = NULL;
614 } 614 }
615 615
616 spin_lock(&rps_map_lock); 616 spin_lock(&rps_map_lock);
617 old_map = rcu_dereference_protected(queue->rps_map, 617 old_map = rcu_dereference_protected(queue->rps_map,
618 lockdep_is_held(&rps_map_lock)); 618 lockdep_is_held(&rps_map_lock));
619 rcu_assign_pointer(queue->rps_map, map); 619 rcu_assign_pointer(queue->rps_map, map);
620 spin_unlock(&rps_map_lock); 620 spin_unlock(&rps_map_lock);
621 621
622 if (map) 622 if (map)
623 static_key_slow_inc(&rps_needed); 623 static_key_slow_inc(&rps_needed);
624 if (old_map) { 624 if (old_map) {
625 kfree_rcu(old_map, rcu); 625 kfree_rcu(old_map, rcu);
626 static_key_slow_dec(&rps_needed); 626 static_key_slow_dec(&rps_needed);
627 } 627 }
628 free_cpumask_var(mask); 628 free_cpumask_var(mask);
629 return len; 629 return len;
630 } 630 }
631 631
632 static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, 632 static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
633 struct rx_queue_attribute *attr, 633 struct rx_queue_attribute *attr,
634 char *buf) 634 char *buf)
635 { 635 {
636 struct rps_dev_flow_table *flow_table; 636 struct rps_dev_flow_table *flow_table;
637 unsigned long val = 0; 637 unsigned long val = 0;
638 638
639 rcu_read_lock(); 639 rcu_read_lock();
640 flow_table = rcu_dereference(queue->rps_flow_table); 640 flow_table = rcu_dereference(queue->rps_flow_table);
641 if (flow_table) 641 if (flow_table)
642 val = (unsigned long)flow_table->mask + 1; 642 val = (unsigned long)flow_table->mask + 1;
643 rcu_read_unlock(); 643 rcu_read_unlock();
644 644
645 return sprintf(buf, "%lu\n", val); 645 return sprintf(buf, "%lu\n", val);
646 } 646 }
647 647
648 static void rps_dev_flow_table_release(struct rcu_head *rcu) 648 static void rps_dev_flow_table_release(struct rcu_head *rcu)
649 { 649 {
650 struct rps_dev_flow_table *table = container_of(rcu, 650 struct rps_dev_flow_table *table = container_of(rcu,
651 struct rps_dev_flow_table, rcu); 651 struct rps_dev_flow_table, rcu);
652 vfree(table); 652 vfree(table);
653 } 653 }
654 654
655 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, 655 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
656 struct rx_queue_attribute *attr, 656 struct rx_queue_attribute *attr,
657 const char *buf, size_t len) 657 const char *buf, size_t len)
658 { 658 {
659 unsigned long mask, count; 659 unsigned long mask, count;
660 struct rps_dev_flow_table *table, *old_table; 660 struct rps_dev_flow_table *table, *old_table;
661 static DEFINE_SPINLOCK(rps_dev_flow_lock); 661 static DEFINE_SPINLOCK(rps_dev_flow_lock);
662 int rc; 662 int rc;
663 663
664 if (!capable(CAP_NET_ADMIN)) 664 if (!capable(CAP_NET_ADMIN))
665 return -EPERM; 665 return -EPERM;
666 666
667 rc = kstrtoul(buf, 0, &count); 667 rc = kstrtoul(buf, 0, &count);
668 if (rc < 0) 668 if (rc < 0)
669 return rc; 669 return rc;
670 670
671 if (count) { 671 if (count) {
672 mask = count - 1; 672 mask = count - 1;
673 /* mask = roundup_pow_of_two(count) - 1; 673 /* mask = roundup_pow_of_two(count) - 1;
674 * without overflows... 674 * without overflows...
675 */ 675 */
676 while ((mask | (mask >> 1)) != mask) 676 while ((mask | (mask >> 1)) != mask)
677 mask |= (mask >> 1); 677 mask |= (mask >> 1);
678 /* On 64 bit arches, must check mask fits in table->mask (u32), 678 /* On 64 bit arches, must check mask fits in table->mask (u32),
679 * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) 679 * and on 32bit arches, must check
680 * doesnt overflow. 680 * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
681 */ 681 */
682 #if BITS_PER_LONG > 32 682 #if BITS_PER_LONG > 32
683 if (mask > (unsigned long)(u32)mask) 683 if (mask > (unsigned long)(u32)mask)
684 return -EINVAL; 684 return -EINVAL;
685 #else 685 #else
686 if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) 686 if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
687 / sizeof(struct rps_dev_flow)) { 687 / sizeof(struct rps_dev_flow)) {
688 /* Enforce a limit to prevent overflow */ 688 /* Enforce a limit to prevent overflow */
689 return -EINVAL; 689 return -EINVAL;
690 } 690 }
691 #endif 691 #endif
692 table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); 692 table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
693 if (!table) 693 if (!table)
694 return -ENOMEM; 694 return -ENOMEM;
695 695
696 table->mask = mask; 696 table->mask = mask;
697 for (count = 0; count <= mask; count++) 697 for (count = 0; count <= mask; count++)
698 table->flows[count].cpu = RPS_NO_CPU; 698 table->flows[count].cpu = RPS_NO_CPU;
699 } else 699 } else
700 table = NULL; 700 table = NULL;
701 701
702 spin_lock(&rps_dev_flow_lock); 702 spin_lock(&rps_dev_flow_lock);
703 old_table = rcu_dereference_protected(queue->rps_flow_table, 703 old_table = rcu_dereference_protected(queue->rps_flow_table,
704 lockdep_is_held(&rps_dev_flow_lock)); 704 lockdep_is_held(&rps_dev_flow_lock));
705 rcu_assign_pointer(queue->rps_flow_table, table); 705 rcu_assign_pointer(queue->rps_flow_table, table);
706 spin_unlock(&rps_dev_flow_lock); 706 spin_unlock(&rps_dev_flow_lock);
707 707
708 if (old_table) 708 if (old_table)
709 call_rcu(&old_table->rcu, rps_dev_flow_table_release); 709 call_rcu(&old_table->rcu, rps_dev_flow_table_release);
710 710
711 return len; 711 return len;
712 } 712 }
713 713
714 static struct rx_queue_attribute rps_cpus_attribute = 714 static struct rx_queue_attribute rps_cpus_attribute =
715 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); 715 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
716 716
717 717
718 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = 718 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
719 __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, 719 __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
720 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); 720 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
721 721
722 static struct attribute *rx_queue_default_attrs[] = { 722 static struct attribute *rx_queue_default_attrs[] = {
723 &rps_cpus_attribute.attr, 723 &rps_cpus_attribute.attr,
724 &rps_dev_flow_table_cnt_attribute.attr, 724 &rps_dev_flow_table_cnt_attribute.attr,
725 NULL 725 NULL
726 }; 726 };
727 727
728 static void rx_queue_release(struct kobject *kobj) 728 static void rx_queue_release(struct kobject *kobj)
729 { 729 {
730 struct netdev_rx_queue *queue = to_rx_queue(kobj); 730 struct netdev_rx_queue *queue = to_rx_queue(kobj);
731 struct rps_map *map; 731 struct rps_map *map;
732 struct rps_dev_flow_table *flow_table; 732 struct rps_dev_flow_table *flow_table;
733 733
734 734
735 map = rcu_dereference_protected(queue->rps_map, 1); 735 map = rcu_dereference_protected(queue->rps_map, 1);
736 if (map) { 736 if (map) {
737 RCU_INIT_POINTER(queue->rps_map, NULL); 737 RCU_INIT_POINTER(queue->rps_map, NULL);
738 kfree_rcu(map, rcu); 738 kfree_rcu(map, rcu);
739 } 739 }
740 740
741 flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); 741 flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
742 if (flow_table) { 742 if (flow_table) {
743 RCU_INIT_POINTER(queue->rps_flow_table, NULL); 743 RCU_INIT_POINTER(queue->rps_flow_table, NULL);
744 call_rcu(&flow_table->rcu, rps_dev_flow_table_release); 744 call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
745 } 745 }
746 746
747 memset(kobj, 0, sizeof(*kobj)); 747 memset(kobj, 0, sizeof(*kobj));
748 dev_put(queue->dev); 748 dev_put(queue->dev);
749 } 749 }
750 750
751 static struct kobj_type rx_queue_ktype = { 751 static struct kobj_type rx_queue_ktype = {
752 .sysfs_ops = &rx_queue_sysfs_ops, 752 .sysfs_ops = &rx_queue_sysfs_ops,
753 .release = rx_queue_release, 753 .release = rx_queue_release,
754 .default_attrs = rx_queue_default_attrs, 754 .default_attrs = rx_queue_default_attrs,
755 }; 755 };
756 756
757 static int rx_queue_add_kobject(struct net_device *net, int index) 757 static int rx_queue_add_kobject(struct net_device *net, int index)
758 { 758 {
759 struct netdev_rx_queue *queue = net->_rx + index; 759 struct netdev_rx_queue *queue = net->_rx + index;
760 struct kobject *kobj = &queue->kobj; 760 struct kobject *kobj = &queue->kobj;
761 int error = 0; 761 int error = 0;
762 762
763 kobj->kset = net->queues_kset; 763 kobj->kset = net->queues_kset;
764 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, 764 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
765 "rx-%u", index); 765 "rx-%u", index);
766 if (error) { 766 if (error) {
767 kobject_put(kobj); 767 kobject_put(kobj);
768 return error; 768 return error;
769 } 769 }
770 770
771 kobject_uevent(kobj, KOBJ_ADD); 771 kobject_uevent(kobj, KOBJ_ADD);
772 dev_hold(queue->dev); 772 dev_hold(queue->dev);
773 773
774 return error; 774 return error;
775 } 775 }
776 #endif /* CONFIG_RPS */ 776 #endif /* CONFIG_RPS */
777 777
778 int 778 int
779 net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) 779 net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
780 { 780 {
781 #ifdef CONFIG_RPS 781 #ifdef CONFIG_RPS
782 int i; 782 int i;
783 int error = 0; 783 int error = 0;
784 784
785 for (i = old_num; i < new_num; i++) { 785 for (i = old_num; i < new_num; i++) {
786 error = rx_queue_add_kobject(net, i); 786 error = rx_queue_add_kobject(net, i);
787 if (error) { 787 if (error) {
788 new_num = old_num; 788 new_num = old_num;
789 break; 789 break;
790 } 790 }
791 } 791 }
792 792
793 while (--i >= new_num) 793 while (--i >= new_num)
794 kobject_put(&net->_rx[i].kobj); 794 kobject_put(&net->_rx[i].kobj);
795 795
796 return error; 796 return error;
797 #else 797 #else
798 return 0; 798 return 0;
799 #endif 799 #endif
800 } 800 }
801 801
802 #ifdef CONFIG_SYSFS 802 #ifdef CONFIG_SYSFS
803 /* 803 /*
804 * netdev_queue sysfs structures and functions. 804 * netdev_queue sysfs structures and functions.
805 */ 805 */
806 struct netdev_queue_attribute { 806 struct netdev_queue_attribute {
807 struct attribute attr; 807 struct attribute attr;
808 ssize_t (*show)(struct netdev_queue *queue, 808 ssize_t (*show)(struct netdev_queue *queue,
809 struct netdev_queue_attribute *attr, char *buf); 809 struct netdev_queue_attribute *attr, char *buf);
810 ssize_t (*store)(struct netdev_queue *queue, 810 ssize_t (*store)(struct netdev_queue *queue,
811 struct netdev_queue_attribute *attr, const char *buf, size_t len); 811 struct netdev_queue_attribute *attr, const char *buf, size_t len);
812 }; 812 };
813 #define to_netdev_queue_attr(_attr) container_of(_attr, \ 813 #define to_netdev_queue_attr(_attr) container_of(_attr, \
814 struct netdev_queue_attribute, attr) 814 struct netdev_queue_attribute, attr)
815 815
816 #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) 816 #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
817 817
818 static ssize_t netdev_queue_attr_show(struct kobject *kobj, 818 static ssize_t netdev_queue_attr_show(struct kobject *kobj,
819 struct attribute *attr, char *buf) 819 struct attribute *attr, char *buf)
820 { 820 {
821 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); 821 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
822 struct netdev_queue *queue = to_netdev_queue(kobj); 822 struct netdev_queue *queue = to_netdev_queue(kobj);
823 823
824 if (!attribute->show) 824 if (!attribute->show)
825 return -EIO; 825 return -EIO;
826 826
827 return attribute->show(queue, attribute, buf); 827 return attribute->show(queue, attribute, buf);
828 } 828 }
829 829
830 static ssize_t netdev_queue_attr_store(struct kobject *kobj, 830 static ssize_t netdev_queue_attr_store(struct kobject *kobj,
831 struct attribute *attr, 831 struct attribute *attr,
832 const char *buf, size_t count) 832 const char *buf, size_t count)
833 { 833 {
834 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); 834 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
835 struct netdev_queue *queue = to_netdev_queue(kobj); 835 struct netdev_queue *queue = to_netdev_queue(kobj);
836 836
837 if (!attribute->store) 837 if (!attribute->store)
838 return -EIO; 838 return -EIO;
839 839
840 return attribute->store(queue, attribute, buf, count); 840 return attribute->store(queue, attribute, buf, count);
841 } 841 }
842 842
843 static const struct sysfs_ops netdev_queue_sysfs_ops = { 843 static const struct sysfs_ops netdev_queue_sysfs_ops = {
844 .show = netdev_queue_attr_show, 844 .show = netdev_queue_attr_show,
845 .store = netdev_queue_attr_store, 845 .store = netdev_queue_attr_store,
846 }; 846 };
847 847
848 static ssize_t show_trans_timeout(struct netdev_queue *queue, 848 static ssize_t show_trans_timeout(struct netdev_queue *queue,
849 struct netdev_queue_attribute *attribute, 849 struct netdev_queue_attribute *attribute,
850 char *buf) 850 char *buf)
851 { 851 {
852 unsigned long trans_timeout; 852 unsigned long trans_timeout;
853 853
854 spin_lock_irq(&queue->_xmit_lock); 854 spin_lock_irq(&queue->_xmit_lock);
855 trans_timeout = queue->trans_timeout; 855 trans_timeout = queue->trans_timeout;
856 spin_unlock_irq(&queue->_xmit_lock); 856 spin_unlock_irq(&queue->_xmit_lock);
857 857
858 return sprintf(buf, "%lu", trans_timeout); 858 return sprintf(buf, "%lu", trans_timeout);
859 } 859 }
860 860
861 static struct netdev_queue_attribute queue_trans_timeout = 861 static struct netdev_queue_attribute queue_trans_timeout =
862 __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); 862 __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
863 863
864 #ifdef CONFIG_BQL 864 #ifdef CONFIG_BQL
865 /* 865 /*
866 * Byte queue limits sysfs structures and functions. 866 * Byte queue limits sysfs structures and functions.
867 */ 867 */
868 static ssize_t bql_show(char *buf, unsigned int value) 868 static ssize_t bql_show(char *buf, unsigned int value)
869 { 869 {
870 return sprintf(buf, "%u\n", value); 870 return sprintf(buf, "%u\n", value);
871 } 871 }
872 872
873 static ssize_t bql_set(const char *buf, const size_t count, 873 static ssize_t bql_set(const char *buf, const size_t count,
874 unsigned int *pvalue) 874 unsigned int *pvalue)
875 { 875 {
876 unsigned int value; 876 unsigned int value;
877 int err; 877 int err;
878 878
879 if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) 879 if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
880 value = DQL_MAX_LIMIT; 880 value = DQL_MAX_LIMIT;
881 else { 881 else {
882 err = kstrtouint(buf, 10, &value); 882 err = kstrtouint(buf, 10, &value);
883 if (err < 0) 883 if (err < 0)
884 return err; 884 return err;
885 if (value > DQL_MAX_LIMIT) 885 if (value > DQL_MAX_LIMIT)
886 return -EINVAL; 886 return -EINVAL;
887 } 887 }
888 888
889 *pvalue = value; 889 *pvalue = value;
890 890
891 return count; 891 return count;
892 } 892 }
893 893
894 static ssize_t bql_show_hold_time(struct netdev_queue *queue, 894 static ssize_t bql_show_hold_time(struct netdev_queue *queue,
895 struct netdev_queue_attribute *attr, 895 struct netdev_queue_attribute *attr,
896 char *buf) 896 char *buf)
897 { 897 {
898 struct dql *dql = &queue->dql; 898 struct dql *dql = &queue->dql;
899 899
900 return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); 900 return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
901 } 901 }
902 902
903 static ssize_t bql_set_hold_time(struct netdev_queue *queue, 903 static ssize_t bql_set_hold_time(struct netdev_queue *queue,
904 struct netdev_queue_attribute *attribute, 904 struct netdev_queue_attribute *attribute,
905 const char *buf, size_t len) 905 const char *buf, size_t len)
906 { 906 {
907 struct dql *dql = &queue->dql; 907 struct dql *dql = &queue->dql;
908 unsigned int value; 908 unsigned int value;
909 int err; 909 int err;
910 910
911 err = kstrtouint(buf, 10, &value); 911 err = kstrtouint(buf, 10, &value);
912 if (err < 0) 912 if (err < 0)
913 return err; 913 return err;
914 914
915 dql->slack_hold_time = msecs_to_jiffies(value); 915 dql->slack_hold_time = msecs_to_jiffies(value);
916 916
917 return len; 917 return len;
918 } 918 }
919 919
920 static struct netdev_queue_attribute bql_hold_time_attribute = 920 static struct netdev_queue_attribute bql_hold_time_attribute =
921 __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, 921 __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
922 bql_set_hold_time); 922 bql_set_hold_time);
923 923
924 static ssize_t bql_show_inflight(struct netdev_queue *queue, 924 static ssize_t bql_show_inflight(struct netdev_queue *queue,
925 struct netdev_queue_attribute *attr, 925 struct netdev_queue_attribute *attr,
926 char *buf) 926 char *buf)
927 { 927 {
928 struct dql *dql = &queue->dql; 928 struct dql *dql = &queue->dql;
929 929
930 return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); 930 return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
931 } 931 }
932 932
933 static struct netdev_queue_attribute bql_inflight_attribute = 933 static struct netdev_queue_attribute bql_inflight_attribute =
934 __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); 934 __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
935 935
936 #define BQL_ATTR(NAME, FIELD) \ 936 #define BQL_ATTR(NAME, FIELD) \
937 static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ 937 static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
938 struct netdev_queue_attribute *attr, \ 938 struct netdev_queue_attribute *attr, \
939 char *buf) \ 939 char *buf) \
940 { \ 940 { \
941 return bql_show(buf, queue->dql.FIELD); \ 941 return bql_show(buf, queue->dql.FIELD); \
942 } \ 942 } \
943 \ 943 \
944 static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ 944 static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
945 struct netdev_queue_attribute *attr, \ 945 struct netdev_queue_attribute *attr, \
946 const char *buf, size_t len) \ 946 const char *buf, size_t len) \
947 { \ 947 { \
948 return bql_set(buf, len, &queue->dql.FIELD); \ 948 return bql_set(buf, len, &queue->dql.FIELD); \
949 } \ 949 } \
950 \ 950 \
951 static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \ 951 static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
952 __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \ 952 __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
953 bql_set_ ## NAME); 953 bql_set_ ## NAME);
954 954
955 BQL_ATTR(limit, limit) 955 BQL_ATTR(limit, limit)
956 BQL_ATTR(limit_max, max_limit) 956 BQL_ATTR(limit_max, max_limit)
957 BQL_ATTR(limit_min, min_limit) 957 BQL_ATTR(limit_min, min_limit)
958 958
959 static struct attribute *dql_attrs[] = { 959 static struct attribute *dql_attrs[] = {
960 &bql_limit_attribute.attr, 960 &bql_limit_attribute.attr,
961 &bql_limit_max_attribute.attr, 961 &bql_limit_max_attribute.attr,
962 &bql_limit_min_attribute.attr, 962 &bql_limit_min_attribute.attr,
963 &bql_hold_time_attribute.attr, 963 &bql_hold_time_attribute.attr,
964 &bql_inflight_attribute.attr, 964 &bql_inflight_attribute.attr,
965 NULL 965 NULL
966 }; 966 };
967 967
968 static struct attribute_group dql_group = { 968 static struct attribute_group dql_group = {
969 .name = "byte_queue_limits", 969 .name = "byte_queue_limits",
970 .attrs = dql_attrs, 970 .attrs = dql_attrs,
971 }; 971 };
972 #endif /* CONFIG_BQL */ 972 #endif /* CONFIG_BQL */
973 973
974 #ifdef CONFIG_XPS 974 #ifdef CONFIG_XPS
975 static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) 975 static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
976 { 976 {
977 struct net_device *dev = queue->dev; 977 struct net_device *dev = queue->dev;
978 int i; 978 int i;
979 979
980 for (i = 0; i < dev->num_tx_queues; i++) 980 for (i = 0; i < dev->num_tx_queues; i++)
981 if (queue == &dev->_tx[i]) 981 if (queue == &dev->_tx[i])
982 break; 982 break;
983 983
984 BUG_ON(i >= dev->num_tx_queues); 984 BUG_ON(i >= dev->num_tx_queues);
985 985
986 return i; 986 return i;
987 } 987 }
988 988
989 989
990 static ssize_t show_xps_map(struct netdev_queue *queue, 990 static ssize_t show_xps_map(struct netdev_queue *queue,
991 struct netdev_queue_attribute *attribute, char *buf) 991 struct netdev_queue_attribute *attribute, char *buf)
992 { 992 {
993 struct net_device *dev = queue->dev; 993 struct net_device *dev = queue->dev;
994 struct xps_dev_maps *dev_maps; 994 struct xps_dev_maps *dev_maps;
995 cpumask_var_t mask; 995 cpumask_var_t mask;
996 unsigned long index; 996 unsigned long index;
997 size_t len = 0; 997 size_t len = 0;
998 int i; 998 int i;
999 999
1000 if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) 1000 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
1001 return -ENOMEM; 1001 return -ENOMEM;
1002 1002
1003 index = get_netdev_queue_index(queue); 1003 index = get_netdev_queue_index(queue);
1004 1004
1005 rcu_read_lock(); 1005 rcu_read_lock();
1006 dev_maps = rcu_dereference(dev->xps_maps); 1006 dev_maps = rcu_dereference(dev->xps_maps);
1007 if (dev_maps) { 1007 if (dev_maps) {
1008 for_each_possible_cpu(i) { 1008 for_each_possible_cpu(i) {
1009 struct xps_map *map = 1009 struct xps_map *map =
1010 rcu_dereference(dev_maps->cpu_map[i]); 1010 rcu_dereference(dev_maps->cpu_map[i]);
1011 if (map) { 1011 if (map) {
1012 int j; 1012 int j;
1013 for (j = 0; j < map->len; j++) { 1013 for (j = 0; j < map->len; j++) {
1014 if (map->queues[j] == index) { 1014 if (map->queues[j] == index) {
1015 cpumask_set_cpu(i, mask); 1015 cpumask_set_cpu(i, mask);
1016 break; 1016 break;
1017 } 1017 }
1018 } 1018 }
1019 } 1019 }
1020 } 1020 }
1021 } 1021 }
1022 rcu_read_unlock(); 1022 rcu_read_unlock();
1023 1023
1024 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); 1024 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
1025 if (PAGE_SIZE - len < 3) { 1025 if (PAGE_SIZE - len < 3) {
1026 free_cpumask_var(mask); 1026 free_cpumask_var(mask);
1027 return -EINVAL; 1027 return -EINVAL;
1028 } 1028 }
1029 1029
1030 free_cpumask_var(mask); 1030 free_cpumask_var(mask);
1031 len += sprintf(buf + len, "\n"); 1031 len += sprintf(buf + len, "\n");
1032 return len; 1032 return len;
1033 } 1033 }
1034 1034
1035 static ssize_t store_xps_map(struct netdev_queue *queue, 1035 static ssize_t store_xps_map(struct netdev_queue *queue,
1036 struct netdev_queue_attribute *attribute, 1036 struct netdev_queue_attribute *attribute,
1037 const char *buf, size_t len) 1037 const char *buf, size_t len)
1038 { 1038 {
1039 struct net_device *dev = queue->dev; 1039 struct net_device *dev = queue->dev;
1040 unsigned long index; 1040 unsigned long index;
1041 cpumask_var_t mask; 1041 cpumask_var_t mask;
1042 int err; 1042 int err;
1043 1043
1044 if (!capable(CAP_NET_ADMIN)) 1044 if (!capable(CAP_NET_ADMIN))
1045 return -EPERM; 1045 return -EPERM;
1046 1046
1047 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 1047 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
1048 return -ENOMEM; 1048 return -ENOMEM;
1049 1049
1050 index = get_netdev_queue_index(queue); 1050 index = get_netdev_queue_index(queue);
1051 1051
1052 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); 1052 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
1053 if (err) { 1053 if (err) {
1054 free_cpumask_var(mask); 1054 free_cpumask_var(mask);
1055 return err; 1055 return err;
1056 } 1056 }
1057 1057
1058 err = netif_set_xps_queue(dev, mask, index); 1058 err = netif_set_xps_queue(dev, mask, index);
1059 1059
1060 free_cpumask_var(mask); 1060 free_cpumask_var(mask);
1061 1061
1062 return err ? : len; 1062 return err ? : len;
1063 } 1063 }
1064 1064
1065 static struct netdev_queue_attribute xps_cpus_attribute = 1065 static struct netdev_queue_attribute xps_cpus_attribute =
1066 __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); 1066 __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
1067 #endif /* CONFIG_XPS */ 1067 #endif /* CONFIG_XPS */
1068 1068
1069 static struct attribute *netdev_queue_default_attrs[] = { 1069 static struct attribute *netdev_queue_default_attrs[] = {
1070 &queue_trans_timeout.attr, 1070 &queue_trans_timeout.attr,
1071 #ifdef CONFIG_XPS 1071 #ifdef CONFIG_XPS
1072 &xps_cpus_attribute.attr, 1072 &xps_cpus_attribute.attr,
1073 #endif 1073 #endif
1074 NULL 1074 NULL
1075 }; 1075 };
1076 1076
1077 static void netdev_queue_release(struct kobject *kobj) 1077 static void netdev_queue_release(struct kobject *kobj)
1078 { 1078 {
1079 struct netdev_queue *queue = to_netdev_queue(kobj); 1079 struct netdev_queue *queue = to_netdev_queue(kobj);
1080 1080
1081 memset(kobj, 0, sizeof(*kobj)); 1081 memset(kobj, 0, sizeof(*kobj));
1082 dev_put(queue->dev); 1082 dev_put(queue->dev);
1083 } 1083 }
1084 1084
1085 static struct kobj_type netdev_queue_ktype = { 1085 static struct kobj_type netdev_queue_ktype = {
1086 .sysfs_ops = &netdev_queue_sysfs_ops, 1086 .sysfs_ops = &netdev_queue_sysfs_ops,
1087 .release = netdev_queue_release, 1087 .release = netdev_queue_release,
1088 .default_attrs = netdev_queue_default_attrs, 1088 .default_attrs = netdev_queue_default_attrs,
1089 }; 1089 };
1090 1090
1091 static int netdev_queue_add_kobject(struct net_device *net, int index) 1091 static int netdev_queue_add_kobject(struct net_device *net, int index)
1092 { 1092 {
1093 struct netdev_queue *queue = net->_tx + index; 1093 struct netdev_queue *queue = net->_tx + index;
1094 struct kobject *kobj = &queue->kobj; 1094 struct kobject *kobj = &queue->kobj;
1095 int error = 0; 1095 int error = 0;
1096 1096
1097 kobj->kset = net->queues_kset; 1097 kobj->kset = net->queues_kset;
1098 error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, 1098 error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
1099 "tx-%u", index); 1099 "tx-%u", index);
1100 if (error) 1100 if (error)
1101 goto exit; 1101 goto exit;
1102 1102
1103 #ifdef CONFIG_BQL 1103 #ifdef CONFIG_BQL
1104 error = sysfs_create_group(kobj, &dql_group); 1104 error = sysfs_create_group(kobj, &dql_group);
1105 if (error) 1105 if (error)
1106 goto exit; 1106 goto exit;
1107 #endif 1107 #endif
1108 1108
1109 kobject_uevent(kobj, KOBJ_ADD); 1109 kobject_uevent(kobj, KOBJ_ADD);
1110 dev_hold(queue->dev); 1110 dev_hold(queue->dev);
1111 1111
1112 return 0; 1112 return 0;
1113 exit: 1113 exit:
1114 kobject_put(kobj); 1114 kobject_put(kobj);
1115 return error; 1115 return error;
1116 } 1116 }
1117 #endif /* CONFIG_SYSFS */ 1117 #endif /* CONFIG_SYSFS */
1118 1118
1119 int 1119 int
1120 netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) 1120 netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
1121 { 1121 {
1122 #ifdef CONFIG_SYSFS 1122 #ifdef CONFIG_SYSFS
1123 int i; 1123 int i;
1124 int error = 0; 1124 int error = 0;
1125 1125
1126 for (i = old_num; i < new_num; i++) { 1126 for (i = old_num; i < new_num; i++) {
1127 error = netdev_queue_add_kobject(net, i); 1127 error = netdev_queue_add_kobject(net, i);
1128 if (error) { 1128 if (error) {
1129 new_num = old_num; 1129 new_num = old_num;
1130 break; 1130 break;
1131 } 1131 }
1132 } 1132 }
1133 1133
1134 while (--i >= new_num) { 1134 while (--i >= new_num) {
1135 struct netdev_queue *queue = net->_tx + i; 1135 struct netdev_queue *queue = net->_tx + i;
1136 1136
1137 #ifdef CONFIG_BQL 1137 #ifdef CONFIG_BQL
1138 sysfs_remove_group(&queue->kobj, &dql_group); 1138 sysfs_remove_group(&queue->kobj, &dql_group);
1139 #endif 1139 #endif
1140 kobject_put(&queue->kobj); 1140 kobject_put(&queue->kobj);
1141 } 1141 }
1142 1142
1143 return error; 1143 return error;
1144 #else 1144 #else
1145 return 0; 1145 return 0;
1146 #endif /* CONFIG_SYSFS */ 1146 #endif /* CONFIG_SYSFS */
1147 } 1147 }
1148 1148
1149 static int register_queue_kobjects(struct net_device *net) 1149 static int register_queue_kobjects(struct net_device *net)
1150 { 1150 {
1151 int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; 1151 int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
1152 1152
1153 #ifdef CONFIG_SYSFS 1153 #ifdef CONFIG_SYSFS
1154 net->queues_kset = kset_create_and_add("queues", 1154 net->queues_kset = kset_create_and_add("queues",
1155 NULL, &net->dev.kobj); 1155 NULL, &net->dev.kobj);
1156 if (!net->queues_kset) 1156 if (!net->queues_kset)
1157 return -ENOMEM; 1157 return -ENOMEM;
1158 #endif 1158 #endif
1159 1159
1160 #ifdef CONFIG_RPS 1160 #ifdef CONFIG_RPS
1161 real_rx = net->real_num_rx_queues; 1161 real_rx = net->real_num_rx_queues;
1162 #endif 1162 #endif
1163 real_tx = net->real_num_tx_queues; 1163 real_tx = net->real_num_tx_queues;
1164 1164
1165 error = net_rx_queue_update_kobjects(net, 0, real_rx); 1165 error = net_rx_queue_update_kobjects(net, 0, real_rx);
1166 if (error) 1166 if (error)
1167 goto error; 1167 goto error;
1168 rxq = real_rx; 1168 rxq = real_rx;
1169 1169
1170 error = netdev_queue_update_kobjects(net, 0, real_tx); 1170 error = netdev_queue_update_kobjects(net, 0, real_tx);
1171 if (error) 1171 if (error)
1172 goto error; 1172 goto error;
1173 txq = real_tx; 1173 txq = real_tx;
1174 1174
1175 return 0; 1175 return 0;
1176 1176
1177 error: 1177 error:
1178 netdev_queue_update_kobjects(net, txq, 0); 1178 netdev_queue_update_kobjects(net, txq, 0);
1179 net_rx_queue_update_kobjects(net, rxq, 0); 1179 net_rx_queue_update_kobjects(net, rxq, 0);
1180 return error; 1180 return error;
1181 } 1181 }
1182 1182
1183 static void remove_queue_kobjects(struct net_device *net) 1183 static void remove_queue_kobjects(struct net_device *net)
1184 { 1184 {
1185 int real_rx = 0, real_tx = 0; 1185 int real_rx = 0, real_tx = 0;
1186 1186
1187 #ifdef CONFIG_RPS 1187 #ifdef CONFIG_RPS
1188 real_rx = net->real_num_rx_queues; 1188 real_rx = net->real_num_rx_queues;
1189 #endif 1189 #endif
1190 real_tx = net->real_num_tx_queues; 1190 real_tx = net->real_num_tx_queues;
1191 1191
1192 net_rx_queue_update_kobjects(net, real_rx, 0); 1192 net_rx_queue_update_kobjects(net, real_rx, 0);
1193 netdev_queue_update_kobjects(net, real_tx, 0); 1193 netdev_queue_update_kobjects(net, real_tx, 0);
1194 #ifdef CONFIG_SYSFS 1194 #ifdef CONFIG_SYSFS
1195 kset_unregister(net->queues_kset); 1195 kset_unregister(net->queues_kset);
1196 #endif 1196 #endif
1197 } 1197 }
1198 1198
1199 static bool net_current_may_mount(void) 1199 static bool net_current_may_mount(void)
1200 { 1200 {
1201 struct net *net = current->nsproxy->net_ns; 1201 struct net *net = current->nsproxy->net_ns;
1202 1202
1203 return ns_capable(net->user_ns, CAP_SYS_ADMIN); 1203 return ns_capable(net->user_ns, CAP_SYS_ADMIN);
1204 } 1204 }
1205 1205
1206 static void *net_grab_current_ns(void) 1206 static void *net_grab_current_ns(void)
1207 { 1207 {
1208 struct net *ns = current->nsproxy->net_ns; 1208 struct net *ns = current->nsproxy->net_ns;
1209 #ifdef CONFIG_NET_NS 1209 #ifdef CONFIG_NET_NS
1210 if (ns) 1210 if (ns)
1211 atomic_inc(&ns->passive); 1211 atomic_inc(&ns->passive);
1212 #endif 1212 #endif
1213 return ns; 1213 return ns;
1214 } 1214 }
1215 1215
1216 static const void *net_initial_ns(void) 1216 static const void *net_initial_ns(void)
1217 { 1217 {
1218 return &init_net; 1218 return &init_net;
1219 } 1219 }
1220 1220
1221 static const void *net_netlink_ns(struct sock *sk) 1221 static const void *net_netlink_ns(struct sock *sk)
1222 { 1222 {
1223 return sock_net(sk); 1223 return sock_net(sk);
1224 } 1224 }
1225 1225
1226 struct kobj_ns_type_operations net_ns_type_operations = { 1226 struct kobj_ns_type_operations net_ns_type_operations = {
1227 .type = KOBJ_NS_TYPE_NET, 1227 .type = KOBJ_NS_TYPE_NET,
1228 .current_may_mount = net_current_may_mount, 1228 .current_may_mount = net_current_may_mount,
1229 .grab_current_ns = net_grab_current_ns, 1229 .grab_current_ns = net_grab_current_ns,
1230 .netlink_ns = net_netlink_ns, 1230 .netlink_ns = net_netlink_ns,
1231 .initial_ns = net_initial_ns, 1231 .initial_ns = net_initial_ns,
1232 .drop_ns = net_drop_ns, 1232 .drop_ns = net_drop_ns,
1233 }; 1233 };
1234 EXPORT_SYMBOL_GPL(net_ns_type_operations); 1234 EXPORT_SYMBOL_GPL(net_ns_type_operations);
1235 1235
1236 static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) 1236 static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
1237 { 1237 {
1238 struct net_device *dev = to_net_dev(d); 1238 struct net_device *dev = to_net_dev(d);
1239 int retval; 1239 int retval;
1240 1240
1241 /* pass interface to uevent. */ 1241 /* pass interface to uevent. */
1242 retval = add_uevent_var(env, "INTERFACE=%s", dev->name); 1242 retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
1243 if (retval) 1243 if (retval)
1244 goto exit; 1244 goto exit;
1245 1245
1246 /* pass ifindex to uevent. 1246 /* pass ifindex to uevent.
1247 * ifindex is useful as it won't change (interface name may change) 1247 * ifindex is useful as it won't change (interface name may change)
1248 * and is what RtNetlink uses natively. */ 1248 * and is what RtNetlink uses natively. */
1249 retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); 1249 retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
1250 1250
1251 exit: 1251 exit:
1252 return retval; 1252 return retval;
1253 } 1253 }
1254 1254
1255 /* 1255 /*
1256 * netdev_release -- destroy and free a dead device. 1256 * netdev_release -- destroy and free a dead device.
1257 * Called when last reference to device kobject is gone. 1257 * Called when last reference to device kobject is gone.
1258 */ 1258 */
1259 static void netdev_release(struct device *d) 1259 static void netdev_release(struct device *d)
1260 { 1260 {
1261 struct net_device *dev = to_net_dev(d); 1261 struct net_device *dev = to_net_dev(d);
1262 1262
1263 BUG_ON(dev->reg_state != NETREG_RELEASED); 1263 BUG_ON(dev->reg_state != NETREG_RELEASED);
1264 1264
1265 kfree(dev->ifalias); 1265 kfree(dev->ifalias);
1266 netdev_freemem(dev); 1266 netdev_freemem(dev);
1267 } 1267 }
1268 1268
1269 static const void *net_namespace(struct device *d) 1269 static const void *net_namespace(struct device *d)
1270 { 1270 {
1271 struct net_device *dev; 1271 struct net_device *dev;
1272 dev = container_of(d, struct net_device, dev); 1272 dev = container_of(d, struct net_device, dev);
1273 return dev_net(dev); 1273 return dev_net(dev);
1274 } 1274 }
1275 1275
1276 static struct class net_class = { 1276 static struct class net_class = {
1277 .name = "net", 1277 .name = "net",
1278 .dev_release = netdev_release, 1278 .dev_release = netdev_release,
1279 .dev_groups = net_class_groups, 1279 .dev_groups = net_class_groups,
1280 .dev_uevent = netdev_uevent, 1280 .dev_uevent = netdev_uevent,
1281 .ns_type = &net_ns_type_operations, 1281 .ns_type = &net_ns_type_operations,
1282 .namespace = net_namespace, 1282 .namespace = net_namespace,
1283 }; 1283 };
1284 1284
1285 /* Delete sysfs entries but hold kobject reference until after all 1285 /* Delete sysfs entries but hold kobject reference until after all
1286 * netdev references are gone. 1286 * netdev references are gone.
1287 */ 1287 */
1288 void netdev_unregister_kobject(struct net_device * net) 1288 void netdev_unregister_kobject(struct net_device * net)
1289 { 1289 {
1290 struct device *dev = &(net->dev); 1290 struct device *dev = &(net->dev);
1291 1291
1292 kobject_get(&dev->kobj); 1292 kobject_get(&dev->kobj);
1293 1293
1294 remove_queue_kobjects(net); 1294 remove_queue_kobjects(net);
1295 1295
1296 pm_runtime_set_memalloc_noio(dev, false); 1296 pm_runtime_set_memalloc_noio(dev, false);
1297 1297
1298 device_del(dev); 1298 device_del(dev);
1299 } 1299 }
1300 1300
1301 /* Create sysfs entries for network device. */ 1301 /* Create sysfs entries for network device. */
1302 int netdev_register_kobject(struct net_device *net) 1302 int netdev_register_kobject(struct net_device *net)
1303 { 1303 {
1304 struct device *dev = &(net->dev); 1304 struct device *dev = &(net->dev);
1305 const struct attribute_group **groups = net->sysfs_groups; 1305 const struct attribute_group **groups = net->sysfs_groups;
1306 int error = 0; 1306 int error = 0;
1307 1307
1308 device_initialize(dev); 1308 device_initialize(dev);
1309 dev->class = &net_class; 1309 dev->class = &net_class;
1310 dev->platform_data = net; 1310 dev->platform_data = net;
1311 dev->groups = groups; 1311 dev->groups = groups;
1312 1312
1313 dev_set_name(dev, "%s", net->name); 1313 dev_set_name(dev, "%s", net->name);
1314 1314
1315 #ifdef CONFIG_SYSFS 1315 #ifdef CONFIG_SYSFS
1316 /* Allow for a device specific group */ 1316 /* Allow for a device specific group */
1317 if (*groups) 1317 if (*groups)
1318 groups++; 1318 groups++;
1319 1319
1320 *groups++ = &netstat_group; 1320 *groups++ = &netstat_group;
1321 1321
1322 #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) 1322 #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
1323 if (net->ieee80211_ptr) 1323 if (net->ieee80211_ptr)
1324 *groups++ = &wireless_group; 1324 *groups++ = &wireless_group;
1325 #if IS_ENABLED(CONFIG_WIRELESS_EXT) 1325 #if IS_ENABLED(CONFIG_WIRELESS_EXT)
1326 else if (net->wireless_handlers) 1326 else if (net->wireless_handlers)
1327 *groups++ = &wireless_group; 1327 *groups++ = &wireless_group;
1328 #endif 1328 #endif
1329 #endif 1329 #endif
1330 #endif /* CONFIG_SYSFS */ 1330 #endif /* CONFIG_SYSFS */
1331 1331
1332 error = device_add(dev); 1332 error = device_add(dev);
1333 if (error) 1333 if (error)
1334 return error; 1334 return error;
1335 1335
1336 error = register_queue_kobjects(net); 1336 error = register_queue_kobjects(net);
1337 if (error) { 1337 if (error) {
1338 device_del(dev); 1338 device_del(dev);
1339 return error; 1339 return error;
1340 } 1340 }
1341 1341
1342 pm_runtime_set_memalloc_noio(dev, true); 1342 pm_runtime_set_memalloc_noio(dev, true);
1343 1343
1344 return error; 1344 return error;
1345 } 1345 }
1346 1346
1347 int netdev_class_create_file_ns(struct class_attribute *class_attr, 1347 int netdev_class_create_file_ns(struct class_attribute *class_attr,
1348 const void *ns) 1348 const void *ns)
1349 { 1349 {
1350 return class_create_file_ns(&net_class, class_attr, ns); 1350 return class_create_file_ns(&net_class, class_attr, ns);
1351 } 1351 }
1352 EXPORT_SYMBOL(netdev_class_create_file_ns); 1352 EXPORT_SYMBOL(netdev_class_create_file_ns);
1353 1353
1354 void netdev_class_remove_file_ns(struct class_attribute *class_attr, 1354 void netdev_class_remove_file_ns(struct class_attribute *class_attr,
1355 const void *ns) 1355 const void *ns)
1356 { 1356 {
1357 class_remove_file_ns(&net_class, class_attr, ns); 1357 class_remove_file_ns(&net_class, class_attr, ns);
1358 } 1358 }
1359 EXPORT_SYMBOL(netdev_class_remove_file_ns); 1359 EXPORT_SYMBOL(netdev_class_remove_file_ns);
1360 1360
1361 int netdev_kobject_init(void) 1361 int netdev_kobject_init(void)
1362 { 1362 {
1363 kobj_ns_type_register(&net_ns_type_operations); 1363 kobj_ns_type_register(&net_ns_type_operations);
1364 return class_register(&net_class); 1364 return class_register(&net_class);
1365 } 1365 }
1366 1366
net/core/netprio_cgroup.c
1 /* 1 /*
2 * net/core/netprio_cgroup.c Priority Control Group 2 * net/core/netprio_cgroup.c Priority Control Group
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License 5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version. 7 * 2 of the License, or (at your option) any later version.
8 * 8 *
9 * Authors: Neil Horman <nhorman@tuxdriver.com> 9 * Authors: Neil Horman <nhorman@tuxdriver.com>
10 */ 10 */
11 11
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14 #include <linux/module.h> 14 #include <linux/module.h>
15 #include <linux/slab.h> 15 #include <linux/slab.h>
16 #include <linux/types.h> 16 #include <linux/types.h>
17 #include <linux/string.h> 17 #include <linux/string.h>
18 #include <linux/errno.h> 18 #include <linux/errno.h>
19 #include <linux/skbuff.h> 19 #include <linux/skbuff.h>
20 #include <linux/cgroup.h> 20 #include <linux/cgroup.h>
21 #include <linux/rcupdate.h> 21 #include <linux/rcupdate.h>
22 #include <linux/atomic.h> 22 #include <linux/atomic.h>
23 #include <net/rtnetlink.h> 23 #include <net/rtnetlink.h>
24 #include <net/pkt_cls.h> 24 #include <net/pkt_cls.h>
25 #include <net/sock.h> 25 #include <net/sock.h>
26 #include <net/netprio_cgroup.h> 26 #include <net/netprio_cgroup.h>
27 27
28 #include <linux/fdtable.h> 28 #include <linux/fdtable.h>
29 29
30 #define PRIOMAP_MIN_SZ 128 30 #define PRIOMAP_MIN_SZ 128
31 31
32 /* 32 /*
33 * Extend @dev->priomap so that it's large enough to accomodate 33 * Extend @dev->priomap so that it's large enough to accommodate
34 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful 34 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
35 * return. Must be called under rtnl lock. 35 * return. Must be called under rtnl lock.
36 */ 36 */
37 static int extend_netdev_table(struct net_device *dev, u32 target_idx) 37 static int extend_netdev_table(struct net_device *dev, u32 target_idx)
38 { 38 {
39 struct netprio_map *old, *new; 39 struct netprio_map *old, *new;
40 size_t new_sz, new_len; 40 size_t new_sz, new_len;
41 41
42 /* is the existing priomap large enough? */ 42 /* is the existing priomap large enough? */
43 old = rtnl_dereference(dev->priomap); 43 old = rtnl_dereference(dev->priomap);
44 if (old && old->priomap_len > target_idx) 44 if (old && old->priomap_len > target_idx)
45 return 0; 45 return 0;
46 46
47 /* 47 /*
48 * Determine the new size. Let's keep it power-of-two. We start 48 * Determine the new size. Let's keep it power-of-two. We start
49 * from PRIOMAP_MIN_SZ and double it until it's large enough to 49 * from PRIOMAP_MIN_SZ and double it until it's large enough to
50 * accommodate @target_idx. 50 * accommodate @target_idx.
51 */ 51 */
52 new_sz = PRIOMAP_MIN_SZ; 52 new_sz = PRIOMAP_MIN_SZ;
53 while (true) { 53 while (true) {
54 new_len = (new_sz - offsetof(struct netprio_map, priomap)) / 54 new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
55 sizeof(new->priomap[0]); 55 sizeof(new->priomap[0]);
56 if (new_len > target_idx) 56 if (new_len > target_idx)
57 break; 57 break;
58 new_sz *= 2; 58 new_sz *= 2;
59 /* overflowed? */ 59 /* overflowed? */
60 if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) 60 if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
61 return -ENOSPC; 61 return -ENOSPC;
62 } 62 }
63 63
64 /* allocate & copy */ 64 /* allocate & copy */
65 new = kzalloc(new_sz, GFP_KERNEL); 65 new = kzalloc(new_sz, GFP_KERNEL);
66 if (!new) 66 if (!new)
67 return -ENOMEM; 67 return -ENOMEM;
68 68
69 if (old) 69 if (old)
70 memcpy(new->priomap, old->priomap, 70 memcpy(new->priomap, old->priomap,
71 old->priomap_len * sizeof(old->priomap[0])); 71 old->priomap_len * sizeof(old->priomap[0]));
72 72
73 new->priomap_len = new_len; 73 new->priomap_len = new_len;
74 74
75 /* install the new priomap */ 75 /* install the new priomap */
76 rcu_assign_pointer(dev->priomap, new); 76 rcu_assign_pointer(dev->priomap, new);
77 if (old) 77 if (old)
78 kfree_rcu(old, rcu); 78 kfree_rcu(old, rcu);
79 return 0; 79 return 0;
80 } 80 }
81 81
82 /** 82 /**
83 * netprio_prio - return the effective netprio of a cgroup-net_device pair 83 * netprio_prio - return the effective netprio of a cgroup-net_device pair
84 * @css: css part of the target pair 84 * @css: css part of the target pair
85 * @dev: net_device part of the target pair 85 * @dev: net_device part of the target pair
86 * 86 *
87 * Should be called under RCU read or rtnl lock. 87 * Should be called under RCU read or rtnl lock.
88 */ 88 */
89 static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) 89 static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
90 { 90 {
91 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); 91 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
92 int id = css->cgroup->id; 92 int id = css->cgroup->id;
93 93
94 if (map && id < map->priomap_len) 94 if (map && id < map->priomap_len)
95 return map->priomap[id]; 95 return map->priomap[id];
96 return 0; 96 return 0;
97 } 97 }
98 98
99 /** 99 /**
100 * netprio_set_prio - set netprio on a cgroup-net_device pair 100 * netprio_set_prio - set netprio on a cgroup-net_device pair
101 * @css: css part of the target pair 101 * @css: css part of the target pair
102 * @dev: net_device part of the target pair 102 * @dev: net_device part of the target pair
103 * @prio: prio to set 103 * @prio: prio to set
104 * 104 *
105 * Set netprio to @prio on @css-@dev pair. Should be called under rtnl 105 * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
106 * lock and may fail under memory pressure for non-zero @prio. 106 * lock and may fail under memory pressure for non-zero @prio.
107 */ 107 */
108 static int netprio_set_prio(struct cgroup_subsys_state *css, 108 static int netprio_set_prio(struct cgroup_subsys_state *css,
109 struct net_device *dev, u32 prio) 109 struct net_device *dev, u32 prio)
110 { 110 {
111 struct netprio_map *map; 111 struct netprio_map *map;
112 int id = css->cgroup->id; 112 int id = css->cgroup->id;
113 int ret; 113 int ret;
114 114
115 /* avoid extending priomap for zero writes */ 115 /* avoid extending priomap for zero writes */
116 map = rtnl_dereference(dev->priomap); 116 map = rtnl_dereference(dev->priomap);
117 if (!prio && (!map || map->priomap_len <= id)) 117 if (!prio && (!map || map->priomap_len <= id))
118 return 0; 118 return 0;
119 119
120 ret = extend_netdev_table(dev, id); 120 ret = extend_netdev_table(dev, id);
121 if (ret) 121 if (ret)
122 return ret; 122 return ret;
123 123
124 map = rtnl_dereference(dev->priomap); 124 map = rtnl_dereference(dev->priomap);
125 map->priomap[id] = prio; 125 map->priomap[id] = prio;
126 return 0; 126 return 0;
127 } 127 }
128 128
129 static struct cgroup_subsys_state * 129 static struct cgroup_subsys_state *
130 cgrp_css_alloc(struct cgroup_subsys_state *parent_css) 130 cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
131 { 131 {
132 struct cgroup_subsys_state *css; 132 struct cgroup_subsys_state *css;
133 133
134 css = kzalloc(sizeof(*css), GFP_KERNEL); 134 css = kzalloc(sizeof(*css), GFP_KERNEL);
135 if (!css) 135 if (!css)
136 return ERR_PTR(-ENOMEM); 136 return ERR_PTR(-ENOMEM);
137 137
138 return css; 138 return css;
139 } 139 }
140 140
141 static int cgrp_css_online(struct cgroup_subsys_state *css) 141 static int cgrp_css_online(struct cgroup_subsys_state *css)
142 { 142 {
143 struct cgroup_subsys_state *parent_css = css_parent(css); 143 struct cgroup_subsys_state *parent_css = css_parent(css);
144 struct net_device *dev; 144 struct net_device *dev;
145 int ret = 0; 145 int ret = 0;
146 146
147 if (!parent_css) 147 if (!parent_css)
148 return 0; 148 return 0;
149 149
150 rtnl_lock(); 150 rtnl_lock();
151 /* 151 /*
152 * Inherit prios from the parent. As all prios are set during 152 * Inherit prios from the parent. As all prios are set during
153 * onlining, there is no need to clear them on offline. 153 * onlining, there is no need to clear them on offline.
154 */ 154 */
155 for_each_netdev(&init_net, dev) { 155 for_each_netdev(&init_net, dev) {
156 u32 prio = netprio_prio(parent_css, dev); 156 u32 prio = netprio_prio(parent_css, dev);
157 157
158 ret = netprio_set_prio(css, dev, prio); 158 ret = netprio_set_prio(css, dev, prio);
159 if (ret) 159 if (ret)
160 break; 160 break;
161 } 161 }
162 rtnl_unlock(); 162 rtnl_unlock();
163 return ret; 163 return ret;
164 } 164 }
165 165
166 static void cgrp_css_free(struct cgroup_subsys_state *css) 166 static void cgrp_css_free(struct cgroup_subsys_state *css)
167 { 167 {
168 kfree(css); 168 kfree(css);
169 } 169 }
170 170
171 static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) 171 static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
172 { 172 {
173 return css->cgroup->id; 173 return css->cgroup->id;
174 } 174 }
175 175
176 static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 176 static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
177 struct cgroup_map_cb *cb) 177 struct cgroup_map_cb *cb)
178 { 178 {
179 struct net_device *dev; 179 struct net_device *dev;
180 180
181 rcu_read_lock(); 181 rcu_read_lock();
182 for_each_netdev_rcu(&init_net, dev) 182 for_each_netdev_rcu(&init_net, dev)
183 cb->fill(cb, dev->name, netprio_prio(css, dev)); 183 cb->fill(cb, dev->name, netprio_prio(css, dev));
184 rcu_read_unlock(); 184 rcu_read_unlock();
185 return 0; 185 return 0;
186 } 186 }
187 187
188 static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 188 static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
189 const char *buffer) 189 const char *buffer)
190 { 190 {
191 char devname[IFNAMSIZ + 1]; 191 char devname[IFNAMSIZ + 1];
192 struct net_device *dev; 192 struct net_device *dev;
193 u32 prio; 193 u32 prio;
194 int ret; 194 int ret;
195 195
196 if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) 196 if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
197 return -EINVAL; 197 return -EINVAL;
198 198
199 dev = dev_get_by_name(&init_net, devname); 199 dev = dev_get_by_name(&init_net, devname);
200 if (!dev) 200 if (!dev)
201 return -ENODEV; 201 return -ENODEV;
202 202
203 rtnl_lock(); 203 rtnl_lock();
204 204
205 ret = netprio_set_prio(css, dev, prio); 205 ret = netprio_set_prio(css, dev, prio);
206 206
207 rtnl_unlock(); 207 rtnl_unlock();
208 dev_put(dev); 208 dev_put(dev);
209 return ret; 209 return ret;
210 } 210 }
211 211
212 static int update_netprio(const void *v, struct file *file, unsigned n) 212 static int update_netprio(const void *v, struct file *file, unsigned n)
213 { 213 {
214 int err; 214 int err;
215 struct socket *sock = sock_from_file(file, &err); 215 struct socket *sock = sock_from_file(file, &err);
216 if (sock) 216 if (sock)
217 sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; 217 sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
218 return 0; 218 return 0;
219 } 219 }
220 220
221 static void net_prio_attach(struct cgroup_subsys_state *css, 221 static void net_prio_attach(struct cgroup_subsys_state *css,
222 struct cgroup_taskset *tset) 222 struct cgroup_taskset *tset)
223 { 223 {
224 struct task_struct *p; 224 struct task_struct *p;
225 void *v = (void *)(unsigned long)css->cgroup->id; 225 void *v = (void *)(unsigned long)css->cgroup->id;
226 226
227 cgroup_taskset_for_each(p, css, tset) { 227 cgroup_taskset_for_each(p, css, tset) {
228 task_lock(p); 228 task_lock(p);
229 iterate_fd(p->files, 0, update_netprio, v); 229 iterate_fd(p->files, 0, update_netprio, v);
230 task_unlock(p); 230 task_unlock(p);
231 } 231 }
232 } 232 }
233 233
234 static struct cftype ss_files[] = { 234 static struct cftype ss_files[] = {
235 { 235 {
236 .name = "prioidx", 236 .name = "prioidx",
237 .read_u64 = read_prioidx, 237 .read_u64 = read_prioidx,
238 }, 238 },
239 { 239 {
240 .name = "ifpriomap", 240 .name = "ifpriomap",
241 .read_map = read_priomap, 241 .read_map = read_priomap,
242 .write_string = write_priomap, 242 .write_string = write_priomap,
243 }, 243 },
244 { } /* terminate */ 244 { } /* terminate */
245 }; 245 };
246 246
247 struct cgroup_subsys net_prio_subsys = { 247 struct cgroup_subsys net_prio_subsys = {
248 .name = "net_prio", 248 .name = "net_prio",
249 .css_alloc = cgrp_css_alloc, 249 .css_alloc = cgrp_css_alloc,
250 .css_online = cgrp_css_online, 250 .css_online = cgrp_css_online,
251 .css_free = cgrp_css_free, 251 .css_free = cgrp_css_free,
252 .attach = net_prio_attach, 252 .attach = net_prio_attach,
253 .subsys_id = net_prio_subsys_id, 253 .subsys_id = net_prio_subsys_id,
254 .base_cftypes = ss_files, 254 .base_cftypes = ss_files,
255 .module = THIS_MODULE, 255 .module = THIS_MODULE,
256 }; 256 };
257 257
258 static int netprio_device_event(struct notifier_block *unused, 258 static int netprio_device_event(struct notifier_block *unused,
259 unsigned long event, void *ptr) 259 unsigned long event, void *ptr)
260 { 260 {
261 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 261 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
262 struct netprio_map *old; 262 struct netprio_map *old;
263 263
264 /* 264 /*
265 * Note this is called with rtnl_lock held so we have update side 265 * Note this is called with rtnl_lock held so we have update side
266 * protection on our rcu assignments 266 * protection on our rcu assignments
267 */ 267 */
268 268
269 switch (event) { 269 switch (event) {
270 case NETDEV_UNREGISTER: 270 case NETDEV_UNREGISTER:
271 old = rtnl_dereference(dev->priomap); 271 old = rtnl_dereference(dev->priomap);
272 RCU_INIT_POINTER(dev->priomap, NULL); 272 RCU_INIT_POINTER(dev->priomap, NULL);
273 if (old) 273 if (old)
274 kfree_rcu(old, rcu); 274 kfree_rcu(old, rcu);
275 break; 275 break;
276 } 276 }
277 return NOTIFY_DONE; 277 return NOTIFY_DONE;
278 } 278 }
279 279
280 static struct notifier_block netprio_device_notifier = { 280 static struct notifier_block netprio_device_notifier = {
281 .notifier_call = netprio_device_event 281 .notifier_call = netprio_device_event
282 }; 282 };
283 283
284 static int __init init_cgroup_netprio(void) 284 static int __init init_cgroup_netprio(void)
285 { 285 {
286 int ret; 286 int ret;
287 287
288 ret = cgroup_load_subsys(&net_prio_subsys); 288 ret = cgroup_load_subsys(&net_prio_subsys);
289 if (ret) 289 if (ret)
290 goto out; 290 goto out;
291 291
292 register_netdevice_notifier(&netprio_device_notifier); 292 register_netdevice_notifier(&netprio_device_notifier);
293 293
294 out: 294 out:
295 return ret; 295 return ret;
296 } 296 }
297 297
298 static void __exit exit_cgroup_netprio(void) 298 static void __exit exit_cgroup_netprio(void)
299 { 299 {
300 struct netprio_map *old; 300 struct netprio_map *old;
301 struct net_device *dev; 301 struct net_device *dev;
302 302
303 unregister_netdevice_notifier(&netprio_device_notifier); 303 unregister_netdevice_notifier(&netprio_device_notifier);
304 304
305 cgroup_unload_subsys(&net_prio_subsys); 305 cgroup_unload_subsys(&net_prio_subsys);
306 306
307 rtnl_lock(); 307 rtnl_lock();
308 for_each_netdev(&init_net, dev) { 308 for_each_netdev(&init_net, dev) {
309 old = rtnl_dereference(dev->priomap); 309 old = rtnl_dereference(dev->priomap);
310 RCU_INIT_POINTER(dev->priomap, NULL); 310 RCU_INIT_POINTER(dev->priomap, NULL);
311 if (old) 311 if (old)
312 kfree_rcu(old, rcu); 312 kfree_rcu(old, rcu);
313 } 313 }
314 rtnl_unlock(); 314 rtnl_unlock();
315 } 315 }
316 316
317 module_init(init_cgroup_netprio); 317 module_init(init_cgroup_netprio);
318 module_exit(exit_cgroup_netprio); 318 module_exit(exit_cgroup_netprio);
319 MODULE_LICENSE("GPL v2"); 319 MODULE_LICENSE("GPL v2");
320 320
net/ipv4/ip_sockglue.c
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * The IP to API glue. 6 * The IP to API glue.
7 * 7 *
8 * Authors: see ip.c 8 * Authors: see ip.c
9 * 9 *
10 * Fixes: 10 * Fixes:
11 * Many : Split from ip.c , see ip.c for history. 11 * Many : Split from ip.c , see ip.c for history.
12 * Martin Mares : TOS setting fixed. 12 * Martin Mares : TOS setting fixed.
13 * Alan Cox : Fixed a couple of oopses in Martin's 13 * Alan Cox : Fixed a couple of oopses in Martin's
14 * TOS tweaks. 14 * TOS tweaks.
15 * Mike McLagan : Routing by source 15 * Mike McLagan : Routing by source
16 */ 16 */
17 17
18 #include <linux/module.h> 18 #include <linux/module.h>
19 #include <linux/types.h> 19 #include <linux/types.h>
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/skbuff.h> 21 #include <linux/skbuff.h>
22 #include <linux/ip.h> 22 #include <linux/ip.h>
23 #include <linux/icmp.h> 23 #include <linux/icmp.h>
24 #include <linux/inetdevice.h> 24 #include <linux/inetdevice.h>
25 #include <linux/netdevice.h> 25 #include <linux/netdevice.h>
26 #include <linux/slab.h> 26 #include <linux/slab.h>
27 #include <net/sock.h> 27 #include <net/sock.h>
28 #include <net/ip.h> 28 #include <net/ip.h>
29 #include <net/icmp.h> 29 #include <net/icmp.h>
30 #include <net/tcp_states.h> 30 #include <net/tcp_states.h>
31 #include <linux/udp.h> 31 #include <linux/udp.h>
32 #include <linux/igmp.h> 32 #include <linux/igmp.h>
33 #include <linux/netfilter.h> 33 #include <linux/netfilter.h>
34 #include <linux/route.h> 34 #include <linux/route.h>
35 #include <linux/mroute.h> 35 #include <linux/mroute.h>
36 #include <net/inet_ecn.h> 36 #include <net/inet_ecn.h>
37 #include <net/route.h> 37 #include <net/route.h>
38 #include <net/xfrm.h> 38 #include <net/xfrm.h>
39 #include <net/compat.h> 39 #include <net/compat.h>
40 #if IS_ENABLED(CONFIG_IPV6) 40 #if IS_ENABLED(CONFIG_IPV6)
41 #include <net/transp_v6.h> 41 #include <net/transp_v6.h>
42 #endif 42 #endif
43 #include <net/ip_fib.h> 43 #include <net/ip_fib.h>
44 44
45 #include <linux/errqueue.h> 45 #include <linux/errqueue.h>
46 #include <asm/uaccess.h> 46 #include <asm/uaccess.h>
47 47
48 #define IP_CMSG_PKTINFO 1 48 #define IP_CMSG_PKTINFO 1
49 #define IP_CMSG_TTL 2 49 #define IP_CMSG_TTL 2
50 #define IP_CMSG_TOS 4 50 #define IP_CMSG_TOS 4
51 #define IP_CMSG_RECVOPTS 8 51 #define IP_CMSG_RECVOPTS 8
52 #define IP_CMSG_RETOPTS 16 52 #define IP_CMSG_RETOPTS 16
53 #define IP_CMSG_PASSSEC 32 53 #define IP_CMSG_PASSSEC 32
54 #define IP_CMSG_ORIGDSTADDR 64 54 #define IP_CMSG_ORIGDSTADDR 64
55 55
56 /* 56 /*
57 * SOL_IP control messages. 57 * SOL_IP control messages.
58 */ 58 */
59 #define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) 59 #define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))
60 60
61 static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) 61 static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
62 { 62 {
63 struct in_pktinfo info = *PKTINFO_SKB_CB(skb); 63 struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
64 64
65 info.ipi_addr.s_addr = ip_hdr(skb)->daddr; 65 info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
66 66
67 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 67 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
68 } 68 }
69 69
70 static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) 70 static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
71 { 71 {
72 int ttl = ip_hdr(skb)->ttl; 72 int ttl = ip_hdr(skb)->ttl;
73 put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); 73 put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
74 } 74 }
75 75
76 static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) 76 static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
77 { 77 {
78 put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); 78 put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
79 } 79 }
80 80
81 static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) 81 static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
82 { 82 {
83 if (IPCB(skb)->opt.optlen == 0) 83 if (IPCB(skb)->opt.optlen == 0)
84 return; 84 return;
85 85
86 put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, 86 put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
87 ip_hdr(skb) + 1); 87 ip_hdr(skb) + 1);
88 } 88 }
89 89
90 90
91 static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) 91 static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
92 { 92 {
93 unsigned char optbuf[sizeof(struct ip_options) + 40]; 93 unsigned char optbuf[sizeof(struct ip_options) + 40];
94 struct ip_options *opt = (struct ip_options *)optbuf; 94 struct ip_options *opt = (struct ip_options *)optbuf;
95 95
96 if (IPCB(skb)->opt.optlen == 0) 96 if (IPCB(skb)->opt.optlen == 0)
97 return; 97 return;
98 98
99 if (ip_options_echo(opt, skb)) { 99 if (ip_options_echo(opt, skb)) {
100 msg->msg_flags |= MSG_CTRUNC; 100 msg->msg_flags |= MSG_CTRUNC;
101 return; 101 return;
102 } 102 }
103 ip_options_undo(opt); 103 ip_options_undo(opt);
104 104
105 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); 105 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
106 } 106 }
107 107
108 static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) 108 static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
109 { 109 {
110 char *secdata; 110 char *secdata;
111 u32 seclen, secid; 111 u32 seclen, secid;
112 int err; 112 int err;
113 113
114 err = security_socket_getpeersec_dgram(NULL, skb, &secid); 114 err = security_socket_getpeersec_dgram(NULL, skb, &secid);
115 if (err) 115 if (err)
116 return; 116 return;
117 117
118 err = security_secid_to_secctx(secid, &secdata, &seclen); 118 err = security_secid_to_secctx(secid, &secdata, &seclen);
119 if (err) 119 if (err)
120 return; 120 return;
121 121
122 put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata); 122 put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
123 security_release_secctx(secdata, seclen); 123 security_release_secctx(secdata, seclen);
124 } 124 }
125 125
126 static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) 126 static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
127 { 127 {
128 struct sockaddr_in sin; 128 struct sockaddr_in sin;
129 const struct iphdr *iph = ip_hdr(skb); 129 const struct iphdr *iph = ip_hdr(skb);
130 __be16 *ports = (__be16 *)skb_transport_header(skb); 130 __be16 *ports = (__be16 *)skb_transport_header(skb);
131 131
132 if (skb_transport_offset(skb) + 4 > skb->len) 132 if (skb_transport_offset(skb) + 4 > skb->len)
133 return; 133 return;
134 134
135 /* All current transport protocols have the port numbers in the 135 /* All current transport protocols have the port numbers in the
136 * first four bytes of the transport header and this function is 136 * first four bytes of the transport header and this function is
137 * written with this assumption in mind. 137 * written with this assumption in mind.
138 */ 138 */
139 139
140 sin.sin_family = AF_INET; 140 sin.sin_family = AF_INET;
141 sin.sin_addr.s_addr = iph->daddr; 141 sin.sin_addr.s_addr = iph->daddr;
142 sin.sin_port = ports[1]; 142 sin.sin_port = ports[1];
143 memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); 143 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
144 144
145 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); 145 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
146 } 146 }
147 147
148 void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) 148 void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
149 { 149 {
150 struct inet_sock *inet = inet_sk(skb->sk); 150 struct inet_sock *inet = inet_sk(skb->sk);
151 unsigned int flags = inet->cmsg_flags; 151 unsigned int flags = inet->cmsg_flags;
152 152
153 /* Ordered by supposed usage frequency */ 153 /* Ordered by supposed usage frequency */
154 if (flags & 1) 154 if (flags & 1)
155 ip_cmsg_recv_pktinfo(msg, skb); 155 ip_cmsg_recv_pktinfo(msg, skb);
156 if ((flags >>= 1) == 0) 156 if ((flags >>= 1) == 0)
157 return; 157 return;
158 158
159 if (flags & 1) 159 if (flags & 1)
160 ip_cmsg_recv_ttl(msg, skb); 160 ip_cmsg_recv_ttl(msg, skb);
161 if ((flags >>= 1) == 0) 161 if ((flags >>= 1) == 0)
162 return; 162 return;
163 163
164 if (flags & 1) 164 if (flags & 1)
165 ip_cmsg_recv_tos(msg, skb); 165 ip_cmsg_recv_tos(msg, skb);
166 if ((flags >>= 1) == 0) 166 if ((flags >>= 1) == 0)
167 return; 167 return;
168 168
169 if (flags & 1) 169 if (flags & 1)
170 ip_cmsg_recv_opts(msg, skb); 170 ip_cmsg_recv_opts(msg, skb);
171 if ((flags >>= 1) == 0) 171 if ((flags >>= 1) == 0)
172 return; 172 return;
173 173
174 if (flags & 1) 174 if (flags & 1)
175 ip_cmsg_recv_retopts(msg, skb); 175 ip_cmsg_recv_retopts(msg, skb);
176 if ((flags >>= 1) == 0) 176 if ((flags >>= 1) == 0)
177 return; 177 return;
178 178
179 if (flags & 1) 179 if (flags & 1)
180 ip_cmsg_recv_security(msg, skb); 180 ip_cmsg_recv_security(msg, skb);
181 181
182 if ((flags >>= 1) == 0) 182 if ((flags >>= 1) == 0)
183 return; 183 return;
184 if (flags & 1) 184 if (flags & 1)
185 ip_cmsg_recv_dstaddr(msg, skb); 185 ip_cmsg_recv_dstaddr(msg, skb);
186 186
187 } 187 }
188 EXPORT_SYMBOL(ip_cmsg_recv); 188 EXPORT_SYMBOL(ip_cmsg_recv);
189 189
190 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) 190 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
191 { 191 {
192 int err, val; 192 int err, val;
193 struct cmsghdr *cmsg; 193 struct cmsghdr *cmsg;
194 194
195 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { 195 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
196 if (!CMSG_OK(msg, cmsg)) 196 if (!CMSG_OK(msg, cmsg))
197 return -EINVAL; 197 return -EINVAL;
198 if (cmsg->cmsg_level != SOL_IP) 198 if (cmsg->cmsg_level != SOL_IP)
199 continue; 199 continue;
200 switch (cmsg->cmsg_type) { 200 switch (cmsg->cmsg_type) {
201 case IP_RETOPTS: 201 case IP_RETOPTS:
202 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 202 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
203 err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), 203 err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
204 err < 40 ? err : 40); 204 err < 40 ? err : 40);
205 if (err) 205 if (err)
206 return err; 206 return err;
207 break; 207 break;
208 case IP_PKTINFO: 208 case IP_PKTINFO:
209 { 209 {
210 struct in_pktinfo *info; 210 struct in_pktinfo *info;
211 if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) 211 if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
212 return -EINVAL; 212 return -EINVAL;
213 info = (struct in_pktinfo *)CMSG_DATA(cmsg); 213 info = (struct in_pktinfo *)CMSG_DATA(cmsg);
214 ipc->oif = info->ipi_ifindex; 214 ipc->oif = info->ipi_ifindex;
215 ipc->addr = info->ipi_spec_dst.s_addr; 215 ipc->addr = info->ipi_spec_dst.s_addr;
216 break; 216 break;
217 } 217 }
218 case IP_TTL: 218 case IP_TTL:
219 if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) 219 if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
220 return -EINVAL; 220 return -EINVAL;
221 val = *(int *)CMSG_DATA(cmsg); 221 val = *(int *)CMSG_DATA(cmsg);
222 if (val < 1 || val > 255) 222 if (val < 1 || val > 255)
223 return -EINVAL; 223 return -EINVAL;
224 ipc->ttl = val; 224 ipc->ttl = val;
225 break; 225 break;
226 case IP_TOS: 226 case IP_TOS:
227 if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) 227 if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
228 return -EINVAL; 228 return -EINVAL;
229 val = *(int *)CMSG_DATA(cmsg); 229 val = *(int *)CMSG_DATA(cmsg);
230 if (val < 0 || val > 255) 230 if (val < 0 || val > 255)
231 return -EINVAL; 231 return -EINVAL;
232 ipc->tos = val; 232 ipc->tos = val;
233 ipc->priority = rt_tos2priority(ipc->tos); 233 ipc->priority = rt_tos2priority(ipc->tos);
234 break; 234 break;
235 235
236 default: 236 default:
237 return -EINVAL; 237 return -EINVAL;
238 } 238 }
239 } 239 }
240 return 0; 240 return 0;
241 } 241 }
242 242
243 243
244 /* Special input handler for packets caught by router alert option. 244 /* Special input handler for packets caught by router alert option.
245 They are selected only by protocol field, and then processed likely 245 They are selected only by protocol field, and then processed likely
246 local ones; but only if someone wants them! Otherwise, router 246 local ones; but only if someone wants them! Otherwise, router
247 not running rsvpd will kill RSVP. 247 not running rsvpd will kill RSVP.
248 248
249 It is user level problem, what it will make with them. 249 It is user level problem, what it will make with them.
250 I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), 250 I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
251 but receiver should be enough clever f.e. to forward mtrace requests, 251 but receiver should be enough clever f.e. to forward mtrace requests,
252 sent to multicast group to reach destination designated router. 252 sent to multicast group to reach destination designated router.
253 */ 253 */
254 struct ip_ra_chain __rcu *ip_ra_chain; 254 struct ip_ra_chain __rcu *ip_ra_chain;
255 static DEFINE_SPINLOCK(ip_ra_lock); 255 static DEFINE_SPINLOCK(ip_ra_lock);
256 256
257 257
258 static void ip_ra_destroy_rcu(struct rcu_head *head) 258 static void ip_ra_destroy_rcu(struct rcu_head *head)
259 { 259 {
260 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); 260 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
261 261
262 sock_put(ra->saved_sk); 262 sock_put(ra->saved_sk);
263 kfree(ra); 263 kfree(ra);
264 } 264 }
265 265
266 int ip_ra_control(struct sock *sk, unsigned char on, 266 int ip_ra_control(struct sock *sk, unsigned char on,
267 void (*destructor)(struct sock *)) 267 void (*destructor)(struct sock *))
268 { 268 {
269 struct ip_ra_chain *ra, *new_ra; 269 struct ip_ra_chain *ra, *new_ra;
270 struct ip_ra_chain __rcu **rap; 270 struct ip_ra_chain __rcu **rap;
271 271
272 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) 272 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
273 return -EINVAL; 273 return -EINVAL;
274 274
275 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 275 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
276 276
277 spin_lock_bh(&ip_ra_lock); 277 spin_lock_bh(&ip_ra_lock);
278 for (rap = &ip_ra_chain; 278 for (rap = &ip_ra_chain;
279 (ra = rcu_dereference_protected(*rap, 279 (ra = rcu_dereference_protected(*rap,
280 lockdep_is_held(&ip_ra_lock))) != NULL; 280 lockdep_is_held(&ip_ra_lock))) != NULL;
281 rap = &ra->next) { 281 rap = &ra->next) {
282 if (ra->sk == sk) { 282 if (ra->sk == sk) {
283 if (on) { 283 if (on) {
284 spin_unlock_bh(&ip_ra_lock); 284 spin_unlock_bh(&ip_ra_lock);
285 kfree(new_ra); 285 kfree(new_ra);
286 return -EADDRINUSE; 286 return -EADDRINUSE;
287 } 287 }
288 /* dont let ip_call_ra_chain() use sk again */ 288 /* dont let ip_call_ra_chain() use sk again */
289 ra->sk = NULL; 289 ra->sk = NULL;
290 rcu_assign_pointer(*rap, ra->next); 290 rcu_assign_pointer(*rap, ra->next);
291 spin_unlock_bh(&ip_ra_lock); 291 spin_unlock_bh(&ip_ra_lock);
292 292
293 if (ra->destructor) 293 if (ra->destructor)
294 ra->destructor(sk); 294 ra->destructor(sk);
295 /* 295 /*
296 * Delay sock_put(sk) and kfree(ra) after one rcu grace 296 * Delay sock_put(sk) and kfree(ra) after one rcu grace
297 * period. This guarantee ip_call_ra_chain() dont need 297 * period. This guarantee ip_call_ra_chain() dont need
298 * to mess with socket refcounts. 298 * to mess with socket refcounts.
299 */ 299 */
300 ra->saved_sk = sk; 300 ra->saved_sk = sk;
301 call_rcu(&ra->rcu, ip_ra_destroy_rcu); 301 call_rcu(&ra->rcu, ip_ra_destroy_rcu);
302 return 0; 302 return 0;
303 } 303 }
304 } 304 }
305 if (new_ra == NULL) { 305 if (new_ra == NULL) {
306 spin_unlock_bh(&ip_ra_lock); 306 spin_unlock_bh(&ip_ra_lock);
307 return -ENOBUFS; 307 return -ENOBUFS;
308 } 308 }
309 new_ra->sk = sk; 309 new_ra->sk = sk;
310 new_ra->destructor = destructor; 310 new_ra->destructor = destructor;
311 311
312 new_ra->next = ra; 312 new_ra->next = ra;
313 rcu_assign_pointer(*rap, new_ra); 313 rcu_assign_pointer(*rap, new_ra);
314 sock_hold(sk); 314 sock_hold(sk);
315 spin_unlock_bh(&ip_ra_lock); 315 spin_unlock_bh(&ip_ra_lock);
316 316
317 return 0; 317 return 0;
318 } 318 }
319 319
320 void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, 320 void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
321 __be16 port, u32 info, u8 *payload) 321 __be16 port, u32 info, u8 *payload)
322 { 322 {
323 struct sock_exterr_skb *serr; 323 struct sock_exterr_skb *serr;
324 324
325 skb = skb_clone(skb, GFP_ATOMIC); 325 skb = skb_clone(skb, GFP_ATOMIC);
326 if (!skb) 326 if (!skb)
327 return; 327 return;
328 328
329 serr = SKB_EXT_ERR(skb); 329 serr = SKB_EXT_ERR(skb);
330 serr->ee.ee_errno = err; 330 serr->ee.ee_errno = err;
331 serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; 331 serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
332 serr->ee.ee_type = icmp_hdr(skb)->type; 332 serr->ee.ee_type = icmp_hdr(skb)->type;
333 serr->ee.ee_code = icmp_hdr(skb)->code; 333 serr->ee.ee_code = icmp_hdr(skb)->code;
334 serr->ee.ee_pad = 0; 334 serr->ee.ee_pad = 0;
335 serr->ee.ee_info = info; 335 serr->ee.ee_info = info;
336 serr->ee.ee_data = 0; 336 serr->ee.ee_data = 0;
337 serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - 337 serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
338 skb_network_header(skb); 338 skb_network_header(skb);
339 serr->port = port; 339 serr->port = port;
340 340
341 if (skb_pull(skb, payload - skb->data) != NULL) { 341 if (skb_pull(skb, payload - skb->data) != NULL) {
342 skb_reset_transport_header(skb); 342 skb_reset_transport_header(skb);
343 if (sock_queue_err_skb(sk, skb) == 0) 343 if (sock_queue_err_skb(sk, skb) == 0)
344 return; 344 return;
345 } 345 }
346 kfree_skb(skb); 346 kfree_skb(skb);
347 } 347 }
348 348
349 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) 349 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
350 { 350 {
351 struct inet_sock *inet = inet_sk(sk); 351 struct inet_sock *inet = inet_sk(sk);
352 struct sock_exterr_skb *serr; 352 struct sock_exterr_skb *serr;
353 struct iphdr *iph; 353 struct iphdr *iph;
354 struct sk_buff *skb; 354 struct sk_buff *skb;
355 355
356 if (!inet->recverr) 356 if (!inet->recverr)
357 return; 357 return;
358 358
359 skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); 359 skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
360 if (!skb) 360 if (!skb)
361 return; 361 return;
362 362
363 skb_put(skb, sizeof(struct iphdr)); 363 skb_put(skb, sizeof(struct iphdr));
364 skb_reset_network_header(skb); 364 skb_reset_network_header(skb);
365 iph = ip_hdr(skb); 365 iph = ip_hdr(skb);
366 iph->daddr = daddr; 366 iph->daddr = daddr;
367 367
368 serr = SKB_EXT_ERR(skb); 368 serr = SKB_EXT_ERR(skb);
369 serr->ee.ee_errno = err; 369 serr->ee.ee_errno = err;
370 serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; 370 serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
371 serr->ee.ee_type = 0; 371 serr->ee.ee_type = 0;
372 serr->ee.ee_code = 0; 372 serr->ee.ee_code = 0;
373 serr->ee.ee_pad = 0; 373 serr->ee.ee_pad = 0;
374 serr->ee.ee_info = info; 374 serr->ee.ee_info = info;
375 serr->ee.ee_data = 0; 375 serr->ee.ee_data = 0;
376 serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); 376 serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
377 serr->port = port; 377 serr->port = port;
378 378
379 __skb_pull(skb, skb_tail_pointer(skb) - skb->data); 379 __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
380 skb_reset_transport_header(skb); 380 skb_reset_transport_header(skb);
381 381
382 if (sock_queue_err_skb(sk, skb)) 382 if (sock_queue_err_skb(sk, skb))
383 kfree_skb(skb); 383 kfree_skb(skb);
384 } 384 }
385 385
386 /* 386 /*
387 * Handle MSG_ERRQUEUE 387 * Handle MSG_ERRQUEUE
388 */ 388 */
389 int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) 389 int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
390 { 390 {
391 struct sock_exterr_skb *serr; 391 struct sock_exterr_skb *serr;
392 struct sk_buff *skb, *skb2; 392 struct sk_buff *skb, *skb2;
393 struct sockaddr_in *sin; 393 struct sockaddr_in *sin;
394 struct { 394 struct {
395 struct sock_extended_err ee; 395 struct sock_extended_err ee;
396 struct sockaddr_in offender; 396 struct sockaddr_in offender;
397 } errhdr; 397 } errhdr;
398 int err; 398 int err;
399 int copied; 399 int copied;
400 400
401 err = -EAGAIN; 401 err = -EAGAIN;
402 skb = skb_dequeue(&sk->sk_error_queue); 402 skb = skb_dequeue(&sk->sk_error_queue);
403 if (skb == NULL) 403 if (skb == NULL)
404 goto out; 404 goto out;
405 405
406 copied = skb->len; 406 copied = skb->len;
407 if (copied > len) { 407 if (copied > len) {
408 msg->msg_flags |= MSG_TRUNC; 408 msg->msg_flags |= MSG_TRUNC;
409 copied = len; 409 copied = len;
410 } 410 }
411 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 411 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
412 if (err) 412 if (err)
413 goto out_free_skb; 413 goto out_free_skb;
414 414
415 sock_recv_timestamp(msg, sk, skb); 415 sock_recv_timestamp(msg, sk, skb);
416 416
417 serr = SKB_EXT_ERR(skb); 417 serr = SKB_EXT_ERR(skb);
418 418
419 sin = (struct sockaddr_in *)msg->msg_name; 419 sin = (struct sockaddr_in *)msg->msg_name;
420 if (sin) { 420 if (sin) {
421 sin->sin_family = AF_INET; 421 sin->sin_family = AF_INET;
422 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + 422 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
423 serr->addr_offset); 423 serr->addr_offset);
424 sin->sin_port = serr->port; 424 sin->sin_port = serr->port;
425 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 425 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
426 *addr_len = sizeof(*sin); 426 *addr_len = sizeof(*sin);
427 } 427 }
428 428
429 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); 429 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
430 sin = &errhdr.offender; 430 sin = &errhdr.offender;
431 sin->sin_family = AF_UNSPEC; 431 sin->sin_family = AF_UNSPEC;
432 if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { 432 if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
433 struct inet_sock *inet = inet_sk(sk); 433 struct inet_sock *inet = inet_sk(sk);
434 434
435 sin->sin_family = AF_INET; 435 sin->sin_family = AF_INET;
436 sin->sin_addr.s_addr = ip_hdr(skb)->saddr; 436 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
437 sin->sin_port = 0; 437 sin->sin_port = 0;
438 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); 438 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
439 if (inet->cmsg_flags) 439 if (inet->cmsg_flags)
440 ip_cmsg_recv(msg, skb); 440 ip_cmsg_recv(msg, skb);
441 } 441 }
442 442
443 put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); 443 put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
444 444
445 /* Now we could try to dump offended packet options */ 445 /* Now we could try to dump offended packet options */
446 446
447 msg->msg_flags |= MSG_ERRQUEUE; 447 msg->msg_flags |= MSG_ERRQUEUE;
448 err = copied; 448 err = copied;
449 449
450 /* Reset and regenerate socket error */ 450 /* Reset and regenerate socket error */
451 spin_lock_bh(&sk->sk_error_queue.lock); 451 spin_lock_bh(&sk->sk_error_queue.lock);
452 sk->sk_err = 0; 452 sk->sk_err = 0;
453 skb2 = skb_peek(&sk->sk_error_queue); 453 skb2 = skb_peek(&sk->sk_error_queue);
454 if (skb2 != NULL) { 454 if (skb2 != NULL) {
455 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; 455 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
456 spin_unlock_bh(&sk->sk_error_queue.lock); 456 spin_unlock_bh(&sk->sk_error_queue.lock);
457 sk->sk_error_report(sk); 457 sk->sk_error_report(sk);
458 } else 458 } else
459 spin_unlock_bh(&sk->sk_error_queue.lock); 459 spin_unlock_bh(&sk->sk_error_queue.lock);
460 460
461 out_free_skb: 461 out_free_skb:
462 kfree_skb(skb); 462 kfree_skb(skb);
463 out: 463 out:
464 return err; 464 return err;
465 } 465 }
466 466
467 467
468 /* 468 /*
469 * Socket option code for IP. This is the end of the line after any 469 * Socket option code for IP. This is the end of the line after any
470 * TCP,UDP etc options on an IP socket. 470 * TCP,UDP etc options on an IP socket.
471 */ 471 */
472 472
473 static int do_ip_setsockopt(struct sock *sk, int level, 473 static int do_ip_setsockopt(struct sock *sk, int level,
474 int optname, char __user *optval, unsigned int optlen) 474 int optname, char __user *optval, unsigned int optlen)
475 { 475 {
476 struct inet_sock *inet = inet_sk(sk); 476 struct inet_sock *inet = inet_sk(sk);
477 int val = 0, err; 477 int val = 0, err;
478 478
479 switch (optname) { 479 switch (optname) {
480 case IP_PKTINFO: 480 case IP_PKTINFO:
481 case IP_RECVTTL: 481 case IP_RECVTTL:
482 case IP_RECVOPTS: 482 case IP_RECVOPTS:
483 case IP_RECVTOS: 483 case IP_RECVTOS:
484 case IP_RETOPTS: 484 case IP_RETOPTS:
485 case IP_TOS: 485 case IP_TOS:
486 case IP_TTL: 486 case IP_TTL:
487 case IP_HDRINCL: 487 case IP_HDRINCL:
488 case IP_MTU_DISCOVER: 488 case IP_MTU_DISCOVER:
489 case IP_RECVERR: 489 case IP_RECVERR:
490 case IP_ROUTER_ALERT: 490 case IP_ROUTER_ALERT:
491 case IP_FREEBIND: 491 case IP_FREEBIND:
492 case IP_PASSSEC: 492 case IP_PASSSEC:
493 case IP_TRANSPARENT: 493 case IP_TRANSPARENT:
494 case IP_MINTTL: 494 case IP_MINTTL:
495 case IP_NODEFRAG: 495 case IP_NODEFRAG:
496 case IP_UNICAST_IF: 496 case IP_UNICAST_IF:
497 case IP_MULTICAST_TTL: 497 case IP_MULTICAST_TTL:
498 case IP_MULTICAST_ALL: 498 case IP_MULTICAST_ALL:
499 case IP_MULTICAST_LOOP: 499 case IP_MULTICAST_LOOP:
500 case IP_RECVORIGDSTADDR: 500 case IP_RECVORIGDSTADDR:
501 if (optlen >= sizeof(int)) { 501 if (optlen >= sizeof(int)) {
502 if (get_user(val, (int __user *) optval)) 502 if (get_user(val, (int __user *) optval))
503 return -EFAULT; 503 return -EFAULT;
504 } else if (optlen >= sizeof(char)) { 504 } else if (optlen >= sizeof(char)) {
505 unsigned char ucval; 505 unsigned char ucval;
506 506
507 if (get_user(ucval, (unsigned char __user *) optval)) 507 if (get_user(ucval, (unsigned char __user *) optval))
508 return -EFAULT; 508 return -EFAULT;
509 val = (int) ucval; 509 val = (int) ucval;
510 } 510 }
511 } 511 }
512 512
513 /* If optlen==0, it is equivalent to val == 0 */ 513 /* If optlen==0, it is equivalent to val == 0 */
514 514
515 if (ip_mroute_opt(optname)) 515 if (ip_mroute_opt(optname))
516 return ip_mroute_setsockopt(sk, optname, optval, optlen); 516 return ip_mroute_setsockopt(sk, optname, optval, optlen);
517 517
518 err = 0; 518 err = 0;
519 lock_sock(sk); 519 lock_sock(sk);
520 520
521 switch (optname) { 521 switch (optname) {
522 case IP_OPTIONS: 522 case IP_OPTIONS:
523 { 523 {
524 struct ip_options_rcu *old, *opt = NULL; 524 struct ip_options_rcu *old, *opt = NULL;
525 525
526 if (optlen > 40) 526 if (optlen > 40)
527 goto e_inval; 527 goto e_inval;
528 err = ip_options_get_from_user(sock_net(sk), &opt, 528 err = ip_options_get_from_user(sock_net(sk), &opt,
529 optval, optlen); 529 optval, optlen);
530 if (err) 530 if (err)
531 break; 531 break;
532 old = rcu_dereference_protected(inet->inet_opt, 532 old = rcu_dereference_protected(inet->inet_opt,
533 sock_owned_by_user(sk)); 533 sock_owned_by_user(sk));
534 if (inet->is_icsk) { 534 if (inet->is_icsk) {
535 struct inet_connection_sock *icsk = inet_csk(sk); 535 struct inet_connection_sock *icsk = inet_csk(sk);
536 #if IS_ENABLED(CONFIG_IPV6) 536 #if IS_ENABLED(CONFIG_IPV6)
537 if (sk->sk_family == PF_INET || 537 if (sk->sk_family == PF_INET ||
538 (!((1 << sk->sk_state) & 538 (!((1 << sk->sk_state) &
539 (TCPF_LISTEN | TCPF_CLOSE)) && 539 (TCPF_LISTEN | TCPF_CLOSE)) &&
540 inet->inet_daddr != LOOPBACK4_IPV6)) { 540 inet->inet_daddr != LOOPBACK4_IPV6)) {
541 #endif 541 #endif
542 if (old) 542 if (old)
543 icsk->icsk_ext_hdr_len -= old->opt.optlen; 543 icsk->icsk_ext_hdr_len -= old->opt.optlen;
544 if (opt) 544 if (opt)
545 icsk->icsk_ext_hdr_len += opt->opt.optlen; 545 icsk->icsk_ext_hdr_len += opt->opt.optlen;
546 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); 546 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
547 #if IS_ENABLED(CONFIG_IPV6) 547 #if IS_ENABLED(CONFIG_IPV6)
548 } 548 }
549 #endif 549 #endif
550 } 550 }
551 rcu_assign_pointer(inet->inet_opt, opt); 551 rcu_assign_pointer(inet->inet_opt, opt);
552 if (old) 552 if (old)
553 kfree_rcu(old, rcu); 553 kfree_rcu(old, rcu);
554 break; 554 break;
555 } 555 }
556 case IP_PKTINFO: 556 case IP_PKTINFO:
557 if (val) 557 if (val)
558 inet->cmsg_flags |= IP_CMSG_PKTINFO; 558 inet->cmsg_flags |= IP_CMSG_PKTINFO;
559 else 559 else
560 inet->cmsg_flags &= ~IP_CMSG_PKTINFO; 560 inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
561 break; 561 break;
562 case IP_RECVTTL: 562 case IP_RECVTTL:
563 if (val) 563 if (val)
564 inet->cmsg_flags |= IP_CMSG_TTL; 564 inet->cmsg_flags |= IP_CMSG_TTL;
565 else 565 else
566 inet->cmsg_flags &= ~IP_CMSG_TTL; 566 inet->cmsg_flags &= ~IP_CMSG_TTL;
567 break; 567 break;
568 case IP_RECVTOS: 568 case IP_RECVTOS:
569 if (val) 569 if (val)
570 inet->cmsg_flags |= IP_CMSG_TOS; 570 inet->cmsg_flags |= IP_CMSG_TOS;
571 else 571 else
572 inet->cmsg_flags &= ~IP_CMSG_TOS; 572 inet->cmsg_flags &= ~IP_CMSG_TOS;
573 break; 573 break;
574 case IP_RECVOPTS: 574 case IP_RECVOPTS:
575 if (val) 575 if (val)
576 inet->cmsg_flags |= IP_CMSG_RECVOPTS; 576 inet->cmsg_flags |= IP_CMSG_RECVOPTS;
577 else 577 else
578 inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; 578 inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
579 break; 579 break;
580 case IP_RETOPTS: 580 case IP_RETOPTS:
581 if (val) 581 if (val)
582 inet->cmsg_flags |= IP_CMSG_RETOPTS; 582 inet->cmsg_flags |= IP_CMSG_RETOPTS;
583 else 583 else
584 inet->cmsg_flags &= ~IP_CMSG_RETOPTS; 584 inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
585 break; 585 break;
586 case IP_PASSSEC: 586 case IP_PASSSEC:
587 if (val) 587 if (val)
588 inet->cmsg_flags |= IP_CMSG_PASSSEC; 588 inet->cmsg_flags |= IP_CMSG_PASSSEC;
589 else 589 else
590 inet->cmsg_flags &= ~IP_CMSG_PASSSEC; 590 inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
591 break; 591 break;
592 case IP_RECVORIGDSTADDR: 592 case IP_RECVORIGDSTADDR:
593 if (val) 593 if (val)
594 inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR; 594 inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
595 else 595 else
596 inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; 596 inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
597 break; 597 break;
598 case IP_TOS: /* This sets both TOS and Precedence */ 598 case IP_TOS: /* This sets both TOS and Precedence */
599 if (sk->sk_type == SOCK_STREAM) { 599 if (sk->sk_type == SOCK_STREAM) {
600 val &= ~INET_ECN_MASK; 600 val &= ~INET_ECN_MASK;
601 val |= inet->tos & INET_ECN_MASK; 601 val |= inet->tos & INET_ECN_MASK;
602 } 602 }
603 if (inet->tos != val) { 603 if (inet->tos != val) {
604 inet->tos = val; 604 inet->tos = val;
605 sk->sk_priority = rt_tos2priority(val); 605 sk->sk_priority = rt_tos2priority(val);
606 sk_dst_reset(sk); 606 sk_dst_reset(sk);
607 } 607 }
608 break; 608 break;
609 case IP_TTL: 609 case IP_TTL:
610 if (optlen < 1) 610 if (optlen < 1)
611 goto e_inval; 611 goto e_inval;
612 if (val != -1 && (val < 1 || val > 255)) 612 if (val != -1 && (val < 1 || val > 255))
613 goto e_inval; 613 goto e_inval;
614 inet->uc_ttl = val; 614 inet->uc_ttl = val;
615 break; 615 break;
616 case IP_HDRINCL: 616 case IP_HDRINCL:
617 if (sk->sk_type != SOCK_RAW) { 617 if (sk->sk_type != SOCK_RAW) {
618 err = -ENOPROTOOPT; 618 err = -ENOPROTOOPT;
619 break; 619 break;
620 } 620 }
621 inet->hdrincl = val ? 1 : 0; 621 inet->hdrincl = val ? 1 : 0;
622 break; 622 break;
623 case IP_NODEFRAG: 623 case IP_NODEFRAG:
624 if (sk->sk_type != SOCK_RAW) { 624 if (sk->sk_type != SOCK_RAW) {
625 err = -ENOPROTOOPT; 625 err = -ENOPROTOOPT;
626 break; 626 break;
627 } 627 }
628 inet->nodefrag = val ? 1 : 0; 628 inet->nodefrag = val ? 1 : 0;
629 break; 629 break;
630 case IP_MTU_DISCOVER: 630 case IP_MTU_DISCOVER:
631 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) 631 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE)
632 goto e_inval; 632 goto e_inval;
633 inet->pmtudisc = val; 633 inet->pmtudisc = val;
634 break; 634 break;
635 case IP_RECVERR: 635 case IP_RECVERR:
636 inet->recverr = !!val; 636 inet->recverr = !!val;
637 if (!val) 637 if (!val)
638 skb_queue_purge(&sk->sk_error_queue); 638 skb_queue_purge(&sk->sk_error_queue);
639 break; 639 break;
640 case IP_MULTICAST_TTL: 640 case IP_MULTICAST_TTL:
641 if (sk->sk_type == SOCK_STREAM) 641 if (sk->sk_type == SOCK_STREAM)
642 goto e_inval; 642 goto e_inval;
643 if (optlen < 1) 643 if (optlen < 1)
644 goto e_inval; 644 goto e_inval;
645 if (val == -1) 645 if (val == -1)
646 val = 1; 646 val = 1;
647 if (val < 0 || val > 255) 647 if (val < 0 || val > 255)
648 goto e_inval; 648 goto e_inval;
649 inet->mc_ttl = val; 649 inet->mc_ttl = val;
650 break; 650 break;
651 case IP_MULTICAST_LOOP: 651 case IP_MULTICAST_LOOP:
652 if (optlen < 1) 652 if (optlen < 1)
653 goto e_inval; 653 goto e_inval;
654 inet->mc_loop = !!val; 654 inet->mc_loop = !!val;
655 break; 655 break;
656 case IP_UNICAST_IF: 656 case IP_UNICAST_IF:
657 { 657 {
658 struct net_device *dev = NULL; 658 struct net_device *dev = NULL;
659 int ifindex; 659 int ifindex;
660 660
661 if (optlen != sizeof(int)) 661 if (optlen != sizeof(int))
662 goto e_inval; 662 goto e_inval;
663 663
664 ifindex = (__force int)ntohl((__force __be32)val); 664 ifindex = (__force int)ntohl((__force __be32)val);
665 if (ifindex == 0) { 665 if (ifindex == 0) {
666 inet->uc_index = 0; 666 inet->uc_index = 0;
667 err = 0; 667 err = 0;
668 break; 668 break;
669 } 669 }
670 670
671 dev = dev_get_by_index(sock_net(sk), ifindex); 671 dev = dev_get_by_index(sock_net(sk), ifindex);
672 err = -EADDRNOTAVAIL; 672 err = -EADDRNOTAVAIL;
673 if (!dev) 673 if (!dev)
674 break; 674 break;
675 dev_put(dev); 675 dev_put(dev);
676 676
677 err = -EINVAL; 677 err = -EINVAL;
678 if (sk->sk_bound_dev_if) 678 if (sk->sk_bound_dev_if)
679 break; 679 break;
680 680
681 inet->uc_index = ifindex; 681 inet->uc_index = ifindex;
682 err = 0; 682 err = 0;
683 break; 683 break;
684 } 684 }
685 case IP_MULTICAST_IF: 685 case IP_MULTICAST_IF:
686 { 686 {
687 struct ip_mreqn mreq; 687 struct ip_mreqn mreq;
688 struct net_device *dev = NULL; 688 struct net_device *dev = NULL;
689 689
690 if (sk->sk_type == SOCK_STREAM) 690 if (sk->sk_type == SOCK_STREAM)
691 goto e_inval; 691 goto e_inval;
692 /* 692 /*
693 * Check the arguments are allowable 693 * Check the arguments are allowable
694 */ 694 */
695 695
696 if (optlen < sizeof(struct in_addr)) 696 if (optlen < sizeof(struct in_addr))
697 goto e_inval; 697 goto e_inval;
698 698
699 err = -EFAULT; 699 err = -EFAULT;
700 if (optlen >= sizeof(struct ip_mreqn)) { 700 if (optlen >= sizeof(struct ip_mreqn)) {
701 if (copy_from_user(&mreq, optval, sizeof(mreq))) 701 if (copy_from_user(&mreq, optval, sizeof(mreq)))
702 break; 702 break;
703 } else { 703 } else {
704 memset(&mreq, 0, sizeof(mreq)); 704 memset(&mreq, 0, sizeof(mreq));
705 if (optlen >= sizeof(struct ip_mreq)) { 705 if (optlen >= sizeof(struct ip_mreq)) {
706 if (copy_from_user(&mreq, optval, 706 if (copy_from_user(&mreq, optval,
707 sizeof(struct ip_mreq))) 707 sizeof(struct ip_mreq)))
708 break; 708 break;
709 } else if (optlen >= sizeof(struct in_addr)) { 709 } else if (optlen >= sizeof(struct in_addr)) {
710 if (copy_from_user(&mreq.imr_address, optval, 710 if (copy_from_user(&mreq.imr_address, optval,
711 sizeof(struct in_addr))) 711 sizeof(struct in_addr)))
712 break; 712 break;
713 } 713 }
714 } 714 }
715 715
716 if (!mreq.imr_ifindex) { 716 if (!mreq.imr_ifindex) {
717 if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { 717 if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
718 inet->mc_index = 0; 718 inet->mc_index = 0;
719 inet->mc_addr = 0; 719 inet->mc_addr = 0;
720 err = 0; 720 err = 0;
721 break; 721 break;
722 } 722 }
723 dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); 723 dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
724 if (dev) 724 if (dev)
725 mreq.imr_ifindex = dev->ifindex; 725 mreq.imr_ifindex = dev->ifindex;
726 } else 726 } else
727 dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); 727 dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
728 728
729 729
730 err = -EADDRNOTAVAIL; 730 err = -EADDRNOTAVAIL;
731 if (!dev) 731 if (!dev)
732 break; 732 break;
733 dev_put(dev); 733 dev_put(dev);
734 734
735 err = -EINVAL; 735 err = -EINVAL;
736 if (sk->sk_bound_dev_if && 736 if (sk->sk_bound_dev_if &&
737 mreq.imr_ifindex != sk->sk_bound_dev_if) 737 mreq.imr_ifindex != sk->sk_bound_dev_if)
738 break; 738 break;
739 739
740 inet->mc_index = mreq.imr_ifindex; 740 inet->mc_index = mreq.imr_ifindex;
741 inet->mc_addr = mreq.imr_address.s_addr; 741 inet->mc_addr = mreq.imr_address.s_addr;
742 err = 0; 742 err = 0;
743 break; 743 break;
744 } 744 }
745 745
746 case IP_ADD_MEMBERSHIP: 746 case IP_ADD_MEMBERSHIP:
747 case IP_DROP_MEMBERSHIP: 747 case IP_DROP_MEMBERSHIP:
748 { 748 {
749 struct ip_mreqn mreq; 749 struct ip_mreqn mreq;
750 750
751 err = -EPROTO; 751 err = -EPROTO;
752 if (inet_sk(sk)->is_icsk) 752 if (inet_sk(sk)->is_icsk)
753 break; 753 break;
754 754
755 if (optlen < sizeof(struct ip_mreq)) 755 if (optlen < sizeof(struct ip_mreq))
756 goto e_inval; 756 goto e_inval;
757 err = -EFAULT; 757 err = -EFAULT;
758 if (optlen >= sizeof(struct ip_mreqn)) { 758 if (optlen >= sizeof(struct ip_mreqn)) {
759 if (copy_from_user(&mreq, optval, sizeof(mreq))) 759 if (copy_from_user(&mreq, optval, sizeof(mreq)))
760 break; 760 break;
761 } else { 761 } else {
762 memset(&mreq, 0, sizeof(mreq)); 762 memset(&mreq, 0, sizeof(mreq));
763 if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq))) 763 if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
764 break; 764 break;
765 } 765 }
766 766
767 if (optname == IP_ADD_MEMBERSHIP) 767 if (optname == IP_ADD_MEMBERSHIP)
768 err = ip_mc_join_group(sk, &mreq); 768 err = ip_mc_join_group(sk, &mreq);
769 else 769 else
770 err = ip_mc_leave_group(sk, &mreq); 770 err = ip_mc_leave_group(sk, &mreq);
771 break; 771 break;
772 } 772 }
773 case IP_MSFILTER: 773 case IP_MSFILTER:
774 { 774 {
775 struct ip_msfilter *msf; 775 struct ip_msfilter *msf;
776 776
777 if (optlen < IP_MSFILTER_SIZE(0)) 777 if (optlen < IP_MSFILTER_SIZE(0))
778 goto e_inval; 778 goto e_inval;
779 if (optlen > sysctl_optmem_max) { 779 if (optlen > sysctl_optmem_max) {
780 err = -ENOBUFS; 780 err = -ENOBUFS;
781 break; 781 break;
782 } 782 }
783 msf = kmalloc(optlen, GFP_KERNEL); 783 msf = kmalloc(optlen, GFP_KERNEL);
784 if (!msf) { 784 if (!msf) {
785 err = -ENOBUFS; 785 err = -ENOBUFS;
786 break; 786 break;
787 } 787 }
788 err = -EFAULT; 788 err = -EFAULT;
789 if (copy_from_user(msf, optval, optlen)) { 789 if (copy_from_user(msf, optval, optlen)) {
790 kfree(msf); 790 kfree(msf);
791 break; 791 break;
792 } 792 }
793 /* numsrc >= (1G-4) overflow in 32 bits */ 793 /* numsrc >= (1G-4) overflow in 32 bits */
794 if (msf->imsf_numsrc >= 0x3ffffffcU || 794 if (msf->imsf_numsrc >= 0x3ffffffcU ||
795 msf->imsf_numsrc > sysctl_igmp_max_msf) { 795 msf->imsf_numsrc > sysctl_igmp_max_msf) {
796 kfree(msf); 796 kfree(msf);
797 err = -ENOBUFS; 797 err = -ENOBUFS;
798 break; 798 break;
799 } 799 }
800 if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { 800 if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
801 kfree(msf); 801 kfree(msf);
802 err = -EINVAL; 802 err = -EINVAL;
803 break; 803 break;
804 } 804 }
805 err = ip_mc_msfilter(sk, msf, 0); 805 err = ip_mc_msfilter(sk, msf, 0);
806 kfree(msf); 806 kfree(msf);
807 break; 807 break;
808 } 808 }
809 case IP_BLOCK_SOURCE: 809 case IP_BLOCK_SOURCE:
810 case IP_UNBLOCK_SOURCE: 810 case IP_UNBLOCK_SOURCE:
811 case IP_ADD_SOURCE_MEMBERSHIP: 811 case IP_ADD_SOURCE_MEMBERSHIP:
812 case IP_DROP_SOURCE_MEMBERSHIP: 812 case IP_DROP_SOURCE_MEMBERSHIP:
813 { 813 {
814 struct ip_mreq_source mreqs; 814 struct ip_mreq_source mreqs;
815 int omode, add; 815 int omode, add;
816 816
817 if (optlen != sizeof(struct ip_mreq_source)) 817 if (optlen != sizeof(struct ip_mreq_source))
818 goto e_inval; 818 goto e_inval;
819 if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { 819 if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
820 err = -EFAULT; 820 err = -EFAULT;
821 break; 821 break;
822 } 822 }
823 if (optname == IP_BLOCK_SOURCE) { 823 if (optname == IP_BLOCK_SOURCE) {
824 omode = MCAST_EXCLUDE; 824 omode = MCAST_EXCLUDE;
825 add = 1; 825 add = 1;
826 } else if (optname == IP_UNBLOCK_SOURCE) { 826 } else if (optname == IP_UNBLOCK_SOURCE) {
827 omode = MCAST_EXCLUDE; 827 omode = MCAST_EXCLUDE;
828 add = 0; 828 add = 0;
829 } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { 829 } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
830 struct ip_mreqn mreq; 830 struct ip_mreqn mreq;
831 831
832 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; 832 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
833 mreq.imr_address.s_addr = mreqs.imr_interface; 833 mreq.imr_address.s_addr = mreqs.imr_interface;
834 mreq.imr_ifindex = 0; 834 mreq.imr_ifindex = 0;
835 err = ip_mc_join_group(sk, &mreq); 835 err = ip_mc_join_group(sk, &mreq);
836 if (err && err != -EADDRINUSE) 836 if (err && err != -EADDRINUSE)
837 break; 837 break;
838 omode = MCAST_INCLUDE; 838 omode = MCAST_INCLUDE;
839 add = 1; 839 add = 1;
840 } else /* IP_DROP_SOURCE_MEMBERSHIP */ { 840 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
841 omode = MCAST_INCLUDE; 841 omode = MCAST_INCLUDE;
842 add = 0; 842 add = 0;
843 } 843 }
844 err = ip_mc_source(add, omode, sk, &mreqs, 0); 844 err = ip_mc_source(add, omode, sk, &mreqs, 0);
845 break; 845 break;
846 } 846 }
847 case MCAST_JOIN_GROUP: 847 case MCAST_JOIN_GROUP:
848 case MCAST_LEAVE_GROUP: 848 case MCAST_LEAVE_GROUP:
849 { 849 {
850 struct group_req greq; 850 struct group_req greq;
851 struct sockaddr_in *psin; 851 struct sockaddr_in *psin;
852 struct ip_mreqn mreq; 852 struct ip_mreqn mreq;
853 853
854 if (optlen < sizeof(struct group_req)) 854 if (optlen < sizeof(struct group_req))
855 goto e_inval; 855 goto e_inval;
856 err = -EFAULT; 856 err = -EFAULT;
857 if (copy_from_user(&greq, optval, sizeof(greq))) 857 if (copy_from_user(&greq, optval, sizeof(greq)))
858 break; 858 break;
859 psin = (struct sockaddr_in *)&greq.gr_group; 859 psin = (struct sockaddr_in *)&greq.gr_group;
860 if (psin->sin_family != AF_INET) 860 if (psin->sin_family != AF_INET)
861 goto e_inval; 861 goto e_inval;
862 memset(&mreq, 0, sizeof(mreq)); 862 memset(&mreq, 0, sizeof(mreq));
863 mreq.imr_multiaddr = psin->sin_addr; 863 mreq.imr_multiaddr = psin->sin_addr;
864 mreq.imr_ifindex = greq.gr_interface; 864 mreq.imr_ifindex = greq.gr_interface;
865 865
866 if (optname == MCAST_JOIN_GROUP) 866 if (optname == MCAST_JOIN_GROUP)
867 err = ip_mc_join_group(sk, &mreq); 867 err = ip_mc_join_group(sk, &mreq);
868 else 868 else
869 err = ip_mc_leave_group(sk, &mreq); 869 err = ip_mc_leave_group(sk, &mreq);
870 break; 870 break;
871 } 871 }
872 case MCAST_JOIN_SOURCE_GROUP: 872 case MCAST_JOIN_SOURCE_GROUP:
873 case MCAST_LEAVE_SOURCE_GROUP: 873 case MCAST_LEAVE_SOURCE_GROUP:
874 case MCAST_BLOCK_SOURCE: 874 case MCAST_BLOCK_SOURCE:
875 case MCAST_UNBLOCK_SOURCE: 875 case MCAST_UNBLOCK_SOURCE:
876 { 876 {
877 struct group_source_req greqs; 877 struct group_source_req greqs;
878 struct ip_mreq_source mreqs; 878 struct ip_mreq_source mreqs;
879 struct sockaddr_in *psin; 879 struct sockaddr_in *psin;
880 int omode, add; 880 int omode, add;
881 881
882 if (optlen != sizeof(struct group_source_req)) 882 if (optlen != sizeof(struct group_source_req))
883 goto e_inval; 883 goto e_inval;
884 if (copy_from_user(&greqs, optval, sizeof(greqs))) { 884 if (copy_from_user(&greqs, optval, sizeof(greqs))) {
885 err = -EFAULT; 885 err = -EFAULT;
886 break; 886 break;
887 } 887 }
888 if (greqs.gsr_group.ss_family != AF_INET || 888 if (greqs.gsr_group.ss_family != AF_INET ||
889 greqs.gsr_source.ss_family != AF_INET) { 889 greqs.gsr_source.ss_family != AF_INET) {
890 err = -EADDRNOTAVAIL; 890 err = -EADDRNOTAVAIL;
891 break; 891 break;
892 } 892 }
893 psin = (struct sockaddr_in *)&greqs.gsr_group; 893 psin = (struct sockaddr_in *)&greqs.gsr_group;
894 mreqs.imr_multiaddr = psin->sin_addr.s_addr; 894 mreqs.imr_multiaddr = psin->sin_addr.s_addr;
895 psin = (struct sockaddr_in *)&greqs.gsr_source; 895 psin = (struct sockaddr_in *)&greqs.gsr_source;
896 mreqs.imr_sourceaddr = psin->sin_addr.s_addr; 896 mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
897 mreqs.imr_interface = 0; /* use index for mc_source */ 897 mreqs.imr_interface = 0; /* use index for mc_source */
898 898
899 if (optname == MCAST_BLOCK_SOURCE) { 899 if (optname == MCAST_BLOCK_SOURCE) {
900 omode = MCAST_EXCLUDE; 900 omode = MCAST_EXCLUDE;
901 add = 1; 901 add = 1;
902 } else if (optname == MCAST_UNBLOCK_SOURCE) { 902 } else if (optname == MCAST_UNBLOCK_SOURCE) {
903 omode = MCAST_EXCLUDE; 903 omode = MCAST_EXCLUDE;
904 add = 0; 904 add = 0;
905 } else if (optname == MCAST_JOIN_SOURCE_GROUP) { 905 } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
906 struct ip_mreqn mreq; 906 struct ip_mreqn mreq;
907 907
908 psin = (struct sockaddr_in *)&greqs.gsr_group; 908 psin = (struct sockaddr_in *)&greqs.gsr_group;
909 mreq.imr_multiaddr = psin->sin_addr; 909 mreq.imr_multiaddr = psin->sin_addr;
910 mreq.imr_address.s_addr = 0; 910 mreq.imr_address.s_addr = 0;
911 mreq.imr_ifindex = greqs.gsr_interface; 911 mreq.imr_ifindex = greqs.gsr_interface;
912 err = ip_mc_join_group(sk, &mreq); 912 err = ip_mc_join_group(sk, &mreq);
913 if (err && err != -EADDRINUSE) 913 if (err && err != -EADDRINUSE)
914 break; 914 break;
915 greqs.gsr_interface = mreq.imr_ifindex; 915 greqs.gsr_interface = mreq.imr_ifindex;
916 omode = MCAST_INCLUDE; 916 omode = MCAST_INCLUDE;
917 add = 1; 917 add = 1;
918 } else /* MCAST_LEAVE_SOURCE_GROUP */ { 918 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
919 omode = MCAST_INCLUDE; 919 omode = MCAST_INCLUDE;
920 add = 0; 920 add = 0;
921 } 921 }
922 err = ip_mc_source(add, omode, sk, &mreqs, 922 err = ip_mc_source(add, omode, sk, &mreqs,
923 greqs.gsr_interface); 923 greqs.gsr_interface);
924 break; 924 break;
925 } 925 }
926 case MCAST_MSFILTER: 926 case MCAST_MSFILTER:
927 { 927 {
928 struct sockaddr_in *psin; 928 struct sockaddr_in *psin;
929 struct ip_msfilter *msf = NULL; 929 struct ip_msfilter *msf = NULL;
930 struct group_filter *gsf = NULL; 930 struct group_filter *gsf = NULL;
931 int msize, i, ifindex; 931 int msize, i, ifindex;
932 932
933 if (optlen < GROUP_FILTER_SIZE(0)) 933 if (optlen < GROUP_FILTER_SIZE(0))
934 goto e_inval; 934 goto e_inval;
935 if (optlen > sysctl_optmem_max) { 935 if (optlen > sysctl_optmem_max) {
936 err = -ENOBUFS; 936 err = -ENOBUFS;
937 break; 937 break;
938 } 938 }
939 gsf = kmalloc(optlen, GFP_KERNEL); 939 gsf = kmalloc(optlen, GFP_KERNEL);
940 if (!gsf) { 940 if (!gsf) {
941 err = -ENOBUFS; 941 err = -ENOBUFS;
942 break; 942 break;
943 } 943 }
944 err = -EFAULT; 944 err = -EFAULT;
945 if (copy_from_user(gsf, optval, optlen)) 945 if (copy_from_user(gsf, optval, optlen))
946 goto mc_msf_out; 946 goto mc_msf_out;
947 947
948 /* numsrc >= (4G-140)/128 overflow in 32 bits */ 948 /* numsrc >= (4G-140)/128 overflow in 32 bits */
949 if (gsf->gf_numsrc >= 0x1ffffff || 949 if (gsf->gf_numsrc >= 0x1ffffff ||
950 gsf->gf_numsrc > sysctl_igmp_max_msf) { 950 gsf->gf_numsrc > sysctl_igmp_max_msf) {
951 err = -ENOBUFS; 951 err = -ENOBUFS;
952 goto mc_msf_out; 952 goto mc_msf_out;
953 } 953 }
954 if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { 954 if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
955 err = -EINVAL; 955 err = -EINVAL;
956 goto mc_msf_out; 956 goto mc_msf_out;
957 } 957 }
958 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); 958 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
959 msf = kmalloc(msize, GFP_KERNEL); 959 msf = kmalloc(msize, GFP_KERNEL);
960 if (!msf) { 960 if (!msf) {
961 err = -ENOBUFS; 961 err = -ENOBUFS;
962 goto mc_msf_out; 962 goto mc_msf_out;
963 } 963 }
964 ifindex = gsf->gf_interface; 964 ifindex = gsf->gf_interface;
965 psin = (struct sockaddr_in *)&gsf->gf_group; 965 psin = (struct sockaddr_in *)&gsf->gf_group;
966 if (psin->sin_family != AF_INET) { 966 if (psin->sin_family != AF_INET) {
967 err = -EADDRNOTAVAIL; 967 err = -EADDRNOTAVAIL;
968 goto mc_msf_out; 968 goto mc_msf_out;
969 } 969 }
970 msf->imsf_multiaddr = psin->sin_addr.s_addr; 970 msf->imsf_multiaddr = psin->sin_addr.s_addr;
971 msf->imsf_interface = 0; 971 msf->imsf_interface = 0;
972 msf->imsf_fmode = gsf->gf_fmode; 972 msf->imsf_fmode = gsf->gf_fmode;
973 msf->imsf_numsrc = gsf->gf_numsrc; 973 msf->imsf_numsrc = gsf->gf_numsrc;
974 err = -EADDRNOTAVAIL; 974 err = -EADDRNOTAVAIL;
975 for (i = 0; i < gsf->gf_numsrc; ++i) { 975 for (i = 0; i < gsf->gf_numsrc; ++i) {
976 psin = (struct sockaddr_in *)&gsf->gf_slist[i]; 976 psin = (struct sockaddr_in *)&gsf->gf_slist[i];
977 977
978 if (psin->sin_family != AF_INET) 978 if (psin->sin_family != AF_INET)
979 goto mc_msf_out; 979 goto mc_msf_out;
980 msf->imsf_slist[i] = psin->sin_addr.s_addr; 980 msf->imsf_slist[i] = psin->sin_addr.s_addr;
981 } 981 }
982 kfree(gsf); 982 kfree(gsf);
983 gsf = NULL; 983 gsf = NULL;
984 984
985 err = ip_mc_msfilter(sk, msf, ifindex); 985 err = ip_mc_msfilter(sk, msf, ifindex);
986 mc_msf_out: 986 mc_msf_out:
987 kfree(msf); 987 kfree(msf);
988 kfree(gsf); 988 kfree(gsf);
989 break; 989 break;
990 } 990 }
991 case IP_MULTICAST_ALL: 991 case IP_MULTICAST_ALL:
992 if (optlen < 1) 992 if (optlen < 1)
993 goto e_inval; 993 goto e_inval;
994 if (val != 0 && val != 1) 994 if (val != 0 && val != 1)
995 goto e_inval; 995 goto e_inval;
996 inet->mc_all = val; 996 inet->mc_all = val;
997 break; 997 break;
998 case IP_ROUTER_ALERT: 998 case IP_ROUTER_ALERT:
999 err = ip_ra_control(sk, val ? 1 : 0, NULL); 999 err = ip_ra_control(sk, val ? 1 : 0, NULL);
1000 break; 1000 break;
1001 1001
1002 case IP_FREEBIND: 1002 case IP_FREEBIND:
1003 if (optlen < 1) 1003 if (optlen < 1)
1004 goto e_inval; 1004 goto e_inval;
1005 inet->freebind = !!val; 1005 inet->freebind = !!val;
1006 break; 1006 break;
1007 1007
1008 case IP_IPSEC_POLICY: 1008 case IP_IPSEC_POLICY:
1009 case IP_XFRM_POLICY: 1009 case IP_XFRM_POLICY:
1010 err = -EPERM; 1010 err = -EPERM;
1011 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1011 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1012 break; 1012 break;
1013 err = xfrm_user_policy(sk, optname, optval, optlen); 1013 err = xfrm_user_policy(sk, optname, optval, optlen);
1014 break; 1014 break;
1015 1015
1016 case IP_TRANSPARENT: 1016 case IP_TRANSPARENT:
1017 if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1017 if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1018 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1018 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1019 err = -EPERM; 1019 err = -EPERM;
1020 break; 1020 break;
1021 } 1021 }
1022 if (optlen < 1) 1022 if (optlen < 1)
1023 goto e_inval; 1023 goto e_inval;
1024 inet->transparent = !!val; 1024 inet->transparent = !!val;
1025 break; 1025 break;
1026 1026
1027 case IP_MINTTL: 1027 case IP_MINTTL:
1028 if (optlen < 1) 1028 if (optlen < 1)
1029 goto e_inval; 1029 goto e_inval;
1030 if (val < 0 || val > 255) 1030 if (val < 0 || val > 255)
1031 goto e_inval; 1031 goto e_inval;
1032 inet->min_ttl = val; 1032 inet->min_ttl = val;
1033 break; 1033 break;
1034 1034
1035 default: 1035 default:
1036 err = -ENOPROTOOPT; 1036 err = -ENOPROTOOPT;
1037 break; 1037 break;
1038 } 1038 }
1039 release_sock(sk); 1039 release_sock(sk);
1040 return err; 1040 return err;
1041 1041
1042 e_inval: 1042 e_inval:
1043 release_sock(sk); 1043 release_sock(sk);
1044 return -EINVAL; 1044 return -EINVAL;
1045 } 1045 }
1046 1046
1047 /** 1047 /**
1048 * ipv4_pktinfo_prepare - transfert some info from rtable to skb 1048 * ipv4_pktinfo_prepare - transfert some info from rtable to skb
1049 * @sk: socket 1049 * @sk: socket
1050 * @skb: buffer 1050 * @skb: buffer
1051 * 1051 *
1052 * To support IP_CMSG_PKTINFO option, we store rt_iif and specific 1052 * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
1053 * destination in skb->cb[] before dst drop. 1053 * destination in skb->cb[] before dst drop.
1054 * This way, receiver doesnt make cache line misses to read rtable. 1054 * This way, receiver doesn't make cache line misses to read rtable.
1055 */ 1055 */
1056 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) 1056 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
1057 { 1057 {
1058 struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); 1058 struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
1059 1059
1060 if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) && 1060 if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) &&
1061 skb_rtable(skb)) { 1061 skb_rtable(skb)) {
1062 pktinfo->ipi_ifindex = inet_iif(skb); 1062 pktinfo->ipi_ifindex = inet_iif(skb);
1063 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); 1063 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
1064 } else { 1064 } else {
1065 pktinfo->ipi_ifindex = 0; 1065 pktinfo->ipi_ifindex = 0;
1066 pktinfo->ipi_spec_dst.s_addr = 0; 1066 pktinfo->ipi_spec_dst.s_addr = 0;
1067 } 1067 }
1068 skb_dst_drop(skb); 1068 skb_dst_drop(skb);
1069 } 1069 }
1070 1070
1071 int ip_setsockopt(struct sock *sk, int level, 1071 int ip_setsockopt(struct sock *sk, int level,
1072 int optname, char __user *optval, unsigned int optlen) 1072 int optname, char __user *optval, unsigned int optlen)
1073 { 1073 {
1074 int err; 1074 int err;
1075 1075
1076 if (level != SOL_IP) 1076 if (level != SOL_IP)
1077 return -ENOPROTOOPT; 1077 return -ENOPROTOOPT;
1078 1078
1079 err = do_ip_setsockopt(sk, level, optname, optval, optlen); 1079 err = do_ip_setsockopt(sk, level, optname, optval, optlen);
1080 #ifdef CONFIG_NETFILTER 1080 #ifdef CONFIG_NETFILTER
1081 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1081 /* we need to exclude all possible ENOPROTOOPTs except default case */
1082 if (err == -ENOPROTOOPT && optname != IP_HDRINCL && 1082 if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
1083 optname != IP_IPSEC_POLICY && 1083 optname != IP_IPSEC_POLICY &&
1084 optname != IP_XFRM_POLICY && 1084 optname != IP_XFRM_POLICY &&
1085 !ip_mroute_opt(optname)) { 1085 !ip_mroute_opt(optname)) {
1086 lock_sock(sk); 1086 lock_sock(sk);
1087 err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); 1087 err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
1088 release_sock(sk); 1088 release_sock(sk);
1089 } 1089 }
1090 #endif 1090 #endif
1091 return err; 1091 return err;
1092 } 1092 }
1093 EXPORT_SYMBOL(ip_setsockopt); 1093 EXPORT_SYMBOL(ip_setsockopt);
1094 1094
1095 #ifdef CONFIG_COMPAT 1095 #ifdef CONFIG_COMPAT
1096 int compat_ip_setsockopt(struct sock *sk, int level, int optname, 1096 int compat_ip_setsockopt(struct sock *sk, int level, int optname,
1097 char __user *optval, unsigned int optlen) 1097 char __user *optval, unsigned int optlen)
1098 { 1098 {
1099 int err; 1099 int err;
1100 1100
1101 if (level != SOL_IP) 1101 if (level != SOL_IP)
1102 return -ENOPROTOOPT; 1102 return -ENOPROTOOPT;
1103 1103
1104 if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) 1104 if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
1105 return compat_mc_setsockopt(sk, level, optname, optval, optlen, 1105 return compat_mc_setsockopt(sk, level, optname, optval, optlen,
1106 ip_setsockopt); 1106 ip_setsockopt);
1107 1107
1108 err = do_ip_setsockopt(sk, level, optname, optval, optlen); 1108 err = do_ip_setsockopt(sk, level, optname, optval, optlen);
1109 #ifdef CONFIG_NETFILTER 1109 #ifdef CONFIG_NETFILTER
1110 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1110 /* we need to exclude all possible ENOPROTOOPTs except default case */
1111 if (err == -ENOPROTOOPT && optname != IP_HDRINCL && 1111 if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
1112 optname != IP_IPSEC_POLICY && 1112 optname != IP_IPSEC_POLICY &&
1113 optname != IP_XFRM_POLICY && 1113 optname != IP_XFRM_POLICY &&
1114 !ip_mroute_opt(optname)) { 1114 !ip_mroute_opt(optname)) {
1115 lock_sock(sk); 1115 lock_sock(sk);
1116 err = compat_nf_setsockopt(sk, PF_INET, optname, 1116 err = compat_nf_setsockopt(sk, PF_INET, optname,
1117 optval, optlen); 1117 optval, optlen);
1118 release_sock(sk); 1118 release_sock(sk);
1119 } 1119 }
1120 #endif 1120 #endif
1121 return err; 1121 return err;
1122 } 1122 }
1123 EXPORT_SYMBOL(compat_ip_setsockopt); 1123 EXPORT_SYMBOL(compat_ip_setsockopt);
1124 #endif 1124 #endif
1125 1125
1126 /* 1126 /*
1127 * Get the options. Note for future reference. The GET of IP options gets 1127 * Get the options. Note for future reference. The GET of IP options gets
1128 * the _received_ ones. The set sets the _sent_ ones. 1128 * the _received_ ones. The set sets the _sent_ ones.
1129 */ 1129 */
1130 1130
1131 static int do_ip_getsockopt(struct sock *sk, int level, int optname, 1131 static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1132 char __user *optval, int __user *optlen, unsigned int flags) 1132 char __user *optval, int __user *optlen, unsigned int flags)
1133 { 1133 {
1134 struct inet_sock *inet = inet_sk(sk); 1134 struct inet_sock *inet = inet_sk(sk);
1135 int val; 1135 int val;
1136 int len; 1136 int len;
1137 1137
1138 if (level != SOL_IP) 1138 if (level != SOL_IP)
1139 return -EOPNOTSUPP; 1139 return -EOPNOTSUPP;
1140 1140
1141 if (ip_mroute_opt(optname)) 1141 if (ip_mroute_opt(optname))
1142 return ip_mroute_getsockopt(sk, optname, optval, optlen); 1142 return ip_mroute_getsockopt(sk, optname, optval, optlen);
1143 1143
1144 if (get_user(len, optlen)) 1144 if (get_user(len, optlen))
1145 return -EFAULT; 1145 return -EFAULT;
1146 if (len < 0) 1146 if (len < 0)
1147 return -EINVAL; 1147 return -EINVAL;
1148 1148
1149 lock_sock(sk); 1149 lock_sock(sk);
1150 1150
1151 switch (optname) { 1151 switch (optname) {
1152 case IP_OPTIONS: 1152 case IP_OPTIONS:
1153 { 1153 {
1154 unsigned char optbuf[sizeof(struct ip_options)+40]; 1154 unsigned char optbuf[sizeof(struct ip_options)+40];
1155 struct ip_options *opt = (struct ip_options *)optbuf; 1155 struct ip_options *opt = (struct ip_options *)optbuf;
1156 struct ip_options_rcu *inet_opt; 1156 struct ip_options_rcu *inet_opt;
1157 1157
1158 inet_opt = rcu_dereference_protected(inet->inet_opt, 1158 inet_opt = rcu_dereference_protected(inet->inet_opt,
1159 sock_owned_by_user(sk)); 1159 sock_owned_by_user(sk));
1160 opt->optlen = 0; 1160 opt->optlen = 0;
1161 if (inet_opt) 1161 if (inet_opt)
1162 memcpy(optbuf, &inet_opt->opt, 1162 memcpy(optbuf, &inet_opt->opt,
1163 sizeof(struct ip_options) + 1163 sizeof(struct ip_options) +
1164 inet_opt->opt.optlen); 1164 inet_opt->opt.optlen);
1165 release_sock(sk); 1165 release_sock(sk);
1166 1166
1167 if (opt->optlen == 0) 1167 if (opt->optlen == 0)
1168 return put_user(0, optlen); 1168 return put_user(0, optlen);
1169 1169
1170 ip_options_undo(opt); 1170 ip_options_undo(opt);
1171 1171
1172 len = min_t(unsigned int, len, opt->optlen); 1172 len = min_t(unsigned int, len, opt->optlen);
1173 if (put_user(len, optlen)) 1173 if (put_user(len, optlen))
1174 return -EFAULT; 1174 return -EFAULT;
1175 if (copy_to_user(optval, opt->__data, len)) 1175 if (copy_to_user(optval, opt->__data, len))
1176 return -EFAULT; 1176 return -EFAULT;
1177 return 0; 1177 return 0;
1178 } 1178 }
1179 case IP_PKTINFO: 1179 case IP_PKTINFO:
1180 val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; 1180 val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
1181 break; 1181 break;
1182 case IP_RECVTTL: 1182 case IP_RECVTTL:
1183 val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; 1183 val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
1184 break; 1184 break;
1185 case IP_RECVTOS: 1185 case IP_RECVTOS:
1186 val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; 1186 val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
1187 break; 1187 break;
1188 case IP_RECVOPTS: 1188 case IP_RECVOPTS:
1189 val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; 1189 val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
1190 break; 1190 break;
1191 case IP_RETOPTS: 1191 case IP_RETOPTS:
1192 val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; 1192 val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
1193 break; 1193 break;
1194 case IP_PASSSEC: 1194 case IP_PASSSEC:
1195 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; 1195 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
1196 break; 1196 break;
1197 case IP_RECVORIGDSTADDR: 1197 case IP_RECVORIGDSTADDR:
1198 val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; 1198 val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
1199 break; 1199 break;
1200 case IP_TOS: 1200 case IP_TOS:
1201 val = inet->tos; 1201 val = inet->tos;
1202 break; 1202 break;
1203 case IP_TTL: 1203 case IP_TTL:
1204 val = (inet->uc_ttl == -1 ? 1204 val = (inet->uc_ttl == -1 ?
1205 sysctl_ip_default_ttl : 1205 sysctl_ip_default_ttl :
1206 inet->uc_ttl); 1206 inet->uc_ttl);
1207 break; 1207 break;
1208 case IP_HDRINCL: 1208 case IP_HDRINCL:
1209 val = inet->hdrincl; 1209 val = inet->hdrincl;
1210 break; 1210 break;
1211 case IP_NODEFRAG: 1211 case IP_NODEFRAG:
1212 val = inet->nodefrag; 1212 val = inet->nodefrag;
1213 break; 1213 break;
1214 case IP_MTU_DISCOVER: 1214 case IP_MTU_DISCOVER:
1215 val = inet->pmtudisc; 1215 val = inet->pmtudisc;
1216 break; 1216 break;
1217 case IP_MTU: 1217 case IP_MTU:
1218 { 1218 {
1219 struct dst_entry *dst; 1219 struct dst_entry *dst;
1220 val = 0; 1220 val = 0;
1221 dst = sk_dst_get(sk); 1221 dst = sk_dst_get(sk);
1222 if (dst) { 1222 if (dst) {
1223 val = dst_mtu(dst); 1223 val = dst_mtu(dst);
1224 dst_release(dst); 1224 dst_release(dst);
1225 } 1225 }
1226 if (!val) { 1226 if (!val) {
1227 release_sock(sk); 1227 release_sock(sk);
1228 return -ENOTCONN; 1228 return -ENOTCONN;
1229 } 1229 }
1230 break; 1230 break;
1231 } 1231 }
1232 case IP_RECVERR: 1232 case IP_RECVERR:
1233 val = inet->recverr; 1233 val = inet->recverr;
1234 break; 1234 break;
1235 case IP_MULTICAST_TTL: 1235 case IP_MULTICAST_TTL:
1236 val = inet->mc_ttl; 1236 val = inet->mc_ttl;
1237 break; 1237 break;
1238 case IP_MULTICAST_LOOP: 1238 case IP_MULTICAST_LOOP:
1239 val = inet->mc_loop; 1239 val = inet->mc_loop;
1240 break; 1240 break;
1241 case IP_UNICAST_IF: 1241 case IP_UNICAST_IF:
1242 val = (__force int)htonl((__u32) inet->uc_index); 1242 val = (__force int)htonl((__u32) inet->uc_index);
1243 break; 1243 break;
1244 case IP_MULTICAST_IF: 1244 case IP_MULTICAST_IF:
1245 { 1245 {
1246 struct in_addr addr; 1246 struct in_addr addr;
1247 len = min_t(unsigned int, len, sizeof(struct in_addr)); 1247 len = min_t(unsigned int, len, sizeof(struct in_addr));
1248 addr.s_addr = inet->mc_addr; 1248 addr.s_addr = inet->mc_addr;
1249 release_sock(sk); 1249 release_sock(sk);
1250 1250
1251 if (put_user(len, optlen)) 1251 if (put_user(len, optlen))
1252 return -EFAULT; 1252 return -EFAULT;
1253 if (copy_to_user(optval, &addr, len)) 1253 if (copy_to_user(optval, &addr, len))
1254 return -EFAULT; 1254 return -EFAULT;
1255 return 0; 1255 return 0;
1256 } 1256 }
1257 case IP_MSFILTER: 1257 case IP_MSFILTER:
1258 { 1258 {
1259 struct ip_msfilter msf; 1259 struct ip_msfilter msf;
1260 int err; 1260 int err;
1261 1261
1262 if (len < IP_MSFILTER_SIZE(0)) { 1262 if (len < IP_MSFILTER_SIZE(0)) {
1263 release_sock(sk); 1263 release_sock(sk);
1264 return -EINVAL; 1264 return -EINVAL;
1265 } 1265 }
1266 if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { 1266 if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
1267 release_sock(sk); 1267 release_sock(sk);
1268 return -EFAULT; 1268 return -EFAULT;
1269 } 1269 }
1270 err = ip_mc_msfget(sk, &msf, 1270 err = ip_mc_msfget(sk, &msf,
1271 (struct ip_msfilter __user *)optval, optlen); 1271 (struct ip_msfilter __user *)optval, optlen);
1272 release_sock(sk); 1272 release_sock(sk);
1273 return err; 1273 return err;
1274 } 1274 }
1275 case MCAST_MSFILTER: 1275 case MCAST_MSFILTER:
1276 { 1276 {
1277 struct group_filter gsf; 1277 struct group_filter gsf;
1278 int err; 1278 int err;
1279 1279
1280 if (len < GROUP_FILTER_SIZE(0)) { 1280 if (len < GROUP_FILTER_SIZE(0)) {
1281 release_sock(sk); 1281 release_sock(sk);
1282 return -EINVAL; 1282 return -EINVAL;
1283 } 1283 }
1284 if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { 1284 if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
1285 release_sock(sk); 1285 release_sock(sk);
1286 return -EFAULT; 1286 return -EFAULT;
1287 } 1287 }
1288 err = ip_mc_gsfget(sk, &gsf, 1288 err = ip_mc_gsfget(sk, &gsf,
1289 (struct group_filter __user *)optval, 1289 (struct group_filter __user *)optval,
1290 optlen); 1290 optlen);
1291 release_sock(sk); 1291 release_sock(sk);
1292 return err; 1292 return err;
1293 } 1293 }
1294 case IP_MULTICAST_ALL: 1294 case IP_MULTICAST_ALL:
1295 val = inet->mc_all; 1295 val = inet->mc_all;
1296 break; 1296 break;
1297 case IP_PKTOPTIONS: 1297 case IP_PKTOPTIONS:
1298 { 1298 {
1299 struct msghdr msg; 1299 struct msghdr msg;
1300 1300
1301 release_sock(sk); 1301 release_sock(sk);
1302 1302
1303 if (sk->sk_type != SOCK_STREAM) 1303 if (sk->sk_type != SOCK_STREAM)
1304 return -ENOPROTOOPT; 1304 return -ENOPROTOOPT;
1305 1305
1306 msg.msg_control = optval; 1306 msg.msg_control = optval;
1307 msg.msg_controllen = len; 1307 msg.msg_controllen = len;
1308 msg.msg_flags = flags; 1308 msg.msg_flags = flags;
1309 1309
1310 if (inet->cmsg_flags & IP_CMSG_PKTINFO) { 1310 if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
1311 struct in_pktinfo info; 1311 struct in_pktinfo info;
1312 1312
1313 info.ipi_addr.s_addr = inet->inet_rcv_saddr; 1313 info.ipi_addr.s_addr = inet->inet_rcv_saddr;
1314 info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr; 1314 info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
1315 info.ipi_ifindex = inet->mc_index; 1315 info.ipi_ifindex = inet->mc_index;
1316 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 1316 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
1317 } 1317 }
1318 if (inet->cmsg_flags & IP_CMSG_TTL) { 1318 if (inet->cmsg_flags & IP_CMSG_TTL) {
1319 int hlim = inet->mc_ttl; 1319 int hlim = inet->mc_ttl;
1320 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); 1320 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
1321 } 1321 }
1322 if (inet->cmsg_flags & IP_CMSG_TOS) { 1322 if (inet->cmsg_flags & IP_CMSG_TOS) {
1323 int tos = inet->rcv_tos; 1323 int tos = inet->rcv_tos;
1324 put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); 1324 put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
1325 } 1325 }
1326 len -= msg.msg_controllen; 1326 len -= msg.msg_controllen;
1327 return put_user(len, optlen); 1327 return put_user(len, optlen);
1328 } 1328 }
1329 case IP_FREEBIND: 1329 case IP_FREEBIND:
1330 val = inet->freebind; 1330 val = inet->freebind;
1331 break; 1331 break;
1332 case IP_TRANSPARENT: 1332 case IP_TRANSPARENT:
1333 val = inet->transparent; 1333 val = inet->transparent;
1334 break; 1334 break;
1335 case IP_MINTTL: 1335 case IP_MINTTL:
1336 val = inet->min_ttl; 1336 val = inet->min_ttl;
1337 break; 1337 break;
1338 default: 1338 default:
1339 release_sock(sk); 1339 release_sock(sk);
1340 return -ENOPROTOOPT; 1340 return -ENOPROTOOPT;
1341 } 1341 }
1342 release_sock(sk); 1342 release_sock(sk);
1343 1343
1344 if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { 1344 if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
1345 unsigned char ucval = (unsigned char)val; 1345 unsigned char ucval = (unsigned char)val;
1346 len = 1; 1346 len = 1;
1347 if (put_user(len, optlen)) 1347 if (put_user(len, optlen))
1348 return -EFAULT; 1348 return -EFAULT;
1349 if (copy_to_user(optval, &ucval, 1)) 1349 if (copy_to_user(optval, &ucval, 1))
1350 return -EFAULT; 1350 return -EFAULT;
1351 } else { 1351 } else {
1352 len = min_t(unsigned int, sizeof(int), len); 1352 len = min_t(unsigned int, sizeof(int), len);
1353 if (put_user(len, optlen)) 1353 if (put_user(len, optlen))
1354 return -EFAULT; 1354 return -EFAULT;
1355 if (copy_to_user(optval, &val, len)) 1355 if (copy_to_user(optval, &val, len))
1356 return -EFAULT; 1356 return -EFAULT;
1357 } 1357 }
1358 return 0; 1358 return 0;
1359 } 1359 }
1360 1360
1361 int ip_getsockopt(struct sock *sk, int level, 1361 int ip_getsockopt(struct sock *sk, int level,
1362 int optname, char __user *optval, int __user *optlen) 1362 int optname, char __user *optval, int __user *optlen)
1363 { 1363 {
1364 int err; 1364 int err;
1365 1365
1366 err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); 1366 err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
1367 #ifdef CONFIG_NETFILTER 1367 #ifdef CONFIG_NETFILTER
1368 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1368 /* we need to exclude all possible ENOPROTOOPTs except default case */
1369 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && 1369 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
1370 !ip_mroute_opt(optname)) { 1370 !ip_mroute_opt(optname)) {
1371 int len; 1371 int len;
1372 1372
1373 if (get_user(len, optlen)) 1373 if (get_user(len, optlen))
1374 return -EFAULT; 1374 return -EFAULT;
1375 1375
1376 lock_sock(sk); 1376 lock_sock(sk);
1377 err = nf_getsockopt(sk, PF_INET, optname, optval, 1377 err = nf_getsockopt(sk, PF_INET, optname, optval,
1378 &len); 1378 &len);
1379 release_sock(sk); 1379 release_sock(sk);
1380 if (err >= 0) 1380 if (err >= 0)
1381 err = put_user(len, optlen); 1381 err = put_user(len, optlen);
1382 return err; 1382 return err;
1383 } 1383 }
1384 #endif 1384 #endif
1385 return err; 1385 return err;
1386 } 1386 }
1387 EXPORT_SYMBOL(ip_getsockopt); 1387 EXPORT_SYMBOL(ip_getsockopt);
1388 1388
1389 #ifdef CONFIG_COMPAT 1389 #ifdef CONFIG_COMPAT
1390 int compat_ip_getsockopt(struct sock *sk, int level, int optname, 1390 int compat_ip_getsockopt(struct sock *sk, int level, int optname,
1391 char __user *optval, int __user *optlen) 1391 char __user *optval, int __user *optlen)
1392 { 1392 {
1393 int err; 1393 int err;
1394 1394
1395 if (optname == MCAST_MSFILTER) 1395 if (optname == MCAST_MSFILTER)
1396 return compat_mc_getsockopt(sk, level, optname, optval, optlen, 1396 return compat_mc_getsockopt(sk, level, optname, optval, optlen,
1397 ip_getsockopt); 1397 ip_getsockopt);
1398 1398
1399 err = do_ip_getsockopt(sk, level, optname, optval, optlen, 1399 err = do_ip_getsockopt(sk, level, optname, optval, optlen,
1400 MSG_CMSG_COMPAT); 1400 MSG_CMSG_COMPAT);
1401 1401
1402 #ifdef CONFIG_NETFILTER 1402 #ifdef CONFIG_NETFILTER
1403 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1403 /* we need to exclude all possible ENOPROTOOPTs except default case */
1404 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && 1404 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
1405 !ip_mroute_opt(optname)) { 1405 !ip_mroute_opt(optname)) {
1406 int len; 1406 int len;
1407 1407
1408 if (get_user(len, optlen)) 1408 if (get_user(len, optlen))
1409 return -EFAULT; 1409 return -EFAULT;
1410 1410
1411 lock_sock(sk); 1411 lock_sock(sk);
1412 err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len); 1412 err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
1413 release_sock(sk); 1413 release_sock(sk);
1414 if (err >= 0) 1414 if (err >= 0)
1415 err = put_user(len, optlen); 1415 err = put_user(len, optlen);
1416 return err; 1416 return err;
1417 } 1417 }
1418 #endif 1418 #endif
1419 return err; 1419 return err;
1420 } 1420 }
1421 EXPORT_SYMBOL(compat_ip_getsockopt); 1421 EXPORT_SYMBOL(compat_ip_getsockopt);
1422 #endif 1422 #endif
1423 1423
net/ipv4/tcp_output.c
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Implementation of the Transmission Control Protocol(TCP). 6 * Implementation of the Transmission Control Protocol(TCP).
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Mark Evans, <evansmp@uhura.aston.ac.uk> 10 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Corey Minyard <wf-rch!minyard@relay.EU.net> 11 * Corey Minyard <wf-rch!minyard@relay.EU.net>
12 * Florian La Roche, <flla@stud.uni-sb.de> 12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 * Linus Torvalds, <torvalds@cs.helsinki.fi> 14 * Linus Torvalds, <torvalds@cs.helsinki.fi>
15 * Alan Cox, <gw4pts@gw4pts.ampr.org> 15 * Alan Cox, <gw4pts@gw4pts.ampr.org>
16 * Matthew Dillon, <dillon@apollo.west.oic.com> 16 * Matthew Dillon, <dillon@apollo.west.oic.com>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Jorge Cwik, <jorge@laser.satlink.net> 18 * Jorge Cwik, <jorge@laser.satlink.net>
19 */ 19 */
20 20
21 /* 21 /*
22 * Changes: Pedro Roque : Retransmit queue handled by TCP. 22 * Changes: Pedro Roque : Retransmit queue handled by TCP.
23 * : Fragmentation on mtu decrease 23 * : Fragmentation on mtu decrease
24 * : Segment collapse on retransmit 24 * : Segment collapse on retransmit
25 * : AF independence 25 * : AF independence
26 * 26 *
27 * Linus Torvalds : send_delayed_ack 27 * Linus Torvalds : send_delayed_ack
28 * David S. Miller : Charge memory using the right skb 28 * David S. Miller : Charge memory using the right skb
29 * during syn/ack processing. 29 * during syn/ack processing.
30 * David S. Miller : Output engine completely rewritten. 30 * David S. Miller : Output engine completely rewritten.
31 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. 31 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
32 * Cacophonix Gaul : draft-minshall-nagle-01 32 * Cacophonix Gaul : draft-minshall-nagle-01
33 * J Hadi Salim : ECN support 33 * J Hadi Salim : ECN support
34 * 34 *
35 */ 35 */
36 36
37 #define pr_fmt(fmt) "TCP: " fmt 37 #define pr_fmt(fmt) "TCP: " fmt
38 38
39 #include <net/tcp.h> 39 #include <net/tcp.h>
40 40
41 #include <linux/compiler.h> 41 #include <linux/compiler.h>
42 #include <linux/gfp.h> 42 #include <linux/gfp.h>
43 #include <linux/module.h> 43 #include <linux/module.h>
44 44
45 /* People can turn this off for buggy TCP's found in printers etc. */ 45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse __read_mostly = 1; 46 int sysctl_tcp_retrans_collapse __read_mostly = 1;
47 47
48 /* People can turn this on to work with those rare, broken TCPs that 48 /* People can turn this on to work with those rare, broken TCPs that
49 * interpret the window field as a signed quantity. 49 * interpret the window field as a signed quantity.
50 */ 50 */
51 int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 51 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52 52
53 /* Default TSQ limit of two TSO segments */ 53 /* Default TSQ limit of two TSO segments */
54 int sysctl_tcp_limit_output_bytes __read_mostly = 131072; 54 int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55 55
56 /* This limits the percentage of the congestion window which we 56 /* This limits the percentage of the congestion window which we
57 * will allow a single TSO frame to consume. Building TSO frames 57 * will allow a single TSO frame to consume. Building TSO frames
58 * which are too large can cause TCP streams to be bursty. 58 * which are too large can cause TCP streams to be bursty.
59 */ 59 */
60 int sysctl_tcp_tso_win_divisor __read_mostly = 3; 60 int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61 61
62 int sysctl_tcp_mtu_probing __read_mostly = 0; 62 int sysctl_tcp_mtu_probing __read_mostly = 0;
63 int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; 63 int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
64 64
65 /* By default, RFC2861 behavior. */ 65 /* By default, RFC2861 behavior. */
66 int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 66 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67 67
68 unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; 68 unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
69 EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); 69 EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
70 70
71 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 71 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72 int push_one, gfp_t gfp); 72 int push_one, gfp_t gfp);
73 73
74 /* Account for new data that has been sent to the network. */ 74 /* Account for new data that has been sent to the network. */
75 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 75 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76 { 76 {
77 struct inet_connection_sock *icsk = inet_csk(sk); 77 struct inet_connection_sock *icsk = inet_csk(sk);
78 struct tcp_sock *tp = tcp_sk(sk); 78 struct tcp_sock *tp = tcp_sk(sk);
79 unsigned int prior_packets = tp->packets_out; 79 unsigned int prior_packets = tp->packets_out;
80 80
81 tcp_advance_send_head(sk, skb); 81 tcp_advance_send_head(sk, skb);
82 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 82 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
83 83
84 tp->packets_out += tcp_skb_pcount(skb); 84 tp->packets_out += tcp_skb_pcount(skb);
85 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 85 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
86 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 86 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
87 tcp_rearm_rto(sk); 87 tcp_rearm_rto(sk);
88 } 88 }
89 } 89 }
90 90
91 /* SND.NXT, if window was not shrunk. 91 /* SND.NXT, if window was not shrunk.
92 * If window has been shrunk, what should we make? It is not clear at all. 92 * If window has been shrunk, what should we make? It is not clear at all.
93 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( 93 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
94 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already 94 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
95 * invalid. OK, let's make this for now: 95 * invalid. OK, let's make this for now:
96 */ 96 */
97 static inline __u32 tcp_acceptable_seq(const struct sock *sk) 97 static inline __u32 tcp_acceptable_seq(const struct sock *sk)
98 { 98 {
99 const struct tcp_sock *tp = tcp_sk(sk); 99 const struct tcp_sock *tp = tcp_sk(sk);
100 100
101 if (!before(tcp_wnd_end(tp), tp->snd_nxt)) 101 if (!before(tcp_wnd_end(tp), tp->snd_nxt))
102 return tp->snd_nxt; 102 return tp->snd_nxt;
103 else 103 else
104 return tcp_wnd_end(tp); 104 return tcp_wnd_end(tp);
105 } 105 }
106 106
107 /* Calculate mss to advertise in SYN segment. 107 /* Calculate mss to advertise in SYN segment.
108 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: 108 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
109 * 109 *
110 * 1. It is independent of path mtu. 110 * 1. It is independent of path mtu.
111 * 2. Ideally, it is maximal possible segment size i.e. 65535-40. 111 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
112 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of 112 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
113 * attached devices, because some buggy hosts are confused by 113 * attached devices, because some buggy hosts are confused by
114 * large MSS. 114 * large MSS.
115 * 4. We do not make 3, we advertise MSS, calculated from first 115 * 4. We do not make 3, we advertise MSS, calculated from first
116 * hop device mtu, but allow to raise it to ip_rt_min_advmss. 116 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
117 * This may be overridden via information stored in routing table. 117 * This may be overridden via information stored in routing table.
118 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, 118 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
119 * probably even Jumbo". 119 * probably even Jumbo".
120 */ 120 */
121 static __u16 tcp_advertise_mss(struct sock *sk) 121 static __u16 tcp_advertise_mss(struct sock *sk)
122 { 122 {
123 struct tcp_sock *tp = tcp_sk(sk); 123 struct tcp_sock *tp = tcp_sk(sk);
124 const struct dst_entry *dst = __sk_dst_get(sk); 124 const struct dst_entry *dst = __sk_dst_get(sk);
125 int mss = tp->advmss; 125 int mss = tp->advmss;
126 126
127 if (dst) { 127 if (dst) {
128 unsigned int metric = dst_metric_advmss(dst); 128 unsigned int metric = dst_metric_advmss(dst);
129 129
130 if (metric < mss) { 130 if (metric < mss) {
131 mss = metric; 131 mss = metric;
132 tp->advmss = mss; 132 tp->advmss = mss;
133 } 133 }
134 } 134 }
135 135
136 return (__u16)mss; 136 return (__u16)mss;
137 } 137 }
138 138
139 /* RFC2861. Reset CWND after idle period longer RTO to "restart window". 139 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
140 * This is the first part of cwnd validation mechanism. */ 140 * This is the first part of cwnd validation mechanism. */
141 static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) 141 static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
142 { 142 {
143 struct tcp_sock *tp = tcp_sk(sk); 143 struct tcp_sock *tp = tcp_sk(sk);
144 s32 delta = tcp_time_stamp - tp->lsndtime; 144 s32 delta = tcp_time_stamp - tp->lsndtime;
145 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 145 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
146 u32 cwnd = tp->snd_cwnd; 146 u32 cwnd = tp->snd_cwnd;
147 147
148 tcp_ca_event(sk, CA_EVENT_CWND_RESTART); 148 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
149 149
150 tp->snd_ssthresh = tcp_current_ssthresh(sk); 150 tp->snd_ssthresh = tcp_current_ssthresh(sk);
151 restart_cwnd = min(restart_cwnd, cwnd); 151 restart_cwnd = min(restart_cwnd, cwnd);
152 152
153 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) 153 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
154 cwnd >>= 1; 154 cwnd >>= 1;
155 tp->snd_cwnd = max(cwnd, restart_cwnd); 155 tp->snd_cwnd = max(cwnd, restart_cwnd);
156 tp->snd_cwnd_stamp = tcp_time_stamp; 156 tp->snd_cwnd_stamp = tcp_time_stamp;
157 tp->snd_cwnd_used = 0; 157 tp->snd_cwnd_used = 0;
158 } 158 }
159 159
160 /* Congestion state accounting after a packet has been sent. */ 160 /* Congestion state accounting after a packet has been sent. */
161 static void tcp_event_data_sent(struct tcp_sock *tp, 161 static void tcp_event_data_sent(struct tcp_sock *tp,
162 struct sock *sk) 162 struct sock *sk)
163 { 163 {
164 struct inet_connection_sock *icsk = inet_csk(sk); 164 struct inet_connection_sock *icsk = inet_csk(sk);
165 const u32 now = tcp_time_stamp; 165 const u32 now = tcp_time_stamp;
166 const struct dst_entry *dst = __sk_dst_get(sk); 166 const struct dst_entry *dst = __sk_dst_get(sk);
167 167
168 if (sysctl_tcp_slow_start_after_idle && 168 if (sysctl_tcp_slow_start_after_idle &&
169 (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) 169 (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
170 tcp_cwnd_restart(sk, __sk_dst_get(sk)); 170 tcp_cwnd_restart(sk, __sk_dst_get(sk));
171 171
172 tp->lsndtime = now; 172 tp->lsndtime = now;
173 173
174 /* If it is a reply for ato after last received 174 /* If it is a reply for ato after last received
175 * packet, enter pingpong mode. 175 * packet, enter pingpong mode.
176 */ 176 */
177 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && 177 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
178 (!dst || !dst_metric(dst, RTAX_QUICKACK))) 178 (!dst || !dst_metric(dst, RTAX_QUICKACK)))
179 icsk->icsk_ack.pingpong = 1; 179 icsk->icsk_ack.pingpong = 1;
180 } 180 }
181 181
182 /* Account for an ACK we sent. */ 182 /* Account for an ACK we sent. */
183 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 183 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
184 { 184 {
185 tcp_dec_quickack_mode(sk, pkts); 185 tcp_dec_quickack_mode(sk, pkts);
186 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 186 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
187 } 187 }
188 188
189 189
190 u32 tcp_default_init_rwnd(u32 mss) 190 u32 tcp_default_init_rwnd(u32 mss)
191 { 191 {
192 /* Initial receive window should be twice of TCP_INIT_CWND to 192 /* Initial receive window should be twice of TCP_INIT_CWND to
193 * enable proper sending of new unsent data during fast recovery 193 * enable proper sending of new unsent data during fast recovery
194 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a 194 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
195 * limit when mss is larger than 1460. 195 * limit when mss is larger than 1460.
196 */ 196 */
197 u32 init_rwnd = TCP_INIT_CWND * 2; 197 u32 init_rwnd = TCP_INIT_CWND * 2;
198 198
199 if (mss > 1460) 199 if (mss > 1460)
200 init_rwnd = max((1460 * init_rwnd) / mss, 2U); 200 init_rwnd = max((1460 * init_rwnd) / mss, 2U);
201 return init_rwnd; 201 return init_rwnd;
202 } 202 }
203 203
204 /* Determine a window scaling and initial window to offer. 204 /* Determine a window scaling and initial window to offer.
205 * Based on the assumption that the given amount of space 205 * Based on the assumption that the given amount of space
206 * will be offered. Store the results in the tp structure. 206 * will be offered. Store the results in the tp structure.
207 * NOTE: for smooth operation initial space offering should 207 * NOTE: for smooth operation initial space offering should
208 * be a multiple of mss if possible. We assume here that mss >= 1. 208 * be a multiple of mss if possible. We assume here that mss >= 1.
209 * This MUST be enforced by all callers. 209 * This MUST be enforced by all callers.
210 */ 210 */
211 void tcp_select_initial_window(int __space, __u32 mss, 211 void tcp_select_initial_window(int __space, __u32 mss,
212 __u32 *rcv_wnd, __u32 *window_clamp, 212 __u32 *rcv_wnd, __u32 *window_clamp,
213 int wscale_ok, __u8 *rcv_wscale, 213 int wscale_ok, __u8 *rcv_wscale,
214 __u32 init_rcv_wnd) 214 __u32 init_rcv_wnd)
215 { 215 {
216 unsigned int space = (__space < 0 ? 0 : __space); 216 unsigned int space = (__space < 0 ? 0 : __space);
217 217
218 /* If no clamp set the clamp to the max possible scaled window */ 218 /* If no clamp set the clamp to the max possible scaled window */
219 if (*window_clamp == 0) 219 if (*window_clamp == 0)
220 (*window_clamp) = (65535 << 14); 220 (*window_clamp) = (65535 << 14);
221 space = min(*window_clamp, space); 221 space = min(*window_clamp, space);
222 222
223 /* Quantize space offering to a multiple of mss if possible. */ 223 /* Quantize space offering to a multiple of mss if possible. */
224 if (space > mss) 224 if (space > mss)
225 space = (space / mss) * mss; 225 space = (space / mss) * mss;
226 226
227 /* NOTE: offering an initial window larger than 32767 227 /* NOTE: offering an initial window larger than 32767
228 * will break some buggy TCP stacks. If the admin tells us 228 * will break some buggy TCP stacks. If the admin tells us
229 * it is likely we could be speaking with such a buggy stack 229 * it is likely we could be speaking with such a buggy stack
230 * we will truncate our initial window offering to 32K-1 230 * we will truncate our initial window offering to 32K-1
231 * unless the remote has sent us a window scaling option, 231 * unless the remote has sent us a window scaling option,
232 * which we interpret as a sign the remote TCP is not 232 * which we interpret as a sign the remote TCP is not
233 * misinterpreting the window field as a signed quantity. 233 * misinterpreting the window field as a signed quantity.
234 */ 234 */
235 if (sysctl_tcp_workaround_signed_windows) 235 if (sysctl_tcp_workaround_signed_windows)
236 (*rcv_wnd) = min(space, MAX_TCP_WINDOW); 236 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
237 else 237 else
238 (*rcv_wnd) = space; 238 (*rcv_wnd) = space;
239 239
240 (*rcv_wscale) = 0; 240 (*rcv_wscale) = 0;
241 if (wscale_ok) { 241 if (wscale_ok) {
242 /* Set window scaling on max possible window 242 /* Set window scaling on max possible window
243 * See RFC1323 for an explanation of the limit to 14 243 * See RFC1323 for an explanation of the limit to 14
244 */ 244 */
245 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); 245 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
246 space = min_t(u32, space, *window_clamp); 246 space = min_t(u32, space, *window_clamp);
247 while (space > 65535 && (*rcv_wscale) < 14) { 247 while (space > 65535 && (*rcv_wscale) < 14) {
248 space >>= 1; 248 space >>= 1;
249 (*rcv_wscale)++; 249 (*rcv_wscale)++;
250 } 250 }
251 } 251 }
252 252
253 if (mss > (1 << *rcv_wscale)) { 253 if (mss > (1 << *rcv_wscale)) {
254 if (!init_rcv_wnd) /* Use default unless specified otherwise */ 254 if (!init_rcv_wnd) /* Use default unless specified otherwise */
255 init_rcv_wnd = tcp_default_init_rwnd(mss); 255 init_rcv_wnd = tcp_default_init_rwnd(mss);
256 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); 256 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
257 } 257 }
258 258
259 /* Set the clamp no higher than max representable value */ 259 /* Set the clamp no higher than max representable value */
260 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); 260 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
261 } 261 }
262 EXPORT_SYMBOL(tcp_select_initial_window); 262 EXPORT_SYMBOL(tcp_select_initial_window);
263 263
264 /* Chose a new window to advertise, update state in tcp_sock for the 264 /* Chose a new window to advertise, update state in tcp_sock for the
265 * socket, and return result with RFC1323 scaling applied. The return 265 * socket, and return result with RFC1323 scaling applied. The return
266 * value can be stuffed directly into th->window for an outgoing 266 * value can be stuffed directly into th->window for an outgoing
267 * frame. 267 * frame.
268 */ 268 */
269 static u16 tcp_select_window(struct sock *sk) 269 static u16 tcp_select_window(struct sock *sk)
270 { 270 {
271 struct tcp_sock *tp = tcp_sk(sk); 271 struct tcp_sock *tp = tcp_sk(sk);
272 u32 cur_win = tcp_receive_window(tp); 272 u32 cur_win = tcp_receive_window(tp);
273 u32 new_win = __tcp_select_window(sk); 273 u32 new_win = __tcp_select_window(sk);
274 274
275 /* Never shrink the offered window */ 275 /* Never shrink the offered window */
276 if (new_win < cur_win) { 276 if (new_win < cur_win) {
277 /* Danger Will Robinson! 277 /* Danger Will Robinson!
278 * Don't update rcv_wup/rcv_wnd here or else 278 * Don't update rcv_wup/rcv_wnd here or else
279 * we will not be able to advertise a zero 279 * we will not be able to advertise a zero
280 * window in time. --DaveM 280 * window in time. --DaveM
281 * 281 *
282 * Relax Will Robinson. 282 * Relax Will Robinson.
283 */ 283 */
284 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); 284 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
285 } 285 }
286 tp->rcv_wnd = new_win; 286 tp->rcv_wnd = new_win;
287 tp->rcv_wup = tp->rcv_nxt; 287 tp->rcv_wup = tp->rcv_nxt;
288 288
289 /* Make sure we do not exceed the maximum possible 289 /* Make sure we do not exceed the maximum possible
290 * scaled window. 290 * scaled window.
291 */ 291 */
292 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) 292 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
293 new_win = min(new_win, MAX_TCP_WINDOW); 293 new_win = min(new_win, MAX_TCP_WINDOW);
294 else 294 else
295 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); 295 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
296 296
297 /* RFC1323 scaling applied */ 297 /* RFC1323 scaling applied */
298 new_win >>= tp->rx_opt.rcv_wscale; 298 new_win >>= tp->rx_opt.rcv_wscale;
299 299
300 /* If we advertise zero window, disable fast path. */ 300 /* If we advertise zero window, disable fast path. */
301 if (new_win == 0) 301 if (new_win == 0)
302 tp->pred_flags = 0; 302 tp->pred_flags = 0;
303 303
304 return new_win; 304 return new_win;
305 } 305 }
306 306
307 /* Packet ECN state for a SYN-ACK */ 307 /* Packet ECN state for a SYN-ACK */
308 static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) 308 static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
309 { 309 {
310 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 310 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
311 if (!(tp->ecn_flags & TCP_ECN_OK)) 311 if (!(tp->ecn_flags & TCP_ECN_OK))
312 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 312 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
313 } 313 }
314 314
315 /* Packet ECN state for a SYN. */ 315 /* Packet ECN state for a SYN. */
316 static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) 316 static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
317 { 317 {
318 struct tcp_sock *tp = tcp_sk(sk); 318 struct tcp_sock *tp = tcp_sk(sk);
319 319
320 tp->ecn_flags = 0; 320 tp->ecn_flags = 0;
321 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { 321 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
322 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 322 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
323 tp->ecn_flags = TCP_ECN_OK; 323 tp->ecn_flags = TCP_ECN_OK;
324 } 324 }
325 } 325 }
326 326
327 static __inline__ void 327 static __inline__ void
328 TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) 328 TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
329 { 329 {
330 if (inet_rsk(req)->ecn_ok) 330 if (inet_rsk(req)->ecn_ok)
331 th->ece = 1; 331 th->ece = 1;
332 } 332 }
333 333
334 /* Set up ECN state for a packet on a ESTABLISHED socket that is about to 334 /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
335 * be sent. 335 * be sent.
336 */ 336 */
337 static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, 337 static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
338 int tcp_header_len) 338 int tcp_header_len)
339 { 339 {
340 struct tcp_sock *tp = tcp_sk(sk); 340 struct tcp_sock *tp = tcp_sk(sk);
341 341
342 if (tp->ecn_flags & TCP_ECN_OK) { 342 if (tp->ecn_flags & TCP_ECN_OK) {
343 /* Not-retransmitted data segment: set ECT and inject CWR. */ 343 /* Not-retransmitted data segment: set ECT and inject CWR. */
344 if (skb->len != tcp_header_len && 344 if (skb->len != tcp_header_len &&
345 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { 345 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
346 INET_ECN_xmit(sk); 346 INET_ECN_xmit(sk);
347 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { 347 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
348 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; 348 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
349 tcp_hdr(skb)->cwr = 1; 349 tcp_hdr(skb)->cwr = 1;
350 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; 350 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
351 } 351 }
352 } else { 352 } else {
353 /* ACK or retransmitted segment: clear ECT|CE */ 353 /* ACK or retransmitted segment: clear ECT|CE */
354 INET_ECN_dontxmit(sk); 354 INET_ECN_dontxmit(sk);
355 } 355 }
356 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) 356 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
357 tcp_hdr(skb)->ece = 1; 357 tcp_hdr(skb)->ece = 1;
358 } 358 }
359 } 359 }
360 360
361 /* Constructs common control bits of non-data skb. If SYN/FIN is present, 361 /* Constructs common control bits of non-data skb. If SYN/FIN is present,
362 * auto increment end seqno. 362 * auto increment end seqno.
363 */ 363 */
364 static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 364 static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
365 { 365 {
366 struct skb_shared_info *shinfo = skb_shinfo(skb); 366 struct skb_shared_info *shinfo = skb_shinfo(skb);
367 367
368 skb->ip_summed = CHECKSUM_PARTIAL; 368 skb->ip_summed = CHECKSUM_PARTIAL;
369 skb->csum = 0; 369 skb->csum = 0;
370 370
371 TCP_SKB_CB(skb)->tcp_flags = flags; 371 TCP_SKB_CB(skb)->tcp_flags = flags;
372 TCP_SKB_CB(skb)->sacked = 0; 372 TCP_SKB_CB(skb)->sacked = 0;
373 373
374 shinfo->gso_segs = 1; 374 shinfo->gso_segs = 1;
375 shinfo->gso_size = 0; 375 shinfo->gso_size = 0;
376 shinfo->gso_type = 0; 376 shinfo->gso_type = 0;
377 377
378 TCP_SKB_CB(skb)->seq = seq; 378 TCP_SKB_CB(skb)->seq = seq;
379 if (flags & (TCPHDR_SYN | TCPHDR_FIN)) 379 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
380 seq++; 380 seq++;
381 TCP_SKB_CB(skb)->end_seq = seq; 381 TCP_SKB_CB(skb)->end_seq = seq;
382 } 382 }
383 383
384 static inline bool tcp_urg_mode(const struct tcp_sock *tp) 384 static inline bool tcp_urg_mode(const struct tcp_sock *tp)
385 { 385 {
386 return tp->snd_una != tp->snd_up; 386 return tp->snd_una != tp->snd_up;
387 } 387 }
388 388
389 #define OPTION_SACK_ADVERTISE (1 << 0) 389 #define OPTION_SACK_ADVERTISE (1 << 0)
390 #define OPTION_TS (1 << 1) 390 #define OPTION_TS (1 << 1)
391 #define OPTION_MD5 (1 << 2) 391 #define OPTION_MD5 (1 << 2)
392 #define OPTION_WSCALE (1 << 3) 392 #define OPTION_WSCALE (1 << 3)
393 #define OPTION_FAST_OPEN_COOKIE (1 << 8) 393 #define OPTION_FAST_OPEN_COOKIE (1 << 8)
394 394
395 struct tcp_out_options { 395 struct tcp_out_options {
396 u16 options; /* bit field of OPTION_* */ 396 u16 options; /* bit field of OPTION_* */
397 u16 mss; /* 0 to disable */ 397 u16 mss; /* 0 to disable */
398 u8 ws; /* window scale, 0 to disable */ 398 u8 ws; /* window scale, 0 to disable */
399 u8 num_sack_blocks; /* number of SACK blocks to include */ 399 u8 num_sack_blocks; /* number of SACK blocks to include */
400 u8 hash_size; /* bytes in hash_location */ 400 u8 hash_size; /* bytes in hash_location */
401 __u8 *hash_location; /* temporary pointer, overloaded */ 401 __u8 *hash_location; /* temporary pointer, overloaded */
402 __u32 tsval, tsecr; /* need to include OPTION_TS */ 402 __u32 tsval, tsecr; /* need to include OPTION_TS */
403 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ 403 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
404 }; 404 };
405 405
406 /* Write previously computed TCP options to the packet. 406 /* Write previously computed TCP options to the packet.
407 * 407 *
408 * Beware: Something in the Internet is very sensitive to the ordering of 408 * Beware: Something in the Internet is very sensitive to the ordering of
409 * TCP options, we learned this through the hard way, so be careful here. 409 * TCP options, we learned this through the hard way, so be careful here.
410 * Luckily we can at least blame others for their non-compliance but from 410 * Luckily we can at least blame others for their non-compliance but from
411 * inter-operatibility perspective it seems that we're somewhat stuck with 411 * inter-operability perspective it seems that we're somewhat stuck with
412 * the ordering which we have been using if we want to keep working with 412 * the ordering which we have been using if we want to keep working with
413 * those broken things (not that it currently hurts anybody as there isn't 413 * those broken things (not that it currently hurts anybody as there isn't
414 * particular reason why the ordering would need to be changed). 414 * particular reason why the ordering would need to be changed).
415 * 415 *
416 * At least SACK_PERM as the first option is known to lead to a disaster 416 * At least SACK_PERM as the first option is known to lead to a disaster
417 * (but it may well be that other scenarios fail similarly). 417 * (but it may well be that other scenarios fail similarly).
418 */ 418 */
419 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 419 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
420 struct tcp_out_options *opts) 420 struct tcp_out_options *opts)
421 { 421 {
422 u16 options = opts->options; /* mungable copy */ 422 u16 options = opts->options; /* mungable copy */
423 423
424 if (unlikely(OPTION_MD5 & options)) { 424 if (unlikely(OPTION_MD5 & options)) {
425 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 425 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
426 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); 426 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
427 /* overload cookie hash location */ 427 /* overload cookie hash location */
428 opts->hash_location = (__u8 *)ptr; 428 opts->hash_location = (__u8 *)ptr;
429 ptr += 4; 429 ptr += 4;
430 } 430 }
431 431
432 if (unlikely(opts->mss)) { 432 if (unlikely(opts->mss)) {
433 *ptr++ = htonl((TCPOPT_MSS << 24) | 433 *ptr++ = htonl((TCPOPT_MSS << 24) |
434 (TCPOLEN_MSS << 16) | 434 (TCPOLEN_MSS << 16) |
435 opts->mss); 435 opts->mss);
436 } 436 }
437 437
438 if (likely(OPTION_TS & options)) { 438 if (likely(OPTION_TS & options)) {
439 if (unlikely(OPTION_SACK_ADVERTISE & options)) { 439 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
440 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | 440 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
441 (TCPOLEN_SACK_PERM << 16) | 441 (TCPOLEN_SACK_PERM << 16) |
442 (TCPOPT_TIMESTAMP << 8) | 442 (TCPOPT_TIMESTAMP << 8) |
443 TCPOLEN_TIMESTAMP); 443 TCPOLEN_TIMESTAMP);
444 options &= ~OPTION_SACK_ADVERTISE; 444 options &= ~OPTION_SACK_ADVERTISE;
445 } else { 445 } else {
446 *ptr++ = htonl((TCPOPT_NOP << 24) | 446 *ptr++ = htonl((TCPOPT_NOP << 24) |
447 (TCPOPT_NOP << 16) | 447 (TCPOPT_NOP << 16) |
448 (TCPOPT_TIMESTAMP << 8) | 448 (TCPOPT_TIMESTAMP << 8) |
449 TCPOLEN_TIMESTAMP); 449 TCPOLEN_TIMESTAMP);
450 } 450 }
451 *ptr++ = htonl(opts->tsval); 451 *ptr++ = htonl(opts->tsval);
452 *ptr++ = htonl(opts->tsecr); 452 *ptr++ = htonl(opts->tsecr);
453 } 453 }
454 454
455 if (unlikely(OPTION_SACK_ADVERTISE & options)) { 455 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
456 *ptr++ = htonl((TCPOPT_NOP << 24) | 456 *ptr++ = htonl((TCPOPT_NOP << 24) |
457 (TCPOPT_NOP << 16) | 457 (TCPOPT_NOP << 16) |
458 (TCPOPT_SACK_PERM << 8) | 458 (TCPOPT_SACK_PERM << 8) |
459 TCPOLEN_SACK_PERM); 459 TCPOLEN_SACK_PERM);
460 } 460 }
461 461
462 if (unlikely(OPTION_WSCALE & options)) { 462 if (unlikely(OPTION_WSCALE & options)) {
463 *ptr++ = htonl((TCPOPT_NOP << 24) | 463 *ptr++ = htonl((TCPOPT_NOP << 24) |
464 (TCPOPT_WINDOW << 16) | 464 (TCPOPT_WINDOW << 16) |
465 (TCPOLEN_WINDOW << 8) | 465 (TCPOLEN_WINDOW << 8) |
466 opts->ws); 466 opts->ws);
467 } 467 }
468 468
469 if (unlikely(opts->num_sack_blocks)) { 469 if (unlikely(opts->num_sack_blocks)) {
470 struct tcp_sack_block *sp = tp->rx_opt.dsack ? 470 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
471 tp->duplicate_sack : tp->selective_acks; 471 tp->duplicate_sack : tp->selective_acks;
472 int this_sack; 472 int this_sack;
473 473
474 *ptr++ = htonl((TCPOPT_NOP << 24) | 474 *ptr++ = htonl((TCPOPT_NOP << 24) |
475 (TCPOPT_NOP << 16) | 475 (TCPOPT_NOP << 16) |
476 (TCPOPT_SACK << 8) | 476 (TCPOPT_SACK << 8) |
477 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * 477 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
478 TCPOLEN_SACK_PERBLOCK))); 478 TCPOLEN_SACK_PERBLOCK)));
479 479
480 for (this_sack = 0; this_sack < opts->num_sack_blocks; 480 for (this_sack = 0; this_sack < opts->num_sack_blocks;
481 ++this_sack) { 481 ++this_sack) {
482 *ptr++ = htonl(sp[this_sack].start_seq); 482 *ptr++ = htonl(sp[this_sack].start_seq);
483 *ptr++ = htonl(sp[this_sack].end_seq); 483 *ptr++ = htonl(sp[this_sack].end_seq);
484 } 484 }
485 485
486 tp->rx_opt.dsack = 0; 486 tp->rx_opt.dsack = 0;
487 } 487 }
488 488
489 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { 489 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
490 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; 490 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
491 491
492 *ptr++ = htonl((TCPOPT_EXP << 24) | 492 *ptr++ = htonl((TCPOPT_EXP << 24) |
493 ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | 493 ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
494 TCPOPT_FASTOPEN_MAGIC); 494 TCPOPT_FASTOPEN_MAGIC);
495 495
496 memcpy(ptr, foc->val, foc->len); 496 memcpy(ptr, foc->val, foc->len);
497 if ((foc->len & 3) == 2) { 497 if ((foc->len & 3) == 2) {
498 u8 *align = ((u8 *)ptr) + foc->len; 498 u8 *align = ((u8 *)ptr) + foc->len;
499 align[0] = align[1] = TCPOPT_NOP; 499 align[0] = align[1] = TCPOPT_NOP;
500 } 500 }
501 ptr += (foc->len + 3) >> 2; 501 ptr += (foc->len + 3) >> 2;
502 } 502 }
503 } 503 }
504 504
505 /* Compute TCP options for SYN packets. This is not the final 505 /* Compute TCP options for SYN packets. This is not the final
506 * network wire format yet. 506 * network wire format yet.
507 */ 507 */
508 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, 508 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
509 struct tcp_out_options *opts, 509 struct tcp_out_options *opts,
510 struct tcp_md5sig_key **md5) 510 struct tcp_md5sig_key **md5)
511 { 511 {
512 struct tcp_sock *tp = tcp_sk(sk); 512 struct tcp_sock *tp = tcp_sk(sk);
513 unsigned int remaining = MAX_TCP_OPTION_SPACE; 513 unsigned int remaining = MAX_TCP_OPTION_SPACE;
514 struct tcp_fastopen_request *fastopen = tp->fastopen_req; 514 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
515 515
516 #ifdef CONFIG_TCP_MD5SIG 516 #ifdef CONFIG_TCP_MD5SIG
517 *md5 = tp->af_specific->md5_lookup(sk, sk); 517 *md5 = tp->af_specific->md5_lookup(sk, sk);
518 if (*md5) { 518 if (*md5) {
519 opts->options |= OPTION_MD5; 519 opts->options |= OPTION_MD5;
520 remaining -= TCPOLEN_MD5SIG_ALIGNED; 520 remaining -= TCPOLEN_MD5SIG_ALIGNED;
521 } 521 }
522 #else 522 #else
523 *md5 = NULL; 523 *md5 = NULL;
524 #endif 524 #endif
525 525
526 /* We always get an MSS option. The option bytes which will be seen in 526 /* We always get an MSS option. The option bytes which will be seen in
527 * normal data packets should timestamps be used, must be in the MSS 527 * normal data packets should timestamps be used, must be in the MSS
528 * advertised. But we subtract them from tp->mss_cache so that 528 * advertised. But we subtract them from tp->mss_cache so that
529 * calculations in tcp_sendmsg are simpler etc. So account for this 529 * calculations in tcp_sendmsg are simpler etc. So account for this
530 * fact here if necessary. If we don't do this correctly, as a 530 * fact here if necessary. If we don't do this correctly, as a
531 * receiver we won't recognize data packets as being full sized when we 531 * receiver we won't recognize data packets as being full sized when we
532 * should, and thus we won't abide by the delayed ACK rules correctly. 532 * should, and thus we won't abide by the delayed ACK rules correctly.
533 * SACKs don't matter, we never delay an ACK when we have any of those 533 * SACKs don't matter, we never delay an ACK when we have any of those
534 * going out. */ 534 * going out. */
535 opts->mss = tcp_advertise_mss(sk); 535 opts->mss = tcp_advertise_mss(sk);
536 remaining -= TCPOLEN_MSS_ALIGNED; 536 remaining -= TCPOLEN_MSS_ALIGNED;
537 537
538 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 538 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
539 opts->options |= OPTION_TS; 539 opts->options |= OPTION_TS;
540 opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; 540 opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;
541 opts->tsecr = tp->rx_opt.ts_recent; 541 opts->tsecr = tp->rx_opt.ts_recent;
542 remaining -= TCPOLEN_TSTAMP_ALIGNED; 542 remaining -= TCPOLEN_TSTAMP_ALIGNED;
543 } 543 }
544 if (likely(sysctl_tcp_window_scaling)) { 544 if (likely(sysctl_tcp_window_scaling)) {
545 opts->ws = tp->rx_opt.rcv_wscale; 545 opts->ws = tp->rx_opt.rcv_wscale;
546 opts->options |= OPTION_WSCALE; 546 opts->options |= OPTION_WSCALE;
547 remaining -= TCPOLEN_WSCALE_ALIGNED; 547 remaining -= TCPOLEN_WSCALE_ALIGNED;
548 } 548 }
549 if (likely(sysctl_tcp_sack)) { 549 if (likely(sysctl_tcp_sack)) {
550 opts->options |= OPTION_SACK_ADVERTISE; 550 opts->options |= OPTION_SACK_ADVERTISE;
551 if (unlikely(!(OPTION_TS & opts->options))) 551 if (unlikely(!(OPTION_TS & opts->options)))
552 remaining -= TCPOLEN_SACKPERM_ALIGNED; 552 remaining -= TCPOLEN_SACKPERM_ALIGNED;
553 } 553 }
554 554
555 if (fastopen && fastopen->cookie.len >= 0) { 555 if (fastopen && fastopen->cookie.len >= 0) {
556 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; 556 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
557 need = (need + 3) & ~3U; /* Align to 32 bits */ 557 need = (need + 3) & ~3U; /* Align to 32 bits */
558 if (remaining >= need) { 558 if (remaining >= need) {
559 opts->options |= OPTION_FAST_OPEN_COOKIE; 559 opts->options |= OPTION_FAST_OPEN_COOKIE;
560 opts->fastopen_cookie = &fastopen->cookie; 560 opts->fastopen_cookie = &fastopen->cookie;
561 remaining -= need; 561 remaining -= need;
562 tp->syn_fastopen = 1; 562 tp->syn_fastopen = 1;
563 } 563 }
564 } 564 }
565 565
566 return MAX_TCP_OPTION_SPACE - remaining; 566 return MAX_TCP_OPTION_SPACE - remaining;
567 } 567 }
568 568
569 /* Set up TCP options for SYN-ACKs. */ 569 /* Set up TCP options for SYN-ACKs. */
570 static unsigned int tcp_synack_options(struct sock *sk, 570 static unsigned int tcp_synack_options(struct sock *sk,
571 struct request_sock *req, 571 struct request_sock *req,
572 unsigned int mss, struct sk_buff *skb, 572 unsigned int mss, struct sk_buff *skb,
573 struct tcp_out_options *opts, 573 struct tcp_out_options *opts,
574 struct tcp_md5sig_key **md5, 574 struct tcp_md5sig_key **md5,
575 struct tcp_fastopen_cookie *foc) 575 struct tcp_fastopen_cookie *foc)
576 { 576 {
577 struct inet_request_sock *ireq = inet_rsk(req); 577 struct inet_request_sock *ireq = inet_rsk(req);
578 unsigned int remaining = MAX_TCP_OPTION_SPACE; 578 unsigned int remaining = MAX_TCP_OPTION_SPACE;
579 579
580 #ifdef CONFIG_TCP_MD5SIG 580 #ifdef CONFIG_TCP_MD5SIG
581 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 581 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
582 if (*md5) { 582 if (*md5) {
583 opts->options |= OPTION_MD5; 583 opts->options |= OPTION_MD5;
584 remaining -= TCPOLEN_MD5SIG_ALIGNED; 584 remaining -= TCPOLEN_MD5SIG_ALIGNED;
585 585
586 /* We can't fit any SACK blocks in a packet with MD5 + TS 586 /* We can't fit any SACK blocks in a packet with MD5 + TS
587 * options. There was discussion about disabling SACK 587 * options. There was discussion about disabling SACK
588 * rather than TS in order to fit in better with old, 588 * rather than TS in order to fit in better with old,
589 * buggy kernels, but that was deemed to be unnecessary. 589 * buggy kernels, but that was deemed to be unnecessary.
590 */ 590 */
591 ireq->tstamp_ok &= !ireq->sack_ok; 591 ireq->tstamp_ok &= !ireq->sack_ok;
592 } 592 }
593 #else 593 #else
594 *md5 = NULL; 594 *md5 = NULL;
595 #endif 595 #endif
596 596
597 /* We always send an MSS option. */ 597 /* We always send an MSS option. */
598 opts->mss = mss; 598 opts->mss = mss;
599 remaining -= TCPOLEN_MSS_ALIGNED; 599 remaining -= TCPOLEN_MSS_ALIGNED;
600 600
601 if (likely(ireq->wscale_ok)) { 601 if (likely(ireq->wscale_ok)) {
602 opts->ws = ireq->rcv_wscale; 602 opts->ws = ireq->rcv_wscale;
603 opts->options |= OPTION_WSCALE; 603 opts->options |= OPTION_WSCALE;
604 remaining -= TCPOLEN_WSCALE_ALIGNED; 604 remaining -= TCPOLEN_WSCALE_ALIGNED;
605 } 605 }
606 if (likely(ireq->tstamp_ok)) { 606 if (likely(ireq->tstamp_ok)) {
607 opts->options |= OPTION_TS; 607 opts->options |= OPTION_TS;
608 opts->tsval = TCP_SKB_CB(skb)->when; 608 opts->tsval = TCP_SKB_CB(skb)->when;
609 opts->tsecr = req->ts_recent; 609 opts->tsecr = req->ts_recent;
610 remaining -= TCPOLEN_TSTAMP_ALIGNED; 610 remaining -= TCPOLEN_TSTAMP_ALIGNED;
611 } 611 }
612 if (likely(ireq->sack_ok)) { 612 if (likely(ireq->sack_ok)) {
613 opts->options |= OPTION_SACK_ADVERTISE; 613 opts->options |= OPTION_SACK_ADVERTISE;
614 if (unlikely(!ireq->tstamp_ok)) 614 if (unlikely(!ireq->tstamp_ok))
615 remaining -= TCPOLEN_SACKPERM_ALIGNED; 615 remaining -= TCPOLEN_SACKPERM_ALIGNED;
616 } 616 }
617 if (foc != NULL) { 617 if (foc != NULL) {
618 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 618 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
619 need = (need + 3) & ~3U; /* Align to 32 bits */ 619 need = (need + 3) & ~3U; /* Align to 32 bits */
620 if (remaining >= need) { 620 if (remaining >= need) {
621 opts->options |= OPTION_FAST_OPEN_COOKIE; 621 opts->options |= OPTION_FAST_OPEN_COOKIE;
622 opts->fastopen_cookie = foc; 622 opts->fastopen_cookie = foc;
623 remaining -= need; 623 remaining -= need;
624 } 624 }
625 } 625 }
626 626
627 return MAX_TCP_OPTION_SPACE - remaining; 627 return MAX_TCP_OPTION_SPACE - remaining;
628 } 628 }
629 629
630 /* Compute TCP options for ESTABLISHED sockets. This is not the 630 /* Compute TCP options for ESTABLISHED sockets. This is not the
631 * final wire format yet. 631 * final wire format yet.
632 */ 632 */
633 static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, 633 static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
634 struct tcp_out_options *opts, 634 struct tcp_out_options *opts,
635 struct tcp_md5sig_key **md5) 635 struct tcp_md5sig_key **md5)
636 { 636 {
637 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; 637 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
638 struct tcp_sock *tp = tcp_sk(sk); 638 struct tcp_sock *tp = tcp_sk(sk);
639 unsigned int size = 0; 639 unsigned int size = 0;
640 unsigned int eff_sacks; 640 unsigned int eff_sacks;
641 641
642 opts->options = 0; 642 opts->options = 0;
643 643
644 #ifdef CONFIG_TCP_MD5SIG 644 #ifdef CONFIG_TCP_MD5SIG
645 *md5 = tp->af_specific->md5_lookup(sk, sk); 645 *md5 = tp->af_specific->md5_lookup(sk, sk);
646 if (unlikely(*md5)) { 646 if (unlikely(*md5)) {
647 opts->options |= OPTION_MD5; 647 opts->options |= OPTION_MD5;
648 size += TCPOLEN_MD5SIG_ALIGNED; 648 size += TCPOLEN_MD5SIG_ALIGNED;
649 } 649 }
650 #else 650 #else
651 *md5 = NULL; 651 *md5 = NULL;
652 #endif 652 #endif
653 653
654 if (likely(tp->rx_opt.tstamp_ok)) { 654 if (likely(tp->rx_opt.tstamp_ok)) {
655 opts->options |= OPTION_TS; 655 opts->options |= OPTION_TS;
656 opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; 656 opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;
657 opts->tsecr = tp->rx_opt.ts_recent; 657 opts->tsecr = tp->rx_opt.ts_recent;
658 size += TCPOLEN_TSTAMP_ALIGNED; 658 size += TCPOLEN_TSTAMP_ALIGNED;
659 } 659 }
660 660
661 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; 661 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
662 if (unlikely(eff_sacks)) { 662 if (unlikely(eff_sacks)) {
663 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; 663 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
664 opts->num_sack_blocks = 664 opts->num_sack_blocks =
665 min_t(unsigned int, eff_sacks, 665 min_t(unsigned int, eff_sacks,
666 (remaining - TCPOLEN_SACK_BASE_ALIGNED) / 666 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
667 TCPOLEN_SACK_PERBLOCK); 667 TCPOLEN_SACK_PERBLOCK);
668 size += TCPOLEN_SACK_BASE_ALIGNED + 668 size += TCPOLEN_SACK_BASE_ALIGNED +
669 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; 669 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
670 } 670 }
671 671
672 return size; 672 return size;
673 } 673 }
674 674
675 675
676 /* TCP SMALL QUEUES (TSQ) 676 /* TCP SMALL QUEUES (TSQ)
677 * 677 *
678 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) 678 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
679 * to reduce RTT and bufferbloat. 679 * to reduce RTT and bufferbloat.
680 * We do this using a special skb destructor (tcp_wfree). 680 * We do this using a special skb destructor (tcp_wfree).
681 * 681 *
682 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb 682 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
683 * needs to be reallocated in a driver. 683 * needs to be reallocated in a driver.
684 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc 684 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
685 * 685 *
686 * Since transmit from skb destructor is forbidden, we use a tasklet 686 * Since transmit from skb destructor is forbidden, we use a tasklet
687 * to process all sockets that eventually need to send more skbs. 687 * to process all sockets that eventually need to send more skbs.
688 * We use one tasklet per cpu, with its own queue of sockets. 688 * We use one tasklet per cpu, with its own queue of sockets.
689 */ 689 */
690 struct tsq_tasklet { 690 struct tsq_tasklet {
691 struct tasklet_struct tasklet; 691 struct tasklet_struct tasklet;
692 struct list_head head; /* queue of tcp sockets */ 692 struct list_head head; /* queue of tcp sockets */
693 }; 693 };
694 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); 694 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
695 695
696 static void tcp_tsq_handler(struct sock *sk) 696 static void tcp_tsq_handler(struct sock *sk)
697 { 697 {
698 if ((1 << sk->sk_state) & 698 if ((1 << sk->sk_state) &
699 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | 699 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
700 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) 700 TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
701 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); 701 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
702 } 702 }
703 /* 703 /*
704 * One tasklest per cpu tries to send more skbs. 704 * One tasklet per cpu tries to send more skbs.
705 * We run in tasklet context but need to disable irqs when 705 * We run in tasklet context but need to disable irqs when
706 * transfering tsq->head because tcp_wfree() might 706 * transferring tsq->head because tcp_wfree() might
707 * interrupt us (non NAPI drivers) 707 * interrupt us (non NAPI drivers)
708 */ 708 */
709 static void tcp_tasklet_func(unsigned long data) 709 static void tcp_tasklet_func(unsigned long data)
710 { 710 {
711 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; 711 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
712 LIST_HEAD(list); 712 LIST_HEAD(list);
713 unsigned long flags; 713 unsigned long flags;
714 struct list_head *q, *n; 714 struct list_head *q, *n;
715 struct tcp_sock *tp; 715 struct tcp_sock *tp;
716 struct sock *sk; 716 struct sock *sk;
717 717
718 local_irq_save(flags); 718 local_irq_save(flags);
719 list_splice_init(&tsq->head, &list); 719 list_splice_init(&tsq->head, &list);
720 local_irq_restore(flags); 720 local_irq_restore(flags);
721 721
722 list_for_each_safe(q, n, &list) { 722 list_for_each_safe(q, n, &list) {
723 tp = list_entry(q, struct tcp_sock, tsq_node); 723 tp = list_entry(q, struct tcp_sock, tsq_node);
724 list_del(&tp->tsq_node); 724 list_del(&tp->tsq_node);
725 725
726 sk = (struct sock *)tp; 726 sk = (struct sock *)tp;
727 bh_lock_sock(sk); 727 bh_lock_sock(sk);
728 728
729 if (!sock_owned_by_user(sk)) { 729 if (!sock_owned_by_user(sk)) {
730 tcp_tsq_handler(sk); 730 tcp_tsq_handler(sk);
731 } else { 731 } else {
732 /* defer the work to tcp_release_cb() */ 732 /* defer the work to tcp_release_cb() */
733 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); 733 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
734 } 734 }
735 bh_unlock_sock(sk); 735 bh_unlock_sock(sk);
736 736
737 clear_bit(TSQ_QUEUED, &tp->tsq_flags); 737 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
738 sk_free(sk); 738 sk_free(sk);
739 } 739 }
740 } 740 }
741 741
742 #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ 742 #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
743 (1UL << TCP_WRITE_TIMER_DEFERRED) | \ 743 (1UL << TCP_WRITE_TIMER_DEFERRED) | \
744 (1UL << TCP_DELACK_TIMER_DEFERRED) | \ 744 (1UL << TCP_DELACK_TIMER_DEFERRED) | \
745 (1UL << TCP_MTU_REDUCED_DEFERRED)) 745 (1UL << TCP_MTU_REDUCED_DEFERRED))
746 /** 746 /**
747 * tcp_release_cb - tcp release_sock() callback 747 * tcp_release_cb - tcp release_sock() callback
748 * @sk: socket 748 * @sk: socket
749 * 749 *
750 * called from release_sock() to perform protocol dependent 750 * called from release_sock() to perform protocol dependent
751 * actions before socket release. 751 * actions before socket release.
752 */ 752 */
753 void tcp_release_cb(struct sock *sk) 753 void tcp_release_cb(struct sock *sk)
754 { 754 {
755 struct tcp_sock *tp = tcp_sk(sk); 755 struct tcp_sock *tp = tcp_sk(sk);
756 unsigned long flags, nflags; 756 unsigned long flags, nflags;
757 757
758 /* perform an atomic operation only if at least one flag is set */ 758 /* perform an atomic operation only if at least one flag is set */
759 do { 759 do {
760 flags = tp->tsq_flags; 760 flags = tp->tsq_flags;
761 if (!(flags & TCP_DEFERRED_ALL)) 761 if (!(flags & TCP_DEFERRED_ALL))
762 return; 762 return;
763 nflags = flags & ~TCP_DEFERRED_ALL; 763 nflags = flags & ~TCP_DEFERRED_ALL;
764 } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); 764 } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
765 765
766 if (flags & (1UL << TCP_TSQ_DEFERRED)) 766 if (flags & (1UL << TCP_TSQ_DEFERRED))
767 tcp_tsq_handler(sk); 767 tcp_tsq_handler(sk);
768 768
769 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { 769 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
770 tcp_write_timer_handler(sk); 770 tcp_write_timer_handler(sk);
771 __sock_put(sk); 771 __sock_put(sk);
772 } 772 }
773 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { 773 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
774 tcp_delack_timer_handler(sk); 774 tcp_delack_timer_handler(sk);
775 __sock_put(sk); 775 __sock_put(sk);
776 } 776 }
777 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { 777 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
778 sk->sk_prot->mtu_reduced(sk); 778 sk->sk_prot->mtu_reduced(sk);
779 __sock_put(sk); 779 __sock_put(sk);
780 } 780 }
781 } 781 }
782 EXPORT_SYMBOL(tcp_release_cb); 782 EXPORT_SYMBOL(tcp_release_cb);
783 783
784 void __init tcp_tasklet_init(void) 784 void __init tcp_tasklet_init(void)
785 { 785 {
786 int i; 786 int i;
787 787
788 for_each_possible_cpu(i) { 788 for_each_possible_cpu(i) {
789 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); 789 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
790 790
791 INIT_LIST_HEAD(&tsq->head); 791 INIT_LIST_HEAD(&tsq->head);
792 tasklet_init(&tsq->tasklet, 792 tasklet_init(&tsq->tasklet,
793 tcp_tasklet_func, 793 tcp_tasklet_func,
794 (unsigned long)tsq); 794 (unsigned long)tsq);
795 } 795 }
796 } 796 }
797 797
798 /* 798 /*
799 * Write buffer destructor automatically called from kfree_skb. 799 * Write buffer destructor automatically called from kfree_skb.
800 * We cant xmit new skbs from this context, as we might already 800 * We can't xmit new skbs from this context, as we might already
801 * hold qdisc lock. 801 * hold qdisc lock.
802 */ 802 */
803 void tcp_wfree(struct sk_buff *skb) 803 void tcp_wfree(struct sk_buff *skb)
804 { 804 {
805 struct sock *sk = skb->sk; 805 struct sock *sk = skb->sk;
806 struct tcp_sock *tp = tcp_sk(sk); 806 struct tcp_sock *tp = tcp_sk(sk);
807 807
808 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && 808 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
809 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { 809 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
810 unsigned long flags; 810 unsigned long flags;
811 struct tsq_tasklet *tsq; 811 struct tsq_tasklet *tsq;
812 812
813 /* Keep a ref on socket. 813 /* Keep a ref on socket.
814 * This last ref will be released in tcp_tasklet_func() 814 * This last ref will be released in tcp_tasklet_func()
815 */ 815 */
816 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); 816 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
817 817
818 /* queue this socket to tasklet queue */ 818 /* queue this socket to tasklet queue */
819 local_irq_save(flags); 819 local_irq_save(flags);
820 tsq = &__get_cpu_var(tsq_tasklet); 820 tsq = &__get_cpu_var(tsq_tasklet);
821 list_add(&tp->tsq_node, &tsq->head); 821 list_add(&tp->tsq_node, &tsq->head);
822 tasklet_schedule(&tsq->tasklet); 822 tasklet_schedule(&tsq->tasklet);
823 local_irq_restore(flags); 823 local_irq_restore(flags);
824 } else { 824 } else {
825 sock_wfree(skb); 825 sock_wfree(skb);
826 } 826 }
827 } 827 }
828 828
829 /* This routine actually transmits TCP packets queued in by 829 /* This routine actually transmits TCP packets queued in by
830 * tcp_do_sendmsg(). This is used by both the initial 830 * tcp_do_sendmsg(). This is used by both the initial
831 * transmission and possible later retransmissions. 831 * transmission and possible later retransmissions.
832 * All SKB's seen here are completely headerless. It is our 832 * All SKB's seen here are completely headerless. It is our
833 * job to build the TCP header, and pass the packet down to 833 * job to build the TCP header, and pass the packet down to
834 * IP so it can do the same plus pass the packet off to the 834 * IP so it can do the same plus pass the packet off to the
835 * device. 835 * device.
836 * 836 *
837 * We are working here with either a clone of the original 837 * We are working here with either a clone of the original
838 * SKB, or a fresh unique copy made by the retransmit engine. 838 * SKB, or a fresh unique copy made by the retransmit engine.
839 */ 839 */
840 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, 840 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
841 gfp_t gfp_mask) 841 gfp_t gfp_mask)
842 { 842 {
843 const struct inet_connection_sock *icsk = inet_csk(sk); 843 const struct inet_connection_sock *icsk = inet_csk(sk);
844 struct inet_sock *inet; 844 struct inet_sock *inet;
845 struct tcp_sock *tp; 845 struct tcp_sock *tp;
846 struct tcp_skb_cb *tcb; 846 struct tcp_skb_cb *tcb;
847 struct tcp_out_options opts; 847 struct tcp_out_options opts;
848 unsigned int tcp_options_size, tcp_header_size; 848 unsigned int tcp_options_size, tcp_header_size;
849 struct tcp_md5sig_key *md5; 849 struct tcp_md5sig_key *md5;
850 struct tcphdr *th; 850 struct tcphdr *th;
851 int err; 851 int err;
852 852
853 BUG_ON(!skb || !tcp_skb_pcount(skb)); 853 BUG_ON(!skb || !tcp_skb_pcount(skb));
854 854
855 if (clone_it) { 855 if (clone_it) {
856 const struct sk_buff *fclone = skb + 1; 856 const struct sk_buff *fclone = skb + 1;
857 857
858 /* If congestion control is doing timestamping, we must 858 /* If congestion control is doing timestamping, we must
859 * take such a timestamp before we potentially clone/copy. 859 * take such a timestamp before we potentially clone/copy.
860 */ 860 */
861 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) 861 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
862 __net_timestamp(skb); 862 __net_timestamp(skb);
863 863
864 if (unlikely(skb->fclone == SKB_FCLONE_ORIG && 864 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
865 fclone->fclone == SKB_FCLONE_CLONE)) 865 fclone->fclone == SKB_FCLONE_CLONE))
866 NET_INC_STATS_BH(sock_net(sk), 866 NET_INC_STATS_BH(sock_net(sk),
867 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); 867 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
868 868
869 if (unlikely(skb_cloned(skb))) 869 if (unlikely(skb_cloned(skb)))
870 skb = pskb_copy(skb, gfp_mask); 870 skb = pskb_copy(skb, gfp_mask);
871 else 871 else
872 skb = skb_clone(skb, gfp_mask); 872 skb = skb_clone(skb, gfp_mask);
873 if (unlikely(!skb)) 873 if (unlikely(!skb))
874 return -ENOBUFS; 874 return -ENOBUFS;
875 } 875 }
876 876
877 inet = inet_sk(sk); 877 inet = inet_sk(sk);
878 tp = tcp_sk(sk); 878 tp = tcp_sk(sk);
879 tcb = TCP_SKB_CB(skb); 879 tcb = TCP_SKB_CB(skb);
880 memset(&opts, 0, sizeof(opts)); 880 memset(&opts, 0, sizeof(opts));
881 881
882 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) 882 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
883 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); 883 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
884 else 884 else
885 tcp_options_size = tcp_established_options(sk, skb, &opts, 885 tcp_options_size = tcp_established_options(sk, skb, &opts,
886 &md5); 886 &md5);
887 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 887 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
888 888
889 if (tcp_packets_in_flight(tp) == 0) 889 if (tcp_packets_in_flight(tp) == 0)
890 tcp_ca_event(sk, CA_EVENT_TX_START); 890 tcp_ca_event(sk, CA_EVENT_TX_START);
891 891
892 /* if no packet is in qdisc/device queue, then allow XPS to select 892 /* if no packet is in qdisc/device queue, then allow XPS to select
893 * another queue. 893 * another queue.
894 */ 894 */
895 skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; 895 skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
896 896
897 skb_push(skb, tcp_header_size); 897 skb_push(skb, tcp_header_size);
898 skb_reset_transport_header(skb); 898 skb_reset_transport_header(skb);
899 899
900 skb_orphan(skb); 900 skb_orphan(skb);
901 skb->sk = sk; 901 skb->sk = sk;
902 skb->destructor = tcp_wfree; 902 skb->destructor = tcp_wfree;
903 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 903 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
904 904
905 /* Build TCP header and checksum it. */ 905 /* Build TCP header and checksum it. */
906 th = tcp_hdr(skb); 906 th = tcp_hdr(skb);
907 th->source = inet->inet_sport; 907 th->source = inet->inet_sport;
908 th->dest = inet->inet_dport; 908 th->dest = inet->inet_dport;
909 th->seq = htonl(tcb->seq); 909 th->seq = htonl(tcb->seq);
910 th->ack_seq = htonl(tp->rcv_nxt); 910 th->ack_seq = htonl(tp->rcv_nxt);
911 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 911 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
912 tcb->tcp_flags); 912 tcb->tcp_flags);
913 913
914 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { 914 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
915 /* RFC1323: The window in SYN & SYN/ACK segments 915 /* RFC1323: The window in SYN & SYN/ACK segments
916 * is never scaled. 916 * is never scaled.
917 */ 917 */
918 th->window = htons(min(tp->rcv_wnd, 65535U)); 918 th->window = htons(min(tp->rcv_wnd, 65535U));
919 } else { 919 } else {
920 th->window = htons(tcp_select_window(sk)); 920 th->window = htons(tcp_select_window(sk));
921 } 921 }
922 th->check = 0; 922 th->check = 0;
923 th->urg_ptr = 0; 923 th->urg_ptr = 0;
924 924
925 /* The urg_mode check is necessary during a below snd_una win probe */ 925 /* The urg_mode check is necessary during a below snd_una win probe */
926 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { 926 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
927 if (before(tp->snd_up, tcb->seq + 0x10000)) { 927 if (before(tp->snd_up, tcb->seq + 0x10000)) {
928 th->urg_ptr = htons(tp->snd_up - tcb->seq); 928 th->urg_ptr = htons(tp->snd_up - tcb->seq);
929 th->urg = 1; 929 th->urg = 1;
930 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { 930 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
931 th->urg_ptr = htons(0xFFFF); 931 th->urg_ptr = htons(0xFFFF);
932 th->urg = 1; 932 th->urg = 1;
933 } 933 }
934 } 934 }
935 935
936 tcp_options_write((__be32 *)(th + 1), tp, &opts); 936 tcp_options_write((__be32 *)(th + 1), tp, &opts);
937 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) 937 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
938 TCP_ECN_send(sk, skb, tcp_header_size); 938 TCP_ECN_send(sk, skb, tcp_header_size);
939 939
940 #ifdef CONFIG_TCP_MD5SIG 940 #ifdef CONFIG_TCP_MD5SIG
941 /* Calculate the MD5 hash, as we have all we need now */ 941 /* Calculate the MD5 hash, as we have all we need now */
942 if (md5) { 942 if (md5) {
943 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 943 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
944 tp->af_specific->calc_md5_hash(opts.hash_location, 944 tp->af_specific->calc_md5_hash(opts.hash_location,
945 md5, sk, NULL, skb); 945 md5, sk, NULL, skb);
946 } 946 }
947 #endif 947 #endif
948 948
949 icsk->icsk_af_ops->send_check(sk, skb); 949 icsk->icsk_af_ops->send_check(sk, skb);
950 950
951 if (likely(tcb->tcp_flags & TCPHDR_ACK)) 951 if (likely(tcb->tcp_flags & TCPHDR_ACK))
952 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 952 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
953 953
954 if (skb->len != tcp_header_size) 954 if (skb->len != tcp_header_size)
955 tcp_event_data_sent(tp, sk); 955 tcp_event_data_sent(tp, sk);
956 956
957 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) 957 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
958 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, 958 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
959 tcp_skb_pcount(skb)); 959 tcp_skb_pcount(skb));
960 960
961 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); 961 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
962 if (likely(err <= 0)) 962 if (likely(err <= 0))
963 return err; 963 return err;
964 964
965 tcp_enter_cwr(sk, 1); 965 tcp_enter_cwr(sk, 1);
966 966
967 return net_xmit_eval(err); 967 return net_xmit_eval(err);
968 } 968 }
969 969
970 /* This routine just queues the buffer for sending. 970 /* This routine just queues the buffer for sending.
971 * 971 *
972 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, 972 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
973 * otherwise socket can stall. 973 * otherwise socket can stall.
974 */ 974 */
975 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) 975 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
976 { 976 {
977 struct tcp_sock *tp = tcp_sk(sk); 977 struct tcp_sock *tp = tcp_sk(sk);
978 978
979 /* Advance write_seq and place onto the write_queue. */ 979 /* Advance write_seq and place onto the write_queue. */
980 tp->write_seq = TCP_SKB_CB(skb)->end_seq; 980 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
981 skb_header_release(skb); 981 skb_header_release(skb);
982 tcp_add_write_queue_tail(sk, skb); 982 tcp_add_write_queue_tail(sk, skb);
983 sk->sk_wmem_queued += skb->truesize; 983 sk->sk_wmem_queued += skb->truesize;
984 sk_mem_charge(sk, skb->truesize); 984 sk_mem_charge(sk, skb->truesize);
985 } 985 }
986 986
987 /* Initialize TSO segments for a packet. */ 987 /* Initialize TSO segments for a packet. */
988 static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, 988 static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
989 unsigned int mss_now) 989 unsigned int mss_now)
990 { 990 {
991 struct skb_shared_info *shinfo = skb_shinfo(skb); 991 struct skb_shared_info *shinfo = skb_shinfo(skb);
992 992
993 /* Make sure we own this skb before messing gso_size/gso_segs */ 993 /* Make sure we own this skb before messing gso_size/gso_segs */
994 WARN_ON_ONCE(skb_cloned(skb)); 994 WARN_ON_ONCE(skb_cloned(skb));
995 995
996 if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { 996 if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
997 /* Avoid the costly divide in the normal 997 /* Avoid the costly divide in the normal
998 * non-TSO case. 998 * non-TSO case.
999 */ 999 */
1000 shinfo->gso_segs = 1; 1000 shinfo->gso_segs = 1;
1001 shinfo->gso_size = 0; 1001 shinfo->gso_size = 0;
1002 shinfo->gso_type = 0; 1002 shinfo->gso_type = 0;
1003 } else { 1003 } else {
1004 shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); 1004 shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
1005 shinfo->gso_size = mss_now; 1005 shinfo->gso_size = mss_now;
1006 shinfo->gso_type = sk->sk_gso_type; 1006 shinfo->gso_type = sk->sk_gso_type;
1007 } 1007 }
1008 } 1008 }
1009 1009
1010 /* When a modification to fackets out becomes necessary, we need to check 1010 /* When a modification to fackets out becomes necessary, we need to check
1011 * skb is counted to fackets_out or not. 1011 * skb is counted to fackets_out or not.
1012 */ 1012 */
1013 static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, 1013 static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
1014 int decr) 1014 int decr)
1015 { 1015 {
1016 struct tcp_sock *tp = tcp_sk(sk); 1016 struct tcp_sock *tp = tcp_sk(sk);
1017 1017
1018 if (!tp->sacked_out || tcp_is_reno(tp)) 1018 if (!tp->sacked_out || tcp_is_reno(tp))
1019 return; 1019 return;
1020 1020
1021 if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) 1021 if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
1022 tp->fackets_out -= decr; 1022 tp->fackets_out -= decr;
1023 } 1023 }
1024 1024
1025 /* Pcount in the middle of the write queue got changed, we need to do various 1025 /* Pcount in the middle of the write queue got changed, we need to do various
1026 * tweaks to fix counters 1026 * tweaks to fix counters
1027 */ 1027 */
1028 static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) 1028 static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1029 { 1029 {
1030 struct tcp_sock *tp = tcp_sk(sk); 1030 struct tcp_sock *tp = tcp_sk(sk);
1031 1031
1032 tp->packets_out -= decr; 1032 tp->packets_out -= decr;
1033 1033
1034 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 1034 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1035 tp->sacked_out -= decr; 1035 tp->sacked_out -= decr;
1036 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) 1036 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1037 tp->retrans_out -= decr; 1037 tp->retrans_out -= decr;
1038 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) 1038 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1039 tp->lost_out -= decr; 1039 tp->lost_out -= decr;
1040 1040
1041 /* Reno case is special. Sigh... */ 1041 /* Reno case is special. Sigh... */
1042 if (tcp_is_reno(tp) && decr > 0) 1042 if (tcp_is_reno(tp) && decr > 0)
1043 tp->sacked_out -= min_t(u32, tp->sacked_out, decr); 1043 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1044 1044
1045 tcp_adjust_fackets_out(sk, skb, decr); 1045 tcp_adjust_fackets_out(sk, skb, decr);
1046 1046
1047 if (tp->lost_skb_hint && 1047 if (tp->lost_skb_hint &&
1048 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && 1048 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1049 (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) 1049 (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
1050 tp->lost_cnt_hint -= decr; 1050 tp->lost_cnt_hint -= decr;
1051 1051
1052 tcp_verify_left_out(tp); 1052 tcp_verify_left_out(tp);
1053 } 1053 }
1054 1054
1055 /* Function to create two new TCP segments. Shrinks the given segment 1055 /* Function to create two new TCP segments. Shrinks the given segment
1056 * to the specified size and appends a new segment with the rest of the 1056 * to the specified size and appends a new segment with the rest of the
1057 * packet to the list. This won't be called frequently, I hope. 1057 * packet to the list. This won't be called frequently, I hope.
1058 * Remember, these are still headerless SKBs at this point. 1058 * Remember, these are still headerless SKBs at this point.
1059 */ 1059 */
1060 int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1060 int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1061 unsigned int mss_now) 1061 unsigned int mss_now)
1062 { 1062 {
1063 struct tcp_sock *tp = tcp_sk(sk); 1063 struct tcp_sock *tp = tcp_sk(sk);
1064 struct sk_buff *buff; 1064 struct sk_buff *buff;
1065 int nsize, old_factor; 1065 int nsize, old_factor;
1066 int nlen; 1066 int nlen;
1067 u8 flags; 1067 u8 flags;
1068 1068
1069 if (WARN_ON(len > skb->len)) 1069 if (WARN_ON(len > skb->len))
1070 return -EINVAL; 1070 return -EINVAL;
1071 1071
1072 nsize = skb_headlen(skb) - len; 1072 nsize = skb_headlen(skb) - len;
1073 if (nsize < 0) 1073 if (nsize < 0)
1074 nsize = 0; 1074 nsize = 0;
1075 1075
1076 if (skb_unclone(skb, GFP_ATOMIC)) 1076 if (skb_unclone(skb, GFP_ATOMIC))
1077 return -ENOMEM; 1077 return -ENOMEM;
1078 1078
1079 /* Get a new skb... force flag on. */ 1079 /* Get a new skb... force flag on. */
1080 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); 1080 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
1081 if (buff == NULL) 1081 if (buff == NULL)
1082 return -ENOMEM; /* We'll just try again later. */ 1082 return -ENOMEM; /* We'll just try again later. */
1083 1083
1084 sk->sk_wmem_queued += buff->truesize; 1084 sk->sk_wmem_queued += buff->truesize;
1085 sk_mem_charge(sk, buff->truesize); 1085 sk_mem_charge(sk, buff->truesize);
1086 nlen = skb->len - len - nsize; 1086 nlen = skb->len - len - nsize;
1087 buff->truesize += nlen; 1087 buff->truesize += nlen;
1088 skb->truesize -= nlen; 1088 skb->truesize -= nlen;
1089 1089
1090 /* Correct the sequence numbers. */ 1090 /* Correct the sequence numbers. */
1091 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; 1091 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1092 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; 1092 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1093 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; 1093 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1094 1094
1095 /* PSH and FIN should only be set in the second packet. */ 1095 /* PSH and FIN should only be set in the second packet. */
1096 flags = TCP_SKB_CB(skb)->tcp_flags; 1096 flags = TCP_SKB_CB(skb)->tcp_flags;
1097 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); 1097 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1098 TCP_SKB_CB(buff)->tcp_flags = flags; 1098 TCP_SKB_CB(buff)->tcp_flags = flags;
1099 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; 1099 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1100 1100
1101 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { 1101 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
1102 /* Copy and checksum data tail into the new buffer. */ 1102 /* Copy and checksum data tail into the new buffer. */
1103 buff->csum = csum_partial_copy_nocheck(skb->data + len, 1103 buff->csum = csum_partial_copy_nocheck(skb->data + len,
1104 skb_put(buff, nsize), 1104 skb_put(buff, nsize),
1105 nsize, 0); 1105 nsize, 0);
1106 1106
1107 skb_trim(skb, len); 1107 skb_trim(skb, len);
1108 1108
1109 skb->csum = csum_block_sub(skb->csum, buff->csum, len); 1109 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
1110 } else { 1110 } else {
1111 skb->ip_summed = CHECKSUM_PARTIAL; 1111 skb->ip_summed = CHECKSUM_PARTIAL;
1112 skb_split(skb, buff, len); 1112 skb_split(skb, buff, len);
1113 } 1113 }
1114 1114
1115 buff->ip_summed = skb->ip_summed; 1115 buff->ip_summed = skb->ip_summed;
1116 1116
1117 /* Looks stupid, but our code really uses when of 1117 /* Looks stupid, but our code really uses when of
1118 * skbs, which it never sent before. --ANK 1118 * skbs, which it never sent before. --ANK
1119 */ 1119 */
1120 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 1120 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
1121 buff->tstamp = skb->tstamp; 1121 buff->tstamp = skb->tstamp;
1122 1122
1123 old_factor = tcp_skb_pcount(skb); 1123 old_factor = tcp_skb_pcount(skb);
1124 1124
1125 /* Fix up tso_factor for both original and new SKB. */ 1125 /* Fix up tso_factor for both original and new SKB. */
1126 tcp_set_skb_tso_segs(sk, skb, mss_now); 1126 tcp_set_skb_tso_segs(sk, skb, mss_now);
1127 tcp_set_skb_tso_segs(sk, buff, mss_now); 1127 tcp_set_skb_tso_segs(sk, buff, mss_now);
1128 1128
1129 /* If this packet has been sent out already, we must 1129 /* If this packet has been sent out already, we must
1130 * adjust the various packet counters. 1130 * adjust the various packet counters.
1131 */ 1131 */
1132 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { 1132 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1133 int diff = old_factor - tcp_skb_pcount(skb) - 1133 int diff = old_factor - tcp_skb_pcount(skb) -
1134 tcp_skb_pcount(buff); 1134 tcp_skb_pcount(buff);
1135 1135
1136 if (diff) 1136 if (diff)
1137 tcp_adjust_pcount(sk, skb, diff); 1137 tcp_adjust_pcount(sk, skb, diff);
1138 } 1138 }
1139 1139
1140 /* Link BUFF into the send queue. */ 1140 /* Link BUFF into the send queue. */
1141 skb_header_release(buff); 1141 skb_header_release(buff);
1142 tcp_insert_write_queue_after(skb, buff, sk); 1142 tcp_insert_write_queue_after(skb, buff, sk);
1143 1143
1144 return 0; 1144 return 0;
1145 } 1145 }
1146 1146
1147 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c 1147 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
1148 * eventually). The difference is that pulled data not copied, but 1148 * eventually). The difference is that pulled data not copied, but
1149 * immediately discarded. 1149 * immediately discarded.
1150 */ 1150 */
1151 static void __pskb_trim_head(struct sk_buff *skb, int len) 1151 static void __pskb_trim_head(struct sk_buff *skb, int len)
1152 { 1152 {
1153 struct skb_shared_info *shinfo; 1153 struct skb_shared_info *shinfo;
1154 int i, k, eat; 1154 int i, k, eat;
1155 1155
1156 eat = min_t(int, len, skb_headlen(skb)); 1156 eat = min_t(int, len, skb_headlen(skb));
1157 if (eat) { 1157 if (eat) {
1158 __skb_pull(skb, eat); 1158 __skb_pull(skb, eat);
1159 len -= eat; 1159 len -= eat;
1160 if (!len) 1160 if (!len)
1161 return; 1161 return;
1162 } 1162 }
1163 eat = len; 1163 eat = len;
1164 k = 0; 1164 k = 0;
1165 shinfo = skb_shinfo(skb); 1165 shinfo = skb_shinfo(skb);
1166 for (i = 0; i < shinfo->nr_frags; i++) { 1166 for (i = 0; i < shinfo->nr_frags; i++) {
1167 int size = skb_frag_size(&shinfo->frags[i]); 1167 int size = skb_frag_size(&shinfo->frags[i]);
1168 1168
1169 if (size <= eat) { 1169 if (size <= eat) {
1170 skb_frag_unref(skb, i); 1170 skb_frag_unref(skb, i);
1171 eat -= size; 1171 eat -= size;
1172 } else { 1172 } else {
1173 shinfo->frags[k] = shinfo->frags[i]; 1173 shinfo->frags[k] = shinfo->frags[i];
1174 if (eat) { 1174 if (eat) {
1175 shinfo->frags[k].page_offset += eat; 1175 shinfo->frags[k].page_offset += eat;
1176 skb_frag_size_sub(&shinfo->frags[k], eat); 1176 skb_frag_size_sub(&shinfo->frags[k], eat);
1177 eat = 0; 1177 eat = 0;
1178 } 1178 }
1179 k++; 1179 k++;
1180 } 1180 }
1181 } 1181 }
1182 shinfo->nr_frags = k; 1182 shinfo->nr_frags = k;
1183 1183
1184 skb_reset_tail_pointer(skb); 1184 skb_reset_tail_pointer(skb);
1185 skb->data_len -= len; 1185 skb->data_len -= len;
1186 skb->len = skb->data_len; 1186 skb->len = skb->data_len;
1187 } 1187 }
1188 1188
1189 /* Remove acked data from a packet in the transmit queue. */ 1189 /* Remove acked data from a packet in the transmit queue. */
1190 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) 1190 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1191 { 1191 {
1192 if (skb_unclone(skb, GFP_ATOMIC)) 1192 if (skb_unclone(skb, GFP_ATOMIC))
1193 return -ENOMEM; 1193 return -ENOMEM;
1194 1194
1195 __pskb_trim_head(skb, len); 1195 __pskb_trim_head(skb, len);
1196 1196
1197 TCP_SKB_CB(skb)->seq += len; 1197 TCP_SKB_CB(skb)->seq += len;
1198 skb->ip_summed = CHECKSUM_PARTIAL; 1198 skb->ip_summed = CHECKSUM_PARTIAL;
1199 1199
1200 skb->truesize -= len; 1200 skb->truesize -= len;
1201 sk->sk_wmem_queued -= len; 1201 sk->sk_wmem_queued -= len;
1202 sk_mem_uncharge(sk, len); 1202 sk_mem_uncharge(sk, len);
1203 sock_set_flag(sk, SOCK_QUEUE_SHRUNK); 1203 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1204 1204
1205 /* Any change of skb->len requires recalculation of tso factor. */ 1205 /* Any change of skb->len requires recalculation of tso factor. */
1206 if (tcp_skb_pcount(skb) > 1) 1206 if (tcp_skb_pcount(skb) > 1)
1207 tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); 1207 tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
1208 1208
1209 return 0; 1209 return 0;
1210 } 1210 }
1211 1211
1212 /* Calculate MSS not accounting any TCP options. */ 1212 /* Calculate MSS not accounting any TCP options. */
1213 static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) 1213 static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1214 { 1214 {
1215 const struct tcp_sock *tp = tcp_sk(sk); 1215 const struct tcp_sock *tp = tcp_sk(sk);
1216 const struct inet_connection_sock *icsk = inet_csk(sk); 1216 const struct inet_connection_sock *icsk = inet_csk(sk);
1217 int mss_now; 1217 int mss_now;
1218 1218
1219 /* Calculate base mss without TCP options: 1219 /* Calculate base mss without TCP options:
1220 It is MMS_S - sizeof(tcphdr) of rfc1122 1220 It is MMS_S - sizeof(tcphdr) of rfc1122
1221 */ 1221 */
1222 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); 1222 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1223 1223
1224 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ 1224 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1225 if (icsk->icsk_af_ops->net_frag_header_len) { 1225 if (icsk->icsk_af_ops->net_frag_header_len) {
1226 const struct dst_entry *dst = __sk_dst_get(sk); 1226 const struct dst_entry *dst = __sk_dst_get(sk);
1227 1227
1228 if (dst && dst_allfrag(dst)) 1228 if (dst && dst_allfrag(dst))
1229 mss_now -= icsk->icsk_af_ops->net_frag_header_len; 1229 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1230 } 1230 }
1231 1231
1232 /* Clamp it (mss_clamp does not include tcp options) */ 1232 /* Clamp it (mss_clamp does not include tcp options) */
1233 if (mss_now > tp->rx_opt.mss_clamp) 1233 if (mss_now > tp->rx_opt.mss_clamp)
1234 mss_now = tp->rx_opt.mss_clamp; 1234 mss_now = tp->rx_opt.mss_clamp;
1235 1235
1236 /* Now subtract optional transport overhead */ 1236 /* Now subtract optional transport overhead */
1237 mss_now -= icsk->icsk_ext_hdr_len; 1237 mss_now -= icsk->icsk_ext_hdr_len;
1238 1238
1239 /* Then reserve room for full set of TCP options and 8 bytes of data */ 1239 /* Then reserve room for full set of TCP options and 8 bytes of data */
1240 if (mss_now < 48) 1240 if (mss_now < 48)
1241 mss_now = 48; 1241 mss_now = 48;
1242 return mss_now; 1242 return mss_now;
1243 } 1243 }
1244 1244
1245 /* Calculate MSS. Not accounting for SACKs here. */ 1245 /* Calculate MSS. Not accounting for SACKs here. */
1246 int tcp_mtu_to_mss(struct sock *sk, int pmtu) 1246 int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1247 { 1247 {
1248 /* Subtract TCP options size, not including SACKs */ 1248 /* Subtract TCP options size, not including SACKs */
1249 return __tcp_mtu_to_mss(sk, pmtu) - 1249 return __tcp_mtu_to_mss(sk, pmtu) -
1250 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); 1250 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1251 } 1251 }
1252 1252
1253 /* Inverse of above */ 1253 /* Inverse of above */
1254 int tcp_mss_to_mtu(struct sock *sk, int mss) 1254 int tcp_mss_to_mtu(struct sock *sk, int mss)
1255 { 1255 {
1256 const struct tcp_sock *tp = tcp_sk(sk); 1256 const struct tcp_sock *tp = tcp_sk(sk);
1257 const struct inet_connection_sock *icsk = inet_csk(sk); 1257 const struct inet_connection_sock *icsk = inet_csk(sk);
1258 int mtu; 1258 int mtu;
1259 1259
1260 mtu = mss + 1260 mtu = mss +
1261 tp->tcp_header_len + 1261 tp->tcp_header_len +
1262 icsk->icsk_ext_hdr_len + 1262 icsk->icsk_ext_hdr_len +
1263 icsk->icsk_af_ops->net_header_len; 1263 icsk->icsk_af_ops->net_header_len;
1264 1264
1265 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ 1265 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1266 if (icsk->icsk_af_ops->net_frag_header_len) { 1266 if (icsk->icsk_af_ops->net_frag_header_len) {
1267 const struct dst_entry *dst = __sk_dst_get(sk); 1267 const struct dst_entry *dst = __sk_dst_get(sk);
1268 1268
1269 if (dst && dst_allfrag(dst)) 1269 if (dst && dst_allfrag(dst))
1270 mtu += icsk->icsk_af_ops->net_frag_header_len; 1270 mtu += icsk->icsk_af_ops->net_frag_header_len;
1271 } 1271 }
1272 return mtu; 1272 return mtu;
1273 } 1273 }
1274 1274
1275 /* MTU probing init per socket */ 1275 /* MTU probing init per socket */
1276 void tcp_mtup_init(struct sock *sk) 1276 void tcp_mtup_init(struct sock *sk)
1277 { 1277 {
1278 struct tcp_sock *tp = tcp_sk(sk); 1278 struct tcp_sock *tp = tcp_sk(sk);
1279 struct inet_connection_sock *icsk = inet_csk(sk); 1279 struct inet_connection_sock *icsk = inet_csk(sk);
1280 1280
1281 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; 1281 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
1282 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + 1282 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1283 icsk->icsk_af_ops->net_header_len; 1283 icsk->icsk_af_ops->net_header_len;
1284 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1284 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
1285 icsk->icsk_mtup.probe_size = 0; 1285 icsk->icsk_mtup.probe_size = 0;
1286 } 1286 }
1287 EXPORT_SYMBOL(tcp_mtup_init); 1287 EXPORT_SYMBOL(tcp_mtup_init);
1288 1288
1289 /* This function synchronize snd mss to current pmtu/exthdr set. 1289 /* This function synchronize snd mss to current pmtu/exthdr set.
1290 1290
1291 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 1291 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
1292 for TCP options, but includes only bare TCP header. 1292 for TCP options, but includes only bare TCP header.
1293 1293
1294 tp->rx_opt.mss_clamp is mss negotiated at connection setup. 1294 tp->rx_opt.mss_clamp is mss negotiated at connection setup.
1295 It is minimum of user_mss and mss received with SYN. 1295 It is minimum of user_mss and mss received with SYN.
1296 It also does not include TCP options. 1296 It also does not include TCP options.
1297 1297
1298 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. 1298 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1299 1299
1300 tp->mss_cache is current effective sending mss, including 1300 tp->mss_cache is current effective sending mss, including
1301 all tcp options except for SACKs. It is evaluated, 1301 all tcp options except for SACKs. It is evaluated,
1302 taking into account current pmtu, but never exceeds 1302 taking into account current pmtu, but never exceeds
1303 tp->rx_opt.mss_clamp. 1303 tp->rx_opt.mss_clamp.
1304 1304
1305 NOTE1. rfc1122 clearly states that advertised MSS 1305 NOTE1. rfc1122 clearly states that advertised MSS
1306 DOES NOT include either tcp or ip options. 1306 DOES NOT include either tcp or ip options.
1307 1307
1308 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache 1308 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1309 are READ ONLY outside this function. --ANK (980731) 1309 are READ ONLY outside this function. --ANK (980731)
1310 */ 1310 */
1311 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) 1311 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1312 { 1312 {
1313 struct tcp_sock *tp = tcp_sk(sk); 1313 struct tcp_sock *tp = tcp_sk(sk);
1314 struct inet_connection_sock *icsk = inet_csk(sk); 1314 struct inet_connection_sock *icsk = inet_csk(sk);
1315 int mss_now; 1315 int mss_now;
1316 1316
1317 if (icsk->icsk_mtup.search_high > pmtu) 1317 if (icsk->icsk_mtup.search_high > pmtu)
1318 icsk->icsk_mtup.search_high = pmtu; 1318 icsk->icsk_mtup.search_high = pmtu;
1319 1319
1320 mss_now = tcp_mtu_to_mss(sk, pmtu); 1320 mss_now = tcp_mtu_to_mss(sk, pmtu);
1321 mss_now = tcp_bound_to_half_wnd(tp, mss_now); 1321 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1322 1322
1323 /* And store cached results */ 1323 /* And store cached results */
1324 icsk->icsk_pmtu_cookie = pmtu; 1324 icsk->icsk_pmtu_cookie = pmtu;
1325 if (icsk->icsk_mtup.enabled) 1325 if (icsk->icsk_mtup.enabled)
1326 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); 1326 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1327 tp->mss_cache = mss_now; 1327 tp->mss_cache = mss_now;
1328 1328
1329 return mss_now; 1329 return mss_now;
1330 } 1330 }
1331 EXPORT_SYMBOL(tcp_sync_mss); 1331 EXPORT_SYMBOL(tcp_sync_mss);
1332 1332
1333 /* Compute the current effective MSS, taking SACKs and IP options, 1333 /* Compute the current effective MSS, taking SACKs and IP options,
1334 * and even PMTU discovery events into account. 1334 * and even PMTU discovery events into account.
1335 */ 1335 */
1336 unsigned int tcp_current_mss(struct sock *sk) 1336 unsigned int tcp_current_mss(struct sock *sk)
1337 { 1337 {
1338 const struct tcp_sock *tp = tcp_sk(sk); 1338 const struct tcp_sock *tp = tcp_sk(sk);
1339 const struct dst_entry *dst = __sk_dst_get(sk); 1339 const struct dst_entry *dst = __sk_dst_get(sk);
1340 u32 mss_now; 1340 u32 mss_now;
1341 unsigned int header_len; 1341 unsigned int header_len;
1342 struct tcp_out_options opts; 1342 struct tcp_out_options opts;
1343 struct tcp_md5sig_key *md5; 1343 struct tcp_md5sig_key *md5;
1344 1344
1345 mss_now = tp->mss_cache; 1345 mss_now = tp->mss_cache;
1346 1346
1347 if (dst) { 1347 if (dst) {
1348 u32 mtu = dst_mtu(dst); 1348 u32 mtu = dst_mtu(dst);
1349 if (mtu != inet_csk(sk)->icsk_pmtu_cookie) 1349 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1350 mss_now = tcp_sync_mss(sk, mtu); 1350 mss_now = tcp_sync_mss(sk, mtu);
1351 } 1351 }
1352 1352
1353 header_len = tcp_established_options(sk, NULL, &opts, &md5) + 1353 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1354 sizeof(struct tcphdr); 1354 sizeof(struct tcphdr);
1355 /* The mss_cache is sized based on tp->tcp_header_len, which assumes 1355 /* The mss_cache is sized based on tp->tcp_header_len, which assumes
1356 * some common options. If this is an odd packet (because we have SACK 1356 * some common options. If this is an odd packet (because we have SACK
1357 * blocks etc) then our calculated header_len will be different, and 1357 * blocks etc) then our calculated header_len will be different, and
1358 * we have to adjust mss_now correspondingly */ 1358 * we have to adjust mss_now correspondingly */
1359 if (header_len != tp->tcp_header_len) { 1359 if (header_len != tp->tcp_header_len) {
1360 int delta = (int) header_len - tp->tcp_header_len; 1360 int delta = (int) header_len - tp->tcp_header_len;
1361 mss_now -= delta; 1361 mss_now -= delta;
1362 } 1362 }
1363 1363
1364 return mss_now; 1364 return mss_now;
1365 } 1365 }
1366 1366
1367 /* Congestion window validation. (RFC2861) */ 1367 /* Congestion window validation. (RFC2861) */
1368 static void tcp_cwnd_validate(struct sock *sk) 1368 static void tcp_cwnd_validate(struct sock *sk)
1369 { 1369 {
1370 struct tcp_sock *tp = tcp_sk(sk); 1370 struct tcp_sock *tp = tcp_sk(sk);
1371 1371
1372 if (tp->packets_out >= tp->snd_cwnd) { 1372 if (tp->packets_out >= tp->snd_cwnd) {
1373 /* Network is feed fully. */ 1373 /* Network is feed fully. */
1374 tp->snd_cwnd_used = 0; 1374 tp->snd_cwnd_used = 0;
1375 tp->snd_cwnd_stamp = tcp_time_stamp; 1375 tp->snd_cwnd_stamp = tcp_time_stamp;
1376 } else { 1376 } else {
1377 /* Network starves. */ 1377 /* Network starves. */
1378 if (tp->packets_out > tp->snd_cwnd_used) 1378 if (tp->packets_out > tp->snd_cwnd_used)
1379 tp->snd_cwnd_used = tp->packets_out; 1379 tp->snd_cwnd_used = tp->packets_out;
1380 1380
1381 if (sysctl_tcp_slow_start_after_idle && 1381 if (sysctl_tcp_slow_start_after_idle &&
1382 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) 1382 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1383 tcp_cwnd_application_limited(sk); 1383 tcp_cwnd_application_limited(sk);
1384 } 1384 }
1385 } 1385 }
1386 1386
1387 /* Returns the portion of skb which can be sent right away without 1387 /* Returns the portion of skb which can be sent right away without
1388 * introducing MSS oddities to segment boundaries. In rare cases where 1388 * introducing MSS oddities to segment boundaries. In rare cases where
1389 * mss_now != mss_cache, we will request caller to create a small skb 1389 * mss_now != mss_cache, we will request caller to create a small skb
1390 * per input skb which could be mostly avoided here (if desired). 1390 * per input skb which could be mostly avoided here (if desired).
1391 * 1391 *
1392 * We explicitly want to create a request for splitting write queue tail 1392 * We explicitly want to create a request for splitting write queue tail
1393 * to a small skb for Nagle purposes while avoiding unnecessary modulos, 1393 * to a small skb for Nagle purposes while avoiding unnecessary modulos,
1394 * thus all the complexity (cwnd_len is always MSS multiple which we 1394 * thus all the complexity (cwnd_len is always MSS multiple which we
1395 * return whenever allowed by the other factors). Basically we need the 1395 * return whenever allowed by the other factors). Basically we need the
1396 * modulo only when the receiver window alone is the limiting factor or 1396 * modulo only when the receiver window alone is the limiting factor or
1397 * when we would be allowed to send the split-due-to-Nagle skb fully. 1397 * when we would be allowed to send the split-due-to-Nagle skb fully.
1398 */ 1398 */
1399 static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, 1399 static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
1400 unsigned int mss_now, unsigned int max_segs) 1400 unsigned int mss_now, unsigned int max_segs)
1401 { 1401 {
1402 const struct tcp_sock *tp = tcp_sk(sk); 1402 const struct tcp_sock *tp = tcp_sk(sk);
1403 u32 needed, window, max_len; 1403 u32 needed, window, max_len;
1404 1404
1405 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 1405 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1406 max_len = mss_now * max_segs; 1406 max_len = mss_now * max_segs;
1407 1407
1408 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) 1408 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1409 return max_len; 1409 return max_len;
1410 1410
1411 needed = min(skb->len, window); 1411 needed = min(skb->len, window);
1412 1412
1413 if (max_len <= needed) 1413 if (max_len <= needed)
1414 return max_len; 1414 return max_len;
1415 1415
1416 return needed - needed % mss_now; 1416 return needed - needed % mss_now;
1417 } 1417 }
1418 1418
1419 /* Can at least one segment of SKB be sent right now, according to the 1419 /* Can at least one segment of SKB be sent right now, according to the
1420 * congestion window rules? If so, return how many segments are allowed. 1420 * congestion window rules? If so, return how many segments are allowed.
1421 */ 1421 */
1422 static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, 1422 static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1423 const struct sk_buff *skb) 1423 const struct sk_buff *skb)
1424 { 1424 {
1425 u32 in_flight, cwnd; 1425 u32 in_flight, cwnd;
1426 1426
1427 /* Don't be strict about the congestion window for the final FIN. */ 1427 /* Don't be strict about the congestion window for the final FIN. */
1428 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && 1428 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
1429 tcp_skb_pcount(skb) == 1) 1429 tcp_skb_pcount(skb) == 1)
1430 return 1; 1430 return 1;
1431 1431
1432 in_flight = tcp_packets_in_flight(tp); 1432 in_flight = tcp_packets_in_flight(tp);
1433 cwnd = tp->snd_cwnd; 1433 cwnd = tp->snd_cwnd;
1434 if (in_flight < cwnd) 1434 if (in_flight < cwnd)
1435 return (cwnd - in_flight); 1435 return (cwnd - in_flight);
1436 1436
1437 return 0; 1437 return 0;
1438 } 1438 }
1439 1439
1440 /* Initialize TSO state of a skb. 1440 /* Initialize TSO state of a skb.
1441 * This must be invoked the first time we consider transmitting 1441 * This must be invoked the first time we consider transmitting
1442 * SKB onto the wire. 1442 * SKB onto the wire.
1443 */ 1443 */
1444 static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, 1444 static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1445 unsigned int mss_now) 1445 unsigned int mss_now)
1446 { 1446 {
1447 int tso_segs = tcp_skb_pcount(skb); 1447 int tso_segs = tcp_skb_pcount(skb);
1448 1448
1449 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { 1449 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1450 tcp_set_skb_tso_segs(sk, skb, mss_now); 1450 tcp_set_skb_tso_segs(sk, skb, mss_now);
1451 tso_segs = tcp_skb_pcount(skb); 1451 tso_segs = tcp_skb_pcount(skb);
1452 } 1452 }
1453 return tso_segs; 1453 return tso_segs;
1454 } 1454 }
1455 1455
1456 /* Minshall's variant of the Nagle send check. */ 1456 /* Minshall's variant of the Nagle send check. */
1457 static inline bool tcp_minshall_check(const struct tcp_sock *tp) 1457 static inline bool tcp_minshall_check(const struct tcp_sock *tp)
1458 { 1458 {
1459 return after(tp->snd_sml, tp->snd_una) && 1459 return after(tp->snd_sml, tp->snd_una) &&
1460 !after(tp->snd_sml, tp->snd_nxt); 1460 !after(tp->snd_sml, tp->snd_nxt);
1461 } 1461 }
1462 1462
1463 /* Return false, if packet can be sent now without violation Nagle's rules: 1463 /* Return false, if packet can be sent now without violation Nagle's rules:
1464 * 1. It is full sized. 1464 * 1. It is full sized.
1465 * 2. Or it contains FIN. (already checked by caller) 1465 * 2. Or it contains FIN. (already checked by caller)
1466 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. 1466 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1467 * 4. Or TCP_CORK is not set, and all sent packets are ACKed. 1467 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1468 * With Minshall's modification: all sent small packets are ACKed. 1468 * With Minshall's modification: all sent small packets are ACKed.
1469 */ 1469 */
1470 static inline bool tcp_nagle_check(const struct tcp_sock *tp, 1470 static inline bool tcp_nagle_check(const struct tcp_sock *tp,
1471 const struct sk_buff *skb, 1471 const struct sk_buff *skb,
1472 unsigned int mss_now, int nonagle) 1472 unsigned int mss_now, int nonagle)
1473 { 1473 {
1474 return skb->len < mss_now && 1474 return skb->len < mss_now &&
1475 ((nonagle & TCP_NAGLE_CORK) || 1475 ((nonagle & TCP_NAGLE_CORK) ||
1476 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); 1476 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1477 } 1477 }
1478 1478
1479 /* Return true if the Nagle test allows this packet to be 1479 /* Return true if the Nagle test allows this packet to be
1480 * sent now. 1480 * sent now.
1481 */ 1481 */
1482 static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, 1482 static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1483 unsigned int cur_mss, int nonagle) 1483 unsigned int cur_mss, int nonagle)
1484 { 1484 {
1485 /* Nagle rule does not apply to frames, which sit in the middle of the 1485 /* Nagle rule does not apply to frames, which sit in the middle of the
1486 * write_queue (they have no chances to get new data). 1486 * write_queue (they have no chances to get new data).
1487 * 1487 *
1488 * This is implemented in the callers, where they modify the 'nonagle' 1488 * This is implemented in the callers, where they modify the 'nonagle'
1489 * argument based upon the location of SKB in the send queue. 1489 * argument based upon the location of SKB in the send queue.
1490 */ 1490 */
1491 if (nonagle & TCP_NAGLE_PUSH) 1491 if (nonagle & TCP_NAGLE_PUSH)
1492 return true; 1492 return true;
1493 1493
1494 /* Don't use the nagle rule for urgent data (or for the final FIN). */ 1494 /* Don't use the nagle rule for urgent data (or for the final FIN). */
1495 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1495 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1496 return true; 1496 return true;
1497 1497
1498 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1498 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1499 return true; 1499 return true;
1500 1500
1501 return false; 1501 return false;
1502 } 1502 }
1503 1503
1504 /* Does at least the first segment of SKB fit into the send window? */ 1504 /* Does at least the first segment of SKB fit into the send window? */
1505 static bool tcp_snd_wnd_test(const struct tcp_sock *tp, 1505 static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1506 const struct sk_buff *skb, 1506 const struct sk_buff *skb,
1507 unsigned int cur_mss) 1507 unsigned int cur_mss)
1508 { 1508 {
1509 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 1509 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1510 1510
1511 if (skb->len > cur_mss) 1511 if (skb->len > cur_mss)
1512 end_seq = TCP_SKB_CB(skb)->seq + cur_mss; 1512 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1513 1513
1514 return !after(end_seq, tcp_wnd_end(tp)); 1514 return !after(end_seq, tcp_wnd_end(tp));
1515 } 1515 }
1516 1516
1517 /* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) 1517 /* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
1518 * should be put on the wire right now. If so, it returns the number of 1518 * should be put on the wire right now. If so, it returns the number of
1519 * packets allowed by the congestion window. 1519 * packets allowed by the congestion window.
1520 */ 1520 */
1521 static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, 1521 static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
1522 unsigned int cur_mss, int nonagle) 1522 unsigned int cur_mss, int nonagle)
1523 { 1523 {
1524 const struct tcp_sock *tp = tcp_sk(sk); 1524 const struct tcp_sock *tp = tcp_sk(sk);
1525 unsigned int cwnd_quota; 1525 unsigned int cwnd_quota;
1526 1526
1527 tcp_init_tso_segs(sk, skb, cur_mss); 1527 tcp_init_tso_segs(sk, skb, cur_mss);
1528 1528
1529 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) 1529 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
1530 return 0; 1530 return 0;
1531 1531
1532 cwnd_quota = tcp_cwnd_test(tp, skb); 1532 cwnd_quota = tcp_cwnd_test(tp, skb);
1533 if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) 1533 if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
1534 cwnd_quota = 0; 1534 cwnd_quota = 0;
1535 1535
1536 return cwnd_quota; 1536 return cwnd_quota;
1537 } 1537 }
1538 1538
1539 /* Test if sending is allowed right now. */ 1539 /* Test if sending is allowed right now. */
1540 bool tcp_may_send_now(struct sock *sk) 1540 bool tcp_may_send_now(struct sock *sk)
1541 { 1541 {
1542 const struct tcp_sock *tp = tcp_sk(sk); 1542 const struct tcp_sock *tp = tcp_sk(sk);
1543 struct sk_buff *skb = tcp_send_head(sk); 1543 struct sk_buff *skb = tcp_send_head(sk);
1544 1544
1545 return skb && 1545 return skb &&
1546 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1546 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1547 (tcp_skb_is_last(sk, skb) ? 1547 (tcp_skb_is_last(sk, skb) ?
1548 tp->nonagle : TCP_NAGLE_PUSH)); 1548 tp->nonagle : TCP_NAGLE_PUSH));
1549 } 1549 }
1550 1550
1551 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1551 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
1552 * which is put after SKB on the list. It is very much like 1552 * which is put after SKB on the list. It is very much like
1553 * tcp_fragment() except that it may make several kinds of assumptions 1553 * tcp_fragment() except that it may make several kinds of assumptions
1554 * in order to speed up the splitting operation. In particular, we 1554 * in order to speed up the splitting operation. In particular, we
1555 * know that all the data is in scatter-gather pages, and that the 1555 * know that all the data is in scatter-gather pages, and that the
1556 * packet has never been sent out before (and thus is not cloned). 1556 * packet has never been sent out before (and thus is not cloned).
1557 */ 1557 */
1558 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1558 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1559 unsigned int mss_now, gfp_t gfp) 1559 unsigned int mss_now, gfp_t gfp)
1560 { 1560 {
1561 struct sk_buff *buff; 1561 struct sk_buff *buff;
1562 int nlen = skb->len - len; 1562 int nlen = skb->len - len;
1563 u8 flags; 1563 u8 flags;
1564 1564
1565 /* All of a TSO frame must be composed of paged data. */ 1565 /* All of a TSO frame must be composed of paged data. */
1566 if (skb->len != skb->data_len) 1566 if (skb->len != skb->data_len)
1567 return tcp_fragment(sk, skb, len, mss_now); 1567 return tcp_fragment(sk, skb, len, mss_now);
1568 1568
1569 buff = sk_stream_alloc_skb(sk, 0, gfp); 1569 buff = sk_stream_alloc_skb(sk, 0, gfp);
1570 if (unlikely(buff == NULL)) 1570 if (unlikely(buff == NULL))
1571 return -ENOMEM; 1571 return -ENOMEM;
1572 1572
1573 sk->sk_wmem_queued += buff->truesize; 1573 sk->sk_wmem_queued += buff->truesize;
1574 sk_mem_charge(sk, buff->truesize); 1574 sk_mem_charge(sk, buff->truesize);
1575 buff->truesize += nlen; 1575 buff->truesize += nlen;
1576 skb->truesize -= nlen; 1576 skb->truesize -= nlen;
1577 1577
1578 /* Correct the sequence numbers. */ 1578 /* Correct the sequence numbers. */
1579 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; 1579 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1580 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; 1580 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1581 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; 1581 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1582 1582
1583 /* PSH and FIN should only be set in the second packet. */ 1583 /* PSH and FIN should only be set in the second packet. */
1584 flags = TCP_SKB_CB(skb)->tcp_flags; 1584 flags = TCP_SKB_CB(skb)->tcp_flags;
1585 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); 1585 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1586 TCP_SKB_CB(buff)->tcp_flags = flags; 1586 TCP_SKB_CB(buff)->tcp_flags = flags;
1587 1587
1588 /* This packet was never sent out yet, so no SACK bits. */ 1588 /* This packet was never sent out yet, so no SACK bits. */
1589 TCP_SKB_CB(buff)->sacked = 0; 1589 TCP_SKB_CB(buff)->sacked = 0;
1590 1590
1591 buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; 1591 buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
1592 skb_split(skb, buff, len); 1592 skb_split(skb, buff, len);
1593 1593
1594 /* Fix up tso_factor for both original and new SKB. */ 1594 /* Fix up tso_factor for both original and new SKB. */
1595 tcp_set_skb_tso_segs(sk, skb, mss_now); 1595 tcp_set_skb_tso_segs(sk, skb, mss_now);
1596 tcp_set_skb_tso_segs(sk, buff, mss_now); 1596 tcp_set_skb_tso_segs(sk, buff, mss_now);
1597 1597
1598 /* Link BUFF into the send queue. */ 1598 /* Link BUFF into the send queue. */
1599 skb_header_release(buff); 1599 skb_header_release(buff);
1600 tcp_insert_write_queue_after(skb, buff, sk); 1600 tcp_insert_write_queue_after(skb, buff, sk);
1601 1601
1602 return 0; 1602 return 0;
1603 } 1603 }
1604 1604
1605 /* Try to defer sending, if possible, in order to minimize the amount 1605 /* Try to defer sending, if possible, in order to minimize the amount
1606 * of TSO splitting we do. View it as a kind of TSO Nagle test. 1606 * of TSO splitting we do. View it as a kind of TSO Nagle test.
1607 * 1607 *
1608 * This algorithm is from John Heffner. 1608 * This algorithm is from John Heffner.
1609 */ 1609 */
1610 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1610 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1611 { 1611 {
1612 struct tcp_sock *tp = tcp_sk(sk); 1612 struct tcp_sock *tp = tcp_sk(sk);
1613 const struct inet_connection_sock *icsk = inet_csk(sk); 1613 const struct inet_connection_sock *icsk = inet_csk(sk);
1614 u32 send_win, cong_win, limit, in_flight; 1614 u32 send_win, cong_win, limit, in_flight;
1615 int win_divisor; 1615 int win_divisor;
1616 1616
1617 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 1617 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1618 goto send_now; 1618 goto send_now;
1619 1619
1620 if (icsk->icsk_ca_state != TCP_CA_Open) 1620 if (icsk->icsk_ca_state != TCP_CA_Open)
1621 goto send_now; 1621 goto send_now;
1622 1622
1623 /* Defer for less than two clock ticks. */ 1623 /* Defer for less than two clock ticks. */
1624 if (tp->tso_deferred && 1624 if (tp->tso_deferred &&
1625 (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) 1625 (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
1626 goto send_now; 1626 goto send_now;
1627 1627
1628 in_flight = tcp_packets_in_flight(tp); 1628 in_flight = tcp_packets_in_flight(tp);
1629 1629
1630 BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); 1630 BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
1631 1631
1632 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 1632 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1633 1633
1634 /* From in_flight test above, we know that cwnd > in_flight. */ 1634 /* From in_flight test above, we know that cwnd > in_flight. */
1635 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; 1635 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1636 1636
1637 limit = min(send_win, cong_win); 1637 limit = min(send_win, cong_win);
1638 1638
1639 /* If a full-sized TSO skb can be sent, do it. */ 1639 /* If a full-sized TSO skb can be sent, do it. */
1640 if (limit >= min_t(unsigned int, sk->sk_gso_max_size, 1640 if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1641 tp->xmit_size_goal_segs * tp->mss_cache)) 1641 tp->xmit_size_goal_segs * tp->mss_cache))
1642 goto send_now; 1642 goto send_now;
1643 1643
1644 /* Middle in queue won't get any more data, full sendable already? */ 1644 /* Middle in queue won't get any more data, full sendable already? */
1645 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1645 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1646 goto send_now; 1646 goto send_now;
1647 1647
1648 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); 1648 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1649 if (win_divisor) { 1649 if (win_divisor) {
1650 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1650 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1651 1651
1652 /* If at least some fraction of a window is available, 1652 /* If at least some fraction of a window is available,
1653 * just use it. 1653 * just use it.
1654 */ 1654 */
1655 chunk /= win_divisor; 1655 chunk /= win_divisor;
1656 if (limit >= chunk) 1656 if (limit >= chunk)
1657 goto send_now; 1657 goto send_now;
1658 } else { 1658 } else {
1659 /* Different approach, try not to defer past a single 1659 /* Different approach, try not to defer past a single
1660 * ACK. Receiver should ACK every other full sized 1660 * ACK. Receiver should ACK every other full sized
1661 * frame, so if we have space for more than 3 frames 1661 * frame, so if we have space for more than 3 frames
1662 * then send now. 1662 * then send now.
1663 */ 1663 */
1664 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) 1664 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
1665 goto send_now; 1665 goto send_now;
1666 } 1666 }
1667 1667
1668 /* Ok, it looks like it is advisable to defer. 1668 /* Ok, it looks like it is advisable to defer.
1669 * Do not rearm the timer if already set to not break TCP ACK clocking. 1669 * Do not rearm the timer if already set to not break TCP ACK clocking.
1670 */ 1670 */
1671 if (!tp->tso_deferred) 1671 if (!tp->tso_deferred)
1672 tp->tso_deferred = 1 | (jiffies << 1); 1672 tp->tso_deferred = 1 | (jiffies << 1);
1673 1673
1674 return true; 1674 return true;
1675 1675
1676 send_now: 1676 send_now:
1677 tp->tso_deferred = 0; 1677 tp->tso_deferred = 0;
1678 return false; 1678 return false;
1679 } 1679 }
1680 1680
1681 /* Create a new MTU probe if we are ready. 1681 /* Create a new MTU probe if we are ready.
1682 * MTU probe is regularly attempting to increase the path MTU by 1682 * MTU probe is regularly attempting to increase the path MTU by
1683 * deliberately sending larger packets. This discovers routing 1683 * deliberately sending larger packets. This discovers routing
1684 * changes resulting in larger path MTUs. 1684 * changes resulting in larger path MTUs.
1685 * 1685 *
1686 * Returns 0 if we should wait to probe (no cwnd available), 1686 * Returns 0 if we should wait to probe (no cwnd available),
1687 * 1 if a probe was sent, 1687 * 1 if a probe was sent,
1688 * -1 otherwise 1688 * -1 otherwise
1689 */ 1689 */
1690 static int tcp_mtu_probe(struct sock *sk) 1690 static int tcp_mtu_probe(struct sock *sk)
1691 { 1691 {
1692 struct tcp_sock *tp = tcp_sk(sk); 1692 struct tcp_sock *tp = tcp_sk(sk);
1693 struct inet_connection_sock *icsk = inet_csk(sk); 1693 struct inet_connection_sock *icsk = inet_csk(sk);
1694 struct sk_buff *skb, *nskb, *next; 1694 struct sk_buff *skb, *nskb, *next;
1695 int len; 1695 int len;
1696 int probe_size; 1696 int probe_size;
1697 int size_needed; 1697 int size_needed;
1698 int copy; 1698 int copy;
1699 int mss_now; 1699 int mss_now;
1700 1700
1701 /* Not currently probing/verifying, 1701 /* Not currently probing/verifying,
1702 * not in recovery, 1702 * not in recovery,
1703 * have enough cwnd, and 1703 * have enough cwnd, and
1704 * not SACKing (the variable headers throw things off) */ 1704 * not SACKing (the variable headers throw things off) */
1705 if (!icsk->icsk_mtup.enabled || 1705 if (!icsk->icsk_mtup.enabled ||
1706 icsk->icsk_mtup.probe_size || 1706 icsk->icsk_mtup.probe_size ||
1707 inet_csk(sk)->icsk_ca_state != TCP_CA_Open || 1707 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1708 tp->snd_cwnd < 11 || 1708 tp->snd_cwnd < 11 ||
1709 tp->rx_opt.num_sacks || tp->rx_opt.dsack) 1709 tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1710 return -1; 1710 return -1;
1711 1711
1712 /* Very simple search strategy: just double the MSS. */ 1712 /* Very simple search strategy: just double the MSS. */
1713 mss_now = tcp_current_mss(sk); 1713 mss_now = tcp_current_mss(sk);
1714 probe_size = 2 * tp->mss_cache; 1714 probe_size = 2 * tp->mss_cache;
1715 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; 1715 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1716 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { 1716 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1717 /* TODO: set timer for probe_converge_event */ 1717 /* TODO: set timer for probe_converge_event */
1718 return -1; 1718 return -1;
1719 } 1719 }
1720 1720
1721 /* Have enough data in the send queue to probe? */ 1721 /* Have enough data in the send queue to probe? */
1722 if (tp->write_seq - tp->snd_nxt < size_needed) 1722 if (tp->write_seq - tp->snd_nxt < size_needed)
1723 return -1; 1723 return -1;
1724 1724
1725 if (tp->snd_wnd < size_needed) 1725 if (tp->snd_wnd < size_needed)
1726 return -1; 1726 return -1;
1727 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) 1727 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
1728 return 0; 1728 return 0;
1729 1729
1730 /* Do we need to wait to drain cwnd? With none in flight, don't stall */ 1730 /* Do we need to wait to drain cwnd? With none in flight, don't stall */
1731 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { 1731 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
1732 if (!tcp_packets_in_flight(tp)) 1732 if (!tcp_packets_in_flight(tp))
1733 return -1; 1733 return -1;
1734 else 1734 else
1735 return 0; 1735 return 0;
1736 } 1736 }
1737 1737
1738 /* We're allowed to probe. Build it now. */ 1738 /* We're allowed to probe. Build it now. */
1739 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) 1739 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1740 return -1; 1740 return -1;
1741 sk->sk_wmem_queued += nskb->truesize; 1741 sk->sk_wmem_queued += nskb->truesize;
1742 sk_mem_charge(sk, nskb->truesize); 1742 sk_mem_charge(sk, nskb->truesize);
1743 1743
1744 skb = tcp_send_head(sk); 1744 skb = tcp_send_head(sk);
1745 1745
1746 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1746 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1747 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1747 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1748 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; 1748 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
1749 TCP_SKB_CB(nskb)->sacked = 0; 1749 TCP_SKB_CB(nskb)->sacked = 0;
1750 nskb->csum = 0; 1750 nskb->csum = 0;
1751 nskb->ip_summed = skb->ip_summed; 1751 nskb->ip_summed = skb->ip_summed;
1752 1752
1753 tcp_insert_write_queue_before(nskb, skb, sk); 1753 tcp_insert_write_queue_before(nskb, skb, sk);
1754 1754
1755 len = 0; 1755 len = 0;
1756 tcp_for_write_queue_from_safe(skb, next, sk) { 1756 tcp_for_write_queue_from_safe(skb, next, sk) {
1757 copy = min_t(int, skb->len, probe_size - len); 1757 copy = min_t(int, skb->len, probe_size - len);
1758 if (nskb->ip_summed) 1758 if (nskb->ip_summed)
1759 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); 1759 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1760 else 1760 else
1761 nskb->csum = skb_copy_and_csum_bits(skb, 0, 1761 nskb->csum = skb_copy_and_csum_bits(skb, 0,
1762 skb_put(nskb, copy), 1762 skb_put(nskb, copy),
1763 copy, nskb->csum); 1763 copy, nskb->csum);
1764 1764
1765 if (skb->len <= copy) { 1765 if (skb->len <= copy) {
1766 /* We've eaten all the data from this skb. 1766 /* We've eaten all the data from this skb.
1767 * Throw it away. */ 1767 * Throw it away. */
1768 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1768 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1769 tcp_unlink_write_queue(skb, sk); 1769 tcp_unlink_write_queue(skb, sk);
1770 sk_wmem_free_skb(sk, skb); 1770 sk_wmem_free_skb(sk, skb);
1771 } else { 1771 } else {
1772 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & 1772 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
1773 ~(TCPHDR_FIN|TCPHDR_PSH); 1773 ~(TCPHDR_FIN|TCPHDR_PSH);
1774 if (!skb_shinfo(skb)->nr_frags) { 1774 if (!skb_shinfo(skb)->nr_frags) {
1775 skb_pull(skb, copy); 1775 skb_pull(skb, copy);
1776 if (skb->ip_summed != CHECKSUM_PARTIAL) 1776 if (skb->ip_summed != CHECKSUM_PARTIAL)
1777 skb->csum = csum_partial(skb->data, 1777 skb->csum = csum_partial(skb->data,
1778 skb->len, 0); 1778 skb->len, 0);
1779 } else { 1779 } else {
1780 __pskb_trim_head(skb, copy); 1780 __pskb_trim_head(skb, copy);
1781 tcp_set_skb_tso_segs(sk, skb, mss_now); 1781 tcp_set_skb_tso_segs(sk, skb, mss_now);
1782 } 1782 }
1783 TCP_SKB_CB(skb)->seq += copy; 1783 TCP_SKB_CB(skb)->seq += copy;
1784 } 1784 }
1785 1785
1786 len += copy; 1786 len += copy;
1787 1787
1788 if (len >= probe_size) 1788 if (len >= probe_size)
1789 break; 1789 break;
1790 } 1790 }
1791 tcp_init_tso_segs(sk, nskb, nskb->len); 1791 tcp_init_tso_segs(sk, nskb, nskb->len);
1792 1792
1793 /* We're ready to send. If this fails, the probe will 1793 /* We're ready to send. If this fails, the probe will
1794 * be resegmented into mss-sized pieces by tcp_write_xmit(). */ 1794 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1795 TCP_SKB_CB(nskb)->when = tcp_time_stamp; 1795 TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1796 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { 1796 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1797 /* Decrement cwnd here because we are sending 1797 /* Decrement cwnd here because we are sending
1798 * effectively two packets. */ 1798 * effectively two packets. */
1799 tp->snd_cwnd--; 1799 tp->snd_cwnd--;
1800 tcp_event_new_data_sent(sk, nskb); 1800 tcp_event_new_data_sent(sk, nskb);
1801 1801
1802 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); 1802 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1803 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; 1803 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1804 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; 1804 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1805 1805
1806 return 1; 1806 return 1;
1807 } 1807 }
1808 1808
1809 return -1; 1809 return -1;
1810 } 1810 }
1811 1811
1812 /* This routine writes packets to the network. It advances the 1812 /* This routine writes packets to the network. It advances the
1813 * send_head. This happens as incoming acks open up the remote 1813 * send_head. This happens as incoming acks open up the remote
1814 * window for us. 1814 * window for us.
1815 * 1815 *
1816 * LARGESEND note: !tcp_urg_mode is overkill, only frames between 1816 * LARGESEND note: !tcp_urg_mode is overkill, only frames between
1817 * snd_up-64k-mss .. snd_up cannot be large. However, taking into 1817 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1818 * account rare use of URG, this is not a big flaw. 1818 * account rare use of URG, this is not a big flaw.
1819 * 1819 *
1820 * Send at most one packet when push_one > 0. Temporarily ignore 1820 * Send at most one packet when push_one > 0. Temporarily ignore
1821 * cwnd limit to force at most one packet out when push_one == 2. 1821 * cwnd limit to force at most one packet out when push_one == 2.
1822 1822
1823 * Returns true, if no segments are in flight and we have queued segments, 1823 * Returns true, if no segments are in flight and we have queued segments,
1824 * but cannot send anything now because of SWS or another problem. 1824 * but cannot send anything now because of SWS or another problem.
1825 */ 1825 */
1826 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 1826 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1827 int push_one, gfp_t gfp) 1827 int push_one, gfp_t gfp)
1828 { 1828 {
1829 struct tcp_sock *tp = tcp_sk(sk); 1829 struct tcp_sock *tp = tcp_sk(sk);
1830 struct sk_buff *skb; 1830 struct sk_buff *skb;
1831 unsigned int tso_segs, sent_pkts; 1831 unsigned int tso_segs, sent_pkts;
1832 int cwnd_quota; 1832 int cwnd_quota;
1833 int result; 1833 int result;
1834 1834
1835 sent_pkts = 0; 1835 sent_pkts = 0;
1836 1836
1837 if (!push_one) { 1837 if (!push_one) {
1838 /* Do MTU probing. */ 1838 /* Do MTU probing. */
1839 result = tcp_mtu_probe(sk); 1839 result = tcp_mtu_probe(sk);
1840 if (!result) { 1840 if (!result) {
1841 return false; 1841 return false;
1842 } else if (result > 0) { 1842 } else if (result > 0) {
1843 sent_pkts = 1; 1843 sent_pkts = 1;
1844 } 1844 }
1845 } 1845 }
1846 1846
1847 while ((skb = tcp_send_head(sk))) { 1847 while ((skb = tcp_send_head(sk))) {
1848 unsigned int limit; 1848 unsigned int limit;
1849 1849
1850 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1850 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1851 BUG_ON(!tso_segs); 1851 BUG_ON(!tso_segs);
1852 1852
1853 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) 1853 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
1854 goto repair; /* Skip network transmission */ 1854 goto repair; /* Skip network transmission */
1855 1855
1856 cwnd_quota = tcp_cwnd_test(tp, skb); 1856 cwnd_quota = tcp_cwnd_test(tp, skb);
1857 if (!cwnd_quota) { 1857 if (!cwnd_quota) {
1858 if (push_one == 2) 1858 if (push_one == 2)
1859 /* Force out a loss probe pkt. */ 1859 /* Force out a loss probe pkt. */
1860 cwnd_quota = 1; 1860 cwnd_quota = 1;
1861 else 1861 else
1862 break; 1862 break;
1863 } 1863 }
1864 1864
1865 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 1865 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
1866 break; 1866 break;
1867 1867
1868 if (tso_segs == 1) { 1868 if (tso_segs == 1) {
1869 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, 1869 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1870 (tcp_skb_is_last(sk, skb) ? 1870 (tcp_skb_is_last(sk, skb) ?
1871 nonagle : TCP_NAGLE_PUSH)))) 1871 nonagle : TCP_NAGLE_PUSH))))
1872 break; 1872 break;
1873 } else { 1873 } else {
1874 if (!push_one && tcp_tso_should_defer(sk, skb)) 1874 if (!push_one && tcp_tso_should_defer(sk, skb))
1875 break; 1875 break;
1876 } 1876 }
1877 1877
1878 /* TCP Small Queues : 1878 /* TCP Small Queues :
1879 * Control number of packets in qdisc/devices to two packets / or ~1 ms. 1879 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
1880 * This allows for : 1880 * This allows for :
1881 * - better RTT estimation and ACK scheduling 1881 * - better RTT estimation and ACK scheduling
1882 * - faster recovery 1882 * - faster recovery
1883 * - high rates 1883 * - high rates
1884 * Alas, some drivers / subsystems require a fair amount 1884 * Alas, some drivers / subsystems require a fair amount
1885 * of queued bytes to ensure line rate. 1885 * of queued bytes to ensure line rate.
1886 * One example is wifi aggregation (802.11 AMPDU) 1886 * One example is wifi aggregation (802.11 AMPDU)
1887 */ 1887 */
1888 limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, 1888 limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
1889 sk->sk_pacing_rate >> 10); 1889 sk->sk_pacing_rate >> 10);
1890 1890
1891 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 1891 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
1892 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 1892 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1893 break; 1893 break;
1894 } 1894 }
1895 1895
1896 limit = mss_now; 1896 limit = mss_now;
1897 if (tso_segs > 1 && !tcp_urg_mode(tp)) 1897 if (tso_segs > 1 && !tcp_urg_mode(tp))
1898 limit = tcp_mss_split_point(sk, skb, mss_now, 1898 limit = tcp_mss_split_point(sk, skb, mss_now,
1899 min_t(unsigned int, 1899 min_t(unsigned int,
1900 cwnd_quota, 1900 cwnd_quota,
1901 sk->sk_gso_max_segs)); 1901 sk->sk_gso_max_segs));
1902 1902
1903 if (skb->len > limit && 1903 if (skb->len > limit &&
1904 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 1904 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1905 break; 1905 break;
1906 1906
1907 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1907 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1908 1908
1909 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) 1909 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
1910 break; 1910 break;
1911 1911
1912 repair: 1912 repair:
1913 /* Advance the send_head. This one is sent out. 1913 /* Advance the send_head. This one is sent out.
1914 * This call will increment packets_out. 1914 * This call will increment packets_out.
1915 */ 1915 */
1916 tcp_event_new_data_sent(sk, skb); 1916 tcp_event_new_data_sent(sk, skb);
1917 1917
1918 tcp_minshall_update(tp, mss_now, skb); 1918 tcp_minshall_update(tp, mss_now, skb);
1919 sent_pkts += tcp_skb_pcount(skb); 1919 sent_pkts += tcp_skb_pcount(skb);
1920 1920
1921 if (push_one) 1921 if (push_one)
1922 break; 1922 break;
1923 } 1923 }
1924 1924
1925 if (likely(sent_pkts)) { 1925 if (likely(sent_pkts)) {
1926 if (tcp_in_cwnd_reduction(sk)) 1926 if (tcp_in_cwnd_reduction(sk))
1927 tp->prr_out += sent_pkts; 1927 tp->prr_out += sent_pkts;
1928 1928
1929 /* Send one loss probe per tail loss episode. */ 1929 /* Send one loss probe per tail loss episode. */
1930 if (push_one != 2) 1930 if (push_one != 2)
1931 tcp_schedule_loss_probe(sk); 1931 tcp_schedule_loss_probe(sk);
1932 tcp_cwnd_validate(sk); 1932 tcp_cwnd_validate(sk);
1933 return false; 1933 return false;
1934 } 1934 }
1935 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 1935 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
1936 } 1936 }
1937 1937
1938 bool tcp_schedule_loss_probe(struct sock *sk) 1938 bool tcp_schedule_loss_probe(struct sock *sk)
1939 { 1939 {
1940 struct inet_connection_sock *icsk = inet_csk(sk); 1940 struct inet_connection_sock *icsk = inet_csk(sk);
1941 struct tcp_sock *tp = tcp_sk(sk); 1941 struct tcp_sock *tp = tcp_sk(sk);
1942 u32 timeout, tlp_time_stamp, rto_time_stamp; 1942 u32 timeout, tlp_time_stamp, rto_time_stamp;
1943 u32 rtt = tp->srtt >> 3; 1943 u32 rtt = tp->srtt >> 3;
1944 1944
1945 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) 1945 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
1946 return false; 1946 return false;
1947 /* No consecutive loss probes. */ 1947 /* No consecutive loss probes. */
1948 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { 1948 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
1949 tcp_rearm_rto(sk); 1949 tcp_rearm_rto(sk);
1950 return false; 1950 return false;
1951 } 1951 }
1952 /* Don't do any loss probe on a Fast Open connection before 3WHS 1952 /* Don't do any loss probe on a Fast Open connection before 3WHS
1953 * finishes. 1953 * finishes.
1954 */ 1954 */
1955 if (sk->sk_state == TCP_SYN_RECV) 1955 if (sk->sk_state == TCP_SYN_RECV)
1956 return false; 1956 return false;
1957 1957
1958 /* TLP is only scheduled when next timer event is RTO. */ 1958 /* TLP is only scheduled when next timer event is RTO. */
1959 if (icsk->icsk_pending != ICSK_TIME_RETRANS) 1959 if (icsk->icsk_pending != ICSK_TIME_RETRANS)
1960 return false; 1960 return false;
1961 1961
1962 /* Schedule a loss probe in 2*RTT for SACK capable connections 1962 /* Schedule a loss probe in 2*RTT for SACK capable connections
1963 * in Open state, that are either limited by cwnd or application. 1963 * in Open state, that are either limited by cwnd or application.
1964 */ 1964 */
1965 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || 1965 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
1966 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 1966 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
1967 return false; 1967 return false;
1968 1968
1969 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 1969 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
1970 tcp_send_head(sk)) 1970 tcp_send_head(sk))
1971 return false; 1971 return false;
1972 1972
1973 /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account 1973 /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
1974 * for delayed ack when there's one outstanding packet. 1974 * for delayed ack when there's one outstanding packet.
1975 */ 1975 */
1976 timeout = rtt << 1; 1976 timeout = rtt << 1;
1977 if (tp->packets_out == 1) 1977 if (tp->packets_out == 1)
1978 timeout = max_t(u32, timeout, 1978 timeout = max_t(u32, timeout,
1979 (rtt + (rtt >> 1) + TCP_DELACK_MAX)); 1979 (rtt + (rtt >> 1) + TCP_DELACK_MAX));
1980 timeout = max_t(u32, timeout, msecs_to_jiffies(10)); 1980 timeout = max_t(u32, timeout, msecs_to_jiffies(10));
1981 1981
1982 /* If RTO is shorter, just schedule TLP in its place. */ 1982 /* If RTO is shorter, just schedule TLP in its place. */
1983 tlp_time_stamp = tcp_time_stamp + timeout; 1983 tlp_time_stamp = tcp_time_stamp + timeout;
1984 rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; 1984 rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
1985 if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { 1985 if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
1986 s32 delta = rto_time_stamp - tcp_time_stamp; 1986 s32 delta = rto_time_stamp - tcp_time_stamp;
1987 if (delta > 0) 1987 if (delta > 0)
1988 timeout = delta; 1988 timeout = delta;
1989 } 1989 }
1990 1990
1991 inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, 1991 inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
1992 TCP_RTO_MAX); 1992 TCP_RTO_MAX);
1993 return true; 1993 return true;
1994 } 1994 }
1995 1995
1996 /* When probe timeout (PTO) fires, send a new segment if one exists, else 1996 /* When probe timeout (PTO) fires, send a new segment if one exists, else
1997 * retransmit the last segment. 1997 * retransmit the last segment.
1998 */ 1998 */
1999 void tcp_send_loss_probe(struct sock *sk) 1999 void tcp_send_loss_probe(struct sock *sk)
2000 { 2000 {
2001 struct tcp_sock *tp = tcp_sk(sk); 2001 struct tcp_sock *tp = tcp_sk(sk);
2002 struct sk_buff *skb; 2002 struct sk_buff *skb;
2003 int pcount; 2003 int pcount;
2004 int mss = tcp_current_mss(sk); 2004 int mss = tcp_current_mss(sk);
2005 int err = -1; 2005 int err = -1;
2006 2006
2007 if (tcp_send_head(sk) != NULL) { 2007 if (tcp_send_head(sk) != NULL) {
2008 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); 2008 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2009 goto rearm_timer; 2009 goto rearm_timer;
2010 } 2010 }
2011 2011
2012 /* At most one outstanding TLP retransmission. */ 2012 /* At most one outstanding TLP retransmission. */
2013 if (tp->tlp_high_seq) 2013 if (tp->tlp_high_seq)
2014 goto rearm_timer; 2014 goto rearm_timer;
2015 2015
2016 /* Retransmit last segment. */ 2016 /* Retransmit last segment. */
2017 skb = tcp_write_queue_tail(sk); 2017 skb = tcp_write_queue_tail(sk);
2018 if (WARN_ON(!skb)) 2018 if (WARN_ON(!skb))
2019 goto rearm_timer; 2019 goto rearm_timer;
2020 2020
2021 pcount = tcp_skb_pcount(skb); 2021 pcount = tcp_skb_pcount(skb);
2022 if (WARN_ON(!pcount)) 2022 if (WARN_ON(!pcount))
2023 goto rearm_timer; 2023 goto rearm_timer;
2024 2024
2025 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2025 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2026 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) 2026 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
2027 goto rearm_timer; 2027 goto rearm_timer;
2028 skb = tcp_write_queue_tail(sk); 2028 skb = tcp_write_queue_tail(sk);
2029 } 2029 }
2030 2030
2031 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2031 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2032 goto rearm_timer; 2032 goto rearm_timer;
2033 2033
2034 /* Probe with zero data doesn't trigger fast recovery. */ 2034 /* Probe with zero data doesn't trigger fast recovery. */
2035 if (skb->len > 0) 2035 if (skb->len > 0)
2036 err = __tcp_retransmit_skb(sk, skb); 2036 err = __tcp_retransmit_skb(sk, skb);
2037 2037
2038 /* Record snd_nxt for loss detection. */ 2038 /* Record snd_nxt for loss detection. */
2039 if (likely(!err)) 2039 if (likely(!err))
2040 tp->tlp_high_seq = tp->snd_nxt; 2040 tp->tlp_high_seq = tp->snd_nxt;
2041 2041
2042 rearm_timer: 2042 rearm_timer:
2043 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2043 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2044 inet_csk(sk)->icsk_rto, 2044 inet_csk(sk)->icsk_rto,
2045 TCP_RTO_MAX); 2045 TCP_RTO_MAX);
2046 2046
2047 if (likely(!err)) 2047 if (likely(!err))
2048 NET_INC_STATS_BH(sock_net(sk), 2048 NET_INC_STATS_BH(sock_net(sk),
2049 LINUX_MIB_TCPLOSSPROBES); 2049 LINUX_MIB_TCPLOSSPROBES);
2050 return; 2050 return;
2051 } 2051 }
2052 2052
2053 /* Push out any pending frames which were held back due to 2053 /* Push out any pending frames which were held back due to
2054 * TCP_CORK or attempt at coalescing tiny packets. 2054 * TCP_CORK or attempt at coalescing tiny packets.
2055 * The socket must be locked by the caller. 2055 * The socket must be locked by the caller.
2056 */ 2056 */
2057 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 2057 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2058 int nonagle) 2058 int nonagle)
2059 { 2059 {
2060 /* If we are closed, the bytes will have to remain here. 2060 /* If we are closed, the bytes will have to remain here.
2061 * In time closedown will finish, we empty the write queue and 2061 * In time closedown will finish, we empty the write queue and
2062 * all will be happy. 2062 * all will be happy.
2063 */ 2063 */
2064 if (unlikely(sk->sk_state == TCP_CLOSE)) 2064 if (unlikely(sk->sk_state == TCP_CLOSE))
2065 return; 2065 return;
2066 2066
2067 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, 2067 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2068 sk_gfp_atomic(sk, GFP_ATOMIC))) 2068 sk_gfp_atomic(sk, GFP_ATOMIC)))
2069 tcp_check_probe_timer(sk); 2069 tcp_check_probe_timer(sk);
2070 } 2070 }
2071 2071
2072 /* Send _single_ skb sitting at the send head. This function requires 2072 /* Send _single_ skb sitting at the send head. This function requires
2073 * true push pending frames to setup probe timer etc. 2073 * true push pending frames to setup probe timer etc.
2074 */ 2074 */
2075 void tcp_push_one(struct sock *sk, unsigned int mss_now) 2075 void tcp_push_one(struct sock *sk, unsigned int mss_now)
2076 { 2076 {
2077 struct sk_buff *skb = tcp_send_head(sk); 2077 struct sk_buff *skb = tcp_send_head(sk);
2078 2078
2079 BUG_ON(!skb || skb->len < mss_now); 2079 BUG_ON(!skb || skb->len < mss_now);
2080 2080
2081 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); 2081 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2082 } 2082 }
2083 2083
2084 /* This function returns the amount that we can raise the 2084 /* This function returns the amount that we can raise the
2085 * usable window based on the following constraints 2085 * usable window based on the following constraints
2086 * 2086 *
2087 * 1. The window can never be shrunk once it is offered (RFC 793) 2087 * 1. The window can never be shrunk once it is offered (RFC 793)
2088 * 2. We limit memory per socket 2088 * 2. We limit memory per socket
2089 * 2089 *
2090 * RFC 1122: 2090 * RFC 1122:
2091 * "the suggested [SWS] avoidance algorithm for the receiver is to keep 2091 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
2092 * RECV.NEXT + RCV.WIN fixed until: 2092 * RECV.NEXT + RCV.WIN fixed until:
2093 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" 2093 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
2094 * 2094 *
2095 * i.e. don't raise the right edge of the window until you can raise 2095 * i.e. don't raise the right edge of the window until you can raise
2096 * it at least MSS bytes. 2096 * it at least MSS bytes.
2097 * 2097 *
2098 * Unfortunately, the recommended algorithm breaks header prediction, 2098 * Unfortunately, the recommended algorithm breaks header prediction,
2099 * since header prediction assumes th->window stays fixed. 2099 * since header prediction assumes th->window stays fixed.
2100 * 2100 *
2101 * Strictly speaking, keeping th->window fixed violates the receiver 2101 * Strictly speaking, keeping th->window fixed violates the receiver
2102 * side SWS prevention criteria. The problem is that under this rule 2102 * side SWS prevention criteria. The problem is that under this rule
2103 * a stream of single byte packets will cause the right side of the 2103 * a stream of single byte packets will cause the right side of the
2104 * window to always advance by a single byte. 2104 * window to always advance by a single byte.
2105 * 2105 *
2106 * Of course, if the sender implements sender side SWS prevention 2106 * Of course, if the sender implements sender side SWS prevention
2107 * then this will not be a problem. 2107 * then this will not be a problem.
2108 * 2108 *
2109 * BSD seems to make the following compromise: 2109 * BSD seems to make the following compromise:
2110 * 2110 *
2111 * If the free space is less than the 1/4 of the maximum 2111 * If the free space is less than the 1/4 of the maximum
2112 * space available and the free space is less than 1/2 mss, 2112 * space available and the free space is less than 1/2 mss,
2113 * then set the window to 0. 2113 * then set the window to 0.
2114 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] 2114 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
2115 * Otherwise, just prevent the window from shrinking 2115 * Otherwise, just prevent the window from shrinking
2116 * and from being larger than the largest representable value. 2116 * and from being larger than the largest representable value.
2117 * 2117 *
2118 * This prevents incremental opening of the window in the regime 2118 * This prevents incremental opening of the window in the regime
2119 * where TCP is limited by the speed of the reader side taking 2119 * where TCP is limited by the speed of the reader side taking
2120 * data out of the TCP receive queue. It does nothing about 2120 * data out of the TCP receive queue. It does nothing about
2121 * those cases where the window is constrained on the sender side 2121 * those cases where the window is constrained on the sender side
2122 * because the pipeline is full. 2122 * because the pipeline is full.
2123 * 2123 *
2124 * BSD also seems to "accidentally" limit itself to windows that are a 2124 * BSD also seems to "accidentally" limit itself to windows that are a
2125 * multiple of MSS, at least until the free space gets quite small. 2125 * multiple of MSS, at least until the free space gets quite small.
2126 * This would appear to be a side effect of the mbuf implementation. 2126 * This would appear to be a side effect of the mbuf implementation.
2127 * Combining these two algorithms results in the observed behavior 2127 * Combining these two algorithms results in the observed behavior
2128 * of having a fixed window size at almost all times. 2128 * of having a fixed window size at almost all times.
2129 * 2129 *
2130 * Below we obtain similar behavior by forcing the offered window to 2130 * Below we obtain similar behavior by forcing the offered window to
2131 * a multiple of the mss when it is feasible to do so. 2131 * a multiple of the mss when it is feasible to do so.
2132 * 2132 *
2133 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. 2133 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
2134 * Regular options like TIMESTAMP are taken into account. 2134 * Regular options like TIMESTAMP are taken into account.
2135 */ 2135 */
2136 u32 __tcp_select_window(struct sock *sk) 2136 u32 __tcp_select_window(struct sock *sk)
2137 { 2137 {
2138 struct inet_connection_sock *icsk = inet_csk(sk); 2138 struct inet_connection_sock *icsk = inet_csk(sk);
2139 struct tcp_sock *tp = tcp_sk(sk); 2139 struct tcp_sock *tp = tcp_sk(sk);
2140 /* MSS for the peer's data. Previous versions used mss_clamp 2140 /* MSS for the peer's data. Previous versions used mss_clamp
2141 * here. I don't know if the value based on our guesses 2141 * here. I don't know if the value based on our guesses
2142 * of peer's MSS is better for the performance. It's more correct 2142 * of peer's MSS is better for the performance. It's more correct
2143 * but may be worse for the performance because of rcv_mss 2143 * but may be worse for the performance because of rcv_mss
2144 * fluctuations. --SAW 1998/11/1 2144 * fluctuations. --SAW 1998/11/1
2145 */ 2145 */
2146 int mss = icsk->icsk_ack.rcv_mss; 2146 int mss = icsk->icsk_ack.rcv_mss;
2147 int free_space = tcp_space(sk); 2147 int free_space = tcp_space(sk);
2148 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); 2148 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
2149 int window; 2149 int window;
2150 2150
2151 if (mss > full_space) 2151 if (mss > full_space)
2152 mss = full_space; 2152 mss = full_space;
2153 2153
2154 if (free_space < (full_space >> 1)) { 2154 if (free_space < (full_space >> 1)) {
2155 icsk->icsk_ack.quick = 0; 2155 icsk->icsk_ack.quick = 0;
2156 2156
2157 if (sk_under_memory_pressure(sk)) 2157 if (sk_under_memory_pressure(sk))
2158 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 2158 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2159 4U * tp->advmss); 2159 4U * tp->advmss);
2160 2160
2161 if (free_space < mss) 2161 if (free_space < mss)
2162 return 0; 2162 return 0;
2163 } 2163 }
2164 2164
2165 if (free_space > tp->rcv_ssthresh) 2165 if (free_space > tp->rcv_ssthresh)
2166 free_space = tp->rcv_ssthresh; 2166 free_space = tp->rcv_ssthresh;
2167 2167
2168 /* Don't do rounding if we are using window scaling, since the 2168 /* Don't do rounding if we are using window scaling, since the
2169 * scaled window will not line up with the MSS boundary anyway. 2169 * scaled window will not line up with the MSS boundary anyway.
2170 */ 2170 */
2171 window = tp->rcv_wnd; 2171 window = tp->rcv_wnd;
2172 if (tp->rx_opt.rcv_wscale) { 2172 if (tp->rx_opt.rcv_wscale) {
2173 window = free_space; 2173 window = free_space;
2174 2174
2175 /* Advertise enough space so that it won't get scaled away. 2175 /* Advertise enough space so that it won't get scaled away.
2176 * Import case: prevent zero window announcement if 2176 * Import case: prevent zero window announcement if
2177 * 1<<rcv_wscale > mss. 2177 * 1<<rcv_wscale > mss.
2178 */ 2178 */
2179 if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) 2179 if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
2180 window = (((window >> tp->rx_opt.rcv_wscale) + 1) 2180 window = (((window >> tp->rx_opt.rcv_wscale) + 1)
2181 << tp->rx_opt.rcv_wscale); 2181 << tp->rx_opt.rcv_wscale);
2182 } else { 2182 } else {
2183 /* Get the largest window that is a nice multiple of mss. 2183 /* Get the largest window that is a nice multiple of mss.
2184 * Window clamp already applied above. 2184 * Window clamp already applied above.
2185 * If our current window offering is within 1 mss of the 2185 * If our current window offering is within 1 mss of the
2186 * free space we just keep it. This prevents the divide 2186 * free space we just keep it. This prevents the divide
2187 * and multiply from happening most of the time. 2187 * and multiply from happening most of the time.
2188 * We also don't do any window rounding when the free space 2188 * We also don't do any window rounding when the free space
2189 * is too small. 2189 * is too small.
2190 */ 2190 */
2191 if (window <= free_space - mss || window > free_space) 2191 if (window <= free_space - mss || window > free_space)
2192 window = (free_space / mss) * mss; 2192 window = (free_space / mss) * mss;
2193 else if (mss == full_space && 2193 else if (mss == full_space &&
2194 free_space > window + (full_space >> 1)) 2194 free_space > window + (full_space >> 1))
2195 window = free_space; 2195 window = free_space;
2196 } 2196 }
2197 2197
2198 return window; 2198 return window;
2199 } 2199 }
2200 2200
2201 /* Collapses two adjacent SKB's during retransmission. */ 2201 /* Collapses two adjacent SKB's during retransmission. */
2202 static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2202 static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2203 { 2203 {
2204 struct tcp_sock *tp = tcp_sk(sk); 2204 struct tcp_sock *tp = tcp_sk(sk);
2205 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2205 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
2206 int skb_size, next_skb_size; 2206 int skb_size, next_skb_size;
2207 2207
2208 skb_size = skb->len; 2208 skb_size = skb->len;
2209 next_skb_size = next_skb->len; 2209 next_skb_size = next_skb->len;
2210 2210
2211 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 2211 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2212 2212
2213 tcp_highest_sack_combine(sk, next_skb, skb); 2213 tcp_highest_sack_combine(sk, next_skb, skb);
2214 2214
2215 tcp_unlink_write_queue(next_skb, sk); 2215 tcp_unlink_write_queue(next_skb, sk);
2216 2216
2217 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), 2217 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
2218 next_skb_size); 2218 next_skb_size);
2219 2219
2220 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2220 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2221 skb->ip_summed = CHECKSUM_PARTIAL; 2221 skb->ip_summed = CHECKSUM_PARTIAL;
2222 2222
2223 if (skb->ip_summed != CHECKSUM_PARTIAL) 2223 if (skb->ip_summed != CHECKSUM_PARTIAL)
2224 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); 2224 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
2225 2225
2226 /* Update sequence range on original skb. */ 2226 /* Update sequence range on original skb. */
2227 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; 2227 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
2228 2228
2229 /* Merge over control information. This moves PSH/FIN etc. over */ 2229 /* Merge over control information. This moves PSH/FIN etc. over */
2230 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; 2230 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
2231 2231
2232 /* All done, get rid of second SKB and account for it so 2232 /* All done, get rid of second SKB and account for it so
2233 * packet counting does not break. 2233 * packet counting does not break.
2234 */ 2234 */
2235 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; 2235 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
2236 2236
2237 /* changed transmit queue under us so clear hints */ 2237 /* changed transmit queue under us so clear hints */
2238 tcp_clear_retrans_hints_partial(tp); 2238 tcp_clear_retrans_hints_partial(tp);
2239 if (next_skb == tp->retransmit_skb_hint) 2239 if (next_skb == tp->retransmit_skb_hint)
2240 tp->retransmit_skb_hint = skb; 2240 tp->retransmit_skb_hint = skb;
2241 2241
2242 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); 2242 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
2243 2243
2244 sk_wmem_free_skb(sk, next_skb); 2244 sk_wmem_free_skb(sk, next_skb);
2245 } 2245 }
2246 2246
2247 /* Check if coalescing SKBs is legal. */ 2247 /* Check if coalescing SKBs is legal. */
2248 static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) 2248 static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2249 { 2249 {
2250 if (tcp_skb_pcount(skb) > 1) 2250 if (tcp_skb_pcount(skb) > 1)
2251 return false; 2251 return false;
2252 /* TODO: SACK collapsing could be used to remove this condition */ 2252 /* TODO: SACK collapsing could be used to remove this condition */
2253 if (skb_shinfo(skb)->nr_frags != 0) 2253 if (skb_shinfo(skb)->nr_frags != 0)
2254 return false; 2254 return false;
2255 if (skb_cloned(skb)) 2255 if (skb_cloned(skb))
2256 return false; 2256 return false;
2257 if (skb == tcp_send_head(sk)) 2257 if (skb == tcp_send_head(sk))
2258 return false; 2258 return false;
2259 /* Some heurestics for collapsing over SACK'd could be invented */ 2259 /* Some heurestics for collapsing over SACK'd could be invented */
2260 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2260 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2261 return false; 2261 return false;
2262 2262
2263 return true; 2263 return true;
2264 } 2264 }
2265 2265
2266 /* Collapse packets in the retransmit queue to make to create 2266 /* Collapse packets in the retransmit queue to make to create
2267 * less packets on the wire. This is only done on retransmission. 2267 * less packets on the wire. This is only done on retransmission.
2268 */ 2268 */
2269 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, 2269 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2270 int space) 2270 int space)
2271 { 2271 {
2272 struct tcp_sock *tp = tcp_sk(sk); 2272 struct tcp_sock *tp = tcp_sk(sk);
2273 struct sk_buff *skb = to, *tmp; 2273 struct sk_buff *skb = to, *tmp;
2274 bool first = true; 2274 bool first = true;
2275 2275
2276 if (!sysctl_tcp_retrans_collapse) 2276 if (!sysctl_tcp_retrans_collapse)
2277 return; 2277 return;
2278 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2278 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2279 return; 2279 return;
2280 2280
2281 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2281 tcp_for_write_queue_from_safe(skb, tmp, sk) {
2282 if (!tcp_can_collapse(sk, skb)) 2282 if (!tcp_can_collapse(sk, skb))
2283 break; 2283 break;
2284 2284
2285 space -= skb->len; 2285 space -= skb->len;
2286 2286
2287 if (first) { 2287 if (first) {
2288 first = false; 2288 first = false;
2289 continue; 2289 continue;
2290 } 2290 }
2291 2291
2292 if (space < 0) 2292 if (space < 0)
2293 break; 2293 break;
2294 /* Punt if not enough space exists in the first SKB for 2294 /* Punt if not enough space exists in the first SKB for
2295 * the data in the second 2295 * the data in the second
2296 */ 2296 */
2297 if (skb->len > skb_availroom(to)) 2297 if (skb->len > skb_availroom(to))
2298 break; 2298 break;
2299 2299
2300 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) 2300 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2301 break; 2301 break;
2302 2302
2303 tcp_collapse_retrans(sk, to); 2303 tcp_collapse_retrans(sk, to);
2304 } 2304 }
2305 } 2305 }
2306 2306
2307 /* This retransmits one SKB. Policy decisions and retransmit queue 2307 /* This retransmits one SKB. Policy decisions and retransmit queue
2308 * state updates are done by the caller. Returns non-zero if an 2308 * state updates are done by the caller. Returns non-zero if an
2309 * error occurred which prevented the send. 2309 * error occurred which prevented the send.
2310 */ 2310 */
2311 int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2311 int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2312 { 2312 {
2313 struct tcp_sock *tp = tcp_sk(sk); 2313 struct tcp_sock *tp = tcp_sk(sk);
2314 struct inet_connection_sock *icsk = inet_csk(sk); 2314 struct inet_connection_sock *icsk = inet_csk(sk);
2315 unsigned int cur_mss; 2315 unsigned int cur_mss;
2316 2316
2317 /* Inconslusive MTU probe */ 2317 /* Inconslusive MTU probe */
2318 if (icsk->icsk_mtup.probe_size) { 2318 if (icsk->icsk_mtup.probe_size) {
2319 icsk->icsk_mtup.probe_size = 0; 2319 icsk->icsk_mtup.probe_size = 0;
2320 } 2320 }
2321 2321
2322 /* Do not sent more than we queued. 1/4 is reserved for possible 2322 /* Do not sent more than we queued. 1/4 is reserved for possible
2323 * copying overhead: fragmentation, tunneling, mangling etc. 2323 * copying overhead: fragmentation, tunneling, mangling etc.
2324 */ 2324 */
2325 if (atomic_read(&sk->sk_wmem_alloc) > 2325 if (atomic_read(&sk->sk_wmem_alloc) >
2326 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2326 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2327 return -EAGAIN; 2327 return -EAGAIN;
2328 2328
2329 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2329 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2330 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2330 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2331 BUG(); 2331 BUG();
2332 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 2332 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
2333 return -ENOMEM; 2333 return -ENOMEM;
2334 } 2334 }
2335 2335
2336 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) 2336 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
2337 return -EHOSTUNREACH; /* Routing failure or similar. */ 2337 return -EHOSTUNREACH; /* Routing failure or similar. */
2338 2338
2339 cur_mss = tcp_current_mss(sk); 2339 cur_mss = tcp_current_mss(sk);
2340 2340
2341 /* If receiver has shrunk his window, and skb is out of 2341 /* If receiver has shrunk his window, and skb is out of
2342 * new window, do not retransmit it. The exception is the 2342 * new window, do not retransmit it. The exception is the
2343 * case, when window is shrunk to zero. In this case 2343 * case, when window is shrunk to zero. In this case
2344 * our retransmit serves as a zero window probe. 2344 * our retransmit serves as a zero window probe.
2345 */ 2345 */
2346 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && 2346 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
2347 TCP_SKB_CB(skb)->seq != tp->snd_una) 2347 TCP_SKB_CB(skb)->seq != tp->snd_una)
2348 return -EAGAIN; 2348 return -EAGAIN;
2349 2349
2350 if (skb->len > cur_mss) { 2350 if (skb->len > cur_mss) {
2351 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 2351 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
2352 return -ENOMEM; /* We'll try again later. */ 2352 return -ENOMEM; /* We'll try again later. */
2353 } else { 2353 } else {
2354 int oldpcount = tcp_skb_pcount(skb); 2354 int oldpcount = tcp_skb_pcount(skb);
2355 2355
2356 if (unlikely(oldpcount > 1)) { 2356 if (unlikely(oldpcount > 1)) {
2357 if (skb_unclone(skb, GFP_ATOMIC)) 2357 if (skb_unclone(skb, GFP_ATOMIC))
2358 return -ENOMEM; 2358 return -ENOMEM;
2359 tcp_init_tso_segs(sk, skb, cur_mss); 2359 tcp_init_tso_segs(sk, skb, cur_mss);
2360 tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); 2360 tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
2361 } 2361 }
2362 } 2362 }
2363 2363
2364 tcp_retrans_try_collapse(sk, skb, cur_mss); 2364 tcp_retrans_try_collapse(sk, skb, cur_mss);
2365 2365
2366 /* Make a copy, if the first transmission SKB clone we made 2366 /* Make a copy, if the first transmission SKB clone we made
2367 * is still in somebody's hands, else make a clone. 2367 * is still in somebody's hands, else make a clone.
2368 */ 2368 */
2369 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2369 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2370 2370
2371 /* make sure skb->data is aligned on arches that require it 2371 /* make sure skb->data is aligned on arches that require it
2372 * and check if ack-trimming & collapsing extended the headroom 2372 * and check if ack-trimming & collapsing extended the headroom
2373 * beyond what csum_start can cover. 2373 * beyond what csum_start can cover.
2374 */ 2374 */
2375 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || 2375 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
2376 skb_headroom(skb) >= 0xFFFF)) { 2376 skb_headroom(skb) >= 0xFFFF)) {
2377 struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, 2377 struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2378 GFP_ATOMIC); 2378 GFP_ATOMIC);
2379 return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : 2379 return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2380 -ENOBUFS; 2380 -ENOBUFS;
2381 } else { 2381 } else {
2382 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2382 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2383 } 2383 }
2384 } 2384 }
2385 2385
2386 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2386 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2387 { 2387 {
2388 struct tcp_sock *tp = tcp_sk(sk); 2388 struct tcp_sock *tp = tcp_sk(sk);
2389 int err = __tcp_retransmit_skb(sk, skb); 2389 int err = __tcp_retransmit_skb(sk, skb);
2390 2390
2391 if (err == 0) { 2391 if (err == 0) {
2392 /* Update global TCP statistics. */ 2392 /* Update global TCP statistics. */
2393 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); 2393 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2394 2394
2395 tp->total_retrans++; 2395 tp->total_retrans++;
2396 2396
2397 #if FASTRETRANS_DEBUG > 0 2397 #if FASTRETRANS_DEBUG > 0
2398 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2398 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2399 net_dbg_ratelimited("retrans_out leaked\n"); 2399 net_dbg_ratelimited("retrans_out leaked\n");
2400 } 2400 }
2401 #endif 2401 #endif
2402 if (!tp->retrans_out) 2402 if (!tp->retrans_out)
2403 tp->lost_retrans_low = tp->snd_nxt; 2403 tp->lost_retrans_low = tp->snd_nxt;
2404 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; 2404 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
2405 tp->retrans_out += tcp_skb_pcount(skb); 2405 tp->retrans_out += tcp_skb_pcount(skb);
2406 2406
2407 /* Save stamp of the first retransmit. */ 2407 /* Save stamp of the first retransmit. */
2408 if (!tp->retrans_stamp) 2408 if (!tp->retrans_stamp)
2409 tp->retrans_stamp = TCP_SKB_CB(skb)->when; 2409 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
2410 2410
2411 tp->undo_retrans += tcp_skb_pcount(skb); 2411 tp->undo_retrans += tcp_skb_pcount(skb);
2412 2412
2413 /* snd_nxt is stored to detect loss of retransmitted segment, 2413 /* snd_nxt is stored to detect loss of retransmitted segment,
2414 * see tcp_input.c tcp_sacktag_write_queue(). 2414 * see tcp_input.c tcp_sacktag_write_queue().
2415 */ 2415 */
2416 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2416 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2417 } else { 2417 } else {
2418 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2418 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2419 } 2419 }
2420 return err; 2420 return err;
2421 } 2421 }
2422 2422
2423 /* Check if we forward retransmits are possible in the current 2423 /* Check if we forward retransmits are possible in the current
2424 * window/congestion state. 2424 * window/congestion state.
2425 */ 2425 */
2426 static bool tcp_can_forward_retransmit(struct sock *sk) 2426 static bool tcp_can_forward_retransmit(struct sock *sk)
2427 { 2427 {
2428 const struct inet_connection_sock *icsk = inet_csk(sk); 2428 const struct inet_connection_sock *icsk = inet_csk(sk);
2429 const struct tcp_sock *tp = tcp_sk(sk); 2429 const struct tcp_sock *tp = tcp_sk(sk);
2430 2430
2431 /* Forward retransmissions are possible only during Recovery. */ 2431 /* Forward retransmissions are possible only during Recovery. */
2432 if (icsk->icsk_ca_state != TCP_CA_Recovery) 2432 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2433 return false; 2433 return false;
2434 2434
2435 /* No forward retransmissions in Reno are possible. */ 2435 /* No forward retransmissions in Reno are possible. */
2436 if (tcp_is_reno(tp)) 2436 if (tcp_is_reno(tp))
2437 return false; 2437 return false;
2438 2438
2439 /* Yeah, we have to make difficult choice between forward transmission 2439 /* Yeah, we have to make difficult choice between forward transmission
2440 * and retransmission... Both ways have their merits... 2440 * and retransmission... Both ways have their merits...
2441 * 2441 *
2442 * For now we do not retransmit anything, while we have some new 2442 * For now we do not retransmit anything, while we have some new
2443 * segments to send. In the other cases, follow rule 3 for 2443 * segments to send. In the other cases, follow rule 3 for
2444 * NextSeg() specified in RFC3517. 2444 * NextSeg() specified in RFC3517.
2445 */ 2445 */
2446 2446
2447 if (tcp_may_send_now(sk)) 2447 if (tcp_may_send_now(sk))
2448 return false; 2448 return false;
2449 2449
2450 return true; 2450 return true;
2451 } 2451 }
2452 2452
2453 /* This gets called after a retransmit timeout, and the initially 2453 /* This gets called after a retransmit timeout, and the initially
2454 * retransmitted data is acknowledged. It tries to continue 2454 * retransmitted data is acknowledged. It tries to continue
2455 * resending the rest of the retransmit queue, until either 2455 * resending the rest of the retransmit queue, until either
2456 * we've sent it all or the congestion window limit is reached. 2456 * we've sent it all or the congestion window limit is reached.
2457 * If doing SACK, the first ACK which comes back for a timeout 2457 * If doing SACK, the first ACK which comes back for a timeout
2458 * based retransmit packet might feed us FACK information again. 2458 * based retransmit packet might feed us FACK information again.
2459 * If so, we use it to avoid unnecessarily retransmissions. 2459 * If so, we use it to avoid unnecessarily retransmissions.
2460 */ 2460 */
2461 void tcp_xmit_retransmit_queue(struct sock *sk) 2461 void tcp_xmit_retransmit_queue(struct sock *sk)
2462 { 2462 {
2463 const struct inet_connection_sock *icsk = inet_csk(sk); 2463 const struct inet_connection_sock *icsk = inet_csk(sk);
2464 struct tcp_sock *tp = tcp_sk(sk); 2464 struct tcp_sock *tp = tcp_sk(sk);
2465 struct sk_buff *skb; 2465 struct sk_buff *skb;
2466 struct sk_buff *hole = NULL; 2466 struct sk_buff *hole = NULL;
2467 u32 last_lost; 2467 u32 last_lost;
2468 int mib_idx; 2468 int mib_idx;
2469 int fwd_rexmitting = 0; 2469 int fwd_rexmitting = 0;
2470 2470
2471 if (!tp->packets_out) 2471 if (!tp->packets_out)
2472 return; 2472 return;
2473 2473
2474 if (!tp->lost_out) 2474 if (!tp->lost_out)
2475 tp->retransmit_high = tp->snd_una; 2475 tp->retransmit_high = tp->snd_una;
2476 2476
2477 if (tp->retransmit_skb_hint) { 2477 if (tp->retransmit_skb_hint) {
2478 skb = tp->retransmit_skb_hint; 2478 skb = tp->retransmit_skb_hint;
2479 last_lost = TCP_SKB_CB(skb)->end_seq; 2479 last_lost = TCP_SKB_CB(skb)->end_seq;
2480 if (after(last_lost, tp->retransmit_high)) 2480 if (after(last_lost, tp->retransmit_high))
2481 last_lost = tp->retransmit_high; 2481 last_lost = tp->retransmit_high;
2482 } else { 2482 } else {
2483 skb = tcp_write_queue_head(sk); 2483 skb = tcp_write_queue_head(sk);
2484 last_lost = tp->snd_una; 2484 last_lost = tp->snd_una;
2485 } 2485 }
2486 2486
2487 tcp_for_write_queue_from(skb, sk) { 2487 tcp_for_write_queue_from(skb, sk) {
2488 __u8 sacked = TCP_SKB_CB(skb)->sacked; 2488 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2489 2489
2490 if (skb == tcp_send_head(sk)) 2490 if (skb == tcp_send_head(sk))
2491 break; 2491 break;
2492 /* we could do better than to assign each time */ 2492 /* we could do better than to assign each time */
2493 if (hole == NULL) 2493 if (hole == NULL)
2494 tp->retransmit_skb_hint = skb; 2494 tp->retransmit_skb_hint = skb;
2495 2495
2496 /* Assume this retransmit will generate 2496 /* Assume this retransmit will generate
2497 * only one packet for congestion window 2497 * only one packet for congestion window
2498 * calculation purposes. This works because 2498 * calculation purposes. This works because
2499 * tcp_retransmit_skb() will chop up the 2499 * tcp_retransmit_skb() will chop up the
2500 * packet to be MSS sized and all the 2500 * packet to be MSS sized and all the
2501 * packet counting works out. 2501 * packet counting works out.
2502 */ 2502 */
2503 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) 2503 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2504 return; 2504 return;
2505 2505
2506 if (fwd_rexmitting) { 2506 if (fwd_rexmitting) {
2507 begin_fwd: 2507 begin_fwd:
2508 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) 2508 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2509 break; 2509 break;
2510 mib_idx = LINUX_MIB_TCPFORWARDRETRANS; 2510 mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2511 2511
2512 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { 2512 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2513 tp->retransmit_high = last_lost; 2513 tp->retransmit_high = last_lost;
2514 if (!tcp_can_forward_retransmit(sk)) 2514 if (!tcp_can_forward_retransmit(sk))
2515 break; 2515 break;
2516 /* Backtrack if necessary to non-L'ed skb */ 2516 /* Backtrack if necessary to non-L'ed skb */
2517 if (hole != NULL) { 2517 if (hole != NULL) {
2518 skb = hole; 2518 skb = hole;
2519 hole = NULL; 2519 hole = NULL;
2520 } 2520 }
2521 fwd_rexmitting = 1; 2521 fwd_rexmitting = 1;
2522 goto begin_fwd; 2522 goto begin_fwd;
2523 2523
2524 } else if (!(sacked & TCPCB_LOST)) { 2524 } else if (!(sacked & TCPCB_LOST)) {
2525 if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) 2525 if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2526 hole = skb; 2526 hole = skb;
2527 continue; 2527 continue;
2528 2528
2529 } else { 2529 } else {
2530 last_lost = TCP_SKB_CB(skb)->end_seq; 2530 last_lost = TCP_SKB_CB(skb)->end_seq;
2531 if (icsk->icsk_ca_state != TCP_CA_Loss) 2531 if (icsk->icsk_ca_state != TCP_CA_Loss)
2532 mib_idx = LINUX_MIB_TCPFASTRETRANS; 2532 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2533 else 2533 else
2534 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; 2534 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
2535 } 2535 }
2536 2536
2537 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) 2537 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2538 continue; 2538 continue;
2539 2539
2540 if (tcp_retransmit_skb(sk, skb)) 2540 if (tcp_retransmit_skb(sk, skb))
2541 return; 2541 return;
2542 2542
2543 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2543 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2544 2544
2545 if (tcp_in_cwnd_reduction(sk)) 2545 if (tcp_in_cwnd_reduction(sk))
2546 tp->prr_out += tcp_skb_pcount(skb); 2546 tp->prr_out += tcp_skb_pcount(skb);
2547 2547
2548 if (skb == tcp_write_queue_head(sk)) 2548 if (skb == tcp_write_queue_head(sk))
2549 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2549 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2550 inet_csk(sk)->icsk_rto, 2550 inet_csk(sk)->icsk_rto,
2551 TCP_RTO_MAX); 2551 TCP_RTO_MAX);
2552 } 2552 }
2553 } 2553 }
2554 2554
2555 /* Send a fin. The caller locks the socket for us. This cannot be 2555 /* Send a fin. The caller locks the socket for us. This cannot be
2556 * allowed to fail queueing a FIN frame under any circumstances. 2556 * allowed to fail queueing a FIN frame under any circumstances.
2557 */ 2557 */
2558 void tcp_send_fin(struct sock *sk) 2558 void tcp_send_fin(struct sock *sk)
2559 { 2559 {
2560 struct tcp_sock *tp = tcp_sk(sk); 2560 struct tcp_sock *tp = tcp_sk(sk);
2561 struct sk_buff *skb = tcp_write_queue_tail(sk); 2561 struct sk_buff *skb = tcp_write_queue_tail(sk);
2562 int mss_now; 2562 int mss_now;
2563 2563
2564 /* Optimization, tack on the FIN if we have a queue of 2564 /* Optimization, tack on the FIN if we have a queue of
2565 * unsent frames. But be careful about outgoing SACKS 2565 * unsent frames. But be careful about outgoing SACKS
2566 * and IP options. 2566 * and IP options.
2567 */ 2567 */
2568 mss_now = tcp_current_mss(sk); 2568 mss_now = tcp_current_mss(sk);
2569 2569
2570 if (tcp_send_head(sk) != NULL) { 2570 if (tcp_send_head(sk) != NULL) {
2571 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; 2571 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
2572 TCP_SKB_CB(skb)->end_seq++; 2572 TCP_SKB_CB(skb)->end_seq++;
2573 tp->write_seq++; 2573 tp->write_seq++;
2574 } else { 2574 } else {
2575 /* Socket is locked, keep trying until memory is available. */ 2575 /* Socket is locked, keep trying until memory is available. */
2576 for (;;) { 2576 for (;;) {
2577 skb = alloc_skb_fclone(MAX_TCP_HEADER, 2577 skb = alloc_skb_fclone(MAX_TCP_HEADER,
2578 sk->sk_allocation); 2578 sk->sk_allocation);
2579 if (skb) 2579 if (skb)
2580 break; 2580 break;
2581 yield(); 2581 yield();
2582 } 2582 }
2583 2583
2584 /* Reserve space for headers and prepare control bits. */ 2584 /* Reserve space for headers and prepare control bits. */
2585 skb_reserve(skb, MAX_TCP_HEADER); 2585 skb_reserve(skb, MAX_TCP_HEADER);
2586 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2586 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2587 tcp_init_nondata_skb(skb, tp->write_seq, 2587 tcp_init_nondata_skb(skb, tp->write_seq,
2588 TCPHDR_ACK | TCPHDR_FIN); 2588 TCPHDR_ACK | TCPHDR_FIN);
2589 tcp_queue_skb(sk, skb); 2589 tcp_queue_skb(sk, skb);
2590 } 2590 }
2591 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2591 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
2592 } 2592 }
2593 2593
2594 /* We get here when a process closes a file descriptor (either due to 2594 /* We get here when a process closes a file descriptor (either due to
2595 * an explicit close() or as a byproduct of exit()'ing) and there 2595 * an explicit close() or as a byproduct of exit()'ing) and there
2596 * was unread data in the receive queue. This behavior is recommended 2596 * was unread data in the receive queue. This behavior is recommended
2597 * by RFC 2525, section 2.17. -DaveM 2597 * by RFC 2525, section 2.17. -DaveM
2598 */ 2598 */
2599 void tcp_send_active_reset(struct sock *sk, gfp_t priority) 2599 void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2600 { 2600 {
2601 struct sk_buff *skb; 2601 struct sk_buff *skb;
2602 2602
2603 /* NOTE: No TCP options attached and we never retransmit this. */ 2603 /* NOTE: No TCP options attached and we never retransmit this. */
2604 skb = alloc_skb(MAX_TCP_HEADER, priority); 2604 skb = alloc_skb(MAX_TCP_HEADER, priority);
2605 if (!skb) { 2605 if (!skb) {
2606 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 2606 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2607 return; 2607 return;
2608 } 2608 }
2609 2609
2610 /* Reserve space for headers and prepare control bits. */ 2610 /* Reserve space for headers and prepare control bits. */
2611 skb_reserve(skb, MAX_TCP_HEADER); 2611 skb_reserve(skb, MAX_TCP_HEADER);
2612 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), 2612 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2613 TCPHDR_ACK | TCPHDR_RST); 2613 TCPHDR_ACK | TCPHDR_RST);
2614 /* Send it off. */ 2614 /* Send it off. */
2615 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2615 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2616 if (tcp_transmit_skb(sk, skb, 0, priority)) 2616 if (tcp_transmit_skb(sk, skb, 0, priority))
2617 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 2617 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2618 2618
2619 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); 2619 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
2620 } 2620 }
2621 2621
2622 /* Send a crossed SYN-ACK during socket establishment. 2622 /* Send a crossed SYN-ACK during socket establishment.
2623 * WARNING: This routine must only be called when we have already sent 2623 * WARNING: This routine must only be called when we have already sent
2624 * a SYN packet that crossed the incoming SYN that caused this routine 2624 * a SYN packet that crossed the incoming SYN that caused this routine
2625 * to get called. If this assumption fails then the initial rcv_wnd 2625 * to get called. If this assumption fails then the initial rcv_wnd
2626 * and rcv_wscale values will not be correct. 2626 * and rcv_wscale values will not be correct.
2627 */ 2627 */
2628 int tcp_send_synack(struct sock *sk) 2628 int tcp_send_synack(struct sock *sk)
2629 { 2629 {
2630 struct sk_buff *skb; 2630 struct sk_buff *skb;
2631 2631
2632 skb = tcp_write_queue_head(sk); 2632 skb = tcp_write_queue_head(sk);
2633 if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 2633 if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2634 pr_debug("%s: wrong queue state\n", __func__); 2634 pr_debug("%s: wrong queue state\n", __func__);
2635 return -EFAULT; 2635 return -EFAULT;
2636 } 2636 }
2637 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 2637 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
2638 if (skb_cloned(skb)) { 2638 if (skb_cloned(skb)) {
2639 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2639 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2640 if (nskb == NULL) 2640 if (nskb == NULL)
2641 return -ENOMEM; 2641 return -ENOMEM;
2642 tcp_unlink_write_queue(skb, sk); 2642 tcp_unlink_write_queue(skb, sk);
2643 skb_header_release(nskb); 2643 skb_header_release(nskb);
2644 __tcp_add_write_queue_head(sk, nskb); 2644 __tcp_add_write_queue_head(sk, nskb);
2645 sk_wmem_free_skb(sk, skb); 2645 sk_wmem_free_skb(sk, skb);
2646 sk->sk_wmem_queued += nskb->truesize; 2646 sk->sk_wmem_queued += nskb->truesize;
2647 sk_mem_charge(sk, nskb->truesize); 2647 sk_mem_charge(sk, nskb->truesize);
2648 skb = nskb; 2648 skb = nskb;
2649 } 2649 }
2650 2650
2651 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2651 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2652 TCP_ECN_send_synack(tcp_sk(sk), skb); 2652 TCP_ECN_send_synack(tcp_sk(sk), skb);
2653 } 2653 }
2654 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2654 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2655 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2655 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2656 } 2656 }
2657 2657
2658 /** 2658 /**
2659 * tcp_make_synack - Prepare a SYN-ACK. 2659 * tcp_make_synack - Prepare a SYN-ACK.
2660 * sk: listener socket 2660 * sk: listener socket
2661 * dst: dst entry attached to the SYNACK 2661 * dst: dst entry attached to the SYNACK
2662 * req: request_sock pointer 2662 * req: request_sock pointer
2663 * 2663 *
2664 * Allocate one skb and build a SYNACK packet. 2664 * Allocate one skb and build a SYNACK packet.
2665 * @dst is consumed : Caller should not use it again. 2665 * @dst is consumed : Caller should not use it again.
2666 */ 2666 */
2667 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2667 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2668 struct request_sock *req, 2668 struct request_sock *req,
2669 struct tcp_fastopen_cookie *foc) 2669 struct tcp_fastopen_cookie *foc)
2670 { 2670 {
2671 struct tcp_out_options opts; 2671 struct tcp_out_options opts;
2672 struct inet_request_sock *ireq = inet_rsk(req); 2672 struct inet_request_sock *ireq = inet_rsk(req);
2673 struct tcp_sock *tp = tcp_sk(sk); 2673 struct tcp_sock *tp = tcp_sk(sk);
2674 struct tcphdr *th; 2674 struct tcphdr *th;
2675 struct sk_buff *skb; 2675 struct sk_buff *skb;
2676 struct tcp_md5sig_key *md5; 2676 struct tcp_md5sig_key *md5;
2677 int tcp_header_size; 2677 int tcp_header_size;
2678 int mss; 2678 int mss;
2679 2679
2680 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2680 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
2681 if (unlikely(!skb)) { 2681 if (unlikely(!skb)) {
2682 dst_release(dst); 2682 dst_release(dst);
2683 return NULL; 2683 return NULL;
2684 } 2684 }
2685 /* Reserve space for headers. */ 2685 /* Reserve space for headers. */
2686 skb_reserve(skb, MAX_TCP_HEADER); 2686 skb_reserve(skb, MAX_TCP_HEADER);
2687 2687
2688 skb_dst_set(skb, dst); 2688 skb_dst_set(skb, dst);
2689 security_skb_owned_by(skb, sk); 2689 security_skb_owned_by(skb, sk);
2690 2690
2691 mss = dst_metric_advmss(dst); 2691 mss = dst_metric_advmss(dst);
2692 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2692 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2693 mss = tp->rx_opt.user_mss; 2693 mss = tp->rx_opt.user_mss;
2694 2694
2695 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 2695 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2696 __u8 rcv_wscale; 2696 __u8 rcv_wscale;
2697 /* Set this up on the first call only */ 2697 /* Set this up on the first call only */
2698 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2698 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2699 2699
2700 /* limit the window selection if the user enforce a smaller rx buffer */ 2700 /* limit the window selection if the user enforce a smaller rx buffer */
2701 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && 2701 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2702 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) 2702 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2703 req->window_clamp = tcp_full_space(sk); 2703 req->window_clamp = tcp_full_space(sk);
2704 2704
2705 /* tcp_full_space because it is guaranteed to be the first packet */ 2705 /* tcp_full_space because it is guaranteed to be the first packet */
2706 tcp_select_initial_window(tcp_full_space(sk), 2706 tcp_select_initial_window(tcp_full_space(sk),
2707 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2707 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2708 &req->rcv_wnd, 2708 &req->rcv_wnd,
2709 &req->window_clamp, 2709 &req->window_clamp,
2710 ireq->wscale_ok, 2710 ireq->wscale_ok,
2711 &rcv_wscale, 2711 &rcv_wscale,
2712 dst_metric(dst, RTAX_INITRWND)); 2712 dst_metric(dst, RTAX_INITRWND));
2713 ireq->rcv_wscale = rcv_wscale; 2713 ireq->rcv_wscale = rcv_wscale;
2714 } 2714 }
2715 2715
2716 memset(&opts, 0, sizeof(opts)); 2716 memset(&opts, 0, sizeof(opts));
2717 #ifdef CONFIG_SYN_COOKIES 2717 #ifdef CONFIG_SYN_COOKIES
2718 if (unlikely(req->cookie_ts)) 2718 if (unlikely(req->cookie_ts))
2719 TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); 2719 TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
2720 else 2720 else
2721 #endif 2721 #endif
2722 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2722 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2723 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, 2723 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
2724 foc) + sizeof(*th); 2724 foc) + sizeof(*th);
2725 2725
2726 skb_push(skb, tcp_header_size); 2726 skb_push(skb, tcp_header_size);
2727 skb_reset_transport_header(skb); 2727 skb_reset_transport_header(skb);
2728 2728
2729 th = tcp_hdr(skb); 2729 th = tcp_hdr(skb);
2730 memset(th, 0, sizeof(struct tcphdr)); 2730 memset(th, 0, sizeof(struct tcphdr));
2731 th->syn = 1; 2731 th->syn = 1;
2732 th->ack = 1; 2732 th->ack = 1;
2733 TCP_ECN_make_synack(req, th); 2733 TCP_ECN_make_synack(req, th);
2734 th->source = htons(ireq->ir_num); 2734 th->source = htons(ireq->ir_num);
2735 th->dest = ireq->ir_rmt_port; 2735 th->dest = ireq->ir_rmt_port;
2736 /* Setting of flags are superfluous here for callers (and ECE is 2736 /* Setting of flags are superfluous here for callers (and ECE is
2737 * not even correctly set) 2737 * not even correctly set)
2738 */ 2738 */
2739 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2739 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2740 TCPHDR_SYN | TCPHDR_ACK); 2740 TCPHDR_SYN | TCPHDR_ACK);
2741 2741
2742 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2742 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2743 /* XXX data is queued and acked as is. No buffer/window check */ 2743 /* XXX data is queued and acked as is. No buffer/window check */
2744 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); 2744 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2745 2745
2746 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2746 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2747 th->window = htons(min(req->rcv_wnd, 65535U)); 2747 th->window = htons(min(req->rcv_wnd, 65535U));
2748 tcp_options_write((__be32 *)(th + 1), tp, &opts); 2748 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2749 th->doff = (tcp_header_size >> 2); 2749 th->doff = (tcp_header_size >> 2);
2750 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); 2750 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2751 2751
2752 #ifdef CONFIG_TCP_MD5SIG 2752 #ifdef CONFIG_TCP_MD5SIG
2753 /* Okay, we have all we need - do the md5 hash if needed */ 2753 /* Okay, we have all we need - do the md5 hash if needed */
2754 if (md5) { 2754 if (md5) {
2755 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, 2755 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2756 md5, NULL, req, skb); 2756 md5, NULL, req, skb);
2757 } 2757 }
2758 #endif 2758 #endif
2759 2759
2760 return skb; 2760 return skb;
2761 } 2761 }
2762 EXPORT_SYMBOL(tcp_make_synack); 2762 EXPORT_SYMBOL(tcp_make_synack);
2763 2763
2764 /* Do all connect socket setups that can be done AF independent. */ 2764 /* Do all connect socket setups that can be done AF independent. */
2765 void tcp_connect_init(struct sock *sk) 2765 void tcp_connect_init(struct sock *sk)
2766 { 2766 {
2767 const struct dst_entry *dst = __sk_dst_get(sk); 2767 const struct dst_entry *dst = __sk_dst_get(sk);
2768 struct tcp_sock *tp = tcp_sk(sk); 2768 struct tcp_sock *tp = tcp_sk(sk);
2769 __u8 rcv_wscale; 2769 __u8 rcv_wscale;
2770 2770
2771 /* We'll fix this up when we get a response from the other end. 2771 /* We'll fix this up when we get a response from the other end.
2772 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. 2772 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
2773 */ 2773 */
2774 tp->tcp_header_len = sizeof(struct tcphdr) + 2774 tp->tcp_header_len = sizeof(struct tcphdr) +
2775 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); 2775 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
2776 2776
2777 #ifdef CONFIG_TCP_MD5SIG 2777 #ifdef CONFIG_TCP_MD5SIG
2778 if (tp->af_specific->md5_lookup(sk, sk) != NULL) 2778 if (tp->af_specific->md5_lookup(sk, sk) != NULL)
2779 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 2779 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
2780 #endif 2780 #endif
2781 2781
2782 /* If user gave his TCP_MAXSEG, record it to clamp */ 2782 /* If user gave his TCP_MAXSEG, record it to clamp */
2783 if (tp->rx_opt.user_mss) 2783 if (tp->rx_opt.user_mss)
2784 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 2784 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2785 tp->max_window = 0; 2785 tp->max_window = 0;
2786 tcp_mtup_init(sk); 2786 tcp_mtup_init(sk);
2787 tcp_sync_mss(sk, dst_mtu(dst)); 2787 tcp_sync_mss(sk, dst_mtu(dst));
2788 2788
2789 if (!tp->window_clamp) 2789 if (!tp->window_clamp)
2790 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2790 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2791 tp->advmss = dst_metric_advmss(dst); 2791 tp->advmss = dst_metric_advmss(dst);
2792 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) 2792 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2793 tp->advmss = tp->rx_opt.user_mss; 2793 tp->advmss = tp->rx_opt.user_mss;
2794 2794
2795 tcp_initialize_rcv_mss(sk); 2795 tcp_initialize_rcv_mss(sk);
2796 2796
2797 /* limit the window selection if the user enforce a smaller rx buffer */ 2797 /* limit the window selection if the user enforce a smaller rx buffer */
2798 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && 2798 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2799 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) 2799 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2800 tp->window_clamp = tcp_full_space(sk); 2800 tp->window_clamp = tcp_full_space(sk);
2801 2801
2802 tcp_select_initial_window(tcp_full_space(sk), 2802 tcp_select_initial_window(tcp_full_space(sk),
2803 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2803 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2804 &tp->rcv_wnd, 2804 &tp->rcv_wnd,
2805 &tp->window_clamp, 2805 &tp->window_clamp,
2806 sysctl_tcp_window_scaling, 2806 sysctl_tcp_window_scaling,
2807 &rcv_wscale, 2807 &rcv_wscale,
2808 dst_metric(dst, RTAX_INITRWND)); 2808 dst_metric(dst, RTAX_INITRWND));
2809 2809
2810 tp->rx_opt.rcv_wscale = rcv_wscale; 2810 tp->rx_opt.rcv_wscale = rcv_wscale;
2811 tp->rcv_ssthresh = tp->rcv_wnd; 2811 tp->rcv_ssthresh = tp->rcv_wnd;
2812 2812
2813 sk->sk_err = 0; 2813 sk->sk_err = 0;
2814 sock_reset_flag(sk, SOCK_DONE); 2814 sock_reset_flag(sk, SOCK_DONE);
2815 tp->snd_wnd = 0; 2815 tp->snd_wnd = 0;
2816 tcp_init_wl(tp, 0); 2816 tcp_init_wl(tp, 0);
2817 tp->snd_una = tp->write_seq; 2817 tp->snd_una = tp->write_seq;
2818 tp->snd_sml = tp->write_seq; 2818 tp->snd_sml = tp->write_seq;
2819 tp->snd_up = tp->write_seq; 2819 tp->snd_up = tp->write_seq;
2820 tp->snd_nxt = tp->write_seq; 2820 tp->snd_nxt = tp->write_seq;
2821 2821
2822 if (likely(!tp->repair)) 2822 if (likely(!tp->repair))
2823 tp->rcv_nxt = 0; 2823 tp->rcv_nxt = 0;
2824 else 2824 else
2825 tp->rcv_tstamp = tcp_time_stamp; 2825 tp->rcv_tstamp = tcp_time_stamp;
2826 tp->rcv_wup = tp->rcv_nxt; 2826 tp->rcv_wup = tp->rcv_nxt;
2827 tp->copied_seq = tp->rcv_nxt; 2827 tp->copied_seq = tp->rcv_nxt;
2828 2828
2829 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 2829 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2830 inet_csk(sk)->icsk_retransmits = 0; 2830 inet_csk(sk)->icsk_retransmits = 0;
2831 tcp_clear_retrans(tp); 2831 tcp_clear_retrans(tp);
2832 } 2832 }
2833 2833
2834 static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) 2834 static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
2835 { 2835 {
2836 struct tcp_sock *tp = tcp_sk(sk); 2836 struct tcp_sock *tp = tcp_sk(sk);
2837 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 2837 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2838 2838
2839 tcb->end_seq += skb->len; 2839 tcb->end_seq += skb->len;
2840 skb_header_release(skb); 2840 skb_header_release(skb);
2841 __tcp_add_write_queue_tail(sk, skb); 2841 __tcp_add_write_queue_tail(sk, skb);
2842 sk->sk_wmem_queued += skb->truesize; 2842 sk->sk_wmem_queued += skb->truesize;
2843 sk_mem_charge(sk, skb->truesize); 2843 sk_mem_charge(sk, skb->truesize);
2844 tp->write_seq = tcb->end_seq; 2844 tp->write_seq = tcb->end_seq;
2845 tp->packets_out += tcp_skb_pcount(skb); 2845 tp->packets_out += tcp_skb_pcount(skb);
2846 } 2846 }
2847 2847
2848 /* Build and send a SYN with data and (cached) Fast Open cookie. However, 2848 /* Build and send a SYN with data and (cached) Fast Open cookie. However,
2849 * queue a data-only packet after the regular SYN, such that regular SYNs 2849 * queue a data-only packet after the regular SYN, such that regular SYNs
2850 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges 2850 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
2851 * only the SYN sequence, the data are retransmitted in the first ACK. 2851 * only the SYN sequence, the data are retransmitted in the first ACK.
2852 * If cookie is not cached or other error occurs, falls back to send a 2852 * If cookie is not cached or other error occurs, falls back to send a
2853 * regular SYN with Fast Open cookie request option. 2853 * regular SYN with Fast Open cookie request option.
2854 */ 2854 */
2855 static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) 2855 static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2856 { 2856 {
2857 struct tcp_sock *tp = tcp_sk(sk); 2857 struct tcp_sock *tp = tcp_sk(sk);
2858 struct tcp_fastopen_request *fo = tp->fastopen_req; 2858 struct tcp_fastopen_request *fo = tp->fastopen_req;
2859 int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; 2859 int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
2860 struct sk_buff *syn_data = NULL, *data; 2860 struct sk_buff *syn_data = NULL, *data;
2861 unsigned long last_syn_loss = 0; 2861 unsigned long last_syn_loss = 0;
2862 2862
2863 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 2863 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
2864 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, 2864 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
2865 &syn_loss, &last_syn_loss); 2865 &syn_loss, &last_syn_loss);
2866 /* Recurring FO SYN losses: revert to regular handshake temporarily */ 2866 /* Recurring FO SYN losses: revert to regular handshake temporarily */
2867 if (syn_loss > 1 && 2867 if (syn_loss > 1 &&
2868 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { 2868 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
2869 fo->cookie.len = -1; 2869 fo->cookie.len = -1;
2870 goto fallback; 2870 goto fallback;
2871 } 2871 }
2872 2872
2873 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) 2873 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
2874 fo->cookie.len = -1; 2874 fo->cookie.len = -1;
2875 else if (fo->cookie.len <= 0) 2875 else if (fo->cookie.len <= 0)
2876 goto fallback; 2876 goto fallback;
2877 2877
2878 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and 2878 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
2879 * user-MSS. Reserve maximum option space for middleboxes that add 2879 * user-MSS. Reserve maximum option space for middleboxes that add
2880 * private TCP options. The cost is reduced data space in SYN :( 2880 * private TCP options. The cost is reduced data space in SYN :(
2881 */ 2881 */
2882 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) 2882 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
2883 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 2883 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2884 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - 2884 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
2885 MAX_TCP_OPTION_SPACE; 2885 MAX_TCP_OPTION_SPACE;
2886 2886
2887 syn_data = skb_copy_expand(syn, skb_headroom(syn), space, 2887 syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
2888 sk->sk_allocation); 2888 sk->sk_allocation);
2889 if (syn_data == NULL) 2889 if (syn_data == NULL)
2890 goto fallback; 2890 goto fallback;
2891 2891
2892 for (i = 0; i < iovlen && syn_data->len < space; ++i) { 2892 for (i = 0; i < iovlen && syn_data->len < space; ++i) {
2893 struct iovec *iov = &fo->data->msg_iov[i]; 2893 struct iovec *iov = &fo->data->msg_iov[i];
2894 unsigned char __user *from = iov->iov_base; 2894 unsigned char __user *from = iov->iov_base;
2895 int len = iov->iov_len; 2895 int len = iov->iov_len;
2896 2896
2897 if (syn_data->len + len > space) 2897 if (syn_data->len + len > space)
2898 len = space - syn_data->len; 2898 len = space - syn_data->len;
2899 else if (i + 1 == iovlen) 2899 else if (i + 1 == iovlen)
2900 /* No more data pending in inet_wait_for_connect() */ 2900 /* No more data pending in inet_wait_for_connect() */
2901 fo->data = NULL; 2901 fo->data = NULL;
2902 2902
2903 if (skb_add_data(syn_data, from, len)) 2903 if (skb_add_data(syn_data, from, len))
2904 goto fallback; 2904 goto fallback;
2905 } 2905 }
2906 2906
2907 /* Queue a data-only packet after the regular SYN for retransmission */ 2907 /* Queue a data-only packet after the regular SYN for retransmission */
2908 data = pskb_copy(syn_data, sk->sk_allocation); 2908 data = pskb_copy(syn_data, sk->sk_allocation);
2909 if (data == NULL) 2909 if (data == NULL)
2910 goto fallback; 2910 goto fallback;
2911 TCP_SKB_CB(data)->seq++; 2911 TCP_SKB_CB(data)->seq++;
2912 TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; 2912 TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
2913 TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); 2913 TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
2914 tcp_connect_queue_skb(sk, data); 2914 tcp_connect_queue_skb(sk, data);
2915 fo->copied = data->len; 2915 fo->copied = data->len;
2916 2916
2917 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { 2917 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
2918 tp->syn_data = (fo->copied > 0); 2918 tp->syn_data = (fo->copied > 0);
2919 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); 2919 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
2920 goto done; 2920 goto done;
2921 } 2921 }
2922 syn_data = NULL; 2922 syn_data = NULL;
2923 2923
2924 fallback: 2924 fallback:
2925 /* Send a regular SYN with Fast Open cookie request option */ 2925 /* Send a regular SYN with Fast Open cookie request option */
2926 if (fo->cookie.len > 0) 2926 if (fo->cookie.len > 0)
2927 fo->cookie.len = 0; 2927 fo->cookie.len = 0;
2928 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); 2928 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
2929 if (err) 2929 if (err)
2930 tp->syn_fastopen = 0; 2930 tp->syn_fastopen = 0;
2931 kfree_skb(syn_data); 2931 kfree_skb(syn_data);
2932 done: 2932 done:
2933 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ 2933 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
2934 return err; 2934 return err;
2935 } 2935 }
2936 2936
2937 /* Build a SYN and send it off. */ 2937 /* Build a SYN and send it off. */
2938 int tcp_connect(struct sock *sk) 2938 int tcp_connect(struct sock *sk)
2939 { 2939 {
2940 struct tcp_sock *tp = tcp_sk(sk); 2940 struct tcp_sock *tp = tcp_sk(sk);
2941 struct sk_buff *buff; 2941 struct sk_buff *buff;
2942 int err; 2942 int err;
2943 2943
2944 tcp_connect_init(sk); 2944 tcp_connect_init(sk);
2945 2945
2946 if (unlikely(tp->repair)) { 2946 if (unlikely(tp->repair)) {
2947 tcp_finish_connect(sk, NULL); 2947 tcp_finish_connect(sk, NULL);
2948 return 0; 2948 return 0;
2949 } 2949 }
2950 2950
2951 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); 2951 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
2952 if (unlikely(buff == NULL)) 2952 if (unlikely(buff == NULL))
2953 return -ENOBUFS; 2953 return -ENOBUFS;
2954 2954
2955 /* Reserve space for headers. */ 2955 /* Reserve space for headers. */
2956 skb_reserve(buff, MAX_TCP_HEADER); 2956 skb_reserve(buff, MAX_TCP_HEADER);
2957 2957
2958 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 2958 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2959 tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; 2959 tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
2960 tcp_connect_queue_skb(sk, buff); 2960 tcp_connect_queue_skb(sk, buff);
2961 TCP_ECN_send_syn(sk, buff); 2961 TCP_ECN_send_syn(sk, buff);
2962 2962
2963 /* Send off SYN; include data in Fast Open. */ 2963 /* Send off SYN; include data in Fast Open. */
2964 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 2964 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
2965 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); 2965 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2966 if (err == -ECONNREFUSED) 2966 if (err == -ECONNREFUSED)
2967 return err; 2967 return err;
2968 2968
2969 /* We change tp->snd_nxt after the tcp_transmit_skb() call 2969 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2970 * in order to make this packet get counted in tcpOutSegs. 2970 * in order to make this packet get counted in tcpOutSegs.
2971 */ 2971 */
2972 tp->snd_nxt = tp->write_seq; 2972 tp->snd_nxt = tp->write_seq;
2973 tp->pushed_seq = tp->write_seq; 2973 tp->pushed_seq = tp->write_seq;
2974 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); 2974 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
2975 2975
2976 /* Timer for repeating the SYN until an answer. */ 2976 /* Timer for repeating the SYN until an answer. */
2977 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2977 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2978 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 2978 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2979 return 0; 2979 return 0;
2980 } 2980 }
2981 EXPORT_SYMBOL(tcp_connect); 2981 EXPORT_SYMBOL(tcp_connect);
2982 2982
2983 /* Send out a delayed ack, the caller does the policy checking 2983 /* Send out a delayed ack, the caller does the policy checking
2984 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() 2984 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
2985 * for details. 2985 * for details.
2986 */ 2986 */
2987 void tcp_send_delayed_ack(struct sock *sk) 2987 void tcp_send_delayed_ack(struct sock *sk)
2988 { 2988 {
2989 struct inet_connection_sock *icsk = inet_csk(sk); 2989 struct inet_connection_sock *icsk = inet_csk(sk);
2990 int ato = icsk->icsk_ack.ato; 2990 int ato = icsk->icsk_ack.ato;
2991 unsigned long timeout; 2991 unsigned long timeout;
2992 2992
2993 if (ato > TCP_DELACK_MIN) { 2993 if (ato > TCP_DELACK_MIN) {
2994 const struct tcp_sock *tp = tcp_sk(sk); 2994 const struct tcp_sock *tp = tcp_sk(sk);
2995 int max_ato = HZ / 2; 2995 int max_ato = HZ / 2;
2996 2996
2997 if (icsk->icsk_ack.pingpong || 2997 if (icsk->icsk_ack.pingpong ||
2998 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) 2998 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
2999 max_ato = TCP_DELACK_MAX; 2999 max_ato = TCP_DELACK_MAX;
3000 3000
3001 /* Slow path, intersegment interval is "high". */ 3001 /* Slow path, intersegment interval is "high". */
3002 3002
3003 /* If some rtt estimate is known, use it to bound delayed ack. 3003 /* If some rtt estimate is known, use it to bound delayed ack.
3004 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements 3004 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
3005 * directly. 3005 * directly.
3006 */ 3006 */
3007 if (tp->srtt) { 3007 if (tp->srtt) {
3008 int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); 3008 int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
3009 3009
3010 if (rtt < max_ato) 3010 if (rtt < max_ato)
3011 max_ato = rtt; 3011 max_ato = rtt;
3012 } 3012 }
3013 3013
3014 ato = min(ato, max_ato); 3014 ato = min(ato, max_ato);
3015 } 3015 }
3016 3016
3017 /* Stay within the limit we were given */ 3017 /* Stay within the limit we were given */
3018 timeout = jiffies + ato; 3018 timeout = jiffies + ato;
3019 3019
3020 /* Use new timeout only if there wasn't a older one earlier. */ 3020 /* Use new timeout only if there wasn't a older one earlier. */
3021 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { 3021 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3022 /* If delack timer was blocked or is about to expire, 3022 /* If delack timer was blocked or is about to expire,
3023 * send ACK now. 3023 * send ACK now.
3024 */ 3024 */
3025 if (icsk->icsk_ack.blocked || 3025 if (icsk->icsk_ack.blocked ||
3026 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { 3026 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3027 tcp_send_ack(sk); 3027 tcp_send_ack(sk);
3028 return; 3028 return;
3029 } 3029 }
3030 3030
3031 if (!time_before(timeout, icsk->icsk_ack.timeout)) 3031 if (!time_before(timeout, icsk->icsk_ack.timeout))
3032 timeout = icsk->icsk_ack.timeout; 3032 timeout = icsk->icsk_ack.timeout;
3033 } 3033 }
3034 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 3034 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3035 icsk->icsk_ack.timeout = timeout; 3035 icsk->icsk_ack.timeout = timeout;
3036 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); 3036 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3037 } 3037 }
3038 3038
3039 /* This routine sends an ack and also updates the window. */ 3039 /* This routine sends an ack and also updates the window. */
3040 void tcp_send_ack(struct sock *sk) 3040 void tcp_send_ack(struct sock *sk)
3041 { 3041 {
3042 struct sk_buff *buff; 3042 struct sk_buff *buff;
3043 3043
3044 /* If we have been reset, we may not send again. */ 3044 /* If we have been reset, we may not send again. */
3045 if (sk->sk_state == TCP_CLOSE) 3045 if (sk->sk_state == TCP_CLOSE)
3046 return; 3046 return;
3047 3047
3048 /* We are not putting this on the write queue, so 3048 /* We are not putting this on the write queue, so
3049 * tcp_transmit_skb() will set the ownership to this 3049 * tcp_transmit_skb() will set the ownership to this
3050 * sock. 3050 * sock.
3051 */ 3051 */
3052 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); 3052 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3053 if (buff == NULL) { 3053 if (buff == NULL) {
3054 inet_csk_schedule_ack(sk); 3054 inet_csk_schedule_ack(sk);
3055 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 3055 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3056 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 3056 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3057 TCP_DELACK_MAX, TCP_RTO_MAX); 3057 TCP_DELACK_MAX, TCP_RTO_MAX);
3058 return; 3058 return;
3059 } 3059 }
3060 3060
3061 /* Reserve space for headers and prepare control bits. */ 3061 /* Reserve space for headers and prepare control bits. */
3062 skb_reserve(buff, MAX_TCP_HEADER); 3062 skb_reserve(buff, MAX_TCP_HEADER);
3063 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); 3063 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3064 3064
3065 /* Send it off, this clears delayed acks for us. */ 3065 /* Send it off, this clears delayed acks for us. */
3066 TCP_SKB_CB(buff)->when = tcp_time_stamp; 3066 TCP_SKB_CB(buff)->when = tcp_time_stamp;
3067 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); 3067 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
3068 } 3068 }
3069 3069
3070 /* This routine sends a packet with an out of date sequence 3070 /* This routine sends a packet with an out of date sequence
3071 * number. It assumes the other end will try to ack it. 3071 * number. It assumes the other end will try to ack it.
3072 * 3072 *
3073 * Question: what should we make while urgent mode? 3073 * Question: what should we make while urgent mode?
3074 * 4.4BSD forces sending single byte of data. We cannot send 3074 * 4.4BSD forces sending single byte of data. We cannot send
3075 * out of window data, because we have SND.NXT==SND.MAX... 3075 * out of window data, because we have SND.NXT==SND.MAX...
3076 * 3076 *
3077 * Current solution: to send TWO zero-length segments in urgent mode: 3077 * Current solution: to send TWO zero-length segments in urgent mode:
3078 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is 3078 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
3079 * out-of-date with SND.UNA-1 to probe window. 3079 * out-of-date with SND.UNA-1 to probe window.
3080 */ 3080 */
3081 static int tcp_xmit_probe_skb(struct sock *sk, int urgent) 3081 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3082 { 3082 {
3083 struct tcp_sock *tp = tcp_sk(sk); 3083 struct tcp_sock *tp = tcp_sk(sk);
3084 struct sk_buff *skb; 3084 struct sk_buff *skb;
3085 3085
3086 /* We don't queue it, tcp_transmit_skb() sets ownership. */ 3086 /* We don't queue it, tcp_transmit_skb() sets ownership. */
3087 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); 3087 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3088 if (skb == NULL) 3088 if (skb == NULL)
3089 return -1; 3089 return -1;
3090 3090
3091 /* Reserve space for headers and set control bits. */ 3091 /* Reserve space for headers and set control bits. */
3092 skb_reserve(skb, MAX_TCP_HEADER); 3092 skb_reserve(skb, MAX_TCP_HEADER);
3093 /* Use a previous sequence. This should cause the other 3093 /* Use a previous sequence. This should cause the other
3094 * end to send an ack. Don't queue or clone SKB, just 3094 * end to send an ack. Don't queue or clone SKB, just
3095 * send it. 3095 * send it.
3096 */ 3096 */
3097 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); 3097 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3098 TCP_SKB_CB(skb)->when = tcp_time_stamp; 3098 TCP_SKB_CB(skb)->when = tcp_time_stamp;
3099 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 3099 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
3100 } 3100 }
3101 3101
3102 void tcp_send_window_probe(struct sock *sk) 3102 void tcp_send_window_probe(struct sock *sk)
3103 { 3103 {
3104 if (sk->sk_state == TCP_ESTABLISHED) { 3104 if (sk->sk_state == TCP_ESTABLISHED) {
3105 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; 3105 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3106 tcp_xmit_probe_skb(sk, 0); 3106 tcp_xmit_probe_skb(sk, 0);
3107 } 3107 }
3108 } 3108 }
3109 3109
3110 /* Initiate keepalive or window probe from timer. */ 3110 /* Initiate keepalive or window probe from timer. */
3111 int tcp_write_wakeup(struct sock *sk) 3111 int tcp_write_wakeup(struct sock *sk)
3112 { 3112 {
3113 struct tcp_sock *tp = tcp_sk(sk); 3113 struct tcp_sock *tp = tcp_sk(sk);
3114 struct sk_buff *skb; 3114 struct sk_buff *skb;
3115 3115
3116 if (sk->sk_state == TCP_CLOSE) 3116 if (sk->sk_state == TCP_CLOSE)
3117 return -1; 3117 return -1;
3118 3118
3119 if ((skb = tcp_send_head(sk)) != NULL && 3119 if ((skb = tcp_send_head(sk)) != NULL &&
3120 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { 3120 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3121 int err; 3121 int err;
3122 unsigned int mss = tcp_current_mss(sk); 3122 unsigned int mss = tcp_current_mss(sk);
3123 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 3123 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3124 3124
3125 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) 3125 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
3126 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; 3126 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
3127 3127
3128 /* We are probing the opening of a window 3128 /* We are probing the opening of a window
3129 * but the window size is != 0 3129 * but the window size is != 0
3130 * must have been a result SWS avoidance ( sender ) 3130 * must have been a result SWS avoidance ( sender )
3131 */ 3131 */
3132 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || 3132 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3133 skb->len > mss) { 3133 skb->len > mss) {
3134 seg_size = min(seg_size, mss); 3134 seg_size = min(seg_size, mss);
3135 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3135 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3136 if (tcp_fragment(sk, skb, seg_size, mss)) 3136 if (tcp_fragment(sk, skb, seg_size, mss))
3137 return -1; 3137 return -1;
3138 } else if (!tcp_skb_pcount(skb)) 3138 } else if (!tcp_skb_pcount(skb))
3139 tcp_set_skb_tso_segs(sk, skb, mss); 3139 tcp_set_skb_tso_segs(sk, skb, mss);
3140 3140
3141 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3141 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3142 TCP_SKB_CB(skb)->when = tcp_time_stamp; 3142 TCP_SKB_CB(skb)->when = tcp_time_stamp;
3143 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 3143 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3144 if (!err) 3144 if (!err)
3145 tcp_event_new_data_sent(sk, skb); 3145 tcp_event_new_data_sent(sk, skb);
3146 return err; 3146 return err;
3147 } else { 3147 } else {
3148 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) 3148 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
3149 tcp_xmit_probe_skb(sk, 1); 3149 tcp_xmit_probe_skb(sk, 1);
3150 return tcp_xmit_probe_skb(sk, 0); 3150 return tcp_xmit_probe_skb(sk, 0);
3151 } 3151 }
3152 } 3152 }
3153 3153
3154 /* A window probe timeout has occurred. If window is not closed send 3154 /* A window probe timeout has occurred. If window is not closed send
3155 * a partial packet else a zero probe. 3155 * a partial packet else a zero probe.
3156 */ 3156 */
3157 void tcp_send_probe0(struct sock *sk) 3157 void tcp_send_probe0(struct sock *sk)
3158 { 3158 {
3159 struct inet_connection_sock *icsk = inet_csk(sk); 3159 struct inet_connection_sock *icsk = inet_csk(sk);
3160 struct tcp_sock *tp = tcp_sk(sk); 3160 struct tcp_sock *tp = tcp_sk(sk);
3161 int err; 3161 int err;
3162 3162
3163 err = tcp_write_wakeup(sk); 3163 err = tcp_write_wakeup(sk);
3164 3164
3165 if (tp->packets_out || !tcp_send_head(sk)) { 3165 if (tp->packets_out || !tcp_send_head(sk)) {
3166 /* Cancel probe timer, if it is not required. */ 3166 /* Cancel probe timer, if it is not required. */
3167 icsk->icsk_probes_out = 0; 3167 icsk->icsk_probes_out = 0;
3168 icsk->icsk_backoff = 0; 3168 icsk->icsk_backoff = 0;
3169 return; 3169 return;
3170 } 3170 }
3171 3171
3172 if (err <= 0) { 3172 if (err <= 0) {
3173 if (icsk->icsk_backoff < sysctl_tcp_retries2) 3173 if (icsk->icsk_backoff < sysctl_tcp_retries2)
3174 icsk->icsk_backoff++; 3174 icsk->icsk_backoff++;
3175 icsk->icsk_probes_out++; 3175 icsk->icsk_probes_out++;
3176 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3176 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3177 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), 3177 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
3178 TCP_RTO_MAX); 3178 TCP_RTO_MAX);
3179 } else { 3179 } else {
3180 /* If packet was not sent due to local congestion, 3180 /* If packet was not sent due to local congestion,
3181 * do not backoff and do not remember icsk_probes_out. 3181 * do not backoff and do not remember icsk_probes_out.
3182 * Let local senders to fight for local resources. 3182 * Let local senders to fight for local resources.
3183 * 3183 *
3184 * Use accumulated backoff yet. 3184 * Use accumulated backoff yet.
3185 */ 3185 */
3186 if (!icsk->icsk_probes_out) 3186 if (!icsk->icsk_probes_out)
3187 icsk->icsk_probes_out = 1; 3187 icsk->icsk_probes_out = 1;
3188 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3188 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3189 min(icsk->icsk_rto << icsk->icsk_backoff, 3189 min(icsk->icsk_rto << icsk->icsk_backoff,
3190 TCP_RESOURCE_PROBE_INTERVAL), 3190 TCP_RESOURCE_PROBE_INTERVAL),
3191 TCP_RTO_MAX); 3191 TCP_RTO_MAX);
3192 } 3192 }
3193 } 3193 }
3194 3194