Commit 617417f9990b33c162c08a2e29cc356c50ce3943

Authored by Eric Dumazet
Committed by Greg Kroah-Hartman
1 parent 7a0189390e

tcp: ipv4: initialize unicast_sock sk_pacing_rate

[ Upstream commit 811230cd853d62f09ed0addd0ce9a1b9b0e13fb5 ]

When I added sk_pacing_rate field, I forgot to initialize its value
in the per cpu unicast_sock used in ip_send_unicast_reply()

This means that for sch_fq users, RST packets, or ACK packets sent
on behalf of TIME_WAIT sockets might be sent to slowly or even dropped
once we reach the per flow limit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 95bd09eb2750 ("tcp: TSO packets automatic sizing")
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 1 additions and 0 deletions Inline Diff

net/ipv4/ip_output.c
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * The Internet Protocol (IP) output module. 6 * The Internet Protocol (IP) output module.
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org> 10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org> 11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood 12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de> 13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net> 14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp> 16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 * 17 *
18 * See ip_input.c for original log 18 * See ip_input.c for original log
19 * 19 *
20 * Fixes: 20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit. 21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit. 22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when 23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found. 24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit 25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by 26 * (in case if packet not accepted by
27 * output firewall rules) 27 * output firewall rules)
28 * Mike McLagan : Routing by source 28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache 29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove 30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests. 31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply. 33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path 34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86 35 * for decreased register pressure on x86
36 * and more readibility. 36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE, 37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM. 38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments. 39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP 40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams. 41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now. 42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */ 43 */
44 44
45 #include <asm/uaccess.h> 45 #include <asm/uaccess.h>
46 #include <linux/module.h> 46 #include <linux/module.h>
47 #include <linux/types.h> 47 #include <linux/types.h>
48 #include <linux/kernel.h> 48 #include <linux/kernel.h>
49 #include <linux/mm.h> 49 #include <linux/mm.h>
50 #include <linux/string.h> 50 #include <linux/string.h>
51 #include <linux/errno.h> 51 #include <linux/errno.h>
52 #include <linux/highmem.h> 52 #include <linux/highmem.h>
53 #include <linux/slab.h> 53 #include <linux/slab.h>
54 54
55 #include <linux/socket.h> 55 #include <linux/socket.h>
56 #include <linux/sockios.h> 56 #include <linux/sockios.h>
57 #include <linux/in.h> 57 #include <linux/in.h>
58 #include <linux/inet.h> 58 #include <linux/inet.h>
59 #include <linux/netdevice.h> 59 #include <linux/netdevice.h>
60 #include <linux/etherdevice.h> 60 #include <linux/etherdevice.h>
61 #include <linux/proc_fs.h> 61 #include <linux/proc_fs.h>
62 #include <linux/stat.h> 62 #include <linux/stat.h>
63 #include <linux/init.h> 63 #include <linux/init.h>
64 64
65 #include <net/snmp.h> 65 #include <net/snmp.h>
66 #include <net/ip.h> 66 #include <net/ip.h>
67 #include <net/protocol.h> 67 #include <net/protocol.h>
68 #include <net/route.h> 68 #include <net/route.h>
69 #include <net/xfrm.h> 69 #include <net/xfrm.h>
70 #include <linux/skbuff.h> 70 #include <linux/skbuff.h>
71 #include <net/sock.h> 71 #include <net/sock.h>
72 #include <net/arp.h> 72 #include <net/arp.h>
73 #include <net/icmp.h> 73 #include <net/icmp.h>
74 #include <net/checksum.h> 74 #include <net/checksum.h>
75 #include <net/inetpeer.h> 75 #include <net/inetpeer.h>
76 #include <linux/igmp.h> 76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h> 77 #include <linux/netfilter_ipv4.h>
78 #include <linux/netfilter_bridge.h> 78 #include <linux/netfilter_bridge.h>
79 #include <linux/mroute.h> 79 #include <linux/mroute.h>
80 #include <linux/netlink.h> 80 #include <linux/netlink.h>
81 #include <linux/tcp.h> 81 #include <linux/tcp.h>
82 82
83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84 EXPORT_SYMBOL(sysctl_ip_default_ttl); 84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 85
86 /* Generate a checksum for an outgoing IP datagram. */ 86 /* Generate a checksum for an outgoing IP datagram. */
87 void ip_send_check(struct iphdr *iph) 87 void ip_send_check(struct iphdr *iph)
88 { 88 {
89 iph->check = 0; 89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91 } 91 }
92 EXPORT_SYMBOL(ip_send_check); 92 EXPORT_SYMBOL(ip_send_check);
93 93
94 int __ip_local_out(struct sk_buff *skb) 94 int __ip_local_out(struct sk_buff *skb)
95 { 95 {
96 struct iphdr *iph = ip_hdr(skb); 96 struct iphdr *iph = ip_hdr(skb);
97 97
98 iph->tot_len = htons(skb->len); 98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph); 99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, 100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output); 101 skb_dst(skb)->dev, dst_output);
102 } 102 }
103 103
104 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) 104 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
105 { 105 {
106 int err; 106 int err;
107 107
108 err = __ip_local_out(skb); 108 err = __ip_local_out(skb);
109 if (likely(err == 1)) 109 if (likely(err == 1))
110 err = dst_output_sk(sk, skb); 110 err = dst_output_sk(sk, skb);
111 111
112 return err; 112 return err;
113 } 113 }
114 EXPORT_SYMBOL_GPL(ip_local_out_sk); 114 EXPORT_SYMBOL_GPL(ip_local_out_sk);
115 115
116 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 116 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117 { 117 {
118 int ttl = inet->uc_ttl; 118 int ttl = inet->uc_ttl;
119 119
120 if (ttl < 0) 120 if (ttl < 0)
121 ttl = ip4_dst_hoplimit(dst); 121 ttl = ip4_dst_hoplimit(dst);
122 return ttl; 122 return ttl;
123 } 123 }
124 124
125 /* 125 /*
126 * Add an ip header to a skbuff and send it out. 126 * Add an ip header to a skbuff and send it out.
127 * 127 *
128 */ 128 */
129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) 130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
131 { 131 {
132 struct inet_sock *inet = inet_sk(sk); 132 struct inet_sock *inet = inet_sk(sk);
133 struct rtable *rt = skb_rtable(skb); 133 struct rtable *rt = skb_rtable(skb);
134 struct iphdr *iph; 134 struct iphdr *iph;
135 135
136 /* Build the IP header. */ 136 /* Build the IP header. */
137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); 137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
138 skb_reset_network_header(skb); 138 skb_reset_network_header(skb);
139 iph = ip_hdr(skb); 139 iph = ip_hdr(skb);
140 iph->version = 4; 140 iph->version = 4;
141 iph->ihl = 5; 141 iph->ihl = 5;
142 iph->tos = inet->tos; 142 iph->tos = inet->tos;
143 if (ip_dont_fragment(sk, &rt->dst)) 143 if (ip_dont_fragment(sk, &rt->dst))
144 iph->frag_off = htons(IP_DF); 144 iph->frag_off = htons(IP_DF);
145 else 145 else
146 iph->frag_off = 0; 146 iph->frag_off = 0;
147 iph->ttl = ip_select_ttl(inet, &rt->dst); 147 iph->ttl = ip_select_ttl(inet, &rt->dst);
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr; 149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol; 150 iph->protocol = sk->sk_protocol;
151 ip_select_ident(skb, sk); 151 ip_select_ident(skb, sk);
152 152
153 if (opt && opt->opt.optlen) { 153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2; 154 iph->ihl += opt->opt.optlen>>2;
155 ip_options_build(skb, &opt->opt, daddr, rt, 0); 155 ip_options_build(skb, &opt->opt, daddr, rt, 0);
156 } 156 }
157 157
158 skb->priority = sk->sk_priority; 158 skb->priority = sk->sk_priority;
159 skb->mark = sk->sk_mark; 159 skb->mark = sk->sk_mark;
160 160
161 /* Send it out. */ 161 /* Send it out. */
162 return ip_local_out(skb); 162 return ip_local_out(skb);
163 } 163 }
164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
165 165
166 static inline int ip_finish_output2(struct sk_buff *skb) 166 static inline int ip_finish_output2(struct sk_buff *skb)
167 { 167 {
168 struct dst_entry *dst = skb_dst(skb); 168 struct dst_entry *dst = skb_dst(skb);
169 struct rtable *rt = (struct rtable *)dst; 169 struct rtable *rt = (struct rtable *)dst;
170 struct net_device *dev = dst->dev; 170 struct net_device *dev = dst->dev;
171 unsigned int hh_len = LL_RESERVED_SPACE(dev); 171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
172 struct neighbour *neigh; 172 struct neighbour *neigh;
173 u32 nexthop; 173 u32 nexthop;
174 174
175 if (rt->rt_type == RTN_MULTICAST) { 175 if (rt->rt_type == RTN_MULTICAST) {
176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
177 } else if (rt->rt_type == RTN_BROADCAST) 177 } else if (rt->rt_type == RTN_BROADCAST)
178 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); 178 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
179 179
180 /* Be paranoid, rather than too clever. */ 180 /* Be paranoid, rather than too clever. */
181 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 181 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
182 struct sk_buff *skb2; 182 struct sk_buff *skb2;
183 183
184 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 184 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
185 if (skb2 == NULL) { 185 if (skb2 == NULL) {
186 kfree_skb(skb); 186 kfree_skb(skb);
187 return -ENOMEM; 187 return -ENOMEM;
188 } 188 }
189 if (skb->sk) 189 if (skb->sk)
190 skb_set_owner_w(skb2, skb->sk); 190 skb_set_owner_w(skb2, skb->sk);
191 consume_skb(skb); 191 consume_skb(skb);
192 skb = skb2; 192 skb = skb2;
193 } 193 }
194 194
195 rcu_read_lock_bh(); 195 rcu_read_lock_bh();
196 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); 196 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh)) 198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
200 if (!IS_ERR(neigh)) { 200 if (!IS_ERR(neigh)) {
201 int res = dst_neigh_output(dst, neigh, skb); 201 int res = dst_neigh_output(dst, neigh, skb);
202 202
203 rcu_read_unlock_bh(); 203 rcu_read_unlock_bh();
204 return res; 204 return res;
205 } 205 }
206 rcu_read_unlock_bh(); 206 rcu_read_unlock_bh();
207 207
208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n", 208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
209 __func__); 209 __func__);
210 kfree_skb(skb); 210 kfree_skb(skb);
211 return -EINVAL; 211 return -EINVAL;
212 } 212 }
213 213
214 static int ip_finish_output_gso(struct sk_buff *skb) 214 static int ip_finish_output_gso(struct sk_buff *skb)
215 { 215 {
216 netdev_features_t features; 216 netdev_features_t features;
217 struct sk_buff *segs; 217 struct sk_buff *segs;
218 int ret = 0; 218 int ret = 0;
219 219
220 /* common case: locally created skb or seglen is <= mtu */ 220 /* common case: locally created skb or seglen is <= mtu */
221 if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || 221 if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
222 skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) 222 skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
223 return ip_finish_output2(skb); 223 return ip_finish_output2(skb);
224 224
225 /* Slowpath - GSO segment length is exceeding the dst MTU. 225 /* Slowpath - GSO segment length is exceeding the dst MTU.
226 * 226 *
227 * This can happen in two cases: 227 * This can happen in two cases:
228 * 1) TCP GRO packet, DF bit not set 228 * 1) TCP GRO packet, DF bit not set
229 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly 229 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
230 * from host network stack. 230 * from host network stack.
231 */ 231 */
232 features = netif_skb_features(skb); 232 features = netif_skb_features(skb);
233 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 233 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
234 if (IS_ERR_OR_NULL(segs)) { 234 if (IS_ERR_OR_NULL(segs)) {
235 kfree_skb(skb); 235 kfree_skb(skb);
236 return -ENOMEM; 236 return -ENOMEM;
237 } 237 }
238 238
239 consume_skb(skb); 239 consume_skb(skb);
240 240
241 do { 241 do {
242 struct sk_buff *nskb = segs->next; 242 struct sk_buff *nskb = segs->next;
243 int err; 243 int err;
244 244
245 segs->next = NULL; 245 segs->next = NULL;
246 err = ip_fragment(segs, ip_finish_output2); 246 err = ip_fragment(segs, ip_finish_output2);
247 247
248 if (err && ret == 0) 248 if (err && ret == 0)
249 ret = err; 249 ret = err;
250 segs = nskb; 250 segs = nskb;
251 } while (segs); 251 } while (segs);
252 252
253 return ret; 253 return ret;
254 } 254 }
255 255
256 static int ip_finish_output(struct sk_buff *skb) 256 static int ip_finish_output(struct sk_buff *skb)
257 { 257 {
258 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 258 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
259 /* Policy lookup after SNAT yielded a new policy */ 259 /* Policy lookup after SNAT yielded a new policy */
260 if (skb_dst(skb)->xfrm != NULL) { 260 if (skb_dst(skb)->xfrm != NULL) {
261 IPCB(skb)->flags |= IPSKB_REROUTED; 261 IPCB(skb)->flags |= IPSKB_REROUTED;
262 return dst_output(skb); 262 return dst_output(skb);
263 } 263 }
264 #endif 264 #endif
265 if (skb_is_gso(skb)) 265 if (skb_is_gso(skb))
266 return ip_finish_output_gso(skb); 266 return ip_finish_output_gso(skb);
267 267
268 if (skb->len > ip_skb_dst_mtu(skb)) 268 if (skb->len > ip_skb_dst_mtu(skb))
269 return ip_fragment(skb, ip_finish_output2); 269 return ip_fragment(skb, ip_finish_output2);
270 270
271 return ip_finish_output2(skb); 271 return ip_finish_output2(skb);
272 } 272 }
273 273
274 int ip_mc_output(struct sock *sk, struct sk_buff *skb) 274 int ip_mc_output(struct sock *sk, struct sk_buff *skb)
275 { 275 {
276 struct rtable *rt = skb_rtable(skb); 276 struct rtable *rt = skb_rtable(skb);
277 struct net_device *dev = rt->dst.dev; 277 struct net_device *dev = rt->dst.dev;
278 278
279 /* 279 /*
280 * If the indicated interface is up and running, send the packet. 280 * If the indicated interface is up and running, send the packet.
281 */ 281 */
282 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); 282 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
283 283
284 skb->dev = dev; 284 skb->dev = dev;
285 skb->protocol = htons(ETH_P_IP); 285 skb->protocol = htons(ETH_P_IP);
286 286
287 /* 287 /*
288 * Multicasts are looped back for other local users 288 * Multicasts are looped back for other local users
289 */ 289 */
290 290
291 if (rt->rt_flags&RTCF_MULTICAST) { 291 if (rt->rt_flags&RTCF_MULTICAST) {
292 if (sk_mc_loop(sk) 292 if (sk_mc_loop(sk)
293 #ifdef CONFIG_IP_MROUTE 293 #ifdef CONFIG_IP_MROUTE
294 /* Small optimization: do not loopback not local frames, 294 /* Small optimization: do not loopback not local frames,
295 which returned after forwarding; they will be dropped 295 which returned after forwarding; they will be dropped
296 by ip_mr_input in any case. 296 by ip_mr_input in any case.
297 Note, that local frames are looped back to be delivered 297 Note, that local frames are looped back to be delivered
298 to local recipients. 298 to local recipients.
299 299
300 This check is duplicated in ip_mr_input at the moment. 300 This check is duplicated in ip_mr_input at the moment.
301 */ 301 */
302 && 302 &&
303 ((rt->rt_flags & RTCF_LOCAL) || 303 ((rt->rt_flags & RTCF_LOCAL) ||
304 !(IPCB(skb)->flags & IPSKB_FORWARDED)) 304 !(IPCB(skb)->flags & IPSKB_FORWARDED))
305 #endif 305 #endif
306 ) { 306 ) {
307 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 307 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
308 if (newskb) 308 if (newskb)
309 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 309 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
310 newskb, NULL, newskb->dev, 310 newskb, NULL, newskb->dev,
311 dev_loopback_xmit); 311 dev_loopback_xmit);
312 } 312 }
313 313
314 /* Multicasts with ttl 0 must not go beyond the host */ 314 /* Multicasts with ttl 0 must not go beyond the host */
315 315
316 if (ip_hdr(skb)->ttl == 0) { 316 if (ip_hdr(skb)->ttl == 0) {
317 kfree_skb(skb); 317 kfree_skb(skb);
318 return 0; 318 return 0;
319 } 319 }
320 } 320 }
321 321
322 if (rt->rt_flags&RTCF_BROADCAST) { 322 if (rt->rt_flags&RTCF_BROADCAST) {
323 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 323 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
324 if (newskb) 324 if (newskb)
325 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 325 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
326 NULL, newskb->dev, dev_loopback_xmit); 326 NULL, newskb->dev, dev_loopback_xmit);
327 } 327 }
328 328
329 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 329 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
330 skb->dev, ip_finish_output, 330 skb->dev, ip_finish_output,
331 !(IPCB(skb)->flags & IPSKB_REROUTED)); 331 !(IPCB(skb)->flags & IPSKB_REROUTED));
332 } 332 }
333 333
334 int ip_output(struct sock *sk, struct sk_buff *skb) 334 int ip_output(struct sock *sk, struct sk_buff *skb)
335 { 335 {
336 struct net_device *dev = skb_dst(skb)->dev; 336 struct net_device *dev = skb_dst(skb)->dev;
337 337
338 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); 338 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
339 339
340 skb->dev = dev; 340 skb->dev = dev;
341 skb->protocol = htons(ETH_P_IP); 341 skb->protocol = htons(ETH_P_IP);
342 342
343 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, 343 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
344 ip_finish_output, 344 ip_finish_output,
345 !(IPCB(skb)->flags & IPSKB_REROUTED)); 345 !(IPCB(skb)->flags & IPSKB_REROUTED));
346 } 346 }
347 347
348 /* 348 /*
349 * copy saddr and daddr, possibly using 64bit load/stores 349 * copy saddr and daddr, possibly using 64bit load/stores
350 * Equivalent to : 350 * Equivalent to :
351 * iph->saddr = fl4->saddr; 351 * iph->saddr = fl4->saddr;
352 * iph->daddr = fl4->daddr; 352 * iph->daddr = fl4->daddr;
353 */ 353 */
354 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) 354 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
355 { 355 {
356 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != 356 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
357 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); 357 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
358 memcpy(&iph->saddr, &fl4->saddr, 358 memcpy(&iph->saddr, &fl4->saddr,
359 sizeof(fl4->saddr) + sizeof(fl4->daddr)); 359 sizeof(fl4->saddr) + sizeof(fl4->daddr));
360 } 360 }
361 361
362 /* Note: skb->sk can be different from sk, in case of tunnels */ 362 /* Note: skb->sk can be different from sk, in case of tunnels */
363 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) 363 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
364 { 364 {
365 struct inet_sock *inet = inet_sk(sk); 365 struct inet_sock *inet = inet_sk(sk);
366 struct ip_options_rcu *inet_opt; 366 struct ip_options_rcu *inet_opt;
367 struct flowi4 *fl4; 367 struct flowi4 *fl4;
368 struct rtable *rt; 368 struct rtable *rt;
369 struct iphdr *iph; 369 struct iphdr *iph;
370 int res; 370 int res;
371 371
372 /* Skip all of this if the packet is already routed, 372 /* Skip all of this if the packet is already routed,
373 * f.e. by something like SCTP. 373 * f.e. by something like SCTP.
374 */ 374 */
375 rcu_read_lock(); 375 rcu_read_lock();
376 inet_opt = rcu_dereference(inet->inet_opt); 376 inet_opt = rcu_dereference(inet->inet_opt);
377 fl4 = &fl->u.ip4; 377 fl4 = &fl->u.ip4;
378 rt = skb_rtable(skb); 378 rt = skb_rtable(skb);
379 if (rt != NULL) 379 if (rt != NULL)
380 goto packet_routed; 380 goto packet_routed;
381 381
382 /* Make sure we can route this packet. */ 382 /* Make sure we can route this packet. */
383 rt = (struct rtable *)__sk_dst_check(sk, 0); 383 rt = (struct rtable *)__sk_dst_check(sk, 0);
384 if (rt == NULL) { 384 if (rt == NULL) {
385 __be32 daddr; 385 __be32 daddr;
386 386
387 /* Use correct destination address if we have options. */ 387 /* Use correct destination address if we have options. */
388 daddr = inet->inet_daddr; 388 daddr = inet->inet_daddr;
389 if (inet_opt && inet_opt->opt.srr) 389 if (inet_opt && inet_opt->opt.srr)
390 daddr = inet_opt->opt.faddr; 390 daddr = inet_opt->opt.faddr;
391 391
392 /* If this fails, retransmit mechanism of transport layer will 392 /* If this fails, retransmit mechanism of transport layer will
393 * keep trying until route appears or the connection times 393 * keep trying until route appears or the connection times
394 * itself out. 394 * itself out.
395 */ 395 */
396 rt = ip_route_output_ports(sock_net(sk), fl4, sk, 396 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
397 daddr, inet->inet_saddr, 397 daddr, inet->inet_saddr,
398 inet->inet_dport, 398 inet->inet_dport,
399 inet->inet_sport, 399 inet->inet_sport,
400 sk->sk_protocol, 400 sk->sk_protocol,
401 RT_CONN_FLAGS(sk), 401 RT_CONN_FLAGS(sk),
402 sk->sk_bound_dev_if); 402 sk->sk_bound_dev_if);
403 if (IS_ERR(rt)) 403 if (IS_ERR(rt))
404 goto no_route; 404 goto no_route;
405 sk_setup_caps(sk, &rt->dst); 405 sk_setup_caps(sk, &rt->dst);
406 } 406 }
407 skb_dst_set_noref(skb, &rt->dst); 407 skb_dst_set_noref(skb, &rt->dst);
408 408
409 packet_routed: 409 packet_routed:
410 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) 410 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
411 goto no_route; 411 goto no_route;
412 412
413 /* OK, we know where to send it, allocate and build IP header. */ 413 /* OK, we know where to send it, allocate and build IP header. */
414 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); 414 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
415 skb_reset_network_header(skb); 415 skb_reset_network_header(skb);
416 iph = ip_hdr(skb); 416 iph = ip_hdr(skb);
417 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 417 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
418 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) 418 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
419 iph->frag_off = htons(IP_DF); 419 iph->frag_off = htons(IP_DF);
420 else 420 else
421 iph->frag_off = 0; 421 iph->frag_off = 0;
422 iph->ttl = ip_select_ttl(inet, &rt->dst); 422 iph->ttl = ip_select_ttl(inet, &rt->dst);
423 iph->protocol = sk->sk_protocol; 423 iph->protocol = sk->sk_protocol;
424 ip_copy_addrs(iph, fl4); 424 ip_copy_addrs(iph, fl4);
425 425
426 /* Transport layer set skb->h.foo itself. */ 426 /* Transport layer set skb->h.foo itself. */
427 427
428 if (inet_opt && inet_opt->opt.optlen) { 428 if (inet_opt && inet_opt->opt.optlen) {
429 iph->ihl += inet_opt->opt.optlen >> 2; 429 iph->ihl += inet_opt->opt.optlen >> 2;
430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
431 } 431 }
432 432
433 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); 433 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
434 434
435 /* TODO : should we use skb->sk here instead of sk ? */ 435 /* TODO : should we use skb->sk here instead of sk ? */
436 skb->priority = sk->sk_priority; 436 skb->priority = sk->sk_priority;
437 skb->mark = sk->sk_mark; 437 skb->mark = sk->sk_mark;
438 438
439 res = ip_local_out(skb); 439 res = ip_local_out(skb);
440 rcu_read_unlock(); 440 rcu_read_unlock();
441 return res; 441 return res;
442 442
443 no_route: 443 no_route:
444 rcu_read_unlock(); 444 rcu_read_unlock();
445 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 445 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
446 kfree_skb(skb); 446 kfree_skb(skb);
447 return -EHOSTUNREACH; 447 return -EHOSTUNREACH;
448 } 448 }
449 EXPORT_SYMBOL(ip_queue_xmit); 449 EXPORT_SYMBOL(ip_queue_xmit);
450 450
451 451
452 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 452 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
453 { 453 {
454 to->pkt_type = from->pkt_type; 454 to->pkt_type = from->pkt_type;
455 to->priority = from->priority; 455 to->priority = from->priority;
456 to->protocol = from->protocol; 456 to->protocol = from->protocol;
457 skb_dst_drop(to); 457 skb_dst_drop(to);
458 skb_dst_copy(to, from); 458 skb_dst_copy(to, from);
459 to->dev = from->dev; 459 to->dev = from->dev;
460 to->mark = from->mark; 460 to->mark = from->mark;
461 461
462 /* Copy the flags to each fragment. */ 462 /* Copy the flags to each fragment. */
463 IPCB(to)->flags = IPCB(from)->flags; 463 IPCB(to)->flags = IPCB(from)->flags;
464 464
465 #ifdef CONFIG_NET_SCHED 465 #ifdef CONFIG_NET_SCHED
466 to->tc_index = from->tc_index; 466 to->tc_index = from->tc_index;
467 #endif 467 #endif
468 nf_copy(to, from); 468 nf_copy(to, from);
469 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 469 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
470 to->ipvs_property = from->ipvs_property; 470 to->ipvs_property = from->ipvs_property;
471 #endif 471 #endif
472 skb_copy_secmark(to, from); 472 skb_copy_secmark(to, from);
473 } 473 }
474 474
475 /* 475 /*
476 * This IP datagram is too large to be sent in one piece. Break it up into 476 * This IP datagram is too large to be sent in one piece. Break it up into
477 * smaller pieces (each of size equal to IP header plus 477 * smaller pieces (each of size equal to IP header plus
478 * a block of the data of the original IP data part) that will yet fit in a 478 * a block of the data of the original IP data part) that will yet fit in a
479 * single device frame, and queue such a frame for sending. 479 * single device frame, and queue such a frame for sending.
480 */ 480 */
481 481
482 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 482 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
483 { 483 {
484 struct iphdr *iph; 484 struct iphdr *iph;
485 int ptr; 485 int ptr;
486 struct net_device *dev; 486 struct net_device *dev;
487 struct sk_buff *skb2; 487 struct sk_buff *skb2;
488 unsigned int mtu, hlen, left, len, ll_rs; 488 unsigned int mtu, hlen, left, len, ll_rs;
489 int offset; 489 int offset;
490 __be16 not_last_frag; 490 __be16 not_last_frag;
491 struct rtable *rt = skb_rtable(skb); 491 struct rtable *rt = skb_rtable(skb);
492 int err = 0; 492 int err = 0;
493 493
494 dev = rt->dst.dev; 494 dev = rt->dst.dev;
495 495
496 /* 496 /*
497 * Point into the IP datagram header. 497 * Point into the IP datagram header.
498 */ 498 */
499 499
500 iph = ip_hdr(skb); 500 iph = ip_hdr(skb);
501 501
502 mtu = ip_skb_dst_mtu(skb); 502 mtu = ip_skb_dst_mtu(skb);
503 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || 503 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
504 (IPCB(skb)->frag_max_size && 504 (IPCB(skb)->frag_max_size &&
505 IPCB(skb)->frag_max_size > mtu))) { 505 IPCB(skb)->frag_max_size > mtu))) {
506 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 506 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
507 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 507 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
508 htonl(mtu)); 508 htonl(mtu));
509 kfree_skb(skb); 509 kfree_skb(skb);
510 return -EMSGSIZE; 510 return -EMSGSIZE;
511 } 511 }
512 512
513 /* 513 /*
514 * Setup starting values. 514 * Setup starting values.
515 */ 515 */
516 516
517 hlen = iph->ihl * 4; 517 hlen = iph->ihl * 4;
518 mtu = mtu - hlen; /* Size of data space */ 518 mtu = mtu - hlen; /* Size of data space */
519 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 519 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
520 if (skb->nf_bridge) 520 if (skb->nf_bridge)
521 mtu -= nf_bridge_mtu_reduction(skb); 521 mtu -= nf_bridge_mtu_reduction(skb);
522 #endif 522 #endif
523 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; 523 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
524 524
525 /* When frag_list is given, use it. First, check its validity: 525 /* When frag_list is given, use it. First, check its validity:
526 * some transformers could create wrong frag_list or break existing 526 * some transformers could create wrong frag_list or break existing
527 * one, it is not prohibited. In this case fall back to copying. 527 * one, it is not prohibited. In this case fall back to copying.
528 * 528 *
529 * LATER: this step can be merged to real generation of fragments, 529 * LATER: this step can be merged to real generation of fragments,
530 * we can switch to copy when see the first bad fragment. 530 * we can switch to copy when see the first bad fragment.
531 */ 531 */
532 if (skb_has_frag_list(skb)) { 532 if (skb_has_frag_list(skb)) {
533 struct sk_buff *frag, *frag2; 533 struct sk_buff *frag, *frag2;
534 int first_len = skb_pagelen(skb); 534 int first_len = skb_pagelen(skb);
535 535
536 if (first_len - hlen > mtu || 536 if (first_len - hlen > mtu ||
537 ((first_len - hlen) & 7) || 537 ((first_len - hlen) & 7) ||
538 ip_is_fragment(iph) || 538 ip_is_fragment(iph) ||
539 skb_cloned(skb)) 539 skb_cloned(skb))
540 goto slow_path; 540 goto slow_path;
541 541
542 skb_walk_frags(skb, frag) { 542 skb_walk_frags(skb, frag) {
543 /* Correct geometry. */ 543 /* Correct geometry. */
544 if (frag->len > mtu || 544 if (frag->len > mtu ||
545 ((frag->len & 7) && frag->next) || 545 ((frag->len & 7) && frag->next) ||
546 skb_headroom(frag) < hlen) 546 skb_headroom(frag) < hlen)
547 goto slow_path_clean; 547 goto slow_path_clean;
548 548
549 /* Partially cloned skb? */ 549 /* Partially cloned skb? */
550 if (skb_shared(frag)) 550 if (skb_shared(frag))
551 goto slow_path_clean; 551 goto slow_path_clean;
552 552
553 BUG_ON(frag->sk); 553 BUG_ON(frag->sk);
554 if (skb->sk) { 554 if (skb->sk) {
555 frag->sk = skb->sk; 555 frag->sk = skb->sk;
556 frag->destructor = sock_wfree; 556 frag->destructor = sock_wfree;
557 } 557 }
558 skb->truesize -= frag->truesize; 558 skb->truesize -= frag->truesize;
559 } 559 }
560 560
561 /* Everything is OK. Generate! */ 561 /* Everything is OK. Generate! */
562 562
563 err = 0; 563 err = 0;
564 offset = 0; 564 offset = 0;
565 frag = skb_shinfo(skb)->frag_list; 565 frag = skb_shinfo(skb)->frag_list;
566 skb_frag_list_init(skb); 566 skb_frag_list_init(skb);
567 skb->data_len = first_len - skb_headlen(skb); 567 skb->data_len = first_len - skb_headlen(skb);
568 skb->len = first_len; 568 skb->len = first_len;
569 iph->tot_len = htons(first_len); 569 iph->tot_len = htons(first_len);
570 iph->frag_off = htons(IP_MF); 570 iph->frag_off = htons(IP_MF);
571 ip_send_check(iph); 571 ip_send_check(iph);
572 572
573 for (;;) { 573 for (;;) {
574 /* Prepare header of the next frame, 574 /* Prepare header of the next frame,
575 * before previous one went down. */ 575 * before previous one went down. */
576 if (frag) { 576 if (frag) {
577 frag->ip_summed = CHECKSUM_NONE; 577 frag->ip_summed = CHECKSUM_NONE;
578 skb_reset_transport_header(frag); 578 skb_reset_transport_header(frag);
579 __skb_push(frag, hlen); 579 __skb_push(frag, hlen);
580 skb_reset_network_header(frag); 580 skb_reset_network_header(frag);
581 memcpy(skb_network_header(frag), iph, hlen); 581 memcpy(skb_network_header(frag), iph, hlen);
582 iph = ip_hdr(frag); 582 iph = ip_hdr(frag);
583 iph->tot_len = htons(frag->len); 583 iph->tot_len = htons(frag->len);
584 ip_copy_metadata(frag, skb); 584 ip_copy_metadata(frag, skb);
585 if (offset == 0) 585 if (offset == 0)
586 ip_options_fragment(frag); 586 ip_options_fragment(frag);
587 offset += skb->len - hlen; 587 offset += skb->len - hlen;
588 iph->frag_off = htons(offset>>3); 588 iph->frag_off = htons(offset>>3);
589 if (frag->next != NULL) 589 if (frag->next != NULL)
590 iph->frag_off |= htons(IP_MF); 590 iph->frag_off |= htons(IP_MF);
591 /* Ready, complete checksum */ 591 /* Ready, complete checksum */
592 ip_send_check(iph); 592 ip_send_check(iph);
593 } 593 }
594 594
595 err = output(skb); 595 err = output(skb);
596 596
597 if (!err) 597 if (!err)
598 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 598 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
599 if (err || !frag) 599 if (err || !frag)
600 break; 600 break;
601 601
602 skb = frag; 602 skb = frag;
603 frag = skb->next; 603 frag = skb->next;
604 skb->next = NULL; 604 skb->next = NULL;
605 } 605 }
606 606
607 if (err == 0) { 607 if (err == 0) {
608 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 608 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
609 return 0; 609 return 0;
610 } 610 }
611 611
612 while (frag) { 612 while (frag) {
613 skb = frag->next; 613 skb = frag->next;
614 kfree_skb(frag); 614 kfree_skb(frag);
615 frag = skb; 615 frag = skb;
616 } 616 }
617 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 617 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
618 return err; 618 return err;
619 619
620 slow_path_clean: 620 slow_path_clean:
621 skb_walk_frags(skb, frag2) { 621 skb_walk_frags(skb, frag2) {
622 if (frag2 == frag) 622 if (frag2 == frag)
623 break; 623 break;
624 frag2->sk = NULL; 624 frag2->sk = NULL;
625 frag2->destructor = NULL; 625 frag2->destructor = NULL;
626 skb->truesize += frag2->truesize; 626 skb->truesize += frag2->truesize;
627 } 627 }
628 } 628 }
629 629
630 slow_path: 630 slow_path:
631 /* for offloaded checksums cleanup checksum before fragmentation */ 631 /* for offloaded checksums cleanup checksum before fragmentation */
632 if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) 632 if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
633 goto fail; 633 goto fail;
634 iph = ip_hdr(skb); 634 iph = ip_hdr(skb);
635 635
636 left = skb->len - hlen; /* Space per frame */ 636 left = skb->len - hlen; /* Space per frame */
637 ptr = hlen; /* Where to start from */ 637 ptr = hlen; /* Where to start from */
638 638
639 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 639 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
640 * we need to make room for the encapsulating header 640 * we need to make room for the encapsulating header
641 */ 641 */
642 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); 642 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
643 643
644 /* 644 /*
645 * Fragment the datagram. 645 * Fragment the datagram.
646 */ 646 */
647 647
648 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; 648 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
649 not_last_frag = iph->frag_off & htons(IP_MF); 649 not_last_frag = iph->frag_off & htons(IP_MF);
650 650
651 /* 651 /*
652 * Keep copying data until we run out. 652 * Keep copying data until we run out.
653 */ 653 */
654 654
655 while (left > 0) { 655 while (left > 0) {
656 len = left; 656 len = left;
657 /* IF: it doesn't fit, use 'mtu' - the data space left */ 657 /* IF: it doesn't fit, use 'mtu' - the data space left */
658 if (len > mtu) 658 if (len > mtu)
659 len = mtu; 659 len = mtu;
660 /* IF: we are not sending up to and including the packet end 660 /* IF: we are not sending up to and including the packet end
661 then align the next start on an eight byte boundary */ 661 then align the next start on an eight byte boundary */
662 if (len < left) { 662 if (len < left) {
663 len &= ~7; 663 len &= ~7;
664 } 664 }
665 /* 665 /*
666 * Allocate buffer. 666 * Allocate buffer.
667 */ 667 */
668 668
669 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { 669 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
670 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); 670 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
671 err = -ENOMEM; 671 err = -ENOMEM;
672 goto fail; 672 goto fail;
673 } 673 }
674 674
675 /* 675 /*
676 * Set up data on packet 676 * Set up data on packet
677 */ 677 */
678 678
679 ip_copy_metadata(skb2, skb); 679 ip_copy_metadata(skb2, skb);
680 skb_reserve(skb2, ll_rs); 680 skb_reserve(skb2, ll_rs);
681 skb_put(skb2, len + hlen); 681 skb_put(skb2, len + hlen);
682 skb_reset_network_header(skb2); 682 skb_reset_network_header(skb2);
683 skb2->transport_header = skb2->network_header + hlen; 683 skb2->transport_header = skb2->network_header + hlen;
684 684
685 /* 685 /*
686 * Charge the memory for the fragment to any owner 686 * Charge the memory for the fragment to any owner
687 * it might possess 687 * it might possess
688 */ 688 */
689 689
690 if (skb->sk) 690 if (skb->sk)
691 skb_set_owner_w(skb2, skb->sk); 691 skb_set_owner_w(skb2, skb->sk);
692 692
693 /* 693 /*
694 * Copy the packet header into the new buffer. 694 * Copy the packet header into the new buffer.
695 */ 695 */
696 696
697 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); 697 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
698 698
699 /* 699 /*
700 * Copy a block of the IP datagram. 700 * Copy a block of the IP datagram.
701 */ 701 */
702 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) 702 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
703 BUG(); 703 BUG();
704 left -= len; 704 left -= len;
705 705
706 /* 706 /*
707 * Fill in the new header fields. 707 * Fill in the new header fields.
708 */ 708 */
709 iph = ip_hdr(skb2); 709 iph = ip_hdr(skb2);
710 iph->frag_off = htons((offset >> 3)); 710 iph->frag_off = htons((offset >> 3));
711 711
712 /* ANK: dirty, but effective trick. Upgrade options only if 712 /* ANK: dirty, but effective trick. Upgrade options only if
713 * the segment to be fragmented was THE FIRST (otherwise, 713 * the segment to be fragmented was THE FIRST (otherwise,
714 * options are already fixed) and make it ONCE 714 * options are already fixed) and make it ONCE
715 * on the initial skb, so that all the following fragments 715 * on the initial skb, so that all the following fragments
716 * will inherit fixed options. 716 * will inherit fixed options.
717 */ 717 */
718 if (offset == 0) 718 if (offset == 0)
719 ip_options_fragment(skb); 719 ip_options_fragment(skb);
720 720
721 /* 721 /*
722 * Added AC : If we are fragmenting a fragment that's not the 722 * Added AC : If we are fragmenting a fragment that's not the
723 * last fragment then keep MF on each bit 723 * last fragment then keep MF on each bit
724 */ 724 */
725 if (left > 0 || not_last_frag) 725 if (left > 0 || not_last_frag)
726 iph->frag_off |= htons(IP_MF); 726 iph->frag_off |= htons(IP_MF);
727 ptr += len; 727 ptr += len;
728 offset += len; 728 offset += len;
729 729
730 /* 730 /*
731 * Put this fragment into the sending queue. 731 * Put this fragment into the sending queue.
732 */ 732 */
733 iph->tot_len = htons(len + hlen); 733 iph->tot_len = htons(len + hlen);
734 734
735 ip_send_check(iph); 735 ip_send_check(iph);
736 736
737 err = output(skb2); 737 err = output(skb2);
738 if (err) 738 if (err)
739 goto fail; 739 goto fail;
740 740
741 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 741 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
742 } 742 }
743 consume_skb(skb); 743 consume_skb(skb);
744 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 744 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
745 return err; 745 return err;
746 746
747 fail: 747 fail:
748 kfree_skb(skb); 748 kfree_skb(skb);
749 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 749 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
750 return err; 750 return err;
751 } 751 }
752 EXPORT_SYMBOL(ip_fragment); 752 EXPORT_SYMBOL(ip_fragment);
753 753
754 int 754 int
755 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) 755 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
756 { 756 {
757 struct iovec *iov = from; 757 struct iovec *iov = from;
758 758
759 if (skb->ip_summed == CHECKSUM_PARTIAL) { 759 if (skb->ip_summed == CHECKSUM_PARTIAL) {
760 if (memcpy_fromiovecend(to, iov, offset, len) < 0) 760 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
761 return -EFAULT; 761 return -EFAULT;
762 } else { 762 } else {
763 __wsum csum = 0; 763 __wsum csum = 0;
764 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) 764 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
765 return -EFAULT; 765 return -EFAULT;
766 skb->csum = csum_block_add(skb->csum, csum, odd); 766 skb->csum = csum_block_add(skb->csum, csum, odd);
767 } 767 }
768 return 0; 768 return 0;
769 } 769 }
770 EXPORT_SYMBOL(ip_generic_getfrag); 770 EXPORT_SYMBOL(ip_generic_getfrag);
771 771
772 static inline __wsum 772 static inline __wsum
773 csum_page(struct page *page, int offset, int copy) 773 csum_page(struct page *page, int offset, int copy)
774 { 774 {
775 char *kaddr; 775 char *kaddr;
776 __wsum csum; 776 __wsum csum;
777 kaddr = kmap(page); 777 kaddr = kmap(page);
778 csum = csum_partial(kaddr + offset, copy, 0); 778 csum = csum_partial(kaddr + offset, copy, 0);
779 kunmap(page); 779 kunmap(page);
780 return csum; 780 return csum;
781 } 781 }
782 782
783 static inline int ip_ufo_append_data(struct sock *sk, 783 static inline int ip_ufo_append_data(struct sock *sk,
784 struct sk_buff_head *queue, 784 struct sk_buff_head *queue,
785 int getfrag(void *from, char *to, int offset, int len, 785 int getfrag(void *from, char *to, int offset, int len,
786 int odd, struct sk_buff *skb), 786 int odd, struct sk_buff *skb),
787 void *from, int length, int hh_len, int fragheaderlen, 787 void *from, int length, int hh_len, int fragheaderlen,
788 int transhdrlen, int maxfraglen, unsigned int flags) 788 int transhdrlen, int maxfraglen, unsigned int flags)
789 { 789 {
790 struct sk_buff *skb; 790 struct sk_buff *skb;
791 int err; 791 int err;
792 792
793 /* There is support for UDP fragmentation offload by network 793 /* There is support for UDP fragmentation offload by network
794 * device, so create one single skb packet containing complete 794 * device, so create one single skb packet containing complete
795 * udp datagram 795 * udp datagram
796 */ 796 */
797 if ((skb = skb_peek_tail(queue)) == NULL) { 797 if ((skb = skb_peek_tail(queue)) == NULL) {
798 skb = sock_alloc_send_skb(sk, 798 skb = sock_alloc_send_skb(sk,
799 hh_len + fragheaderlen + transhdrlen + 20, 799 hh_len + fragheaderlen + transhdrlen + 20,
800 (flags & MSG_DONTWAIT), &err); 800 (flags & MSG_DONTWAIT), &err);
801 801
802 if (skb == NULL) 802 if (skb == NULL)
803 return err; 803 return err;
804 804
805 /* reserve space for Hardware header */ 805 /* reserve space for Hardware header */
806 skb_reserve(skb, hh_len); 806 skb_reserve(skb, hh_len);
807 807
808 /* create space for UDP/IP header */ 808 /* create space for UDP/IP header */
809 skb_put(skb, fragheaderlen + transhdrlen); 809 skb_put(skb, fragheaderlen + transhdrlen);
810 810
811 /* initialize network header pointer */ 811 /* initialize network header pointer */
812 skb_reset_network_header(skb); 812 skb_reset_network_header(skb);
813 813
814 /* initialize protocol header pointer */ 814 /* initialize protocol header pointer */
815 skb->transport_header = skb->network_header + fragheaderlen; 815 skb->transport_header = skb->network_header + fragheaderlen;
816 816
817 skb->csum = 0; 817 skb->csum = 0;
818 818
819 819
820 __skb_queue_tail(queue, skb); 820 __skb_queue_tail(queue, skb);
821 } else if (skb_is_gso(skb)) { 821 } else if (skb_is_gso(skb)) {
822 goto append; 822 goto append;
823 } 823 }
824 824
825 skb->ip_summed = CHECKSUM_PARTIAL; 825 skb->ip_summed = CHECKSUM_PARTIAL;
826 /* specify the length of each IP datagram fragment */ 826 /* specify the length of each IP datagram fragment */
827 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; 827 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
828 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 828 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
829 829
830 append: 830 append:
831 return skb_append_datato_frags(sk, skb, getfrag, from, 831 return skb_append_datato_frags(sk, skb, getfrag, from,
832 (length - transhdrlen)); 832 (length - transhdrlen));
833 } 833 }
834 834
835 static int __ip_append_data(struct sock *sk, 835 static int __ip_append_data(struct sock *sk,
836 struct flowi4 *fl4, 836 struct flowi4 *fl4,
837 struct sk_buff_head *queue, 837 struct sk_buff_head *queue,
838 struct inet_cork *cork, 838 struct inet_cork *cork,
839 struct page_frag *pfrag, 839 struct page_frag *pfrag,
840 int getfrag(void *from, char *to, int offset, 840 int getfrag(void *from, char *to, int offset,
841 int len, int odd, struct sk_buff *skb), 841 int len, int odd, struct sk_buff *skb),
842 void *from, int length, int transhdrlen, 842 void *from, int length, int transhdrlen,
843 unsigned int flags) 843 unsigned int flags)
844 { 844 {
845 struct inet_sock *inet = inet_sk(sk); 845 struct inet_sock *inet = inet_sk(sk);
846 struct sk_buff *skb; 846 struct sk_buff *skb;
847 847
848 struct ip_options *opt = cork->opt; 848 struct ip_options *opt = cork->opt;
849 int hh_len; 849 int hh_len;
850 int exthdrlen; 850 int exthdrlen;
851 int mtu; 851 int mtu;
852 int copy; 852 int copy;
853 int err; 853 int err;
854 int offset = 0; 854 int offset = 0;
855 unsigned int maxfraglen, fragheaderlen, maxnonfragsize; 855 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
856 int csummode = CHECKSUM_NONE; 856 int csummode = CHECKSUM_NONE;
857 struct rtable *rt = (struct rtable *)cork->dst; 857 struct rtable *rt = (struct rtable *)cork->dst;
858 u32 tskey = 0; 858 u32 tskey = 0;
859 859
860 skb = skb_peek_tail(queue); 860 skb = skb_peek_tail(queue);
861 861
862 exthdrlen = !skb ? rt->dst.header_len : 0; 862 exthdrlen = !skb ? rt->dst.header_len : 0;
863 mtu = cork->fragsize; 863 mtu = cork->fragsize;
864 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 864 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
865 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 865 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
866 tskey = sk->sk_tskey++; 866 tskey = sk->sk_tskey++;
867 867
868 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 868 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
869 869
870 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 870 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
871 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 871 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
872 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; 872 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
873 873
874 if (cork->length + length > maxnonfragsize - fragheaderlen) { 874 if (cork->length + length > maxnonfragsize - fragheaderlen) {
875 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 875 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
876 mtu - (opt ? opt->optlen : 0)); 876 mtu - (opt ? opt->optlen : 0));
877 return -EMSGSIZE; 877 return -EMSGSIZE;
878 } 878 }
879 879
880 /* 880 /*
881 * transhdrlen > 0 means that this is the first fragment and we wish 881 * transhdrlen > 0 means that this is the first fragment and we wish
882 * it won't be fragmented in the future. 882 * it won't be fragmented in the future.
883 */ 883 */
884 if (transhdrlen && 884 if (transhdrlen &&
885 length + fragheaderlen <= mtu && 885 length + fragheaderlen <= mtu &&
886 rt->dst.dev->features & NETIF_F_V4_CSUM && 886 rt->dst.dev->features & NETIF_F_V4_CSUM &&
887 !exthdrlen) 887 !exthdrlen)
888 csummode = CHECKSUM_PARTIAL; 888 csummode = CHECKSUM_PARTIAL;
889 889
890 cork->length += length; 890 cork->length += length;
891 if (((length > mtu) || (skb && skb_is_gso(skb))) && 891 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
892 (sk->sk_protocol == IPPROTO_UDP) && 892 (sk->sk_protocol == IPPROTO_UDP) &&
893 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { 893 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
894 err = ip_ufo_append_data(sk, queue, getfrag, from, length, 894 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
895 hh_len, fragheaderlen, transhdrlen, 895 hh_len, fragheaderlen, transhdrlen,
896 maxfraglen, flags); 896 maxfraglen, flags);
897 if (err) 897 if (err)
898 goto error; 898 goto error;
899 return 0; 899 return 0;
900 } 900 }
901 901
902 /* So, what's going on in the loop below? 902 /* So, what's going on in the loop below?
903 * 903 *
904 * We use calculated fragment length to generate chained skb, 904 * We use calculated fragment length to generate chained skb,
905 * each of segments is IP fragment ready for sending to network after 905 * each of segments is IP fragment ready for sending to network after
906 * adding appropriate IP header. 906 * adding appropriate IP header.
907 */ 907 */
908 908
909 if (!skb) 909 if (!skb)
910 goto alloc_new_skb; 910 goto alloc_new_skb;
911 911
912 while (length > 0) { 912 while (length > 0) {
913 /* Check if the remaining data fits into current packet. */ 913 /* Check if the remaining data fits into current packet. */
914 copy = mtu - skb->len; 914 copy = mtu - skb->len;
915 if (copy < length) 915 if (copy < length)
916 copy = maxfraglen - skb->len; 916 copy = maxfraglen - skb->len;
917 if (copy <= 0) { 917 if (copy <= 0) {
918 char *data; 918 char *data;
919 unsigned int datalen; 919 unsigned int datalen;
920 unsigned int fraglen; 920 unsigned int fraglen;
921 unsigned int fraggap; 921 unsigned int fraggap;
922 unsigned int alloclen; 922 unsigned int alloclen;
923 struct sk_buff *skb_prev; 923 struct sk_buff *skb_prev;
924 alloc_new_skb: 924 alloc_new_skb:
925 skb_prev = skb; 925 skb_prev = skb;
926 if (skb_prev) 926 if (skb_prev)
927 fraggap = skb_prev->len - maxfraglen; 927 fraggap = skb_prev->len - maxfraglen;
928 else 928 else
929 fraggap = 0; 929 fraggap = 0;
930 930
931 /* 931 /*
932 * If remaining data exceeds the mtu, 932 * If remaining data exceeds the mtu,
933 * we know we need more fragment(s). 933 * we know we need more fragment(s).
934 */ 934 */
935 datalen = length + fraggap; 935 datalen = length + fraggap;
936 if (datalen > mtu - fragheaderlen) 936 if (datalen > mtu - fragheaderlen)
937 datalen = maxfraglen - fragheaderlen; 937 datalen = maxfraglen - fragheaderlen;
938 fraglen = datalen + fragheaderlen; 938 fraglen = datalen + fragheaderlen;
939 939
940 if ((flags & MSG_MORE) && 940 if ((flags & MSG_MORE) &&
941 !(rt->dst.dev->features&NETIF_F_SG)) 941 !(rt->dst.dev->features&NETIF_F_SG))
942 alloclen = mtu; 942 alloclen = mtu;
943 else 943 else
944 alloclen = fraglen; 944 alloclen = fraglen;
945 945
946 alloclen += exthdrlen; 946 alloclen += exthdrlen;
947 947
948 /* The last fragment gets additional space at tail. 948 /* The last fragment gets additional space at tail.
949 * Note, with MSG_MORE we overallocate on fragments, 949 * Note, with MSG_MORE we overallocate on fragments,
950 * because we have no idea what fragment will be 950 * because we have no idea what fragment will be
951 * the last. 951 * the last.
952 */ 952 */
953 if (datalen == length + fraggap) 953 if (datalen == length + fraggap)
954 alloclen += rt->dst.trailer_len; 954 alloclen += rt->dst.trailer_len;
955 955
956 if (transhdrlen) { 956 if (transhdrlen) {
957 skb = sock_alloc_send_skb(sk, 957 skb = sock_alloc_send_skb(sk,
958 alloclen + hh_len + 15, 958 alloclen + hh_len + 15,
959 (flags & MSG_DONTWAIT), &err); 959 (flags & MSG_DONTWAIT), &err);
960 } else { 960 } else {
961 skb = NULL; 961 skb = NULL;
962 if (atomic_read(&sk->sk_wmem_alloc) <= 962 if (atomic_read(&sk->sk_wmem_alloc) <=
963 2 * sk->sk_sndbuf) 963 2 * sk->sk_sndbuf)
964 skb = sock_wmalloc(sk, 964 skb = sock_wmalloc(sk,
965 alloclen + hh_len + 15, 1, 965 alloclen + hh_len + 15, 1,
966 sk->sk_allocation); 966 sk->sk_allocation);
967 if (unlikely(skb == NULL)) 967 if (unlikely(skb == NULL))
968 err = -ENOBUFS; 968 err = -ENOBUFS;
969 } 969 }
970 if (skb == NULL) 970 if (skb == NULL)
971 goto error; 971 goto error;
972 972
973 /* 973 /*
974 * Fill in the control structures 974 * Fill in the control structures
975 */ 975 */
976 skb->ip_summed = csummode; 976 skb->ip_summed = csummode;
977 skb->csum = 0; 977 skb->csum = 0;
978 skb_reserve(skb, hh_len); 978 skb_reserve(skb, hh_len);
979 979
980 /* only the initial fragment is time stamped */ 980 /* only the initial fragment is time stamped */
981 skb_shinfo(skb)->tx_flags = cork->tx_flags; 981 skb_shinfo(skb)->tx_flags = cork->tx_flags;
982 cork->tx_flags = 0; 982 cork->tx_flags = 0;
983 skb_shinfo(skb)->tskey = tskey; 983 skb_shinfo(skb)->tskey = tskey;
984 tskey = 0; 984 tskey = 0;
985 985
986 /* 986 /*
987 * Find where to start putting bytes. 987 * Find where to start putting bytes.
988 */ 988 */
989 data = skb_put(skb, fraglen + exthdrlen); 989 data = skb_put(skb, fraglen + exthdrlen);
990 skb_set_network_header(skb, exthdrlen); 990 skb_set_network_header(skb, exthdrlen);
991 skb->transport_header = (skb->network_header + 991 skb->transport_header = (skb->network_header +
992 fragheaderlen); 992 fragheaderlen);
993 data += fragheaderlen + exthdrlen; 993 data += fragheaderlen + exthdrlen;
994 994
995 if (fraggap) { 995 if (fraggap) {
996 skb->csum = skb_copy_and_csum_bits( 996 skb->csum = skb_copy_and_csum_bits(
997 skb_prev, maxfraglen, 997 skb_prev, maxfraglen,
998 data + transhdrlen, fraggap, 0); 998 data + transhdrlen, fraggap, 0);
999 skb_prev->csum = csum_sub(skb_prev->csum, 999 skb_prev->csum = csum_sub(skb_prev->csum,
1000 skb->csum); 1000 skb->csum);
1001 data += fraggap; 1001 data += fraggap;
1002 pskb_trim_unique(skb_prev, maxfraglen); 1002 pskb_trim_unique(skb_prev, maxfraglen);
1003 } 1003 }
1004 1004
1005 copy = datalen - transhdrlen - fraggap; 1005 copy = datalen - transhdrlen - fraggap;
1006 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 1006 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1007 err = -EFAULT; 1007 err = -EFAULT;
1008 kfree_skb(skb); 1008 kfree_skb(skb);
1009 goto error; 1009 goto error;
1010 } 1010 }
1011 1011
1012 offset += copy; 1012 offset += copy;
1013 length -= datalen - fraggap; 1013 length -= datalen - fraggap;
1014 transhdrlen = 0; 1014 transhdrlen = 0;
1015 exthdrlen = 0; 1015 exthdrlen = 0;
1016 csummode = CHECKSUM_NONE; 1016 csummode = CHECKSUM_NONE;
1017 1017
1018 /* 1018 /*
1019 * Put the packet on the pending queue. 1019 * Put the packet on the pending queue.
1020 */ 1020 */
1021 __skb_queue_tail(queue, skb); 1021 __skb_queue_tail(queue, skb);
1022 continue; 1022 continue;
1023 } 1023 }
1024 1024
1025 if (copy > length) 1025 if (copy > length)
1026 copy = length; 1026 copy = length;
1027 1027
1028 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1028 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1029 unsigned int off; 1029 unsigned int off;
1030 1030
1031 off = skb->len; 1031 off = skb->len;
1032 if (getfrag(from, skb_put(skb, copy), 1032 if (getfrag(from, skb_put(skb, copy),
1033 offset, copy, off, skb) < 0) { 1033 offset, copy, off, skb) < 0) {
1034 __skb_trim(skb, off); 1034 __skb_trim(skb, off);
1035 err = -EFAULT; 1035 err = -EFAULT;
1036 goto error; 1036 goto error;
1037 } 1037 }
1038 } else { 1038 } else {
1039 int i = skb_shinfo(skb)->nr_frags; 1039 int i = skb_shinfo(skb)->nr_frags;
1040 1040
1041 err = -ENOMEM; 1041 err = -ENOMEM;
1042 if (!sk_page_frag_refill(sk, pfrag)) 1042 if (!sk_page_frag_refill(sk, pfrag))
1043 goto error; 1043 goto error;
1044 1044
1045 if (!skb_can_coalesce(skb, i, pfrag->page, 1045 if (!skb_can_coalesce(skb, i, pfrag->page,
1046 pfrag->offset)) { 1046 pfrag->offset)) {
1047 err = -EMSGSIZE; 1047 err = -EMSGSIZE;
1048 if (i == MAX_SKB_FRAGS) 1048 if (i == MAX_SKB_FRAGS)
1049 goto error; 1049 goto error;
1050 1050
1051 __skb_fill_page_desc(skb, i, pfrag->page, 1051 __skb_fill_page_desc(skb, i, pfrag->page,
1052 pfrag->offset, 0); 1052 pfrag->offset, 0);
1053 skb_shinfo(skb)->nr_frags = ++i; 1053 skb_shinfo(skb)->nr_frags = ++i;
1054 get_page(pfrag->page); 1054 get_page(pfrag->page);
1055 } 1055 }
1056 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1056 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1057 if (getfrag(from, 1057 if (getfrag(from,
1058 page_address(pfrag->page) + pfrag->offset, 1058 page_address(pfrag->page) + pfrag->offset,
1059 offset, copy, skb->len, skb) < 0) 1059 offset, copy, skb->len, skb) < 0)
1060 goto error_efault; 1060 goto error_efault;
1061 1061
1062 pfrag->offset += copy; 1062 pfrag->offset += copy;
1063 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1063 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1064 skb->len += copy; 1064 skb->len += copy;
1065 skb->data_len += copy; 1065 skb->data_len += copy;
1066 skb->truesize += copy; 1066 skb->truesize += copy;
1067 atomic_add(copy, &sk->sk_wmem_alloc); 1067 atomic_add(copy, &sk->sk_wmem_alloc);
1068 } 1068 }
1069 offset += copy; 1069 offset += copy;
1070 length -= copy; 1070 length -= copy;
1071 } 1071 }
1072 1072
1073 return 0; 1073 return 0;
1074 1074
1075 error_efault: 1075 error_efault:
1076 err = -EFAULT; 1076 err = -EFAULT;
1077 error: 1077 error:
1078 cork->length -= length; 1078 cork->length -= length;
1079 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1079 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1080 return err; 1080 return err;
1081 } 1081 }
1082 1082
1083 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, 1083 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1084 struct ipcm_cookie *ipc, struct rtable **rtp) 1084 struct ipcm_cookie *ipc, struct rtable **rtp)
1085 { 1085 {
1086 struct ip_options_rcu *opt; 1086 struct ip_options_rcu *opt;
1087 struct rtable *rt; 1087 struct rtable *rt;
1088 1088
1089 /* 1089 /*
1090 * setup for corking. 1090 * setup for corking.
1091 */ 1091 */
1092 opt = ipc->opt; 1092 opt = ipc->opt;
1093 if (opt) { 1093 if (opt) {
1094 if (cork->opt == NULL) { 1094 if (cork->opt == NULL) {
1095 cork->opt = kmalloc(sizeof(struct ip_options) + 40, 1095 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1096 sk->sk_allocation); 1096 sk->sk_allocation);
1097 if (unlikely(cork->opt == NULL)) 1097 if (unlikely(cork->opt == NULL))
1098 return -ENOBUFS; 1098 return -ENOBUFS;
1099 } 1099 }
1100 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); 1100 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1101 cork->flags |= IPCORK_OPT; 1101 cork->flags |= IPCORK_OPT;
1102 cork->addr = ipc->addr; 1102 cork->addr = ipc->addr;
1103 } 1103 }
1104 rt = *rtp; 1104 rt = *rtp;
1105 if (unlikely(!rt)) 1105 if (unlikely(!rt))
1106 return -EFAULT; 1106 return -EFAULT;
1107 /* 1107 /*
1108 * We steal reference to this route, caller should not release it 1108 * We steal reference to this route, caller should not release it
1109 */ 1109 */
1110 *rtp = NULL; 1110 *rtp = NULL;
1111 cork->fragsize = ip_sk_use_pmtu(sk) ? 1111 cork->fragsize = ip_sk_use_pmtu(sk) ?
1112 dst_mtu(&rt->dst) : rt->dst.dev->mtu; 1112 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1113 cork->dst = &rt->dst; 1113 cork->dst = &rt->dst;
1114 cork->length = 0; 1114 cork->length = 0;
1115 cork->ttl = ipc->ttl; 1115 cork->ttl = ipc->ttl;
1116 cork->tos = ipc->tos; 1116 cork->tos = ipc->tos;
1117 cork->priority = ipc->priority; 1117 cork->priority = ipc->priority;
1118 cork->tx_flags = ipc->tx_flags; 1118 cork->tx_flags = ipc->tx_flags;
1119 1119
1120 return 0; 1120 return 0;
1121 } 1121 }
1122 1122
1123 /* 1123 /*
1124 * ip_append_data() and ip_append_page() can make one large IP datagram 1124 * ip_append_data() and ip_append_page() can make one large IP datagram
1125 * from many pieces of data. Each pieces will be holded on the socket 1125 * from many pieces of data. Each pieces will be holded on the socket
1126 * until ip_push_pending_frames() is called. Each piece can be a page 1126 * until ip_push_pending_frames() is called. Each piece can be a page
1127 * or non-page data. 1127 * or non-page data.
1128 * 1128 *
1129 * Not only UDP, other transport protocols - e.g. raw sockets - can use 1129 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1130 * this interface potentially. 1130 * this interface potentially.
1131 * 1131 *
1132 * LATER: length must be adjusted by pad at tail, when it is required. 1132 * LATER: length must be adjusted by pad at tail, when it is required.
1133 */ 1133 */
1134 int ip_append_data(struct sock *sk, struct flowi4 *fl4, 1134 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1135 int getfrag(void *from, char *to, int offset, int len, 1135 int getfrag(void *from, char *to, int offset, int len,
1136 int odd, struct sk_buff *skb), 1136 int odd, struct sk_buff *skb),
1137 void *from, int length, int transhdrlen, 1137 void *from, int length, int transhdrlen,
1138 struct ipcm_cookie *ipc, struct rtable **rtp, 1138 struct ipcm_cookie *ipc, struct rtable **rtp,
1139 unsigned int flags) 1139 unsigned int flags)
1140 { 1140 {
1141 struct inet_sock *inet = inet_sk(sk); 1141 struct inet_sock *inet = inet_sk(sk);
1142 int err; 1142 int err;
1143 1143
1144 if (flags&MSG_PROBE) 1144 if (flags&MSG_PROBE)
1145 return 0; 1145 return 0;
1146 1146
1147 if (skb_queue_empty(&sk->sk_write_queue)) { 1147 if (skb_queue_empty(&sk->sk_write_queue)) {
1148 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); 1148 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1149 if (err) 1149 if (err)
1150 return err; 1150 return err;
1151 } else { 1151 } else {
1152 transhdrlen = 0; 1152 transhdrlen = 0;
1153 } 1153 }
1154 1154
1155 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, 1155 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1156 sk_page_frag(sk), getfrag, 1156 sk_page_frag(sk), getfrag,
1157 from, length, transhdrlen, flags); 1157 from, length, transhdrlen, flags);
1158 } 1158 }
1159 1159
1160 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, 1160 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1161 int offset, size_t size, int flags) 1161 int offset, size_t size, int flags)
1162 { 1162 {
1163 struct inet_sock *inet = inet_sk(sk); 1163 struct inet_sock *inet = inet_sk(sk);
1164 struct sk_buff *skb; 1164 struct sk_buff *skb;
1165 struct rtable *rt; 1165 struct rtable *rt;
1166 struct ip_options *opt = NULL; 1166 struct ip_options *opt = NULL;
1167 struct inet_cork *cork; 1167 struct inet_cork *cork;
1168 int hh_len; 1168 int hh_len;
1169 int mtu; 1169 int mtu;
1170 int len; 1170 int len;
1171 int err; 1171 int err;
1172 unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; 1172 unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
1173 1173
1174 if (inet->hdrincl) 1174 if (inet->hdrincl)
1175 return -EPERM; 1175 return -EPERM;
1176 1176
1177 if (flags&MSG_PROBE) 1177 if (flags&MSG_PROBE)
1178 return 0; 1178 return 0;
1179 1179
1180 if (skb_queue_empty(&sk->sk_write_queue)) 1180 if (skb_queue_empty(&sk->sk_write_queue))
1181 return -EINVAL; 1181 return -EINVAL;
1182 1182
1183 cork = &inet->cork.base; 1183 cork = &inet->cork.base;
1184 rt = (struct rtable *)cork->dst; 1184 rt = (struct rtable *)cork->dst;
1185 if (cork->flags & IPCORK_OPT) 1185 if (cork->flags & IPCORK_OPT)
1186 opt = cork->opt; 1186 opt = cork->opt;
1187 1187
1188 if (!(rt->dst.dev->features&NETIF_F_SG)) 1188 if (!(rt->dst.dev->features&NETIF_F_SG))
1189 return -EOPNOTSUPP; 1189 return -EOPNOTSUPP;
1190 1190
1191 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1191 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1192 mtu = cork->fragsize; 1192 mtu = cork->fragsize;
1193 1193
1194 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1194 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1195 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1195 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1196 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; 1196 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1197 1197
1198 if (cork->length + size > maxnonfragsize - fragheaderlen) { 1198 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1199 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 1199 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1200 mtu - (opt ? opt->optlen : 0)); 1200 mtu - (opt ? opt->optlen : 0));
1201 return -EMSGSIZE; 1201 return -EMSGSIZE;
1202 } 1202 }
1203 1203
1204 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1204 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1205 return -EINVAL; 1205 return -EINVAL;
1206 1206
1207 cork->length += size; 1207 cork->length += size;
1208 if ((size + skb->len > mtu) && 1208 if ((size + skb->len > mtu) &&
1209 (sk->sk_protocol == IPPROTO_UDP) && 1209 (sk->sk_protocol == IPPROTO_UDP) &&
1210 (rt->dst.dev->features & NETIF_F_UFO)) { 1210 (rt->dst.dev->features & NETIF_F_UFO)) {
1211 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 1211 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1212 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1212 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1213 } 1213 }
1214 1214
1215 1215
1216 while (size > 0) { 1216 while (size > 0) {
1217 int i; 1217 int i;
1218 1218
1219 if (skb_is_gso(skb)) 1219 if (skb_is_gso(skb))
1220 len = size; 1220 len = size;
1221 else { 1221 else {
1222 1222
1223 /* Check if the remaining data fits into current packet. */ 1223 /* Check if the remaining data fits into current packet. */
1224 len = mtu - skb->len; 1224 len = mtu - skb->len;
1225 if (len < size) 1225 if (len < size)
1226 len = maxfraglen - skb->len; 1226 len = maxfraglen - skb->len;
1227 } 1227 }
1228 if (len <= 0) { 1228 if (len <= 0) {
1229 struct sk_buff *skb_prev; 1229 struct sk_buff *skb_prev;
1230 int alloclen; 1230 int alloclen;
1231 1231
1232 skb_prev = skb; 1232 skb_prev = skb;
1233 fraggap = skb_prev->len - maxfraglen; 1233 fraggap = skb_prev->len - maxfraglen;
1234 1234
1235 alloclen = fragheaderlen + hh_len + fraggap + 15; 1235 alloclen = fragheaderlen + hh_len + fraggap + 15;
1236 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); 1236 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1237 if (unlikely(!skb)) { 1237 if (unlikely(!skb)) {
1238 err = -ENOBUFS; 1238 err = -ENOBUFS;
1239 goto error; 1239 goto error;
1240 } 1240 }
1241 1241
1242 /* 1242 /*
1243 * Fill in the control structures 1243 * Fill in the control structures
1244 */ 1244 */
1245 skb->ip_summed = CHECKSUM_NONE; 1245 skb->ip_summed = CHECKSUM_NONE;
1246 skb->csum = 0; 1246 skb->csum = 0;
1247 skb_reserve(skb, hh_len); 1247 skb_reserve(skb, hh_len);
1248 1248
1249 /* 1249 /*
1250 * Find where to start putting bytes. 1250 * Find where to start putting bytes.
1251 */ 1251 */
1252 skb_put(skb, fragheaderlen + fraggap); 1252 skb_put(skb, fragheaderlen + fraggap);
1253 skb_reset_network_header(skb); 1253 skb_reset_network_header(skb);
1254 skb->transport_header = (skb->network_header + 1254 skb->transport_header = (skb->network_header +
1255 fragheaderlen); 1255 fragheaderlen);
1256 if (fraggap) { 1256 if (fraggap) {
1257 skb->csum = skb_copy_and_csum_bits(skb_prev, 1257 skb->csum = skb_copy_and_csum_bits(skb_prev,
1258 maxfraglen, 1258 maxfraglen,
1259 skb_transport_header(skb), 1259 skb_transport_header(skb),
1260 fraggap, 0); 1260 fraggap, 0);
1261 skb_prev->csum = csum_sub(skb_prev->csum, 1261 skb_prev->csum = csum_sub(skb_prev->csum,
1262 skb->csum); 1262 skb->csum);
1263 pskb_trim_unique(skb_prev, maxfraglen); 1263 pskb_trim_unique(skb_prev, maxfraglen);
1264 } 1264 }
1265 1265
1266 /* 1266 /*
1267 * Put the packet on the pending queue. 1267 * Put the packet on the pending queue.
1268 */ 1268 */
1269 __skb_queue_tail(&sk->sk_write_queue, skb); 1269 __skb_queue_tail(&sk->sk_write_queue, skb);
1270 continue; 1270 continue;
1271 } 1271 }
1272 1272
1273 i = skb_shinfo(skb)->nr_frags; 1273 i = skb_shinfo(skb)->nr_frags;
1274 if (len > size) 1274 if (len > size)
1275 len = size; 1275 len = size;
1276 if (skb_can_coalesce(skb, i, page, offset)) { 1276 if (skb_can_coalesce(skb, i, page, offset)) {
1277 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); 1277 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1278 } else if (i < MAX_SKB_FRAGS) { 1278 } else if (i < MAX_SKB_FRAGS) {
1279 get_page(page); 1279 get_page(page);
1280 skb_fill_page_desc(skb, i, page, offset, len); 1280 skb_fill_page_desc(skb, i, page, offset, len);
1281 } else { 1281 } else {
1282 err = -EMSGSIZE; 1282 err = -EMSGSIZE;
1283 goto error; 1283 goto error;
1284 } 1284 }
1285 1285
1286 if (skb->ip_summed == CHECKSUM_NONE) { 1286 if (skb->ip_summed == CHECKSUM_NONE) {
1287 __wsum csum; 1287 __wsum csum;
1288 csum = csum_page(page, offset, len); 1288 csum = csum_page(page, offset, len);
1289 skb->csum = csum_block_add(skb->csum, csum, skb->len); 1289 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1290 } 1290 }
1291 1291
1292 skb->len += len; 1292 skb->len += len;
1293 skb->data_len += len; 1293 skb->data_len += len;
1294 skb->truesize += len; 1294 skb->truesize += len;
1295 atomic_add(len, &sk->sk_wmem_alloc); 1295 atomic_add(len, &sk->sk_wmem_alloc);
1296 offset += len; 1296 offset += len;
1297 size -= len; 1297 size -= len;
1298 } 1298 }
1299 return 0; 1299 return 0;
1300 1300
1301 error: 1301 error:
1302 cork->length -= size; 1302 cork->length -= size;
1303 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1303 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1304 return err; 1304 return err;
1305 } 1305 }
1306 1306
1307 static void ip_cork_release(struct inet_cork *cork) 1307 static void ip_cork_release(struct inet_cork *cork)
1308 { 1308 {
1309 cork->flags &= ~IPCORK_OPT; 1309 cork->flags &= ~IPCORK_OPT;
1310 kfree(cork->opt); 1310 kfree(cork->opt);
1311 cork->opt = NULL; 1311 cork->opt = NULL;
1312 dst_release(cork->dst); 1312 dst_release(cork->dst);
1313 cork->dst = NULL; 1313 cork->dst = NULL;
1314 } 1314 }
1315 1315
1316 /* 1316 /*
1317 * Combined all pending IP fragments on the socket as one IP datagram 1317 * Combined all pending IP fragments on the socket as one IP datagram
1318 * and push them out. 1318 * and push them out.
1319 */ 1319 */
1320 struct sk_buff *__ip_make_skb(struct sock *sk, 1320 struct sk_buff *__ip_make_skb(struct sock *sk,
1321 struct flowi4 *fl4, 1321 struct flowi4 *fl4,
1322 struct sk_buff_head *queue, 1322 struct sk_buff_head *queue,
1323 struct inet_cork *cork) 1323 struct inet_cork *cork)
1324 { 1324 {
1325 struct sk_buff *skb, *tmp_skb; 1325 struct sk_buff *skb, *tmp_skb;
1326 struct sk_buff **tail_skb; 1326 struct sk_buff **tail_skb;
1327 struct inet_sock *inet = inet_sk(sk); 1327 struct inet_sock *inet = inet_sk(sk);
1328 struct net *net = sock_net(sk); 1328 struct net *net = sock_net(sk);
1329 struct ip_options *opt = NULL; 1329 struct ip_options *opt = NULL;
1330 struct rtable *rt = (struct rtable *)cork->dst; 1330 struct rtable *rt = (struct rtable *)cork->dst;
1331 struct iphdr *iph; 1331 struct iphdr *iph;
1332 __be16 df = 0; 1332 __be16 df = 0;
1333 __u8 ttl; 1333 __u8 ttl;
1334 1334
1335 if ((skb = __skb_dequeue(queue)) == NULL) 1335 if ((skb = __skb_dequeue(queue)) == NULL)
1336 goto out; 1336 goto out;
1337 tail_skb = &(skb_shinfo(skb)->frag_list); 1337 tail_skb = &(skb_shinfo(skb)->frag_list);
1338 1338
1339 /* move skb->data to ip header from ext header */ 1339 /* move skb->data to ip header from ext header */
1340 if (skb->data < skb_network_header(skb)) 1340 if (skb->data < skb_network_header(skb))
1341 __skb_pull(skb, skb_network_offset(skb)); 1341 __skb_pull(skb, skb_network_offset(skb));
1342 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1342 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1343 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1343 __skb_pull(tmp_skb, skb_network_header_len(skb));
1344 *tail_skb = tmp_skb; 1344 *tail_skb = tmp_skb;
1345 tail_skb = &(tmp_skb->next); 1345 tail_skb = &(tmp_skb->next);
1346 skb->len += tmp_skb->len; 1346 skb->len += tmp_skb->len;
1347 skb->data_len += tmp_skb->len; 1347 skb->data_len += tmp_skb->len;
1348 skb->truesize += tmp_skb->truesize; 1348 skb->truesize += tmp_skb->truesize;
1349 tmp_skb->destructor = NULL; 1349 tmp_skb->destructor = NULL;
1350 tmp_skb->sk = NULL; 1350 tmp_skb->sk = NULL;
1351 } 1351 }
1352 1352
1353 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow 1353 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1354 * to fragment the frame generated here. No matter, what transforms 1354 * to fragment the frame generated here. No matter, what transforms
1355 * how transforms change size of the packet, it will come out. 1355 * how transforms change size of the packet, it will come out.
1356 */ 1356 */
1357 skb->ignore_df = ip_sk_ignore_df(sk); 1357 skb->ignore_df = ip_sk_ignore_df(sk);
1358 1358
1359 /* DF bit is set when we want to see DF on outgoing frames. 1359 /* DF bit is set when we want to see DF on outgoing frames.
1360 * If ignore_df is set too, we still allow to fragment this frame 1360 * If ignore_df is set too, we still allow to fragment this frame
1361 * locally. */ 1361 * locally. */
1362 if (inet->pmtudisc == IP_PMTUDISC_DO || 1362 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1363 inet->pmtudisc == IP_PMTUDISC_PROBE || 1363 inet->pmtudisc == IP_PMTUDISC_PROBE ||
1364 (skb->len <= dst_mtu(&rt->dst) && 1364 (skb->len <= dst_mtu(&rt->dst) &&
1365 ip_dont_fragment(sk, &rt->dst))) 1365 ip_dont_fragment(sk, &rt->dst)))
1366 df = htons(IP_DF); 1366 df = htons(IP_DF);
1367 1367
1368 if (cork->flags & IPCORK_OPT) 1368 if (cork->flags & IPCORK_OPT)
1369 opt = cork->opt; 1369 opt = cork->opt;
1370 1370
1371 if (cork->ttl != 0) 1371 if (cork->ttl != 0)
1372 ttl = cork->ttl; 1372 ttl = cork->ttl;
1373 else if (rt->rt_type == RTN_MULTICAST) 1373 else if (rt->rt_type == RTN_MULTICAST)
1374 ttl = inet->mc_ttl; 1374 ttl = inet->mc_ttl;
1375 else 1375 else
1376 ttl = ip_select_ttl(inet, &rt->dst); 1376 ttl = ip_select_ttl(inet, &rt->dst);
1377 1377
1378 iph = ip_hdr(skb); 1378 iph = ip_hdr(skb);
1379 iph->version = 4; 1379 iph->version = 4;
1380 iph->ihl = 5; 1380 iph->ihl = 5;
1381 iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; 1381 iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
1382 iph->frag_off = df; 1382 iph->frag_off = df;
1383 iph->ttl = ttl; 1383 iph->ttl = ttl;
1384 iph->protocol = sk->sk_protocol; 1384 iph->protocol = sk->sk_protocol;
1385 ip_copy_addrs(iph, fl4); 1385 ip_copy_addrs(iph, fl4);
1386 ip_select_ident(skb, sk); 1386 ip_select_ident(skb, sk);
1387 1387
1388 if (opt) { 1388 if (opt) {
1389 iph->ihl += opt->optlen>>2; 1389 iph->ihl += opt->optlen>>2;
1390 ip_options_build(skb, opt, cork->addr, rt, 0); 1390 ip_options_build(skb, opt, cork->addr, rt, 0);
1391 } 1391 }
1392 1392
1393 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; 1393 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1394 skb->mark = sk->sk_mark; 1394 skb->mark = sk->sk_mark;
1395 /* 1395 /*
1396 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1396 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1397 * on dst refcount 1397 * on dst refcount
1398 */ 1398 */
1399 cork->dst = NULL; 1399 cork->dst = NULL;
1400 skb_dst_set(skb, &rt->dst); 1400 skb_dst_set(skb, &rt->dst);
1401 1401
1402 if (iph->protocol == IPPROTO_ICMP) 1402 if (iph->protocol == IPPROTO_ICMP)
1403 icmp_out_count(net, ((struct icmphdr *) 1403 icmp_out_count(net, ((struct icmphdr *)
1404 skb_transport_header(skb))->type); 1404 skb_transport_header(skb))->type);
1405 1405
1406 ip_cork_release(cork); 1406 ip_cork_release(cork);
1407 out: 1407 out:
1408 return skb; 1408 return skb;
1409 } 1409 }
1410 1410
1411 int ip_send_skb(struct net *net, struct sk_buff *skb) 1411 int ip_send_skb(struct net *net, struct sk_buff *skb)
1412 { 1412 {
1413 int err; 1413 int err;
1414 1414
1415 err = ip_local_out(skb); 1415 err = ip_local_out(skb);
1416 if (err) { 1416 if (err) {
1417 if (err > 0) 1417 if (err > 0)
1418 err = net_xmit_errno(err); 1418 err = net_xmit_errno(err);
1419 if (err) 1419 if (err)
1420 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1420 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1421 } 1421 }
1422 1422
1423 return err; 1423 return err;
1424 } 1424 }
1425 1425
1426 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) 1426 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1427 { 1427 {
1428 struct sk_buff *skb; 1428 struct sk_buff *skb;
1429 1429
1430 skb = ip_finish_skb(sk, fl4); 1430 skb = ip_finish_skb(sk, fl4);
1431 if (!skb) 1431 if (!skb)
1432 return 0; 1432 return 0;
1433 1433
1434 /* Netfilter gets whole the not fragmented skb. */ 1434 /* Netfilter gets whole the not fragmented skb. */
1435 return ip_send_skb(sock_net(sk), skb); 1435 return ip_send_skb(sock_net(sk), skb);
1436 } 1436 }
1437 1437
1438 /* 1438 /*
1439 * Throw away all pending data on the socket. 1439 * Throw away all pending data on the socket.
1440 */ 1440 */
1441 static void __ip_flush_pending_frames(struct sock *sk, 1441 static void __ip_flush_pending_frames(struct sock *sk,
1442 struct sk_buff_head *queue, 1442 struct sk_buff_head *queue,
1443 struct inet_cork *cork) 1443 struct inet_cork *cork)
1444 { 1444 {
1445 struct sk_buff *skb; 1445 struct sk_buff *skb;
1446 1446
1447 while ((skb = __skb_dequeue_tail(queue)) != NULL) 1447 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1448 kfree_skb(skb); 1448 kfree_skb(skb);
1449 1449
1450 ip_cork_release(cork); 1450 ip_cork_release(cork);
1451 } 1451 }
1452 1452
1453 void ip_flush_pending_frames(struct sock *sk) 1453 void ip_flush_pending_frames(struct sock *sk)
1454 { 1454 {
1455 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); 1455 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1456 } 1456 }
1457 1457
1458 struct sk_buff *ip_make_skb(struct sock *sk, 1458 struct sk_buff *ip_make_skb(struct sock *sk,
1459 struct flowi4 *fl4, 1459 struct flowi4 *fl4,
1460 int getfrag(void *from, char *to, int offset, 1460 int getfrag(void *from, char *to, int offset,
1461 int len, int odd, struct sk_buff *skb), 1461 int len, int odd, struct sk_buff *skb),
1462 void *from, int length, int transhdrlen, 1462 void *from, int length, int transhdrlen,
1463 struct ipcm_cookie *ipc, struct rtable **rtp, 1463 struct ipcm_cookie *ipc, struct rtable **rtp,
1464 unsigned int flags) 1464 unsigned int flags)
1465 { 1465 {
1466 struct inet_cork cork; 1466 struct inet_cork cork;
1467 struct sk_buff_head queue; 1467 struct sk_buff_head queue;
1468 int err; 1468 int err;
1469 1469
1470 if (flags & MSG_PROBE) 1470 if (flags & MSG_PROBE)
1471 return NULL; 1471 return NULL;
1472 1472
1473 __skb_queue_head_init(&queue); 1473 __skb_queue_head_init(&queue);
1474 1474
1475 cork.flags = 0; 1475 cork.flags = 0;
1476 cork.addr = 0; 1476 cork.addr = 0;
1477 cork.opt = NULL; 1477 cork.opt = NULL;
1478 err = ip_setup_cork(sk, &cork, ipc, rtp); 1478 err = ip_setup_cork(sk, &cork, ipc, rtp);
1479 if (err) 1479 if (err)
1480 return ERR_PTR(err); 1480 return ERR_PTR(err);
1481 1481
1482 err = __ip_append_data(sk, fl4, &queue, &cork, 1482 err = __ip_append_data(sk, fl4, &queue, &cork,
1483 &current->task_frag, getfrag, 1483 &current->task_frag, getfrag,
1484 from, length, transhdrlen, flags); 1484 from, length, transhdrlen, flags);
1485 if (err) { 1485 if (err) {
1486 __ip_flush_pending_frames(sk, &queue, &cork); 1486 __ip_flush_pending_frames(sk, &queue, &cork);
1487 return ERR_PTR(err); 1487 return ERR_PTR(err);
1488 } 1488 }
1489 1489
1490 return __ip_make_skb(sk, fl4, &queue, &cork); 1490 return __ip_make_skb(sk, fl4, &queue, &cork);
1491 } 1491 }
1492 1492
1493 /* 1493 /*
1494 * Fetch data from kernel space and fill in checksum if needed. 1494 * Fetch data from kernel space and fill in checksum if needed.
1495 */ 1495 */
1496 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 1496 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1497 int len, int odd, struct sk_buff *skb) 1497 int len, int odd, struct sk_buff *skb)
1498 { 1498 {
1499 __wsum csum; 1499 __wsum csum;
1500 1500
1501 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); 1501 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1502 skb->csum = csum_block_add(skb->csum, csum, odd); 1502 skb->csum = csum_block_add(skb->csum, csum, odd);
1503 return 0; 1503 return 0;
1504 } 1504 }
1505 1505
1506 /* 1506 /*
1507 * Generic function to send a packet as reply to another packet. 1507 * Generic function to send a packet as reply to another packet.
1508 * Used to send some TCP resets/acks so far. 1508 * Used to send some TCP resets/acks so far.
1509 * 1509 *
1510 * Use a fake percpu inet socket to avoid false sharing and contention. 1510 * Use a fake percpu inet socket to avoid false sharing and contention.
1511 */ 1511 */
1512 static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { 1512 static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
1513 .sk = { 1513 .sk = {
1514 .__sk_common = { 1514 .__sk_common = {
1515 .skc_refcnt = ATOMIC_INIT(1), 1515 .skc_refcnt = ATOMIC_INIT(1),
1516 }, 1516 },
1517 .sk_wmem_alloc = ATOMIC_INIT(1), 1517 .sk_wmem_alloc = ATOMIC_INIT(1),
1518 .sk_allocation = GFP_ATOMIC, 1518 .sk_allocation = GFP_ATOMIC,
1519 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), 1519 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
1520 .sk_pacing_rate = ~0U,
1520 }, 1521 },
1521 .pmtudisc = IP_PMTUDISC_WANT, 1522 .pmtudisc = IP_PMTUDISC_WANT,
1522 .uc_ttl = -1, 1523 .uc_ttl = -1,
1523 }; 1524 };
1524 1525
1525 void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, 1526 void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1526 const struct ip_options *sopt, 1527 const struct ip_options *sopt,
1527 __be32 daddr, __be32 saddr, 1528 __be32 daddr, __be32 saddr,
1528 const struct ip_reply_arg *arg, 1529 const struct ip_reply_arg *arg,
1529 unsigned int len) 1530 unsigned int len)
1530 { 1531 {
1531 struct ip_options_data replyopts; 1532 struct ip_options_data replyopts;
1532 struct ipcm_cookie ipc; 1533 struct ipcm_cookie ipc;
1533 struct flowi4 fl4; 1534 struct flowi4 fl4;
1534 struct rtable *rt = skb_rtable(skb); 1535 struct rtable *rt = skb_rtable(skb);
1535 struct sk_buff *nskb; 1536 struct sk_buff *nskb;
1536 struct sock *sk; 1537 struct sock *sk;
1537 struct inet_sock *inet; 1538 struct inet_sock *inet;
1538 int err; 1539 int err;
1539 1540
1540 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) 1541 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
1541 return; 1542 return;
1542 1543
1543 ipc.addr = daddr; 1544 ipc.addr = daddr;
1544 ipc.opt = NULL; 1545 ipc.opt = NULL;
1545 ipc.tx_flags = 0; 1546 ipc.tx_flags = 0;
1546 ipc.ttl = 0; 1547 ipc.ttl = 0;
1547 ipc.tos = -1; 1548 ipc.tos = -1;
1548 1549
1549 if (replyopts.opt.opt.optlen) { 1550 if (replyopts.opt.opt.optlen) {
1550 ipc.opt = &replyopts.opt; 1551 ipc.opt = &replyopts.opt;
1551 1552
1552 if (replyopts.opt.opt.srr) 1553 if (replyopts.opt.opt.srr)
1553 daddr = replyopts.opt.opt.faddr; 1554 daddr = replyopts.opt.opt.faddr;
1554 } 1555 }
1555 1556
1556 flowi4_init_output(&fl4, arg->bound_dev_if, 1557 flowi4_init_output(&fl4, arg->bound_dev_if,
1557 IP4_REPLY_MARK(net, skb->mark), 1558 IP4_REPLY_MARK(net, skb->mark),
1558 RT_TOS(arg->tos), 1559 RT_TOS(arg->tos),
1559 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, 1560 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1560 ip_reply_arg_flowi_flags(arg), 1561 ip_reply_arg_flowi_flags(arg),
1561 daddr, saddr, 1562 daddr, saddr,
1562 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1563 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1563 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1564 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1564 rt = ip_route_output_key(net, &fl4); 1565 rt = ip_route_output_key(net, &fl4);
1565 if (IS_ERR(rt)) 1566 if (IS_ERR(rt))
1566 return; 1567 return;
1567 1568
1568 inet = &get_cpu_var(unicast_sock); 1569 inet = &get_cpu_var(unicast_sock);
1569 1570
1570 inet->tos = arg->tos; 1571 inet->tos = arg->tos;
1571 sk = &inet->sk; 1572 sk = &inet->sk;
1572 sk->sk_priority = skb->priority; 1573 sk->sk_priority = skb->priority;
1573 sk->sk_protocol = ip_hdr(skb)->protocol; 1574 sk->sk_protocol = ip_hdr(skb)->protocol;
1574 sk->sk_bound_dev_if = arg->bound_dev_if; 1575 sk->sk_bound_dev_if = arg->bound_dev_if;
1575 sock_net_set(sk, net); 1576 sock_net_set(sk, net);
1576 __skb_queue_head_init(&sk->sk_write_queue); 1577 __skb_queue_head_init(&sk->sk_write_queue);
1577 sk->sk_sndbuf = sysctl_wmem_default; 1578 sk->sk_sndbuf = sysctl_wmem_default;
1578 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, 1579 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1579 len, 0, &ipc, &rt, MSG_DONTWAIT); 1580 len, 0, &ipc, &rt, MSG_DONTWAIT);
1580 if (unlikely(err)) { 1581 if (unlikely(err)) {
1581 ip_flush_pending_frames(sk); 1582 ip_flush_pending_frames(sk);
1582 goto out; 1583 goto out;
1583 } 1584 }
1584 1585
1585 nskb = skb_peek(&sk->sk_write_queue); 1586 nskb = skb_peek(&sk->sk_write_queue);
1586 if (nskb) { 1587 if (nskb) {
1587 if (arg->csumoffset >= 0) 1588 if (arg->csumoffset >= 0)
1588 *((__sum16 *)skb_transport_header(nskb) + 1589 *((__sum16 *)skb_transport_header(nskb) +
1589 arg->csumoffset) = csum_fold(csum_add(nskb->csum, 1590 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1590 arg->csum)); 1591 arg->csum));
1591 nskb->ip_summed = CHECKSUM_NONE; 1592 nskb->ip_summed = CHECKSUM_NONE;
1592 skb_orphan(nskb); 1593 skb_orphan(nskb);
1593 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); 1594 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1594 ip_push_pending_frames(sk, &fl4); 1595 ip_push_pending_frames(sk, &fl4);
1595 } 1596 }
1596 out: 1597 out:
1597 put_cpu_var(unicast_sock); 1598 put_cpu_var(unicast_sock);
1598 1599
1599 ip_rt_put(rt); 1600 ip_rt_put(rt);
1600 } 1601 }
1601 1602
1602 void __init ip_init(void) 1603 void __init ip_init(void)
1603 { 1604 {
1604 ip_rt_init(); 1605 ip_rt_init();
1605 inet_initpeers(); 1606 inet_initpeers();
1606 1607
1607 #if defined(CONFIG_IP_MULTICAST) 1608 #if defined(CONFIG_IP_MULTICAST)
1608 igmp_mc_init(); 1609 igmp_mc_init();
1609 #endif 1610 #endif
1610 } 1611 }
1611 1612