Blame view
net/ipv4/ip_output.c
43 KB
457c89965
|
1 |
// SPDX-License-Identifier: GPL-2.0-only |
1da177e4c
|
2 3 4 5 6 7 8 |
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) output module. * |
02c30a84e
|
9 |
* Authors: Ross Biro |
1da177e4c
|
10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <Alan.Cox@linux.org> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Hirokazu Takahashi, <taka@valinux.co.jp> * * See ip_input.c for original log * * Fixes: * Alan Cox : Missing nonblock feature in ip_build_xmit. * Mike Kilburn : htons() missing in ip_build_xmit. |
e905a9eda
|
24 |
* Bradford Johnson: Fix faulty handling of some frames when |
1da177e4c
|
25 26 27 28 29 30 31 32 33 34 |
* no route is found. * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit * (in case if packet not accepted by * output firewall rules) * Mike McLagan : Routing by source * Alexey Kuznetsov: use new route cache * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Replace ip_reply with ip_send_reply. |
e905a9eda
|
35 36 37 |
* Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 * and more readibility. |
1da177e4c
|
38 39 40 41 42 43 44 |
* Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. * Hirokazu Takahashi: HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi: sendfile() on UDP works now. */ |
7c0f6ba68
|
45 |
#include <linux/uaccess.h> |
1da177e4c
|
46 47 48 |
#include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> |
1da177e4c
|
49 50 51 |
#include <linux/mm.h> #include <linux/string.h> #include <linux/errno.h> |
a1f8e7f7f
|
52 |
#include <linux/highmem.h> |
5a0e3ad6a
|
53 |
#include <linux/slab.h> |
1da177e4c
|
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
#include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> |
cfacb0577
|
69 |
#include <net/xfrm.h> |
1da177e4c
|
70 71 72 73 |
#include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> |
1da177e4c
|
74 75 |
#include <net/checksum.h> #include <net/inetpeer.h> |
ba9e04a7d
|
76 |
#include <net/inet_ecn.h> |
14972cbd3
|
77 |
#include <net/lwtunnel.h> |
33b486793
|
78 |
#include <linux/bpf-cgroup.h> |
1da177e4c
|
79 80 81 |
#include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> |
1da177e4c
|
82 |
#include <linux/netlink.h> |
6cbb0df78
|
83 |
#include <linux/tcp.h> |
1da177e4c
|
84 |
|
694869b3c
|
85 86 87 88 |
static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, int (*output)(struct net *, struct sock *, struct sk_buff *)); |
49d16b23c
|
89 |
|
1da177e4c
|
90 |
/* Generate a checksum for an outgoing IP datagram. */ |
2fbd96797
|
91 |
void ip_send_check(struct iphdr *iph) |
1da177e4c
|
92 93 94 95 |
{ iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } |
4bc2f18ba
|
96 |
EXPORT_SYMBOL(ip_send_check); |
1da177e4c
|
97 |
|
cf91a99da
|
98 |
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) |
c439cb2e4
|
99 100 101 102 103 |
{ struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); |
a8e3e1a9f
|
104 105 106 107 108 109 110 |
/* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_out(sk, skb); if (unlikely(!skb)) return 0; |
f41804391
|
111 |
skb->protocol = htons(ETH_P_IP); |
29a26a568
|
112 113 |
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, |
13206b6bf
|
114 |
dst_output); |
7026b1ddb
|
115 |
} |
33224b16f
|
116 |
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) |
c439cb2e4
|
117 118 |
{ int err; |
cf91a99da
|
119 |
err = __ip_local_out(net, sk, skb); |
c439cb2e4
|
120 |
if (likely(err == 1)) |
13206b6bf
|
121 |
err = dst_output(net, sk, skb); |
c439cb2e4
|
122 123 124 |
return err; } |
e2cb77db0
|
125 |
EXPORT_SYMBOL_GPL(ip_local_out); |
c439cb2e4
|
126 |
|
1da177e4c
|
127 128 129 130 131 |
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; if (ttl < 0) |
323e126f0
|
132 |
ttl = ip4_dst_hoplimit(dst); |
1da177e4c
|
133 134 |
return ttl; } |
e905a9eda
|
135 |
/* |
1da177e4c
|
136 137 138 |
* Add an ip header to a skbuff and send it out. * */ |
cfe673b0a
|
139 |
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, |
de033b7d1
|
140 141 |
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt, u8 tos) |
1da177e4c
|
142 143 |
{ struct inet_sock *inet = inet_sk(sk); |
511c3f92a
|
144 |
struct rtable *rt = skb_rtable(skb); |
77589ce0f
|
145 |
struct net *net = sock_net(sk); |
1da177e4c
|
146 147 148 |
struct iphdr *iph; /* Build the IP header. */ |
f6d8bd051
|
149 |
skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); |
8856dfa3e
|
150 |
skb_reset_network_header(skb); |
eddc9ec53
|
151 |
iph = ip_hdr(skb); |
1da177e4c
|
152 153 |
iph->version = 4; iph->ihl = 5; |
de033b7d1
|
154 |
iph->tos = tos; |
d8d1f30b9
|
155 |
iph->ttl = ip_select_ttl(inet, &rt->dst); |
dd927a269
|
156 157 |
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; |
1da177e4c
|
158 |
iph->protocol = sk->sk_protocol; |
cfe673b0a
|
159 160 161 162 163 |
if (ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; |
77589ce0f
|
164 |
__ip_select_ident(net, iph, 1); |
cfe673b0a
|
165 |
} |
1da177e4c
|
166 |
|
f6d8bd051
|
167 168 169 |
if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; ip_options_build(skb, &opt->opt, daddr, rt, 0); |
1da177e4c
|
170 |
} |
1da177e4c
|
171 172 |
skb->priority = sk->sk_priority; |
e05a90ec9
|
173 174 |
if (!skb->mark) skb->mark = sk->sk_mark; |
1da177e4c
|
175 176 |
/* Send it out. */ |
33224b16f
|
177 |
return ip_local_out(net, skb->sk, skb); |
1da177e4c
|
178 |
} |
d8c97a945
|
179 |
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); |
694869b3c
|
180 |
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) |
1da177e4c
|
181 |
{ |
adf30907d
|
182 |
struct dst_entry *dst = skb_dst(skb); |
80787ebc2
|
183 |
struct rtable *rt = (struct rtable *)dst; |
1da177e4c
|
184 |
struct net_device *dev = dst->dev; |
c2636b4d9
|
185 |
unsigned int hh_len = LL_RESERVED_SPACE(dev); |
f6b72b621
|
186 |
struct neighbour *neigh; |
5c9f7c1df
|
187 |
bool is_v6gw = false; |
1da177e4c
|
188 |
|
edf391ff1
|
189 |
if (rt->rt_type == RTN_MULTICAST) { |
4ba1bf429
|
190 |
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); |
edf391ff1
|
191 |
} else if (rt->rt_type == RTN_BROADCAST) |
4ba1bf429
|
192 |
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); |
80787ebc2
|
193 |
|
1da177e4c
|
194 |
/* Be paranoid, rather than too clever. */ |
3b04ddde0
|
195 |
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { |
1da177e4c
|
196 197 198 |
struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); |
51456b291
|
199 |
if (!skb2) { |
1da177e4c
|
200 201 202 203 204 |
kfree_skb(skb); return -ENOMEM; } if (skb->sk) skb_set_owner_w(skb2, skb->sk); |
5d0ba55b6
|
205 |
consume_skb(skb); |
1da177e4c
|
206 207 |
skb = skb2; } |
14972cbd3
|
208 209 210 211 212 213 |
if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); if (res < 0 || res == LWTUNNEL_XMIT_DONE) return res; } |
a263b3093
|
214 |
rcu_read_lock_bh(); |
5c9f7c1df
|
215 |
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); |
9871f1ad6
|
216 |
if (!IS_ERR(neigh)) { |
4ff062035
|
217 218 219 |
int res; sock_confirm_neigh(skb, neigh); |
5c9f7c1df
|
220 221 |
/* if crossing protocols, can not use the cached header */ res = neigh_output(neigh, skb, is_v6gw); |
a263b3093
|
222 |
rcu_read_unlock_bh(); |
f2c31e32b
|
223 224 |
return res; } |
a263b3093
|
225 |
rcu_read_unlock_bh(); |
05e3aa094
|
226 |
|
e87cc4728
|
227 228 229 |
net_dbg_ratelimited("%s: No header cache and no neighbour! ", __func__); |
1da177e4c
|
230 231 232 |
kfree_skb(skb); return -EINVAL; } |
694869b3c
|
233 234 |
static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) |
c7ba65d7b
|
235 |
{ |
88bebdf5b
|
236 |
struct sk_buff *segs, *nskb; |
c7ba65d7b
|
237 |
netdev_features_t features; |
c7ba65d7b
|
238 |
int ret = 0; |
9ee6c5dc8
|
239 |
/* common case: seglen is <= mtu |
359ebda25
|
240 |
*/ |
779b7931b
|
241 |
if (skb_gso_validate_network_len(skb, mtu)) |
694869b3c
|
242 |
return ip_finish_output2(net, sk, skb); |
c7ba65d7b
|
243 |
|
0ace81ec7
|
244 |
/* Slowpath - GSO segment length exceeds the egress MTU. |
c7ba65d7b
|
245 |
* |
0ace81ec7
|
246 247 248 249 250 251 252 253 254 255 |
* This can happen in several cases: * - Forwarding of a TCP GRO skb, when DF flag is not set. * - Forwarding of an skb that arrived on a virtualization interface * (virtio-net/vhost/tap) with TSO/GSO size set by other network * stack. * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an * interface with a smaller MTU. * - Arriving GRO skb (or GSO skb in a virtualized environment) that is * bridged to a NETIF_F_TSO tunnel stacked over an interface with an * insufficent MTU. |
c7ba65d7b
|
256 257 |
*/ features = netif_skb_features(skb); |
a08e7fd91
|
258 |
BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); |
c7ba65d7b
|
259 |
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); |
330966e50
|
260 |
if (IS_ERR_OR_NULL(segs)) { |
c7ba65d7b
|
261 262 263 264 265 |
kfree_skb(skb); return -ENOMEM; } consume_skb(skb); |
88bebdf5b
|
266 |
skb_list_walk_safe(segs, segs, nskb) { |
c7ba65d7b
|
267 |
int err; |
a8305bff6
|
268 |
skb_mark_not_on_list(segs); |
694869b3c
|
269 |
err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); |
c7ba65d7b
|
270 271 272 |
if (err && ret == 0) ret = err; |
88bebdf5b
|
273 |
} |
c7ba65d7b
|
274 275 276 |
return ret; } |
956fe2190
|
277 |
static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
1da177e4c
|
278 |
{ |
c5501eb34
|
279 |
unsigned int mtu; |
5c901daae
|
280 281 |
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ |
00db41243
|
282 |
if (skb_dst(skb)->xfrm) { |
48d5cad87
|
283 |
IPCB(skb)->flags |= IPSKB_REROUTED; |
13206b6bf
|
284 |
return dst_output(net, sk, skb); |
48d5cad87
|
285 |
} |
5c901daae
|
286 |
#endif |
fedbb6b4f
|
287 |
mtu = ip_skb_dst_mtu(sk, skb); |
c7ba65d7b
|
288 |
if (skb_is_gso(skb)) |
694869b3c
|
289 |
return ip_finish_output_gso(net, sk, skb, mtu); |
c7ba65d7b
|
290 |
|
b0ff6d00e
|
291 |
if (skb->len > mtu || IPCB(skb)->frag_max_size) |
694869b3c
|
292 |
return ip_fragment(net, sk, skb, mtu, ip_finish_output2); |
c7ba65d7b
|
293 |
|
694869b3c
|
294 |
return ip_finish_output2(net, sk, skb); |
1da177e4c
|
295 |
} |
956fe2190
|
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: return __ip_finish_output(net, sk, skb); case NET_XMIT_CN: return __ip_finish_output(net, sk, skb) ? : ret; default: kfree_skb(skb); return ret; } } |
33b486793
|
311 312 313 |
static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { |
5b18f1289
|
314 |
struct rtable *new_rt; |
d96ff269a
|
315 316 |
bool do_cn = false; int ret, err; |
33b486793
|
317 318 |
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); |
956fe2190
|
319 |
switch (ret) { |
956fe2190
|
320 |
case NET_XMIT_CN: |
d96ff269a
|
321 |
do_cn = true; |
a8eceea84
|
322 |
fallthrough; |
d96ff269a
|
323 324 |
case NET_XMIT_SUCCESS: break; |
956fe2190
|
325 |
default: |
33b486793
|
326 327 328 |
kfree_skb(skb); return ret; } |
5b18f1289
|
329 330 331 332 333 334 335 336 337 338 |
/* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten, * see ipv4_pktinfo_prepare(). */ new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb)); if (new_rt) { new_rt->rt_iif = 0; skb_dst_drop(skb); skb_dst_set(skb, &new_rt->dst); } |
d96ff269a
|
339 340 |
err = dev_loopback_xmit(net, sk, skb); return (do_cn && err) ? ret : err; |
33b486793
|
341 |
} |
ede2059db
|
342 |
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
1da177e4c
|
343 |
{ |
511c3f92a
|
344 |
struct rtable *rt = skb_rtable(skb); |
d8d1f30b9
|
345 |
struct net_device *dev = rt->dst.dev; |
1da177e4c
|
346 347 348 349 |
/* * If the indicated interface is up and running, send the packet. */ |
88f5cc245
|
350 |
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); |
1da177e4c
|
351 352 353 354 355 356 357 358 359 |
skb->dev = dev; skb->protocol = htons(ETH_P_IP); /* * Multicasts are looped back for other local users */ if (rt->rt_flags&RTCF_MULTICAST) { |
7ad6848c7
|
360 |
if (sk_mc_loop(sk) |
1da177e4c
|
361 362 363 364 365 366 367 368 369 |
#ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped by ip_mr_input in any case. Note, that local frames are looped back to be delivered to local recipients. This check is duplicated in ip_mr_input at the moment. */ |
9d4fb27db
|
370 371 372 |
&& ((rt->rt_flags & RTCF_LOCAL) || !(IPCB(skb)->flags & IPSKB_FORWARDED)) |
1da177e4c
|
373 |
#endif |
9d4fb27db
|
374 |
) { |
1da177e4c
|
375 376 |
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) |
9bbc768aa
|
377 |
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, |
29a26a568
|
378 |
net, sk, newskb, NULL, newskb->dev, |
33b486793
|
379 |
ip_mc_finish_output); |
1da177e4c
|
380 381 382 |
} /* Multicasts with ttl 0 must not go beyond the host */ |
eddc9ec53
|
383 |
if (ip_hdr(skb)->ttl == 0) { |
1da177e4c
|
384 385 386 387 388 389 390 391 |
kfree_skb(skb); return 0; } } if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) |
29a26a568
|
392 393 |
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, |
33b486793
|
394 |
ip_mc_finish_output); |
1da177e4c
|
395 |
} |
29a26a568
|
396 397 398 |
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, ip_finish_output, |
48d5cad87
|
399 |
!(IPCB(skb)->flags & IPSKB_REROUTED)); |
1da177e4c
|
400 |
} |
ede2059db
|
401 |
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
1da177e4c
|
402 |
{ |
28f8bfd1a
|
403 |
struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; |
1bd9bef6f
|
404 |
|
88f5cc245
|
405 |
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); |
1da177e4c
|
406 |
|
1bd9bef6f
|
407 408 |
skb->dev = dev; skb->protocol = htons(ETH_P_IP); |
29a26a568
|
409 |
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, |
28f8bfd1a
|
410 |
net, sk, skb, indev, dev, |
e905a9eda
|
411 |
ip_finish_output, |
48d5cad87
|
412 |
!(IPCB(skb)->flags & IPSKB_REROUTED)); |
1da177e4c
|
413 |
} |
84f9307c5
|
414 415 416 417 418 419 420 421 422 423 424 425 426 |
/* * copy saddr and daddr, possibly using 64bit load/stores * Equivalent to : * iph->saddr = fl4->saddr; * iph->daddr = fl4->daddr; */ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); memcpy(&iph->saddr, &fl4->saddr, sizeof(fl4->saddr) + sizeof(fl4->daddr)); } |
b0270e910
|
427 |
/* Note: skb->sk can be different from sk, in case of tunnels */ |
69b9e1e07
|
428 429 |
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos) |
1da177e4c
|
430 |
{ |
1da177e4c
|
431 |
struct inet_sock *inet = inet_sk(sk); |
77589ce0f
|
432 |
struct net *net = sock_net(sk); |
f6d8bd051
|
433 |
struct ip_options_rcu *inet_opt; |
b57ae01a8
|
434 |
struct flowi4 *fl4; |
1da177e4c
|
435 436 |
struct rtable *rt; struct iphdr *iph; |
ab6e3feba
|
437 |
int res; |
1da177e4c
|
438 439 440 441 |
/* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ |
ab6e3feba
|
442 |
rcu_read_lock(); |
f6d8bd051
|
443 |
inet_opt = rcu_dereference(inet->inet_opt); |
ea4fc0d61
|
444 |
fl4 = &fl->u.ip4; |
511c3f92a
|
445 |
rt = skb_rtable(skb); |
00db41243
|
446 |
if (rt) |
1da177e4c
|
447 448 449 450 |
goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); |
51456b291
|
451 |
if (!rt) { |
3ca3c68e7
|
452 |
__be32 daddr; |
1da177e4c
|
453 454 |
/* Use correct destination address if we have options. */ |
c720c7e83
|
455 |
daddr = inet->inet_daddr; |
f6d8bd051
|
456 457 |
if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; |
1da177e4c
|
458 |
|
78fbfd8a6
|
459 460 461 462 |
/* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ |
77589ce0f
|
463 |
rt = ip_route_output_ports(net, fl4, sk, |
78fbfd8a6
|
464 465 466 467 |
daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, |
69b9e1e07
|
468 |
RT_CONN_FLAGS_TOS(sk, tos), |
78fbfd8a6
|
469 470 471 |
sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; |
d8d1f30b9
|
472 |
sk_setup_caps(sk, &rt->dst); |
1da177e4c
|
473 |
} |
d8d1f30b9
|
474 |
skb_dst_set_noref(skb, &rt->dst); |
1da177e4c
|
475 476 |
packet_routed: |
77d5bc7e6
|
477 |
if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) |
1da177e4c
|
478 479 480 |
goto no_route; /* OK, we know where to send it, allocate and build IP header. */ |
f6d8bd051
|
481 |
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); |
8856dfa3e
|
482 |
skb_reset_network_header(skb); |
eddc9ec53
|
483 |
iph = ip_hdr(skb); |
69b9e1e07
|
484 |
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); |
60ff74673
|
485 |
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) |
1da177e4c
|
486 487 488 |
iph->frag_off = htons(IP_DF); else iph->frag_off = 0; |
d8d1f30b9
|
489 |
iph->ttl = ip_select_ttl(inet, &rt->dst); |
1da177e4c
|
490 |
iph->protocol = sk->sk_protocol; |
84f9307c5
|
491 |
ip_copy_addrs(iph, fl4); |
1da177e4c
|
492 |
/* Transport layer set skb->h.foo itself. */ |
f6d8bd051
|
493 494 495 |
if (inet_opt && inet_opt->opt.optlen) { iph->ihl += inet_opt->opt.optlen >> 2; ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); |
1da177e4c
|
496 |
} |
77589ce0f
|
497 |
ip_select_ident_segs(net, skb, sk, |
b6a7719ae
|
498 |
skb_shinfo(skb)->gso_segs ?: 1); |
1da177e4c
|
499 |
|
b0270e910
|
500 |
/* TODO : should we use skb->sk here instead of sk ? */ |
1da177e4c
|
501 |
skb->priority = sk->sk_priority; |
4a19ec580
|
502 |
skb->mark = sk->sk_mark; |
1da177e4c
|
503 |
|
33224b16f
|
504 |
res = ip_local_out(net, sk, skb); |
ab6e3feba
|
505 506 |
rcu_read_unlock(); return res; |
1da177e4c
|
507 508 |
no_route: |
ab6e3feba
|
509 |
rcu_read_unlock(); |
77589ce0f
|
510 |
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); |
1da177e4c
|
511 512 513 |
kfree_skb(skb); return -EHOSTUNREACH; } |
69b9e1e07
|
514 |
EXPORT_SYMBOL(__ip_queue_xmit); |
1da177e4c
|
515 |
|
05e22e839
|
516 517 518 519 520 |
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); } EXPORT_SYMBOL(ip_queue_xmit); |
1da177e4c
|
521 522 523 524 525 |
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; |
d2f0c9611
|
526 |
to->skb_iif = from->skb_iif; |
adf30907d
|
527 |
skb_dst_drop(to); |
fe76cda30
|
528 |
skb_dst_copy(to, from); |
1da177e4c
|
529 |
to->dev = from->dev; |
82e91ffef
|
530 |
to->mark = from->mark; |
1da177e4c
|
531 |
|
3dd1c9a12
|
532 |
skb_copy_hash(to, from); |
1da177e4c
|
533 534 535 |
#ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif |
e7ac05f34
|
536 |
nf_copy(to, from); |
df5042f4c
|
537 |
skb_ext_copy(to, from); |
6ca40d4e8
|
538 |
#if IS_ENABLED(CONFIG_IP_VS) |
c98d80edc
|
539 540 |
to->ipvs_property = from->ipvs_property; #endif |
984bc16cc
|
541 |
skb_copy_secmark(to, from); |
1da177e4c
|
542 |
} |
694869b3c
|
543 |
static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, |
c5501eb34
|
544 |
unsigned int mtu, |
694869b3c
|
545 |
int (*output)(struct net *, struct sock *, struct sk_buff *)) |
49d16b23c
|
546 547 |
{ struct iphdr *iph = ip_hdr(skb); |
49d16b23c
|
548 |
|
d6b915e29
|
549 |
if ((iph->frag_off & htons(IP_DF)) == 0) |
694869b3c
|
550 |
return ip_do_fragment(net, sk, skb, output); |
d6b915e29
|
551 552 |
if (unlikely(!skb->ignore_df || |
49d16b23c
|
553 554 |
(IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { |
9479b0af4
|
555 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); |
49d16b23c
|
556 557 558 559 560 |
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } |
694869b3c
|
561 |
return ip_do_fragment(net, sk, skb, output); |
49d16b23c
|
562 |
} |
c8b17be0b
|
563 564 565 566 |
void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, unsigned int hlen, struct ip_fraglist_iter *iter) { unsigned int first_len = skb_pagelen(skb); |
b70341467
|
567 |
iter->frag = skb_shinfo(skb)->frag_list; |
c8b17be0b
|
568 569 570 571 572 573 574 575 576 577 578 579 580 |
skb_frag_list_init(skb); iter->offset = 0; iter->iph = iph; iter->hlen = hlen; skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); ip_send_check(iph); } EXPORT_SYMBOL(ip_fraglist_init); |
19c3401a9
|
581 582 583 584 585 586 587 588 589 590 591 |
static void ip_fraglist_ipcb_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) { struct sk_buff *to = iter->frag; /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(skb)->flags; if (iter->offset == 0) ip_options_fragment(to); } |
c8b17be0b
|
592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 |
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) { unsigned int hlen = iter->hlen; struct iphdr *iph = iter->iph; struct sk_buff *frag; frag = iter->frag; frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iph, hlen); iter->iph = ip_hdr(frag); iph = iter->iph; iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); |
c8b17be0b
|
608 609 610 611 612 613 614 615 |
iter->offset += skb->len - hlen; iph->frag_off = htons(iter->offset >> 3); if (frag->next) iph->frag_off |= htons(IP_MF); /* Ready, complete checksum */ ip_send_check(iph); } EXPORT_SYMBOL(ip_fraglist_prepare); |
065ff79f8
|
616 |
void ip_frag_init(struct sk_buff *skb, unsigned int hlen, |
e7a409c3f
|
617 |
unsigned int ll_rs, unsigned int mtu, bool DF, |
065ff79f8
|
618 619 620 |
struct ip_frag_state *state) { struct iphdr *iph = ip_hdr(skb); |
e7a409c3f
|
621 |
state->DF = DF; |
065ff79f8
|
622 623 624 625 626 627 628 629 630 631 632 |
state->hlen = hlen; state->ll_rs = ll_rs; state->mtu = mtu; state->left = skb->len - hlen; /* Space per frame */ state->ptr = hlen; /* Where to start from */ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; state->not_last_frag = iph->frag_off & htons(IP_MF); } EXPORT_SYMBOL(ip_frag_init); |
19c3401a9
|
633 634 635 636 637 |
static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, bool first_frag, struct ip_frag_state *state) { /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; |
19c3401a9
|
638 639 640 641 642 643 644 645 646 |
/* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options. */ if (first_frag) ip_options_fragment(from); } |
065ff79f8
|
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 |
struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) { unsigned int len = state->left; struct sk_buff *skb2; struct iphdr *iph; len = state->left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu) len = state->mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < state->left) { len &= ~7; } /* Allocate buffer */ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); /* * Set up data on packet */ ip_copy_metadata(skb2, skb); skb_reserve(skb2, state->ll_rs); skb_put(skb2, len + state->hlen); skb_reset_network_header(skb2); skb2->transport_header = skb2->network_header + state->hlen; /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(skb2, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen); /* * Copy a block of the IP datagram. */ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len)) BUG(); state->left -= len; /* * Fill in the new header fields. */ iph = ip_hdr(skb2); iph->frag_off = htons((state->offset >> 3)); |
e7a409c3f
|
704 705 |
if (state->DF) iph->frag_off |= htons(IP_DF); |
065ff79f8
|
706 |
|
065ff79f8
|
707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 |
/* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit */ if (state->left > 0 || state->not_last_frag) iph->frag_off |= htons(IP_MF); state->ptr += len; state->offset += len; iph->tot_len = htons(len + state->hlen); ip_send_check(iph); return skb2; } EXPORT_SYMBOL(ip_frag_next); |
1da177e4c
|
723 724 725 726 727 728 |
/* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. */ |
694869b3c
|
729 730 |
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) |
1da177e4c
|
731 732 |
{ struct iphdr *iph; |
1da177e4c
|
733 |
struct sk_buff *skb2; |
511c3f92a
|
734 |
struct rtable *rt = skb_rtable(skb); |
065ff79f8
|
735 |
unsigned int mtu, hlen, ll_rs; |
c8b17be0b
|
736 |
struct ip_fraglist_iter iter; |
9669fffc1
|
737 |
ktime_t tstamp = skb->tstamp; |
065ff79f8
|
738 |
struct ip_frag_state state; |
1da177e4c
|
739 |
int err = 0; |
dbd3393c5
|
740 741 742 743 |
/* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; |
1da177e4c
|
744 745 746 |
/* * Point into the IP datagram header. */ |
eddc9ec53
|
747 |
iph = ip_hdr(skb); |
1da177e4c
|
748 |
|
fedbb6b4f
|
749 |
mtu = ip_skb_dst_mtu(sk, skb); |
d6b915e29
|
750 751 |
if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) mtu = IPCB(skb)->frag_max_size; |
1da177e4c
|
752 753 754 755 756 757 |
/* * Setup starting values. */ hlen = iph->ihl * 4; |
f87c10a8a
|
758 |
mtu = mtu - hlen; /* Size of data space */ |
89cee8b1c
|
759 |
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; |
254d900b8
|
760 |
ll_rs = LL_RESERVED_SPACE(rt->dst.dev); |
1da177e4c
|
761 762 763 764 765 766 767 768 |
/* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing * one, it is not prohibited. In this case fall back to copying. * * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ |
21dc33015
|
769 |
if (skb_has_frag_list(skb)) { |
3d13008e7
|
770 |
struct sk_buff *frag, *frag2; |
c72d8cdaa
|
771 |
unsigned int first_len = skb_pagelen(skb); |
1da177e4c
|
772 773 774 |
if (first_len - hlen > mtu || ((first_len - hlen) & 7) || |
56f8a75c1
|
775 |
ip_is_fragment(iph) || |
254d900b8
|
776 777 |
skb_cloned(skb) || skb_headroom(skb) < ll_rs) |
1da177e4c
|
778 |
goto slow_path; |
d7fcf1a5c
|
779 |
skb_walk_frags(skb, frag) { |
1da177e4c
|
780 781 782 |
/* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || |
254d900b8
|
783 |
skb_headroom(frag) < hlen + ll_rs) |
3d13008e7
|
784 |
goto slow_path_clean; |
1da177e4c
|
785 786 787 |
/* Partially cloned skb? */ if (skb_shared(frag)) |
3d13008e7
|
788 |
goto slow_path_clean; |
2fdba6b08
|
789 790 791 |
BUG_ON(frag->sk); if (skb->sk) { |
2fdba6b08
|
792 793 |
frag->sk = skb->sk; frag->destructor = sock_wfree; |
2fdba6b08
|
794 |
} |
3d13008e7
|
795 |
skb->truesize -= frag->truesize; |
1da177e4c
|
796 797 798 |
} /* Everything is OK. Generate! */ |
c8b17be0b
|
799 |
ip_fraglist_init(skb, iph, hlen, &iter); |
1da177e4c
|
800 801 802 803 |
for (;;) { /* Prepare header of the next frame, * before previous one went down. */ |
19c3401a9
|
804 805 |
if (iter.frag) { ip_fraglist_ipcb_prepare(skb, &iter); |
c8b17be0b
|
806 |
ip_fraglist_prepare(skb, &iter); |
19c3401a9
|
807 |
} |
1da177e4c
|
808 |
|
9669fffc1
|
809 |
skb->tstamp = tstamp; |
694869b3c
|
810 |
err = output(net, sk, skb); |
1da177e4c
|
811 |
|
dafee4908
|
812 |
if (!err) |
26a949dbd
|
813 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); |
c8b17be0b
|
814 |
if (err || !iter.frag) |
1da177e4c
|
815 |
break; |
c8b17be0b
|
816 |
skb = ip_fraglist_next(&iter); |
1da177e4c
|
817 818 819 |
} if (err == 0) { |
26a949dbd
|
820 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); |
1da177e4c
|
821 822 |
return 0; } |
b70341467
|
823 |
kfree_skb_list(iter.frag); |
942f146a6
|
824 |
|
26a949dbd
|
825 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); |
1da177e4c
|
826 |
return err; |
3d13008e7
|
827 828 829 830 831 832 833 834 835 |
slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } |
1da177e4c
|
836 837 838 |
} slow_path: |
1da177e4c
|
839 840 841 |
/* * Fragment the datagram. */ |
e7a409c3f
|
842 843 |
ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU, &state); |
1da177e4c
|
844 845 846 847 |
/* * Keep copying data until we run out. */ |
065ff79f8
|
848 |
while (state.left > 0) { |
19c3401a9
|
849 |
bool first_frag = (state.offset == 0); |
065ff79f8
|
850 851 852 |
skb2 = ip_frag_next(skb, &state); if (IS_ERR(skb2)) { err = PTR_ERR(skb2); |
1da177e4c
|
853 854 |
goto fail; } |
19c3401a9
|
855 |
ip_frag_ipcb(skb, skb2, first_frag, &state); |
1da177e4c
|
856 857 |
/* |
1da177e4c
|
858 859 |
* Put this fragment into the sending queue. */ |
9669fffc1
|
860 |
skb2->tstamp = tstamp; |
694869b3c
|
861 |
err = output(net, sk, skb2); |
1da177e4c
|
862 863 |
if (err) goto fail; |
dafee4908
|
864 |
|
26a949dbd
|
865 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); |
1da177e4c
|
866 |
} |
5d0ba55b6
|
867 |
consume_skb(skb); |
26a949dbd
|
868 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); |
1da177e4c
|
869 870 871 |
return err; fail: |
e905a9eda
|
872 |
kfree_skb(skb); |
26a949dbd
|
873 |
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); |
1da177e4c
|
874 875 |
return err; } |
49d16b23c
|
876 |
EXPORT_SYMBOL(ip_do_fragment); |
2e2f7aefa
|
877 |
|
1da177e4c
|
878 879 880 |
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { |
f69e6d131
|
881 |
struct msghdr *msg = from; |
1da177e4c
|
882 |
|
84fa7933a
|
883 |
if (skb->ip_summed == CHECKSUM_PARTIAL) { |
0b62fca26
|
884 |
if (!copy_from_iter_full(to, len, &msg->msg_iter)) |
1da177e4c
|
885 886 |
return -EFAULT; } else { |
44bb93633
|
887 |
__wsum csum = 0; |
0b62fca26
|
888 |
if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter)) |
1da177e4c
|
889 890 891 892 893 |
return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } return 0; } |
4bc2f18ba
|
894 |
EXPORT_SYMBOL(ip_generic_getfrag); |
1da177e4c
|
895 |
|
44bb93633
|
896 |
static inline __wsum |
1da177e4c
|
897 898 899 |
csum_page(struct page *page, int offset, int copy) { char *kaddr; |
44bb93633
|
900 |
__wsum csum; |
1da177e4c
|
901 902 903 904 905 |
kaddr = kmap(page); csum = csum_partial(kaddr + offset, copy, 0); kunmap(page); return csum; } |
f5fca6086
|
906 907 908 |
static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, |
1470ddf7f
|
909 |
struct inet_cork *cork, |
5640f7685
|
910 |
struct page_frag *pfrag, |
1470ddf7f
|
911 912 913 914 |
int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, unsigned int flags) |
1da177e4c
|
915 916 |
{ struct inet_sock *inet = inet_sk(sk); |
b5947e5d1
|
917 |
struct ubuf_info *uarg = NULL; |
1da177e4c
|
918 |
struct sk_buff *skb; |
07df5294a
|
919 |
struct ip_options *opt = cork->opt; |
1da177e4c
|
920 921 922 923 924 925 |
int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; |
daba287b2
|
926 |
unsigned int maxfraglen, fragheaderlen, maxnonfragsize; |
1da177e4c
|
927 |
int csummode = CHECKSUM_NONE; |
1470ddf7f
|
928 |
struct rtable *rt = (struct rtable *)cork->dst; |
694aba690
|
929 |
unsigned int wmem_alloc_delta = 0; |
100f6d8e0
|
930 |
bool paged, extra_uref = false; |
09c2d251b
|
931 |
u32 tskey = 0; |
1da177e4c
|
932 |
|
96d7303e9
|
933 934 935 |
skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; |
bec1f6f69
|
936 |
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; |
15e36f5b8
|
937 |
paged = !!cork->gso_size; |
bec1f6f69
|
938 |
|
09c2d251b
|
939 940 941 |
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; |
1da177e4c
|
942 |
|
d8d1f30b9
|
943 |
hh_len = LL_RESERVED_SPACE(rt->dst.dev); |
1da177e4c
|
944 945 946 |
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; |
cbc08a331
|
947 |
maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; |
1da177e4c
|
948 |
|
daba287b2
|
949 |
if (cork->length + length > maxnonfragsize - fragheaderlen) { |
f5fca6086
|
950 |
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, |
61e7f09d0
|
951 |
mtu - (opt ? opt->optlen : 0)); |
1da177e4c
|
952 953 954 955 956 957 958 959 960 |
return -EMSGSIZE; } /* * transhdrlen > 0 means that this is the first fragment and we wish * it won't be fragmented in the future. */ if (transhdrlen && length + fragheaderlen <= mtu && |
c8cd0989b
|
961 |
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && |
bec1f6f69
|
962 |
(!(flags & MSG_MORE) || cork->gso_size) && |
cd027a543
|
963 |
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) |
84fa7933a
|
964 |
csummode = CHECKSUM_PARTIAL; |
1da177e4c
|
965 |
|
b5947e5d1
|
966 967 968 969 |
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; |
522924b58
|
970 |
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ |
b5947e5d1
|
971 972 973 974 975 |
if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; } else { uarg->zerocopy = 0; |
52900d222
|
976 |
skb_zcopy_set(skb, uarg, &extra_uref); |
b5947e5d1
|
977 978 |
} } |
1470ddf7f
|
979 |
cork->length += length; |
1da177e4c
|
980 981 982 983 984 985 986 |
/* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, * each of segments is IP fragment ready for sending to network after * adding appropriate IP header. */ |
26cde9f7e
|
987 |
if (!skb) |
1da177e4c
|
988 989 990 991 992 993 994 995 996 997 998 999 1000 |
goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = mtu - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; |
aba36930a
|
1001 |
unsigned int pagedlen; |
1da177e4c
|
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 |
struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; if (skb_prev) fraggap = skb_prev->len - maxfraglen; else fraggap = 0; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; |
aba36930a
|
1018 |
pagedlen = 0; |
1da177e4c
|
1019 |
|
e905a9eda
|
1020 |
if ((flags & MSG_MORE) && |
d8d1f30b9
|
1021 |
!(rt->dst.dev->features&NETIF_F_SG)) |
1da177e4c
|
1022 |
alloclen = mtu; |
15e36f5b8
|
1023 |
else if (!paged) |
59104f062
|
1024 |
alloclen = fraglen; |
15e36f5b8
|
1025 1026 1027 1028 |
else { alloclen = min_t(int, fraglen, MAX_HEADER); pagedlen = fraglen - alloclen; } |
1da177e4c
|
1029 |
|
353e5c9ab
|
1030 |
alloclen += exthdrlen; |
1da177e4c
|
1031 1032 1033 1034 1035 |
/* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ |
33f99dc7f
|
1036 |
if (datalen == length + fraggap) |
d8d1f30b9
|
1037 |
alloclen += rt->dst.trailer_len; |
33f99dc7f
|
1038 |
|
1da177e4c
|
1039 |
if (transhdrlen) { |
e905a9eda
|
1040 |
skb = sock_alloc_send_skb(sk, |
1da177e4c
|
1041 1042 1043 1044 |
alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; |
694aba690
|
1045 |
if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= |
1da177e4c
|
1046 |
2 * sk->sk_sndbuf) |
694aba690
|
1047 1048 |
skb = alloc_skb(alloclen + hh_len + 15, sk->sk_allocation); |
51456b291
|
1049 |
if (unlikely(!skb)) |
1da177e4c
|
1050 1051 |
err = -ENOBUFS; } |
51456b291
|
1052 |
if (!skb) |
1da177e4c
|
1053 1054 1055 1056 1057 1058 1059 1060 |
goto error; /* * Fill in the control structures */ skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); |
11878b40e
|
1061 |
|
1da177e4c
|
1062 1063 1064 |
/* * Find where to start putting bytes. */ |
15e36f5b8
|
1065 |
data = skb_put(skb, fraglen + exthdrlen - pagedlen); |
c14d2450c
|
1066 |
skb_set_network_header(skb, exthdrlen); |
b0e380b1d
|
1067 1068 |
skb->transport_header = (skb->network_header + fragheaderlen); |
353e5c9ab
|
1069 |
data += fragheaderlen + exthdrlen; |
1da177e4c
|
1070 1071 1072 1073 |
if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, |
8d5930dfb
|
1074 |
data + transhdrlen, fraggap); |
1da177e4c
|
1075 1076 1077 |
skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; |
e9fa4f7bd
|
1078 |
pskb_trim_unique(skb_prev, maxfraglen); |
1da177e4c
|
1079 |
} |
15e36f5b8
|
1080 |
copy = datalen - transhdrlen - fraggap - pagedlen; |
1da177e4c
|
1081 1082 1083 1084 1085 1086 1087 |
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } offset += copy; |
15e36f5b8
|
1088 |
length -= copy + transhdrlen; |
1da177e4c
|
1089 1090 1091 |
transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; |
52900d222
|
1092 1093 1094 1095 1096 1097 |
/* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; skb_zcopy_set(skb, uarg, &extra_uref); |
0dec879f6
|
1098 1099 |
if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); |
1da177e4c
|
1100 1101 1102 |
/* * Put the packet on the pending queue. */ |
694aba690
|
1103 1104 1105 1106 1107 |
if (!skb->destructor) { skb->destructor = sock_wfree; skb->sk = sk; wmem_alloc_delta += skb->truesize; } |
1470ddf7f
|
1108 |
__skb_queue_tail(queue, skb); |
1da177e4c
|
1109 1110 1111 1112 1113 |
continue; } if (copy > length) copy = length; |
113f99c33
|
1114 1115 |
if (!(rt->dst.dev->features&NETIF_F_SG) && skb_tailroom(skb) >= copy) { |
1da177e4c
|
1116 1117 1118 |
unsigned int off; off = skb->len; |
e905a9eda
|
1119 |
if (getfrag(from, skb_put(skb, copy), |
1da177e4c
|
1120 1121 1122 1123 1124 |
offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } |
b5947e5d1
|
1125 |
} else if (!uarg || !uarg->zerocopy) { |
1da177e4c
|
1126 |
int i = skb_shinfo(skb)->nr_frags; |
1da177e4c
|
1127 |
|
5640f7685
|
1128 1129 |
err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) |
1da177e4c
|
1130 |
goto error; |
5640f7685
|
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 |
if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); |
1da177e4c
|
1142 |
} |
5640f7685
|
1143 1144 1145 1146 1147 1148 1149 1150 |
copy = min_t(int, copy, pfrag->size - pfrag->offset); if (getfrag(from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); |
1da177e4c
|
1151 1152 |
skb->len += copy; skb->data_len += copy; |
f945fa7ad
|
1153 |
skb->truesize += copy; |
694aba690
|
1154 |
wmem_alloc_delta += copy; |
b5947e5d1
|
1155 1156 1157 1158 |
} else { err = skb_zerocopy_iter_dgram(skb, from, copy); if (err < 0) goto error; |
1da177e4c
|
1159 1160 1161 1162 |
} offset += copy; length -= copy; } |
9e8445a56
|
1163 1164 |
if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); |
1da177e4c
|
1165 |
return 0; |
5640f7685
|
1166 1167 |
error_efault: err = -EFAULT; |
1da177e4c
|
1168 |
error: |
97ef7b4c5
|
1169 1170 |
if (uarg) sock_zerocopy_put_abort(uarg, extra_uref); |
1470ddf7f
|
1171 |
cork->length -= length; |
5e38e2704
|
1172 |
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); |
694aba690
|
1173 |
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); |
e905a9eda
|
1174 |
return err; |
1da177e4c
|
1175 |
} |
1470ddf7f
|
1176 1177 1178 |
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { |
f6d8bd051
|
1179 |
struct ip_options_rcu *opt; |
1470ddf7f
|
1180 |
struct rtable *rt; |
9783ccd0f
|
1181 1182 1183 |
rt = *rtp; if (unlikely(!rt)) return -EFAULT; |
1470ddf7f
|
1184 1185 1186 1187 1188 |
/* * setup for corking. */ opt = ipc->opt; if (opt) { |
51456b291
|
1189 |
if (!cork->opt) { |
1470ddf7f
|
1190 1191 |
cork->opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); |
51456b291
|
1192 |
if (unlikely(!cork->opt)) |
1470ddf7f
|
1193 1194 |
return -ENOBUFS; } |
f6d8bd051
|
1195 |
memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); |
1470ddf7f
|
1196 1197 1198 |
cork->flags |= IPCORK_OPT; cork->addr = ipc->addr; } |
9783ccd0f
|
1199 |
|
482fc6094
|
1200 |
cork->fragsize = ip_sk_use_pmtu(sk) ? |
501a90c94
|
1201 1202 1203 1204 |
dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); if (!inetdev_valid_mtu(cork->fragsize)) return -ENETUNREACH; |
bec1f6f69
|
1205 |
|
fbf478136
|
1206 |
cork->gso_size = ipc->gso_size; |
501a90c94
|
1207 |
|
1470ddf7f
|
1208 |
cork->dst = &rt->dst; |
501a90c94
|
1209 1210 |
/* We stole this route, caller should not release it. */ *rtp = NULL; |
1470ddf7f
|
1211 |
cork->length = 0; |
aa6615814
|
1212 1213 |
cork->ttl = ipc->ttl; cork->tos = ipc->tos; |
c6af0c227
|
1214 |
cork->mark = ipc->sockc.mark; |
aa6615814
|
1215 |
cork->priority = ipc->priority; |
bc969a977
|
1216 |
cork->transmit_time = ipc->sockc.transmit_time; |
678ca42d6
|
1217 1218 |
cork->tx_flags = 0; sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); |
1470ddf7f
|
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 |
return 0; } /* * ip_append_data() and ip_append_page() can make one large IP datagram * from many pieces of data. Each pieces will be holded on the socket * until ip_push_pending_frames() is called. Each piece can be a page * or non-page data. * * Not only UDP, other transport protocols - e.g. raw sockets - can use * this interface potentially. * * LATER: length must be adjusted by pad at tail, when it is required. */ |
f5fca6086
|
1234 |
int ip_append_data(struct sock *sk, struct flowi4 *fl4, |
1470ddf7f
|
1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 |
int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); int err; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { |
bdc712b4c
|
1248 |
err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); |
1470ddf7f
|
1249 1250 1251 1252 1253 |
if (err) return err; } else { transhdrlen = 0; } |
5640f7685
|
1254 1255 |
return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, sk_page_frag(sk), getfrag, |
1470ddf7f
|
1256 1257 |
from, length, transhdrlen, flags); } |
f5fca6086
|
1258 |
ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, |
1da177e4c
|
1259 1260 1261 1262 1263 1264 |
int offset, size_t size, int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; |
bdc712b4c
|
1265 |
struct inet_cork *cork; |
1da177e4c
|
1266 1267 1268 1269 |
int hh_len; int mtu; int len; int err; |
daba287b2
|
1270 |
unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; |
1da177e4c
|
1271 1272 1273 1274 1275 1276 1277 1278 1279 |
if (inet->hdrincl) return -EPERM; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; |
bdc712b4c
|
1280 1281 1282 1283 |
cork = &inet->cork.base; rt = (struct rtable *)cork->dst; if (cork->flags & IPCORK_OPT) opt = cork->opt; |
1da177e4c
|
1284 |
|
343d8c601
|
1285 |
if (!(rt->dst.dev->features & NETIF_F_SG)) |
1da177e4c
|
1286 |
return -EOPNOTSUPP; |
d8d1f30b9
|
1287 |
hh_len = LL_RESERVED_SPACE(rt->dst.dev); |
bec1f6f69
|
1288 |
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; |
1da177e4c
|
1289 1290 1291 |
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; |
60ff74673
|
1292 |
maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; |
1da177e4c
|
1293 |
|
daba287b2
|
1294 |
if (cork->length + size > maxnonfragsize - fragheaderlen) { |
61e7f09d0
|
1295 1296 |
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu - (opt ? opt->optlen : 0)); |
1da177e4c
|
1297 1298 |
return -EMSGSIZE; } |
51456b291
|
1299 1300 |
skb = skb_peek_tail(&sk->sk_write_queue); if (!skb) |
1da177e4c
|
1301 |
return -EINVAL; |
a8c4a2522
|
1302 |
cork->length += size; |
e89e9cf53
|
1303 |
|
1da177e4c
|
1304 |
while (size > 0) { |
ab2fb7e32
|
1305 1306 1307 1308 |
/* Check if the remaining data fits into current packet. */ len = mtu - skb->len; if (len < size) len = maxfraglen - skb->len; |
e89e9cf53
|
1309 |
|
1da177e4c
|
1310 1311 |
if (len <= 0) { struct sk_buff *skb_prev; |
1da177e4c
|
1312 1313 1314 |
int alloclen; skb_prev = skb; |
0d0d2bba9
|
1315 |
fraggap = skb_prev->len - maxfraglen; |
1da177e4c
|
1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 |
alloclen = fragheaderlen + hh_len + fraggap + 15; skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); if (unlikely(!skb)) { err = -ENOBUFS; goto error; } /* * Fill in the control structures */ skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; skb_reserve(skb, hh_len); /* * Find where to start putting bytes. */ |
967b05f64
|
1334 |
skb_put(skb, fragheaderlen + fraggap); |
2ca9e6f2c
|
1335 |
skb_reset_network_header(skb); |
b0e380b1d
|
1336 1337 |
skb->transport_header = (skb->network_header + fragheaderlen); |
1da177e4c
|
1338 |
if (fraggap) { |
967b05f64
|
1339 1340 |
skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, |
9c70220b7
|
1341 |
skb_transport_header(skb), |
8d5930dfb
|
1342 |
fraggap); |
1da177e4c
|
1343 1344 |
skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); |
e9fa4f7bd
|
1345 |
pskb_trim_unique(skb_prev, maxfraglen); |
1da177e4c
|
1346 1347 1348 1349 1350 1351 1352 1353 |
} /* * Put the packet on the pending queue. */ __skb_queue_tail(&sk->sk_write_queue, skb); continue; } |
1da177e4c
|
1354 1355 |
if (len > size) len = size; |
be12a1fe2
|
1356 1357 |
if (skb_append_pagefrags(skb, page, offset, len)) { |
1da177e4c
|
1358 1359 1360 1361 1362 |
err = -EMSGSIZE; goto error; } if (skb->ip_summed == CHECKSUM_NONE) { |
44bb93633
|
1363 |
__wsum csum; |
1da177e4c
|
1364 1365 1366 1367 1368 1369 |
csum = csum_page(page, offset, len); skb->csum = csum_block_add(skb->csum, csum, skb->len); } skb->len += len; skb->data_len += len; |
1e34a11d5
|
1370 |
skb->truesize += len; |
14afee4b6
|
1371 |
refcount_add(len, &sk->sk_wmem_alloc); |
1da177e4c
|
1372 1373 1374 1375 1376 1377 |
offset += len; size -= len; } return 0; error: |
bdc712b4c
|
1378 |
cork->length -= size; |
5e38e2704
|
1379 |
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); |
1da177e4c
|
1380 1381 |
return err; } |
1470ddf7f
|
1382 |
static void ip_cork_release(struct inet_cork *cork) |
429f08e95
|
1383 |
{ |
1470ddf7f
|
1384 1385 1386 1387 1388 |
cork->flags &= ~IPCORK_OPT; kfree(cork->opt); cork->opt = NULL; dst_release(cork->dst); cork->dst = NULL; |
429f08e95
|
1389 |
} |
1da177e4c
|
1390 1391 1392 1393 |
/* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ |
1c32c5ad6
|
1394 |
struct sk_buff *__ip_make_skb(struct sock *sk, |
77968b782
|
1395 |
struct flowi4 *fl4, |
1c32c5ad6
|
1396 1397 |
struct sk_buff_head *queue, struct inet_cork *cork) |
1da177e4c
|
1398 1399 1400 1401 |
{ struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); |
0388b0042
|
1402 |
struct net *net = sock_net(sk); |
1da177e4c
|
1403 |
struct ip_options *opt = NULL; |
1470ddf7f
|
1404 |
struct rtable *rt = (struct rtable *)cork->dst; |
1da177e4c
|
1405 |
struct iphdr *iph; |
76ab608d8
|
1406 |
__be16 df = 0; |
1da177e4c
|
1407 |
__u8 ttl; |
1da177e4c
|
1408 |
|
51456b291
|
1409 1410 |
skb = __skb_dequeue(queue); if (!skb) |
1da177e4c
|
1411 1412 1413 1414 |
goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ |
d56f90a7c
|
1415 |
if (skb->data < skb_network_header(skb)) |
bbe735e42
|
1416 |
__skb_pull(skb, skb_network_offset(skb)); |
1470ddf7f
|
1417 |
while ((tmp_skb = __skb_dequeue(queue)) != NULL) { |
cfe1fc775
|
1418 |
__skb_pull(tmp_skb, skb_network_header_len(skb)); |
1da177e4c
|
1419 1420 1421 1422 1423 |
*tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; |
1da177e4c
|
1424 1425 1426 1427 1428 1429 1430 1431 |
tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ |
60ff74673
|
1432 |
skb->ignore_df = ip_sk_ignore_df(sk); |
1da177e4c
|
1433 1434 |
/* DF bit is set when we want to see DF on outgoing frames. |
60ff74673
|
1435 |
* If ignore_df is set too, we still allow to fragment this frame |
1da177e4c
|
1436 |
* locally. */ |
482fc6094
|
1437 1438 |
if (inet->pmtudisc == IP_PMTUDISC_DO || inet->pmtudisc == IP_PMTUDISC_PROBE || |
d8d1f30b9
|
1439 1440 |
(skb->len <= dst_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) |
1da177e4c
|
1441 |
df = htons(IP_DF); |
1470ddf7f
|
1442 1443 |
if (cork->flags & IPCORK_OPT) opt = cork->opt; |
1da177e4c
|
1444 |
|
aa6615814
|
1445 1446 1447 |
if (cork->ttl != 0) ttl = cork->ttl; else if (rt->rt_type == RTN_MULTICAST) |
1da177e4c
|
1448 1449 |
ttl = inet->mc_ttl; else |
d8d1f30b9
|
1450 |
ttl = ip_select_ttl(inet, &rt->dst); |
1da177e4c
|
1451 |
|
749154aa5
|
1452 |
iph = ip_hdr(skb); |
1da177e4c
|
1453 1454 |
iph->version = 4; iph->ihl = 5; |
aa6615814
|
1455 |
iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; |
1da177e4c
|
1456 |
iph->frag_off = df; |
1da177e4c
|
1457 1458 |
iph->ttl = ttl; iph->protocol = sk->sk_protocol; |
84f9307c5
|
1459 |
ip_copy_addrs(iph, fl4); |
b6a7719ae
|
1460 |
ip_select_ident(net, skb, sk); |
1da177e4c
|
1461 |
|
22f728f8f
|
1462 |
if (opt) { |
5af68891d
|
1463 |
iph->ihl += opt->optlen >> 2; |
22f728f8f
|
1464 1465 |
ip_options_build(skb, opt, cork->addr, rt, 0); } |
aa6615814
|
1466 |
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; |
c6af0c227
|
1467 |
skb->mark = cork->mark; |
bc969a977
|
1468 |
skb->tstamp = cork->transmit_time; |
a21bba945
|
1469 1470 1471 1472 |
/* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount */ |
1470ddf7f
|
1473 |
cork->dst = NULL; |
d8d1f30b9
|
1474 |
skb_dst_set(skb, &rt->dst); |
1da177e4c
|
1475 |
|
96793b482
|
1476 |
if (iph->protocol == IPPROTO_ICMP) |
0388b0042
|
1477 |
icmp_out_count(net, ((struct icmphdr *) |
96793b482
|
1478 |
skb_transport_header(skb))->type); |
1c32c5ad6
|
1479 1480 1481 1482 |
ip_cork_release(cork); out: return skb; } |
b5ec8eeac
|
1483 |
int ip_send_skb(struct net *net, struct sk_buff *skb) |
1c32c5ad6
|
1484 |
{ |
1c32c5ad6
|
1485 |
int err; |
33224b16f
|
1486 |
err = ip_local_out(net, skb->sk, skb); |
1da177e4c
|
1487 1488 |
if (err) { if (err > 0) |
6ce9e7b5f
|
1489 |
err = net_xmit_errno(err); |
1da177e4c
|
1490 |
if (err) |
1c32c5ad6
|
1491 |
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); |
1da177e4c
|
1492 |
} |
1da177e4c
|
1493 |
return err; |
1da177e4c
|
1494 |
} |
77968b782
|
1495 |
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) |
1470ddf7f
|
1496 |
{ |
1c32c5ad6
|
1497 |
struct sk_buff *skb; |
77968b782
|
1498 |
skb = ip_finish_skb(sk, fl4); |
1c32c5ad6
|
1499 1500 1501 1502 |
if (!skb) return 0; /* Netfilter gets whole the not fragmented skb. */ |
b5ec8eeac
|
1503 |
return ip_send_skb(sock_net(sk), skb); |
1470ddf7f
|
1504 |
} |
1da177e4c
|
1505 1506 1507 |
/* * Throw away all pending data on the socket. */ |
1470ddf7f
|
1508 1509 1510 |
static void __ip_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, struct inet_cork *cork) |
1da177e4c
|
1511 |
{ |
1da177e4c
|
1512 |
struct sk_buff *skb; |
1470ddf7f
|
1513 |
while ((skb = __skb_dequeue_tail(queue)) != NULL) |
1da177e4c
|
1514 |
kfree_skb(skb); |
1470ddf7f
|
1515 1516 1517 1518 1519 |
ip_cork_release(cork); } void ip_flush_pending_frames(struct sock *sk) { |
bdc712b4c
|
1520 |
__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); |
1da177e4c
|
1521 |
} |
1c32c5ad6
|
1522 |
struct sk_buff *ip_make_skb(struct sock *sk, |
77968b782
|
1523 |
struct flowi4 *fl4, |
1c32c5ad6
|
1524 1525 1526 1527 |
int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, |
1cd7884df
|
1528 |
struct inet_cork *cork, unsigned int flags) |
1c32c5ad6
|
1529 |
{ |
1c32c5ad6
|
1530 1531 1532 1533 1534 1535 1536 |
struct sk_buff_head queue; int err; if (flags & MSG_PROBE) return NULL; __skb_queue_head_init(&queue); |
1cd7884df
|
1537 1538 1539 1540 |
cork->flags = 0; cork->addr = 0; cork->opt = NULL; err = ip_setup_cork(sk, cork, ipc, rtp); |
1c32c5ad6
|
1541 1542 |
if (err) return ERR_PTR(err); |
1cd7884df
|
1543 |
err = __ip_append_data(sk, fl4, &queue, cork, |
5640f7685
|
1544 |
¤t->task_frag, getfrag, |
1c32c5ad6
|
1545 1546 |
from, length, transhdrlen, flags); if (err) { |
1cd7884df
|
1547 |
__ip_flush_pending_frames(sk, &queue, cork); |
1c32c5ad6
|
1548 1549 |
return ERR_PTR(err); } |
1cd7884df
|
1550 |
return __ip_make_skb(sk, fl4, &queue, cork); |
1c32c5ad6
|
1551 |
} |
1da177e4c
|
1552 1553 1554 1555 |
/* * Fetch data from kernel space and fill in checksum if needed. */ |
e905a9eda
|
1556 |
static int ip_reply_glue_bits(void *dptr, char *to, int offset, |
1da177e4c
|
1557 1558 |
int len, int odd, struct sk_buff *skb) { |
5084205fa
|
1559 |
__wsum csum; |
1da177e4c
|
1560 |
|
cc44c17ba
|
1561 |
csum = csum_partial_copy_nocheck(dptr+offset, to, len); |
1da177e4c
|
1562 |
skb->csum = csum_block_add(skb->csum, csum, odd); |
e905a9eda
|
1563 |
return 0; |
1da177e4c
|
1564 |
} |
e905a9eda
|
1565 |
/* |
1da177e4c
|
1566 |
* Generic function to send a packet as reply to another packet. |
be9f4a44e
|
1567 |
* Used to send some TCP resets/acks so far. |
1da177e4c
|
1568 |
*/ |
bdbbb8527
|
1569 |
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, |
24a2d43d8
|
1570 1571 1572 |
const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, |
d6fb396cf
|
1573 |
unsigned int len, u64 transmit_time) |
1da177e4c
|
1574 |
{ |
f6d8bd051
|
1575 |
struct ip_options_data replyopts; |
1da177e4c
|
1576 |
struct ipcm_cookie ipc; |
77968b782
|
1577 |
struct flowi4 fl4; |
511c3f92a
|
1578 |
struct rtable *rt = skb_rtable(skb); |
bdbbb8527
|
1579 |
struct net *net = sock_net(sk); |
be9f4a44e
|
1580 |
struct sk_buff *nskb; |
4062090e3
|
1581 |
int err; |
f7ba868b7
|
1582 |
int oif; |
1da177e4c
|
1583 |
|
91ed1e666
|
1584 |
if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) |
1da177e4c
|
1585 |
return; |
351782067
|
1586 |
ipcm_init(&ipc); |
0a5ebb800
|
1587 |
ipc.addr = daddr; |
d6fb396cf
|
1588 |
ipc.sockc.transmit_time = transmit_time; |
1da177e4c
|
1589 |
|
f6d8bd051
|
1590 |
if (replyopts.opt.opt.optlen) { |
1da177e4c
|
1591 |
ipc.opt = &replyopts.opt; |
f6d8bd051
|
1592 1593 |
if (replyopts.opt.opt.srr) daddr = replyopts.opt.opt.faddr; |
1da177e4c
|
1594 |
} |
f7ba868b7
|
1595 |
oif = arg->bound_dev_if; |
9b6c14d51
|
1596 1597 |
if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; |
f7ba868b7
|
1598 1599 |
flowi4_init_output(&fl4, oif, |
004836905
|
1600 |
IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, |
66b13d99d
|
1601 |
RT_TOS(arg->tos), |
be9f4a44e
|
1602 |
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, |
77968b782
|
1603 |
ip_reply_arg_flowi_flags(arg), |
70e734167
|
1604 |
daddr, saddr, |
e2d118a1c
|
1605 1606 |
tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); |
77968b782
|
1607 |
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
be9f4a44e
|
1608 |
rt = ip_route_output_key(net, &fl4); |
77968b782
|
1609 1610 |
if (IS_ERR(rt)) return; |
1da177e4c
|
1611 |
|
ba9e04a7d
|
1612 |
inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; |
1da177e4c
|
1613 |
|
eddc9ec53
|
1614 |
sk->sk_protocol = ip_hdr(skb)->protocol; |
f0e48dbfc
|
1615 |
sk->sk_bound_dev_if = arg->bound_dev_if; |
be9f4a44e
|
1616 |
sk->sk_sndbuf = sysctl_wmem_default; |
0da7536fb
|
1617 |
ipc.sockc.mark = fl4.flowi4_mark; |
4062090e3
|
1618 1619 1620 1621 1622 1623 |
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { ip_flush_pending_frames(sk); goto out; } |
be9f4a44e
|
1624 1625 |
nskb = skb_peek(&sk->sk_write_queue); if (nskb) { |
1da177e4c
|
1626 |
if (arg->csumoffset >= 0) |
be9f4a44e
|
1627 1628 |
*((__sum16 *)skb_transport_header(nskb) + arg->csumoffset) = csum_fold(csum_add(nskb->csum, |
9c70220b7
|
1629 |
arg->csum)); |
be9f4a44e
|
1630 |
nskb->ip_summed = CHECKSUM_NONE; |
77968b782
|
1631 |
ip_push_pending_frames(sk, &fl4); |
1da177e4c
|
1632 |
} |
4062090e3
|
1633 |
out: |
1da177e4c
|
1634 1635 |
ip_rt_put(rt); } |
1da177e4c
|
1636 1637 |
void __init ip_init(void) { |
1da177e4c
|
1638 1639 |
ip_rt_init(); inet_initpeers(); |
72c1d3bdd
|
1640 1641 |
#if defined(CONFIG_IP_MULTICAST) igmp_mc_init(); |
1da177e4c
|
1642 1643 |
#endif } |