Commit 71d8c47fc653711c41bc3282e5b0e605b3727956

Authored by Pablo Neira Ayuso
1 parent ba76738c03

netfilter: conntrack: introduce clash resolution on insertion race

This patch introduces nf_ct_resolve_clash() to resolve race condition on
conntrack insertions.

This is particularly a problem for connection-less protocols such as
UDP, with no initial handshake. Two or more packets may race to insert
the entry resulting in packet drops.

Another problematic scenario are packets enqueued to userspace via
NFQUEUE after the raw table, that make it easier to trigger this
race.

To resolve this, the idea is to reset the conntrack entry to the one
that won race. Packet and bytes counters are also merged.

The 'insert_failed' stats still accounts for this situation, after
this patch, the drop counter is bumped whenever we drop packets, so we
can watch for unresolved clashes.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

Showing 4 changed files with 57 additions and 3 deletions Side-by-side Diff

include/net/netfilter/nf_conntrack_l4proto.h
... ... @@ -23,6 +23,9 @@
23 23 /* L4 Protocol number. */
24 24 u_int8_t l4proto;
25 25  
  26 + /* Resolve clashes on insertion races. */
  27 + bool allow_clash;
  28 +
26 29 /* Try to fill in the third arg: dataoff is offset past network protocol
27 30 hdr. Return true if possible. */
28 31 bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
net/netfilter/nf_conntrack_core.c
... ... @@ -617,6 +617,48 @@
617 617 }
618 618 }
619 619  
  620 +static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
  621 + const struct nf_conn *loser_ct)
  622 +{
  623 + struct nf_conn_acct *acct;
  624 +
  625 + acct = nf_conn_acct_find(loser_ct);
  626 + if (acct) {
  627 + struct nf_conn_counter *counter = acct->counter;
  628 + enum ip_conntrack_info ctinfo;
  629 + unsigned int bytes;
  630 +
  631 + /* u32 should be fine since we must have seen one packet. */
  632 + bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
  633 + nf_ct_acct_update(ct, ctinfo, bytes);
  634 + }
  635 +}
  636 +
  637 +/* Resolve race on insertion if this protocol allows this. */
  638 +static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
  639 + enum ip_conntrack_info ctinfo,
  640 + struct nf_conntrack_tuple_hash *h)
  641 +{
  642 + /* This is the conntrack entry already in hashes that won race. */
  643 + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
  644 + struct nf_conntrack_l4proto *l4proto;
  645 +
  646 + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
  647 + if (l4proto->allow_clash &&
  648 + !nf_ct_is_dying(ct) &&
  649 + atomic_inc_not_zero(&ct->ct_general.use)) {
  650 + nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
  651 + nf_conntrack_put(skb->nfct);
  652 + /* Assign conntrack already in hashes to this skbuff. Don't
  653 + * modify skb->nfctinfo to ensure consistent stateful filtering.
  654 + */
  655 + skb->nfct = &ct->ct_general;
  656 + return NF_ACCEPT;
  657 + }
  658 + NF_CT_STAT_INC(net, drop);
  659 + return NF_DROP;
  660 +}
  661 +
620 662 /* Confirm a connection given skb; places it in hash table */
621 663 int
622 664 __nf_conntrack_confirm(struct sk_buff *skb)
... ... @@ -631,6 +673,7 @@
631 673 enum ip_conntrack_info ctinfo;
632 674 struct net *net;
633 675 unsigned int sequence;
  676 + int ret = NF_DROP;
634 677  
635 678 ct = nf_ct_get(skb, &ctinfo);
636 679 net = nf_ct_net(ct);
... ... @@ -673,8 +716,10 @@
673 716 */
674 717 nf_ct_del_from_dying_or_unconfirmed_list(ct);
675 718  
676   - if (unlikely(nf_ct_is_dying(ct)))
677   - goto out;
  719 + if (unlikely(nf_ct_is_dying(ct))) {
  720 + nf_ct_add_to_dying_list(ct);
  721 + goto dying;
  722 + }
678 723  
679 724 /* See if there's one in the list already, including reverse:
680 725 NAT could have grabbed it without realizing, since we're
681 726  
... ... @@ -725,10 +770,12 @@
725 770  
726 771 out:
727 772 nf_ct_add_to_dying_list(ct);
  773 + ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
  774 +dying:
728 775 nf_conntrack_double_unlock(hash, reply_hash);
729 776 NF_CT_STAT_INC(net, insert_failed);
730 777 local_bh_enable();
731   - return NF_DROP;
  778 + return ret;
732 779 }
733 780 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
734 781  
net/netfilter/nf_conntrack_proto_udp.c
... ... @@ -309,6 +309,7 @@
309 309 .l3proto = PF_INET,
310 310 .l4proto = IPPROTO_UDP,
311 311 .name = "udp",
  312 + .allow_clash = true,
312 313 .pkt_to_tuple = udp_pkt_to_tuple,
313 314 .invert_tuple = udp_invert_tuple,
314 315 .print_tuple = udp_print_tuple,
... ... @@ -341,6 +342,7 @@
341 342 .l3proto = PF_INET6,
342 343 .l4proto = IPPROTO_UDP,
343 344 .name = "udp",
  345 + .allow_clash = true,
344 346 .pkt_to_tuple = udp_pkt_to_tuple,
345 347 .invert_tuple = udp_invert_tuple,
346 348 .print_tuple = udp_print_tuple,
net/netfilter/nf_conntrack_proto_udplite.c
... ... @@ -274,6 +274,7 @@
274 274 .l3proto = PF_INET,
275 275 .l4proto = IPPROTO_UDPLITE,
276 276 .name = "udplite",
  277 + .allow_clash = true,
277 278 .pkt_to_tuple = udplite_pkt_to_tuple,
278 279 .invert_tuple = udplite_invert_tuple,
279 280 .print_tuple = udplite_print_tuple,
... ... @@ -306,6 +307,7 @@
306 307 .l3proto = PF_INET6,
307 308 .l4proto = IPPROTO_UDPLITE,
308 309 .name = "udplite",
  310 + .allow_clash = true,
309 311 .pkt_to_tuple = udplite_pkt_to_tuple,
310 312 .invert_tuple = udplite_invert_tuple,
311 313 .print_tuple = udplite_print_tuple,