Commit 90964016e5d34758033e75884e41d68ccb93212e

Authored by Pablo Neira Ayuso
1 parent 0befd061af

netfilter: nf_conntrack: add IPS_OFFLOAD status bit

This new bit tells us that the conntrack entry is owned by the flow
table offload infrastructure.

 # cat /proc/net/nf_conntrack
 ipv4     2 tcp      6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2

Note the [OFFLOAD] tag in the listing.

The timer of such conntrack entries look like stopped from userspace.
In practise, to make sure the conntrack entry does not go away, the
conntrack timer is periodically set to an arbitrary large value that
gets refreshed on every iteration from the garbage collector, so it
never expires- and they display no internal state in the case of TCP
flows. This allows us to save a bitcheck from the packet path via
nf_ct_is_expired().

Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

Showing 5 changed files with 50 additions and 6 deletions Side-by-side Diff

include/uapi/linux/netfilter/nf_conntrack_common.h
... ... @@ -101,12 +101,16 @@
101 101 IPS_HELPER_BIT = 13,
102 102 IPS_HELPER = (1 << IPS_HELPER_BIT),
103 103  
  104 + /* Conntrack has been offloaded to flow table. */
  105 + IPS_OFFLOAD_BIT = 14,
  106 + IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
  107 +
104 108 /* Be careful here, modifying these bits can make things messy,
105 109 * so don't let users modify them directly.
106 110 */
107 111 IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
108 112 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
109   - IPS_SEQ_ADJUST | IPS_TEMPLATE),
  113 + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
110 114  
111 115 __IPS_MAX_BIT = 14,
112 116 };
net/netfilter/nf_conntrack_core.c
... ... @@ -901,6 +901,9 @@
901 901 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
902 902 tmp = nf_ct_tuplehash_to_ctrack(h);
903 903  
  904 + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
  905 + continue;
  906 +
904 907 if (nf_ct_is_expired(tmp)) {
905 908 nf_ct_gc_expired(tmp);
906 909 continue;
... ... @@ -975,6 +978,18 @@
975 978 return false;
976 979 }
977 980  
  981 +#define DAY (86400 * HZ)
  982 +
  983 +/* Set an arbitrary timeout large enough not to ever expire, this save
  984 + * us a check for the IPS_OFFLOAD_BIT from the packet path via
  985 + * nf_ct_is_expired().
  986 + */
  987 +static void nf_ct_offload_timeout(struct nf_conn *ct)
  988 +{
  989 + if (nf_ct_expires(ct) < DAY / 2)
  990 + ct->timeout = nfct_time_stamp + DAY;
  991 +}
  992 +
978 993 static void gc_worker(struct work_struct *work)
979 994 {
980 995 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
... ... @@ -1011,6 +1026,11 @@
1011 1026 tmp = nf_ct_tuplehash_to_ctrack(h);
1012 1027  
1013 1028 scanned++;
  1029 + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
  1030 + nf_ct_offload_timeout(tmp);
  1031 + continue;
  1032 + }
  1033 +
1014 1034 if (nf_ct_is_expired(tmp)) {
1015 1035 nf_ct_gc_expired(tmp);
1016 1036 expired_count++;
net/netfilter/nf_conntrack_netlink.c
... ... @@ -1110,6 +1110,14 @@
1110 1110 .len = NF_CT_LABELS_MAX_SIZE },
1111 1111 };
1112 1112  
  1113 +static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
  1114 +{
  1115 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
  1116 + return 0;
  1117 +
  1118 + return ctnetlink_filter_match(ct, data);
  1119 +}
  1120 +
1113 1121 static int ctnetlink_flush_conntrack(struct net *net,
1114 1122 const struct nlattr * const cda[],
1115 1123 u32 portid, int report)
... ... @@ -1122,7 +1130,7 @@
1122 1130 return PTR_ERR(filter);
1123 1131 }
1124 1132  
1125   - nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
  1133 + nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
1126 1134 portid, report);
1127 1135 kfree(filter);
1128 1136  
... ... @@ -1167,6 +1175,11 @@
1167 1175 return -ENOENT;
1168 1176  
1169 1177 ct = nf_ct_tuplehash_to_ctrack(h);
  1178 +
  1179 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
  1180 + nf_ct_put(ct);
  1181 + return -EBUSY;
  1182 + }
1170 1183  
1171 1184 if (cda[CTA_ID]) {
1172 1185 u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
net/netfilter/nf_conntrack_proto_tcp.c
... ... @@ -305,6 +305,9 @@
305 305 /* Print out the private part of the conntrack. */
306 306 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
307 307 {
  308 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
  309 + return;
  310 +
308 311 seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
309 312 }
310 313 #endif
net/netfilter/nf_conntrack_standalone.c
... ... @@ -309,11 +309,13 @@
309 309 WARN_ON(!l4proto);
310 310  
311 311 ret = -ENOSPC;
312   - seq_printf(s, "%-8s %u %-8s %u %ld ",
  312 + seq_printf(s, "%-8s %u %-8s %u ",
313 313 l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
314   - l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
315   - nf_ct_expires(ct) / HZ);
  314 + l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
316 315  
  316 + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
  317 + seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ);
  318 +
317 319 if (l4proto->print_conntrack)
318 320 l4proto->print_conntrack(s, ct);
319 321  
... ... @@ -339,7 +341,9 @@
339 341 if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
340 342 goto release;
341 343  
342   - if (test_bit(IPS_ASSURED_BIT, &ct->status))
  344 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
  345 + seq_puts(s, "[OFFLOAD] ");
  346 + else if (test_bit(IPS_ASSURED_BIT, &ct->status))
343 347 seq_puts(s, "[ASSURED] ");
344 348  
345 349 if (seq_has_overflowed(s))