Commit 99bbc70741903c063b3ccad90a3e06fc55df9245

Authored by Willem de Bruijn
Committed by David S. Miller
1 parent 4a5bddf7ea

rps: selective flow shedding during softnet overflow

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 5 changed files with 194 additions and 3 deletions Side-by-side Diff

include/linux/netdevice.h
... ... @@ -1778,6 +1778,19 @@
1778 1778 return register_gifconf(family, NULL);
1779 1779 }
1780 1780  
  1781 +#ifdef CONFIG_NET_FLOW_LIMIT
  1782 +#define FLOW_LIMIT_HISTORY (1 << 8) /* must be ^2 */
  1783 +struct sd_flow_limit {
  1784 + u64 count;
  1785 + unsigned int num_buckets;
  1786 + unsigned int history_head;
  1787 + u16 history[FLOW_LIMIT_HISTORY];
  1788 + u8 buckets[];
  1789 +};
  1790 +
  1791 +extern int netdev_flow_limit_table_len;
  1792 +#endif /* CONFIG_NET_FLOW_LIMIT */
  1793 +
1781 1794 /*
1782 1795 * Incoming packets are placed on per-cpu queues
1783 1796 */
... ... @@ -1807,6 +1820,10 @@
1807 1820 unsigned int dropped;
1808 1821 struct sk_buff_head input_pkt_queue;
1809 1822 struct napi_struct backlog;
  1823 +
  1824 +#ifdef CONFIG_NET_FLOW_LIMIT
  1825 + struct sd_flow_limit *flow_limit;
  1826 +#endif
1810 1827 };
1811 1828  
1812 1829 static inline void input_queue_head_incr(struct softnet_data *sd)
... ... @@ -259,6 +259,18 @@
259 259 packet sniffing (libpcap/tcpdump). Note : Admin should enable
260 260 this feature changing /proc/sys/net/core/bpf_jit_enable
261 261  
  262 +config NET_FLOW_LIMIT
  263 + boolean
  264 + depends on RPS
  265 + default y
  266 + ---help---
  267 + The network stack has to drop packets when a receive processing CPU's
  268 + backlog reaches netdev_max_backlog. If a few out of many active flows
  269 + generate the vast majority of load, drop their traffic earlier to
  270 + maintain capacity for the other flows. This feature provides servers
  271 + with many clients some protection against DoS by a single (spoofed)
  272 + flow that greatly exceeds average workload.
  273 +
262 274 menu "Network testing"
263 275  
264 276 config NET_PKTGEN
... ... @@ -3064,6 +3064,46 @@
3064 3064 return 0;
3065 3065 }
3066 3066  
  3067 +#ifdef CONFIG_NET_FLOW_LIMIT
  3068 +int netdev_flow_limit_table_len __read_mostly = (1 << 12);
  3069 +#endif
  3070 +
  3071 +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
  3072 +{
  3073 +#ifdef CONFIG_NET_FLOW_LIMIT
  3074 + struct sd_flow_limit *fl;
  3075 + struct softnet_data *sd;
  3076 + unsigned int old_flow, new_flow;
  3077 +
  3078 + if (qlen < (netdev_max_backlog >> 1))
  3079 + return false;
  3080 +
  3081 + sd = &__get_cpu_var(softnet_data);
  3082 +
  3083 + rcu_read_lock();
  3084 + fl = rcu_dereference(sd->flow_limit);
  3085 + if (fl) {
  3086 + new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
  3087 + old_flow = fl->history[fl->history_head];
  3088 + fl->history[fl->history_head] = new_flow;
  3089 +
  3090 + fl->history_head++;
  3091 + fl->history_head &= FLOW_LIMIT_HISTORY - 1;
  3092 +
  3093 + if (likely(fl->buckets[old_flow]))
  3094 + fl->buckets[old_flow]--;
  3095 +
  3096 + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
  3097 + fl->count++;
  3098 + rcu_read_unlock();
  3099 + return true;
  3100 + }
  3101 + }
  3102 + rcu_read_unlock();
  3103 +#endif
  3104 + return false;
  3105 +}
  3106 +
3067 3107 /*
3068 3108 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3069 3109 * queue (may be a remote CPU queue).
3070 3110  
... ... @@ -3073,13 +3113,15 @@
3073 3113 {
3074 3114 struct softnet_data *sd;
3075 3115 unsigned long flags;
  3116 + unsigned int qlen;
3076 3117  
3077 3118 sd = &per_cpu(softnet_data, cpu);
3078 3119  
3079 3120 local_irq_save(flags);
3080 3121  
3081 3122 rps_lock(sd);
3082   - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
  3123 + qlen = skb_queue_len(&sd->input_pkt_queue);
  3124 + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3083 3125 if (skb_queue_len(&sd->input_pkt_queue)) {
3084 3126 enqueue:
3085 3127 __skb_queue_tail(&sd->input_pkt_queue, skb);
... ... @@ -6269,6 +6311,10 @@
6269 6311 sd->backlog.weight = weight_p;
6270 6312 sd->backlog.gro_list = NULL;
6271 6313 sd->backlog.gro_count = 0;
  6314 +
  6315 +#ifdef CONFIG_NET_FLOW_LIMIT
  6316 + sd->flow_limit = NULL;
  6317 +#endif
6272 6318 }
6273 6319  
6274 6320 dev_boot_phase = 0;
net/core/net-procfs.c
... ... @@ -146,11 +146,23 @@
146 146 static int softnet_seq_show(struct seq_file *seq, void *v)
147 147 {
148 148 struct softnet_data *sd = v;
  149 + unsigned int flow_limit_count = 0;
149 150  
150   - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
  151 +#ifdef CONFIG_NET_FLOW_LIMIT
  152 + struct sd_flow_limit *fl;
  153 +
  154 + rcu_read_lock();
  155 + fl = rcu_dereference(sd->flow_limit);
  156 + if (fl)
  157 + flow_limit_count = fl->count;
  158 + rcu_read_unlock();
  159 +#endif
  160 +
  161 + seq_printf(seq,
  162 + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151 163 sd->processed, sd->dropped, sd->time_squeeze, 0,
152 164 0, 0, 0, 0, /* was fastroute */
153   - sd->cpu_collision, sd->received_rps);
  165 + sd->cpu_collision, sd->received_rps, flow_limit_count);
154 166 return 0;
155 167 }
156 168  
net/core/sysctl_net_core.c
... ... @@ -87,6 +87,96 @@
87 87 }
88 88 #endif /* CONFIG_RPS */
89 89  
  90 +#ifdef CONFIG_NET_FLOW_LIMIT
  91 +static DEFINE_MUTEX(flow_limit_update_mutex);
  92 +
  93 +static int flow_limit_cpu_sysctl(ctl_table *table, int write,
  94 + void __user *buffer, size_t *lenp,
  95 + loff_t *ppos)
  96 +{
  97 + struct sd_flow_limit *cur;
  98 + struct softnet_data *sd;
  99 + cpumask_var_t mask;
  100 + int i, len, ret = 0;
  101 +
  102 + if (!alloc_cpumask_var(&mask, GFP_KERNEL))
  103 + return -ENOMEM;
  104 +
  105 + if (write) {
  106 + ret = cpumask_parse_user(buffer, *lenp, mask);
  107 + if (ret)
  108 + goto done;
  109 +
  110 + mutex_lock(&flow_limit_update_mutex);
  111 + len = sizeof(*cur) + netdev_flow_limit_table_len;
  112 + for_each_possible_cpu(i) {
  113 + sd = &per_cpu(softnet_data, i);
  114 + cur = rcu_dereference_protected(sd->flow_limit,
  115 + lockdep_is_held(&flow_limit_update_mutex));
  116 + if (cur && !cpumask_test_cpu(i, mask)) {
  117 + RCU_INIT_POINTER(sd->flow_limit, NULL);
  118 + synchronize_rcu();
  119 + kfree(cur);
  120 + } else if (!cur && cpumask_test_cpu(i, mask)) {
  121 + cur = kzalloc(len, GFP_KERNEL);
  122 + if (!cur) {
  123 + /* not unwinding previous changes */
  124 + ret = -ENOMEM;
  125 + goto write_unlock;
  126 + }
  127 + cur->num_buckets = netdev_flow_limit_table_len;
  128 + rcu_assign_pointer(sd->flow_limit, cur);
  129 + }
  130 + }
  131 +write_unlock:
  132 + mutex_unlock(&flow_limit_update_mutex);
  133 + } else {
  134 + if (*ppos || !*lenp) {
  135 + *lenp = 0;
  136 + goto done;
  137 + }
  138 +
  139 + cpumask_clear(mask);
  140 + rcu_read_lock();
  141 + for_each_possible_cpu(i) {
  142 + sd = &per_cpu(softnet_data, i);
  143 + if (rcu_dereference(sd->flow_limit))
  144 + cpumask_set_cpu(i, mask);
  145 + }
  146 + rcu_read_unlock();
  147 +
  148 + len = cpumask_scnprintf(buffer, *lenp, mask);
  149 + *lenp = len + 1;
  150 + *ppos += len + 1;
  151 + }
  152 +
  153 +done:
  154 + free_cpumask_var(mask);
  155 + return ret;
  156 +}
  157 +
  158 +static int flow_limit_table_len_sysctl(ctl_table *table, int write,
  159 + void __user *buffer, size_t *lenp,
  160 + loff_t *ppos)
  161 +{
  162 + unsigned int old, *ptr;
  163 + int ret;
  164 +
  165 + mutex_lock(&flow_limit_update_mutex);
  166 +
  167 + ptr = table->data;
  168 + old = *ptr;
  169 + ret = proc_dointvec(table, write, buffer, lenp, ppos);
  170 + if (!ret && write && !is_power_of_2(*ptr)) {
  171 + *ptr = old;
  172 + ret = -EINVAL;
  173 + }
  174 +
  175 + mutex_unlock(&flow_limit_update_mutex);
  176 + return ret;
  177 +}
  178 +#endif /* CONFIG_NET_FLOW_LIMIT */
  179 +
90 180 static struct ctl_table net_core_table[] = {
91 181 #ifdef CONFIG_NET
92 182 {
... ... @@ -180,6 +270,20 @@
180 270 .proc_handler = rps_sock_flow_sysctl
181 271 },
182 272 #endif
  273 +#ifdef CONFIG_NET_FLOW_LIMIT
  274 + {
  275 + .procname = "flow_limit_cpu_bitmap",
  276 + .mode = 0644,
  277 + .proc_handler = flow_limit_cpu_sysctl
  278 + },
  279 + {
  280 + .procname = "flow_limit_table_len",
  281 + .data = &netdev_flow_limit_table_len,
  282 + .maxlen = sizeof(int),
  283 + .mode = 0644,
  284 + .proc_handler = flow_limit_table_len_sysctl
  285 + },
  286 +#endif /* CONFIG_NET_FLOW_LIMIT */
183 287 #endif /* CONFIG_NET */
184 288 {
185 289 .procname = "netdev_budget",