Blame view

samples/bpf/hbm_out_kern.c 5.52 KB
187d0738f   brakmo   bpf: Sample HBM B...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  // SPDX-License-Identifier: GPL-2.0
  /* Copyright (c) 2019 Facebook
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of version 2 of the GNU General Public
   * License as published by the Free Software Foundation.
   *
   * Sample Host Bandwidth Manager (HBM) BPF program.
   *
   * A cgroup skb BPF egress program to limit cgroup output bandwidth.
   * It uses a modified virtual token bucket queue to limit average
   * egress bandwidth. The implementation uses credits instead of tokens.
   * Negative credits imply that queueing would have happened (this is
   * a virtual queue, so no queueing is done by it. However, queueing may
   * occur at the actual qdisc (which is not used for rate limiting).
   *
   * This implementation uses 3 thresholds, one to start marking packets and
   * the other two to drop packets:
   *                                  CREDIT
   *        - <--------------------------|------------------------> +
   *              |    |          |      0
   *              |  Large pkt    |
   *              |  drop thresh  |
   *   Small pkt drop             Mark threshold
   *       thresh
   *
   * The effect of marking depends on the type of packet:
   * a) If the packet is ECN enabled and it is a TCP packet, then the packet
   *    is ECN marked.
   * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
   *    to reduce the congestion window. The current implementation uses a linear
   *    distribution (0% probability at marking threshold, 100% probability
   *    at drop threshold).
   * c) If the packet is not a TCP packet, then it is dropped.
   *
   * If the credit is below the drop threshold, the packet is dropped. If it
   * is a TCP packet, then it also calls tcp_cwr since packets dropped by
   * by a cgroup skb BPF program do not automatically trigger a call to
   * tcp_cwr in the current kernel code.
   *
   * This BPF program actually uses 2 drop thresholds, one threshold
   * for larger packets (>= 120 bytes) and another for smaller packets. This
   * protects smaller packets such as SYNs, ACKs, etc.
   *
   * The default bandwidth limit is set at 1Gbps but this can be changed by
   * a user program through a shared BPF map. In addition, by default this BPF
   * program does not limit connections using loopback. This behavior can be
   * overwritten by the user program. There is also an option to calculate
   * some statistics, such as percent of packets marked or dropped, which
   * the user program can access.
   *
   * A latter patch provides such a program (hbm.c)
   */
  
  #include "hbm_kern.h"
  
  SEC("cgroup_skb/egress")
  int _hbm_out_cg(struct __sk_buff *skb)
  {
  	struct hbm_pkt_info pkti;
  	int len = skb->len;
  	unsigned int queue_index = 0;
  	unsigned long long curtime;
  	int credit;
d58c6f721   brakmo   bpf: Add more sta...
65
  	signed long long delta = 0, new_credit;
187d0738f   brakmo   bpf: Sample HBM B...
66
67
68
69
  	int max_credit = MAX_CREDIT;
  	bool congestion_flag = false;
  	bool drop_flag = false;
  	bool cwr_flag = false;
d58c6f721   brakmo   bpf: Add more sta...
70
  	bool ecn_ce_flag = false;
187d0738f   brakmo   bpf: Sample HBM B...
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  	struct hbm_vqueue *qdp;
  	struct hbm_queue_stats *qsp = NULL;
  	int rv = ALLOW_PKT;
  
  	qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
  	if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
  		return ALLOW_PKT;
  
  	hbm_get_pkt_info(skb, &pkti);
  
  	// We may want to account for the length of headers in len
  	// calculation, like ETH header + overhead, specially if it
  	// is a gso packet. But I am not doing it right now.
  
  	qdp = bpf_get_local_storage(&queue_state, 0);
  	if (!qdp)
  		return ALLOW_PKT;
  	else if (qdp->lasttime == 0)
  		hbm_init_vqueue(qdp, 1024);
  
  	curtime = bpf_ktime_get_ns();
  
  	// Begin critical section
  	bpf_spin_lock(&qdp->lock);
  	credit = qdp->credit;
  	delta = curtime - qdp->lasttime;
  	/* delta < 0 implies that another process with a curtime greater
  	 * than ours beat us to the critical section and already added
  	 * the new credit, so we should not add it ourselves
  	 */
  	if (delta > 0) {
  		qdp->lasttime = curtime;
d58c6f721   brakmo   bpf: Add more sta...
103
104
  		new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
  		if (new_credit > MAX_CREDIT)
187d0738f   brakmo   bpf: Sample HBM B...
105
  			credit = MAX_CREDIT;
d58c6f721   brakmo   bpf: Add more sta...
106
107
  		else
  			credit = new_credit;
187d0738f   brakmo   bpf: Sample HBM B...
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
  	}
  	credit -= len;
  	qdp->credit = credit;
  	bpf_spin_unlock(&qdp->lock);
  	// End critical section
  
  	// Check if we should update rate
  	if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
  		qdp->rate = qsp->rate * 128;
  		bpf_printk("Updating rate: %d (1sec:%llu bits)
  ",
  			   (int)qdp->rate,
  			   CREDIT_PER_NS(1000000000, qdp->rate) * 8);
  	}
  
  	// Set flags (drop, congestion, cwr)
  	// Dropping => we are congested, so ignore congestion flag
  	if (credit < -DROP_THRESH ||
ffd81558d   brakmo   bpf: Add cn suppo...
126
127
  	    (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
  		// Very congested, set drop packet
187d0738f   brakmo   bpf: Sample HBM B...
128
  		drop_flag = true;
ffd81558d   brakmo   bpf: Add cn suppo...
129
130
131
132
  		if (pkti.ecn)
  			congestion_flag = true;
  		else if (pkti.is_tcp)
  			cwr_flag = true;
187d0738f   brakmo   bpf: Sample HBM B...
133
134
  	} else if (credit < 0) {
  		// Congested, set congestion flag
ffd81558d   brakmo   bpf: Add cn suppo...
135
  		if (pkti.ecn || pkti.is_tcp) {
187d0738f   brakmo   bpf: Sample HBM B...
136
137
138
139
140
141
142
143
144
145
  			if (credit < -MARK_THRESH)
  				congestion_flag = true;
  			else
  				congestion_flag = false;
  		} else {
  			congestion_flag = true;
  		}
  	}
  
  	if (congestion_flag) {
d58c6f721   brakmo   bpf: Add more sta...
146
147
148
  		if (bpf_skb_ecn_set_ce(skb)) {
  			ecn_ce_flag = true;
  		} else {
ffd81558d   brakmo   bpf: Add cn suppo...
149
150
151
152
153
154
155
156
157
  			if (pkti.is_tcp) {
  				unsigned int rand = bpf_get_prandom_u32();
  
  				if (-credit >= MARK_THRESH +
  				    (rand % MARK_REGION_SIZE)) {
  					// Do congestion control
  					cwr_flag = true;
  				}
  			} else if (len > LARGE_PKT_THRESH) {
187d0738f   brakmo   bpf: Sample HBM B...
158
159
160
161
162
  				// Problem if too many small packets?
  				drop_flag = true;
  			}
  		}
  	}
ffd81558d   brakmo   bpf: Add cn suppo...
163
164
165
  	if (qsp != NULL)
  		if (qsp->no_cn)
  			cwr_flag = false;
187d0738f   brakmo   bpf: Sample HBM B...
166

d58c6f721   brakmo   bpf: Add more sta...
167
168
  	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
  			 cwr_flag, ecn_ce_flag, &pkti, credit);
187d0738f   brakmo   bpf: Sample HBM B...
169

d58c6f721   brakmo   bpf: Add more sta...
170
  	if (drop_flag) {
187d0738f   brakmo   bpf: Sample HBM B...
171
  		__sync_add_and_fetch(&(qdp->credit), len);
d58c6f721   brakmo   bpf: Add more sta...
172
173
  		rv = DROP_PKT;
  	}
187d0738f   brakmo   bpf: Sample HBM B...
174

ffd81558d   brakmo   bpf: Add cn suppo...
175
176
  	if (cwr_flag)
  		rv |= 2;
187d0738f   brakmo   bpf: Sample HBM B...
177
178
179
  	return rv;
  }
  char _license[] SEC("license") = "GPL";