Blame view

net/sched/sch_qfq.c 42.7 KB
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1
  /*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
2
   * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
3
4
   *
   * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
5
   * Copyright (c) 2012 Paolo Valente.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public License
   * version 2 as published by the Free Software Foundation.
   */
  
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/bitops.h>
  #include <linux/errno.h>
  #include <linux/netdevice.h>
  #include <linux/pkt_sched.h>
  #include <net/sch_generic.h>
  #include <net/pkt_sched.h>
  #include <net/pkt_cls.h>
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
21
22
  /*  Quick Fair Queueing Plus
      ========================
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
23
24
  
      Sources:
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
25
26
27
28
29
30
31
      [1] Paolo Valente,
      "Reducing the Execution Time of Fair-Queueing Schedulers."
      http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
  
      Sources for QFQ:
  
      [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
32
33
34
35
36
37
38
      Packet Scheduling with Tight Bandwidth Distribution Guarantees."
  
      See also:
      http://retis.sssup.it/~fabio/linux/qfq/
   */
  
  /*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
39
40
41
42
43
44
45
46
47
48
49
50
51
    QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
    classes. Each aggregate is timestamped with a virtual start time S
    and a virtual finish time F, and scheduled according to its
    timestamps. S and F are computed as a function of a system virtual
    time function V. The classes within each aggregate are instead
    scheduled with DRR.
  
    To speed up operations, QFQ+ divides also aggregates into a limited
    number of groups. Which group a class belongs to depends on the
    ratio between the maximum packet length for the class and the weight
    of the class. Groups have their own S and F. In the end, QFQ+
    schedules groups, then aggregates within groups, then classes within
    aggregates. See [1] and [2] for a full description.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    Virtual time computations.
  
    S, F and V are all computed in fixed point arithmetic with
    FRAC_BITS decimal bits.
  
    QFQ_MAX_INDEX is the maximum index allowed for a group. We need
  	one bit per index.
    QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
  
    The layout of the bits is as below:
  
                     [ MTU_SHIFT ][      FRAC_BITS    ]
                     [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
  				 ^.__grp->index = 0
  				 *.__grp->slot_shift
  
    where MIN_SLOT_SHIFT is derived by difference from the others.
  
    The max group index corresponds to Lmax/w_min, where
    Lmax=1<<MTU_SHIFT, w_min = 1 .
    From this, and knowing how many groups (MAX_INDEX) we want,
    we can derive the shift corresponding to each group.
  
    Because we often need to compute
  	F = S + len/w_i  and V = V + len/wsum
    instead of storing w_i store the value
  	inv_w = (1<<FRAC_BITS)/w_i
    so we can do F = S + len * inv_w * wsum.
    We use W_TOT in the formulas so we can easily move between
    static and adaptive weight sum.
  
    The per-scheduler-instance data contain all the data structures
    for the scheduler: bitmaps and bucket lists.
  
   */
  
  /*
   * Maximum number of consecutive slots occupied by backlogged classes
   * inside a group.
   */
  #define QFQ_MAX_SLOTS	32
  
  /*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
95
96
   * Shifts used for aggregate<->group mapping.  We allow class weights that are
   * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
97
   * group with the smallest index that can support the L_i / r_i configured
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
98
   * for the classes in the aggregate.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
99
100
101
102
   *
   * grp->index is the index of the group; and grp->slot_shift
   * is the shift for the corresponding (scaled) sigma_i.
   */
3015f3d2a   Paolo Valente   pkt_sched: enable...
103
  #define QFQ_MAX_INDEX		24
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
104
  #define QFQ_MAX_WSHIFT		10
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
105

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
106
107
  #define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
  #define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
108
109
110
  
  #define FRAC_BITS		30	/* fixed point arithmetic */
  #define ONE_FP			(1UL << FRAC_BITS)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
111

3015f3d2a   Paolo Valente   pkt_sched: enable...
112
  #define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
113
114
115
  #define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
  
  #define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
116
117
118
119
120
121
122
123
  
  /*
   * Possible group states.  These values are used as indexes for the bitmaps
   * array of struct qfq_queue.
   */
  enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
  
  struct qfq_group;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
124
  struct qfq_aggregate;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
125
126
127
128
129
130
131
132
  struct qfq_class {
  	struct Qdisc_class_common common;
  
  	unsigned int refcnt;
  	unsigned int filter_cnt;
  
  	struct gnet_stats_basic_packed bstats;
  	struct gnet_stats_queue qstats;
45203a3b3   Eric Dumazet   net_sched: add 64...
133
  	struct gnet_stats_rate_est64 rate_est;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
134
  	struct Qdisc *qdisc;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
135
136
137
138
  	struct list_head alist;		/* Link for active-classes list. */
  	struct qfq_aggregate *agg;	/* Parent aggregate. */
  	int deficit;			/* DRR deficit counter. */
  };
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
139

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
140
  struct qfq_aggregate {
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
141
142
143
144
145
146
147
148
149
150
  	struct hlist_node next;	/* Link for the slot list. */
  	u64 S, F;		/* flow timestamps (exact) */
  
  	/* group we belong to. In principle we would need the index,
  	 * which is log_2(lmax/weight), but we never reference it
  	 * directly, only the group.
  	 */
  	struct qfq_group *grp;
  
  	/* these are copied from the flowset. */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
151
152
153
154
155
156
157
158
159
160
161
162
  	u32	class_weight; /* Weight of each class in this aggregate. */
  	/* Max pkt size for the classes in this aggregate, DRR quantum. */
  	int	lmax;
  
  	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */
  	u32	budgetmax;  /* Max budget for this aggregate. */
  	u32	initial_budget, budget;     /* Initial and current budget. */
  
  	int		  num_classes;	/* Number of classes in this aggr. */
  	struct list_head  active;	/* DRR queue of active classes. */
  
  	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
163
164
165
166
167
168
169
170
  };
  
  struct qfq_group {
  	u64 S, F;			/* group timestamps (approx). */
  	unsigned int slot_shift;	/* Slot shift. */
  	unsigned int index;		/* Group index. */
  	unsigned int front;		/* Index of the front slot. */
  	unsigned long full_slots;	/* non-empty slots */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
171
  	/* Array of RR lists of active aggregates. */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
172
173
174
175
  	struct hlist_head slots[QFQ_MAX_SLOTS];
  };
  
  struct qfq_sched {
25d8c0d55   John Fastabend   net: rcu-ify tcf_...
176
  	struct tcf_proto __rcu *filter_list;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
177
  	struct Qdisc_class_hash clhash;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
178
179
180
181
  	u64			oldV, V;	/* Precise virtual times. */
  	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
  	u32			num_active_agg; /* Num. of active aggregates */
  	u32			wsum;		/* weight sum */
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
182
  	u32			iwsum;		/* inverse weight sum */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
183
184
185
  
  	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
  	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
186
187
188
189
  	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */
  
  	u32 max_agg_classes;		/* Max number of classes per aggr. */
  	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
190
  };
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
191
192
193
194
195
196
197
198
  /*
   * Possible reasons why the timestamps of an aggregate are updated
   * enqueue: the aggregate switches from idle to active and must scheduled
   *	    for service
   * requeue: the aggregate finishes its budget, so it stops being served and
   *	    must be rescheduled for service
   */
  enum update_reason {enqueue, requeue};
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct Qdisc_class_common *clc;
  
  	clc = qdisc_class_find(&q->clhash, classid);
  	if (clc == NULL)
  		return NULL;
  	return container_of(clc, struct qfq_class, common);
  }
  
  static void qfq_purge_queue(struct qfq_class *cl)
  {
  	unsigned int len = cl->qdisc->q.qlen;
  
  	qdisc_reset(cl->qdisc);
  	qdisc_tree_decrease_qlen(cl->qdisc, len);
  }
  
  static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
  	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
  	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
  };
  
  /*
   * Calculate a flow index, given its weight and maximum packet length.
   * index = log_2(maxlen/weight) but we need to apply the scaling.
   * This is used only once at flow creation.
   */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
228
  static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
229
230
231
232
  {
  	u64 slot_size = (u64)maxlen * inv_w;
  	unsigned long size_map;
  	int index = 0;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
233
  	size_map = slot_size >> min_slot_shift;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
234
235
236
237
  	if (!size_map)
  		goto out;
  
  	index = __fls(size_map) + 1;	/* basically a log_2 */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
238
  	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
239
240
241
242
243
244
245
246
247
248
  
  	if (index < 0)
  		index = 0;
  out:
  	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d
  ",
  		 (unsigned long) ONE_FP/inv_w, maxlen, index);
  
  	return index;
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
249
250
251
252
253
254
  static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
  static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
  			     enum update_reason);
  
  static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
  			 u32 lmax, u32 weight)
be72f63b4   Paolo Valente   sched: add missin...
255
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
256
257
258
259
260
261
262
263
264
265
266
  	INIT_LIST_HEAD(&agg->active);
  	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
  
  	agg->lmax = lmax;
  	agg->class_weight = weight;
  }
  
  static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
  					  u32 lmax, u32 weight)
  {
  	struct qfq_aggregate *agg;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
267

b67bfe0d4   Sasha Levin   hlist: drop the n...
268
  	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
269
270
271
272
273
  		if (agg->lmax == lmax && agg->class_weight == weight)
  			return agg;
  
  	return NULL;
  }
be72f63b4   Paolo Valente   sched: add missin...
274

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
275
276
277
278
279
280
281
282
283
284
285
286
  /* Update aggregate as a function of the new number of classes. */
  static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
  			   int new_num_classes)
  {
  	u32 new_agg_weight;
  
  	if (new_num_classes == q->max_agg_classes)
  		hlist_del_init(&agg->nonfull_next);
  
  	if (agg->num_classes > new_num_classes &&
  	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */
  		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
9b99b7e90   Paolo Valente   pkt_sched: sch_qf...
287
288
289
290
  	/* The next assignment may let
  	 * agg->initial_budget > agg->budgetmax
  	 * hold, we will take it into account in charge_actual_service().
  	 */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
291
292
293
294
295
296
297
298
299
300
301
302
  	agg->budgetmax = new_num_classes * agg->lmax;
  	new_agg_weight = agg->class_weight * new_num_classes;
  	agg->inv_w = ONE_FP/new_agg_weight;
  
  	if (agg->grp == NULL) {
  		int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
  				       q->min_slot_shift);
  		agg->grp = &q->groups[i];
  	}
  
  	q->wsum +=
  		(int) agg->class_weight * (new_num_classes - agg->num_classes);
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
303
  	q->iwsum = ONE_FP / q->wsum;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
  
  	agg->num_classes = new_num_classes;
  }
  
  /* Add class to aggregate. */
  static void qfq_add_to_agg(struct qfq_sched *q,
  			   struct qfq_aggregate *agg,
  			   struct qfq_class *cl)
  {
  	cl->agg = agg;
  
  	qfq_update_agg(q, agg, agg->num_classes+1);
  	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
  		list_add_tail(&cl->alist, &agg->active);
  		if (list_first_entry(&agg->active, struct qfq_class, alist) ==
  		    cl && q->in_serv_agg != agg) /* agg was inactive */
  			qfq_activate_agg(q, agg, enqueue); /* schedule agg */
  	}
be72f63b4   Paolo Valente   sched: add missin...
322
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
323
  static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
be72f63b4   Paolo Valente   sched: add missin...
324

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
325
  static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
be72f63b4   Paolo Valente   sched: add missin...
326
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
327
328
  	if (!hlist_unhashed(&agg->nonfull_next))
  		hlist_del_init(&agg->nonfull_next);
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
329
330
331
  	q->wsum -= agg->class_weight;
  	if (q->wsum != 0)
  		q->iwsum = ONE_FP / q->wsum;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
332
333
334
335
  	if (q->in_serv_agg == agg)
  		q->in_serv_agg = qfq_choose_next_agg(q);
  	kfree(agg);
  }
be72f63b4   Paolo Valente   sched: add missin...
336

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
337
338
339
340
  /* Deschedule class from within its parent aggregate. */
  static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
  {
  	struct qfq_aggregate *agg = cl->agg;
be72f63b4   Paolo Valente   sched: add missin...
341

be72f63b4   Paolo Valente   sched: add missin...
342

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
343
344
345
  	list_del(&cl->alist); /* remove from RR queue of the aggregate */
  	if (list_empty(&agg->active)) /* agg is now inactive */
  		qfq_deactivate_agg(q, agg);
be72f63b4   Paolo Valente   sched: add missin...
346
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
347
348
  /* Remove class from its parent aggregate. */
  static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
3015f3d2a   Paolo Valente   pkt_sched: enable...
349
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
350
  	struct qfq_aggregate *agg = cl->agg;
3015f3d2a   Paolo Valente   pkt_sched: enable...
351

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
352
353
354
355
  	cl->agg = NULL;
  	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
  		qfq_destroy_agg(q, agg);
  		return;
3015f3d2a   Paolo Valente   pkt_sched: enable...
356
  	}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
357
358
  	qfq_update_agg(q, agg, agg->num_classes-1);
  }
3015f3d2a   Paolo Valente   pkt_sched: enable...
359

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
360
361
362
363
364
  /* Deschedule class and remove it from its parent aggregate. */
  static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
  {
  	if (cl->qdisc->q.qlen > 0) /* class is active */
  		qfq_deactivate_class(q, cl);
3015f3d2a   Paolo Valente   pkt_sched: enable...
365

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
366
  	qfq_rm_from_agg(q, cl);
3015f3d2a   Paolo Valente   pkt_sched: enable...
367
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
  /* Move class to a new aggregate, matching the new class weight and/or lmax */
  static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
  			   u32 lmax)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
  
  	if (new_agg == NULL) { /* create new aggregate */
  		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
  		if (new_agg == NULL)
  			return -ENOBUFS;
  		qfq_init_agg(q, new_agg, lmax, weight);
  	}
  	qfq_deact_rm_from_agg(q, cl);
  	qfq_add_to_agg(q, new_agg, cl);
  
  	return 0;
  }
3015f3d2a   Paolo Valente   pkt_sched: enable...
386

0545a3037   stephen hemminger   pkt_sched: QFQ - ...
387
388
389
390
391
  static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
  			    struct nlattr **tca, unsigned long *arg)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_class *cl = (struct qfq_class *)*arg;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
392
  	bool existing = false;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
393
  	struct nlattr *tb[TCA_QFQ_MAX + 1];
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
394
  	struct qfq_aggregate *new_agg = NULL;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
395
  	u32 weight, lmax, inv_w;
3015f3d2a   Paolo Valente   pkt_sched: enable...
396
  	int err;
d32ae76f2   Eric Dumazet   sch_qfq: accurate...
397
  	int delta_w;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
  
  	if (tca[TCA_OPTIONS] == NULL) {
  		pr_notice("qfq: no options
  ");
  		return -EINVAL;
  	}
  
  	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
  	if (err < 0)
  		return err;
  
  	if (tb[TCA_QFQ_WEIGHT]) {
  		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
  		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
  			pr_notice("qfq: invalid weight %u
  ", weight);
  			return -EINVAL;
  		}
  	} else
  		weight = 1;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
418
419
  	if (tb[TCA_QFQ_LMAX]) {
  		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
3015f3d2a   Paolo Valente   pkt_sched: enable...
420
  		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
421
422
423
424
425
  			pr_notice("qfq: invalid max length %u
  ", lmax);
  			return -EINVAL;
  		}
  	} else
3015f3d2a   Paolo Valente   pkt_sched: enable...
426
  		lmax = psched_mtu(qdisc_dev(sch));
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
427

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
  	inv_w = ONE_FP / weight;
  	weight = ONE_FP / inv_w;
  
  	if (cl != NULL &&
  	    lmax == cl->agg->lmax &&
  	    weight == cl->agg->class_weight)
  		return 0; /* nothing to change */
  
  	delta_w = weight - (cl ? cl->agg->class_weight : 0);
  
  	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
  		pr_notice("qfq: total weight out of range (%d + %u)
  ",
  			  delta_w, q->wsum);
  		return -EINVAL;
  	}
  
  	if (cl != NULL) { /* modify existing class */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
446
  		if (tca[TCA_RATE]) {
22e0f8b93   John Fastabend   net: sched: make ...
447
448
  			err = gen_replace_estimator(&cl->bstats, NULL,
  						    &cl->rate_est,
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
449
450
451
452
453
  						    qdisc_root_sleeping_lock(sch),
  						    tca[TCA_RATE]);
  			if (err)
  				return err;
  		}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
454
455
  		existing = true;
  		goto set_change_agg;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
456
  	}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
457
  	/* create and init new class */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
458
459
460
461
462
463
  	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
  	if (cl == NULL)
  		return -ENOBUFS;
  
  	cl->refcnt = 1;
  	cl->common.classid = classid;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
464
  	cl->deficit = lmax;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
465
466
467
468
469
470
471
  
  	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
  				      &pfifo_qdisc_ops, classid);
  	if (cl->qdisc == NULL)
  		cl->qdisc = &noop_qdisc;
  
  	if (tca[TCA_RATE]) {
22e0f8b93   John Fastabend   net: sched: make ...
472
473
  		err = gen_new_estimator(&cl->bstats, NULL,
  					&cl->rate_est,
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
474
475
  					qdisc_root_sleeping_lock(sch),
  					tca[TCA_RATE]);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
476
477
  		if (err)
  			goto destroy_class;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
478
479
480
481
482
483
484
  	}
  
  	sch_tree_lock(sch);
  	qdisc_class_hash_insert(&q->clhash, &cl->common);
  	sch_tree_unlock(sch);
  
  	qdisc_class_hash_grow(sch, &q->clhash);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
  set_change_agg:
  	sch_tree_lock(sch);
  	new_agg = qfq_find_agg(q, lmax, weight);
  	if (new_agg == NULL) { /* create new aggregate */
  		sch_tree_unlock(sch);
  		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
  		if (new_agg == NULL) {
  			err = -ENOBUFS;
  			gen_kill_estimator(&cl->bstats, &cl->rate_est);
  			goto destroy_class;
  		}
  		sch_tree_lock(sch);
  		qfq_init_agg(q, new_agg, lmax, weight);
  	}
  	if (existing)
  		qfq_deact_rm_from_agg(q, cl);
  	qfq_add_to_agg(q, new_agg, cl);
  	sch_tree_unlock(sch);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
503
504
  	*arg = (unsigned long)cl;
  	return 0;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
505
506
507
508
509
  
  destroy_class:
  	qdisc_destroy(cl->qdisc);
  	kfree(cl);
  	return err;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
510
511
512
513
514
  }
  
  static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
515
  	qfq_rm_from_agg(q, cl);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
  	gen_kill_estimator(&cl->bstats, &cl->rate_est);
  	qdisc_destroy(cl->qdisc);
  	kfree(cl);
  }
  
  static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_class *cl = (struct qfq_class *)arg;
  
  	if (cl->filter_cnt > 0)
  		return -EBUSY;
  
  	sch_tree_lock(sch);
  
  	qfq_purge_queue(cl);
  	qdisc_class_hash_remove(&q->clhash, &cl->common);
  
  	BUG_ON(--cl->refcnt == 0);
  	/*
  	 * This shouldn't happen: we "hold" one cops->get() when called
  	 * from tc_ctl_tclass; the destroy method is done from cops->put().
  	 */
  
  	sch_tree_unlock(sch);
  	return 0;
  }
  
  static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
  {
  	struct qfq_class *cl = qfq_find_class(sch, classid);
  
  	if (cl != NULL)
  		cl->refcnt++;
  
  	return (unsigned long)cl;
  }
  
  static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
  {
  	struct qfq_class *cl = (struct qfq_class *)arg;
  
  	if (--cl->refcnt == 0)
  		qfq_destroy_class(sch, cl);
  }
25d8c0d55   John Fastabend   net: rcu-ify tcf_...
561
562
  static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch,
  					      unsigned long cl)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  
  	if (cl)
  		return NULL;
  
  	return &q->filter_list;
  }
  
  static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
  				  u32 classid)
  {
  	struct qfq_class *cl = qfq_find_class(sch, classid);
  
  	if (cl != NULL)
  		cl->filter_cnt++;
  
  	return (unsigned long)cl;
  }
  
  static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
  {
  	struct qfq_class *cl = (struct qfq_class *)arg;
  
  	cl->filter_cnt--;
  }
  
  static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
  			   struct Qdisc *new, struct Qdisc **old)
  {
  	struct qfq_class *cl = (struct qfq_class *)arg;
  
  	if (new == NULL) {
  		new = qdisc_create_dflt(sch->dev_queue,
  					&pfifo_qdisc_ops, cl->common.classid);
  		if (new == NULL)
  			new = &noop_qdisc;
  	}
  
  	sch_tree_lock(sch);
  	qfq_purge_queue(cl);
  	*old = cl->qdisc;
  	cl->qdisc = new;
  	sch_tree_unlock(sch);
  	return 0;
  }
  
  static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
  {
  	struct qfq_class *cl = (struct qfq_class *)arg;
  
  	return cl->qdisc;
  }
  
  static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
  			  struct sk_buff *skb, struct tcmsg *tcm)
  {
  	struct qfq_class *cl = (struct qfq_class *)arg;
  	struct nlattr *nest;
  
  	tcm->tcm_parent	= TC_H_ROOT;
  	tcm->tcm_handle	= cl->common.classid;
  	tcm->tcm_info	= cl->qdisc->handle;
  
  	nest = nla_nest_start(skb, TCA_OPTIONS);
  	if (nest == NULL)
  		goto nla_put_failure;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
630
631
  	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
  	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
1b34ec43c   David S. Miller   pkt_sched: Stop u...
632
  		goto nla_put_failure;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
633
634
635
636
637
638
639
640
641
642
643
644
645
646
  	return nla_nest_end(skb, nest);
  
  nla_put_failure:
  	nla_nest_cancel(skb, nest);
  	return -EMSGSIZE;
  }
  
  static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
  				struct gnet_dump *d)
  {
  	struct qfq_class *cl = (struct qfq_class *)arg;
  	struct tc_qfq_stats xstats;
  
  	memset(&xstats, 0, sizeof(xstats));
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
647

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
648
649
  	xstats.weight = cl->agg->class_weight;
  	xstats.lmax = cl->agg->lmax;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
650

22e0f8b93   John Fastabend   net: sched: make ...
651
  	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
652
  	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
b0ab6f927   John Fastabend   net: sched: enabl...
653
654
  	    gnet_stats_copy_queue(d, NULL,
  				  &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
655
656
657
658
659
660
661
662
663
  		return -1;
  
  	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
  }
  
  static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_class *cl;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
664
665
666
667
668
669
  	unsigned int i;
  
  	if (arg->stop)
  		return;
  
  	for (i = 0; i < q->clhash.hashsize; i++) {
b67bfe0d4   Sasha Levin   hlist: drop the n...
670
  		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
  			if (arg->count < arg->skip) {
  				arg->count++;
  				continue;
  			}
  			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
  				arg->stop = 1;
  				return;
  			}
  			arg->count++;
  		}
  	}
  }
  
  static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
  				      int *qerr)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_class *cl;
  	struct tcf_result res;
25d8c0d55   John Fastabend   net: rcu-ify tcf_...
690
  	struct tcf_proto *fl;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
691
692
693
694
695
696
697
698
699
700
701
  	int result;
  
  	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
  		pr_debug("qfq_classify: found %d
  ", skb->priority);
  		cl = qfq_find_class(sch, skb->priority);
  		if (cl != NULL)
  			return cl;
  	}
  
  	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
25d8c0d55   John Fastabend   net: rcu-ify tcf_...
702
703
  	fl = rcu_dereference_bh(q->filter_list);
  	result = tc_classify(skb, fl, &res);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
  	if (result >= 0) {
  #ifdef CONFIG_NET_CLS_ACT
  		switch (result) {
  		case TC_ACT_QUEUED:
  		case TC_ACT_STOLEN:
  			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
  		case TC_ACT_SHOT:
  			return NULL;
  		}
  #endif
  		cl = (struct qfq_class *)res.class;
  		if (cl == NULL)
  			cl = qfq_find_class(sch, res.classid);
  		return cl;
  	}
  
  	return NULL;
  }
  
  /* Generic comparison function, handling wraparound. */
  static inline int qfq_gt(u64 a, u64 b)
  {
  	return (s64)(a - b) > 0;
  }
  
  /* Round a precise timestamp to its slotted value. */
  static inline u64 qfq_round_down(u64 ts, unsigned int shift)
  {
  	return ts & ~((1ULL << shift) - 1);
  }
  
  /* return the pointer to the group with lowest index in the bitmap */
  static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
  					unsigned long bitmap)
  {
  	int index = __ffs(bitmap);
  	return &q->groups[index];
  }
  /* Calculate a mask to mimic what would be ffs_from(). */
  static inline unsigned long mask_from(unsigned long bitmap, int from)
  {
  	return bitmap & ~((1UL << from) - 1);
  }
  
  /*
   * The state computation relies on ER=0, IR=1, EB=2, IB=3
   * First compute eligibility comparing grp->S, q->V,
   * then check if someone is blocking us and possibly add EB
   */
  static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
  {
  	/* if S > V we are not eligible */
  	unsigned int state = qfq_gt(grp->S, q->V);
  	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
  	struct qfq_group *next;
  
  	if (mask) {
  		next = qfq_ffs(q, mask);
  		if (qfq_gt(grp->F, next->F))
  			state |= EB;
  	}
  
  	return state;
  }
  
  
  /*
   * In principle
   *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
   *	q->bitmaps[src] &= ~mask;
   * but we should make sure that src != dst
   */
  static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
  				   int src, int dst)
  {
  	q->bitmaps[dst] |= q->bitmaps[src] & mask;
  	q->bitmaps[src] &= ~mask;
  }
  
  static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
  {
  	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
  	struct qfq_group *next;
  
  	if (mask) {
  		next = qfq_ffs(q, mask);
  		if (!qfq_gt(next->F, old_F))
  			return;
  	}
  
  	mask = (1UL << index) - 1;
  	qfq_move_groups(q, mask, EB, ER);
  	qfq_move_groups(q, mask, IB, IR);
  }
  
  /*
   * perhaps
   *
  	old_V ^= q->V;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
803
  	old_V >>= q->min_slot_shift;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
804
805
806
807
808
  	if (old_V) {
  		...
  	}
   *
   */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
809
  static void qfq_make_eligible(struct qfq_sched *q)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
810
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
811
812
  	unsigned long vslot = q->V >> q->min_slot_shift;
  	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
813
814
  
  	if (vslot != old_vslot) {
87f1369d6   Paolo Valente   pkt_sched: sch_qf...
815
816
817
818
819
820
821
  		unsigned long mask;
  		int last_flip_pos = fls(vslot ^ old_vslot);
  
  		if (last_flip_pos > 31) /* higher than the number of groups */
  			mask = ~0UL;    /* make all groups eligible */
  		else
  			mask = (1UL << last_flip_pos) - 1;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
822
823
824
825
  		qfq_move_groups(q, mask, IR, ER);
  		qfq_move_groups(q, mask, IB, EB);
  	}
  }
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
826
  /*
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
   * The index of the slot in which the input aggregate agg is to be
   * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
   * and not a '-1' because the start time of the group may be moved
   * backward by one slot after the aggregate has been inserted, and
   * this would cause non-empty slots to be right-shifted by one
   * position.
   *
   * QFQ+ fully satisfies this bound to the slot index if the parameters
   * of the classes are not changed dynamically, and if QFQ+ never
   * happens to postpone the service of agg unjustly, i.e., it never
   * happens that the aggregate becomes backlogged and eligible, or just
   * eligible, while an aggregate with a higher approximated finish time
   * is being served. In particular, in this case QFQ+ guarantees that
   * the timestamps of agg are low enough that the slot index is never
   * higher than 2. Unfortunately, QFQ+ cannot provide the same
   * guarantee if it happens to unjustly postpone the service of agg, or
   * if the parameters of some class are changed.
   *
   * As for the first event, i.e., an out-of-order service, the
   * upper bound to the slot index guaranteed by QFQ+ grows to
   * 2 +
   * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
   * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
3015f3d2a   Paolo Valente   pkt_sched: enable...
850
   *
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
851
852
853
854
855
856
857
858
859
860
861
   * The following function deals with this problem by backward-shifting
   * the timestamps of agg, if needed, so as to guarantee that the slot
   * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
   * cause the service of other aggregates to be postponed, yet the
   * worst-case guarantees of these aggregates are not violated.  In
   * fact, in case of no out-of-order service, the timestamps of agg
   * would have been even lower than they are after the backward shift,
   * because QFQ+ would have guaranteed a maximum value equal to 2 for
   * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
   * service is postponed because of the backward-shift would have
   * however waited for the service of agg before being served.
3015f3d2a   Paolo Valente   pkt_sched: enable...
862
   *
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
863
864
865
866
867
868
869
870
871
872
873
874
   * The other event that may cause the slot index to be higher than 2
   * for agg is a recent change of the parameters of some class. If the
   * weight of a class is increased or the lmax (max_pkt_size) of the
   * class is decreased, then a new aggregate with smaller slot size
   * than the original parent aggregate of the class may happen to be
   * activated. The activation of this aggregate should be properly
   * delayed to when the service of the class has finished in the ideal
   * system tracked by QFQ+. If the activation of the aggregate is not
   * delayed to this reference time instant, then this aggregate may be
   * unjustly served before other aggregates waiting for service. This
   * may cause the above bound to the slot index to be violated for some
   * of these unlucky aggregates.
3015f3d2a   Paolo Valente   pkt_sched: enable...
875
   *
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
876
   * Instead of delaying the activation of the new aggregate, which is
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
877
878
879
   * quite complex, the above-discussed capping of the slot index is
   * used to handle also the consequences of a change of the parameters
   * of a class.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
880
   */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
881
  static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
882
883
884
  			    u64 roundedS)
  {
  	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
3015f3d2a   Paolo Valente   pkt_sched: enable...
885
886
887
888
889
  	unsigned int i; /* slot index in the bucket list */
  
  	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
  		u64 deltaS = roundedS - grp->S -
  			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
890
891
  		agg->S -= deltaS;
  		agg->F -= deltaS;
3015f3d2a   Paolo Valente   pkt_sched: enable...
892
893
894
895
  		slot = QFQ_MAX_SLOTS - 2;
  	}
  
  	i = (grp->front + slot) % QFQ_MAX_SLOTS;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
896

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
897
  	hlist_add_head(&agg->next, &grp->slots[i]);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
898
899
900
901
  	__set_bit(slot, &grp->full_slots);
  }
  
  /* Maybe introduce hlist_first_entry?? */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
902
  static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
903
904
  {
  	return hlist_entry(grp->slots[grp->front].first,
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
905
  			   struct qfq_aggregate, next);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
906
907
908
909
910
911
912
  }
  
  /*
   * remove the entry from the slot
   */
  static void qfq_front_slot_remove(struct qfq_group *grp)
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
913
  	struct qfq_aggregate *agg = qfq_slot_head(grp);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
914

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
915
916
  	BUG_ON(!agg);
  	hlist_del(&agg->next);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
917
918
919
920
921
  	if (hlist_empty(&grp->slots[grp->front]))
  		__clear_bit(0, &grp->full_slots);
  }
  
  /*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
922
923
924
   * Returns the first aggregate in the first non-empty bucket of the
   * group. As a side effect, adjusts the bucket list so the first
   * non-empty bucket is at position 0 in full_slots.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
925
   */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
926
  static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
  {
  	unsigned int i;
  
  	pr_debug("qfq slot_scan: grp %u full %#lx
  ",
  		 grp->index, grp->full_slots);
  
  	if (grp->full_slots == 0)
  		return NULL;
  
  	i = __ffs(grp->full_slots);  /* zero based */
  	if (i > 0) {
  		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
  		grp->full_slots >>= i;
  	}
  
  	return qfq_slot_head(grp);
  }
  
  /*
   * adjust the bucket list. When the start time of a group decreases,
   * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
   * move the objects. The mask of occupied slots must be shifted
   * because we use ffs() to find the first non-empty slot.
   * This covers decreases in the group's start time, but what about
   * increases of the start time ?
   * Here too we should make sure that i is less than 32
   */
  static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
  {
  	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
  
  	grp->full_slots <<= i;
  	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
962
  static void qfq_update_eligible(struct qfq_sched *q)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
963
964
965
966
967
968
969
970
971
972
973
  {
  	struct qfq_group *grp;
  	unsigned long ineligible;
  
  	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
  	if (ineligible) {
  		if (!q->bitmaps[ER]) {
  			grp = qfq_ffs(q, ineligible);
  			if (qfq_gt(grp->S, q->V))
  				q->V = grp->S;
  		}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
974
  		qfq_make_eligible(q);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
975
976
  	}
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
977
978
979
  /* Dequeue head packet of the head class in the DRR queue of the aggregate. */
  static void agg_dequeue(struct qfq_aggregate *agg,
  			struct qfq_class *cl, unsigned int len)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
980
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
981
  	qdisc_dequeue_peeked(cl->qdisc);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
982

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
983
  	cl->deficit -= (int) len;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
984

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
985
986
987
988
989
  	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
  		list_del(&cl->alist);
  	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
  		cl->deficit += agg->lmax;
  		list_move_tail(&cl->alist, &agg->active);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
990
  	}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
991
992
993
994
995
996
997
  }
  
  static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
  					   struct qfq_class **cl,
  					   unsigned int *len)
  {
  	struct sk_buff *skb;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
998

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
  	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
  	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
  	if (skb == NULL)
  		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf
  ");
  	else
  		*len = qdisc_pkt_len(skb);
  
  	return skb;
  }
  
  /* Update F according to the actual service received by the aggregate. */
  static inline void charge_actual_service(struct qfq_aggregate *agg)
  {
9b99b7e90   Paolo Valente   pkt_sched: sch_qf...
1013
1014
1015
1016
1017
1018
1019
  	/* Compute the service received by the aggregate, taking into
  	 * account that, after decreasing the number of classes in
  	 * agg, it may happen that
  	 * agg->initial_budget - agg->budget > agg->bugdetmax
  	 */
  	u32 service_received = min(agg->budgetmax,
  				   agg->initial_budget - agg->budget);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1020
1021
  
  	agg->F = agg->S + (u64)service_received * agg->inv_w;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1022
  }
88d4f419a   Paolo Valente   pkt_sched: sch_qf...
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
  /* Assign a reasonable start time for a new aggregate in group i.
   * Admissible values for \hat(F) are multiples of \sigma_i
   * no greater than V+\sigma_i . Larger values mean that
   * we had a wraparound so we consider the timestamp to be stale.
   *
   * If F is not stale and F >= V then we set S = F.
   * Otherwise we should assign S = V, but this may violate
   * the ordering in EB (see [2]). So, if we have groups in ER,
   * set S to the F_j of the first group j which would be blocking us.
   * We are guaranteed not to move S backward because
   * otherwise our group i would still be blocked.
   */
  static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
  {
  	unsigned long mask;
  	u64 limit, roundedF;
  	int slot_shift = agg->grp->slot_shift;
  
  	roundedF = qfq_round_down(agg->F, slot_shift);
  	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
  
  	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
  		/* timestamp was stale */
  		mask = mask_from(q->bitmaps[ER], agg->grp->index);
  		if (mask) {
  			struct qfq_group *next = qfq_ffs(q, mask);
  			if (qfq_gt(roundedF, next->F)) {
  				if (qfq_gt(limit, next->F))
  					agg->S = next->F;
  				else /* preserve timestamp correctness */
  					agg->S = limit;
  				return;
  			}
  		}
  		agg->S = q->V;
  	} else  /* timestamp is not stale */
  		agg->S = agg->F;
  }
  
  /* Update the timestamps of agg before scheduling/rescheduling it for
   * service.  In particular, assign to agg->F its maximum possible
   * value, i.e., the virtual finish time with which the aggregate
   * should be labeled if it used all its budget once in service.
   */
  static inline void
  qfq_update_agg_ts(struct qfq_sched *q,
  		    struct qfq_aggregate *agg, enum update_reason reason)
  {
  	if (reason != requeue)
  		qfq_update_start(q, agg);
  	else /* just charge agg for the service received */
  		agg->S = agg->F;
  
  	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
  }
2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1078
1079
  
  static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1080
1081
1082
  static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1083
  	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1084
  	struct qfq_class *cl;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1085
1086
1087
  	struct sk_buff *skb = NULL;
  	/* next-packet len, 0 means no more active classes in in-service agg */
  	unsigned int len = 0;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1088

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1089
  	if (in_serv_agg == NULL)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1090
  		return NULL;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1091
1092
  	if (!list_empty(&in_serv_agg->active))
  		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1093

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
  	/*
  	 * If there are no active classes in the in-service aggregate,
  	 * or if the aggregate has not enough budget to serve its next
  	 * class, then choose the next aggregate to serve.
  	 */
  	if (len == 0 || in_serv_agg->budget < len) {
  		charge_actual_service(in_serv_agg);
  
  		/* recharge the budget of the aggregate */
  		in_serv_agg->initial_budget = in_serv_agg->budget =
  			in_serv_agg->budgetmax;
2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1105
  		if (!list_empty(&in_serv_agg->active)) {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
  			/*
  			 * Still active: reschedule for
  			 * service. Possible optimization: if no other
  			 * aggregate is active, then there is no point
  			 * in rescheduling this aggregate, and we can
  			 * just keep it as the in-service one. This
  			 * should be however a corner case, and to
  			 * handle it, we would need to maintain an
  			 * extra num_active_aggs field.
  			*/
2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1116
1117
1118
  			qfq_update_agg_ts(q, in_serv_agg, requeue);
  			qfq_schedule_agg(q, in_serv_agg);
  		} else if (sch->q.qlen == 0) { /* no aggregate to serve */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
  			q->in_serv_agg = NULL;
  			return NULL;
  		}
  
  		/*
  		 * If we get here, there are other aggregates queued:
  		 * choose the new aggregate to serve.
  		 */
  		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
  		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1129
  	}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1130
1131
  	if (!skb)
  		return NULL;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1132
1133
1134
  
  	sch->q.qlen--;
  	qdisc_bstats_update(sch, skb);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1135
  	agg_dequeue(in_serv_agg, cl, len);
a0143efa9   Paolo Valente   pkt_sched: sch_qf...
1136
1137
1138
1139
1140
1141
1142
1143
  	/* If lmax is lowered, through qfq_change_class, for a class
  	 * owning pending packets with larger size than the new value
  	 * of lmax, then the following condition may hold.
  	 */
  	if (unlikely(in_serv_agg->budget < len))
  		in_serv_agg->budget = 0;
  	else
  		in_serv_agg->budget -= len;
87f40dd6c   Paolo Valente   pkt_sched: sch_qf...
1144
  	q->V += (u64)len * q->iwsum;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1145
1146
  	pr_debug("qfq dequeue: len %u F %lld now %lld
  ",
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1147
1148
  		 len, (unsigned long long) in_serv_agg->F,
  		 (unsigned long long) q->V);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1149

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1150
1151
  	return skb;
  }
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1152

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1153
1154
1155
1156
1157
  static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
  {
  	struct qfq_group *grp;
  	struct qfq_aggregate *agg, *new_front_agg;
  	u64 old_F;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1158

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
  	qfq_update_eligible(q);
  	q->oldV = q->V;
  
  	if (!q->bitmaps[ER])
  		return NULL;
  
  	grp = qfq_ffs(q, q->bitmaps[ER]);
  	old_F = grp->F;
  
  	agg = qfq_slot_head(grp);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1169

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
  	/* agg starts to be served, remove it from schedule */
  	qfq_front_slot_remove(grp);
  
  	new_front_agg = qfq_slot_scan(grp);
  
  	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
  		__clear_bit(grp->index, &q->bitmaps[ER]);
  	else {
  		u64 roundedS = qfq_round_down(new_front_agg->S,
  					      grp->slot_shift);
  		unsigned int s;
  
  		if (grp->S == roundedS)
  			return agg;
  		grp->S = roundedS;
  		grp->F = roundedS + (2ULL << grp->slot_shift);
  		__clear_bit(grp->index, &q->bitmaps[ER]);
  		s = qfq_calc_state(q, grp);
  		__set_bit(grp->index, &q->bitmaps[s]);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1189
  	}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1190
  	qfq_unblock_groups(q, grp->index, old_F);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1191

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1192
  	return agg;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1193
  }
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1194
1195
1196
  static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1197
  	struct qfq_class *cl;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1198
  	struct qfq_aggregate *agg;
f54ba7798   David S. Miller   pkt_sched: Fix wa...
1199
  	int err = 0;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1200
1201
1202
1203
  
  	cl = qfq_classify(skb, sch, &err);
  	if (cl == NULL) {
  		if (err & __NET_XMIT_BYPASS)
25331d6ce   John Fastabend   net: sched: imple...
1204
  			qdisc_qstats_drop(sch);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1205
1206
1207
1208
1209
  		kfree_skb(skb);
  		return err;
  	}
  	pr_debug("qfq_enqueue: cl = %x
  ", cl->common.classid);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1210
  	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
3015f3d2a   Paolo Valente   pkt_sched: enable...
1211
  		pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1212
1213
1214
1215
1216
  			 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
  		err = qfq_change_agg(sch, cl, cl->agg->class_weight,
  				     qdisc_pkt_len(skb));
  		if (err)
  			return err;
3015f3d2a   Paolo Valente   pkt_sched: enable...
1217
  	}
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1218
1219
1220
1221
1222
1223
  	err = qdisc_enqueue(skb, cl->qdisc);
  	if (unlikely(err != NET_XMIT_SUCCESS)) {
  		pr_debug("qfq_enqueue: enqueue failed %d
  ", err);
  		if (net_xmit_drop_count(err)) {
  			cl->qstats.drops++;
25331d6ce   John Fastabend   net: sched: imple...
1224
  			qdisc_qstats_drop(sch);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1225
1226
1227
1228
1229
1230
  		}
  		return err;
  	}
  
  	bstats_update(&cl->bstats, skb);
  	++sch->q.qlen;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1231
1232
1233
1234
1235
1236
1237
  	agg = cl->agg;
  	/* if the queue was not empty, then done here */
  	if (cl->qdisc->q.qlen != 1) {
  		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
  		    list_first_entry(&agg->active, struct qfq_class, alist)
  		    == cl && cl->deficit < qdisc_pkt_len(skb))
  			list_move_tail(&cl->alist, &agg->active);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1238
  		return err;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1239
1240
1241
1242
1243
  	}
  
  	/* schedule class for service within the aggregate */
  	cl->deficit = agg->lmax;
  	list_add_tail(&cl->alist, &agg->active);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1244

2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1245
1246
1247
  	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
  	    q->in_serv_agg == agg)
  		return err; /* non-empty or in service, nothing else to do */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1248

2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1249
  	qfq_activate_agg(q, agg, enqueue);
be72f63b4   Paolo Valente   sched: add missin...
1250
1251
1252
1253
1254
  
  	return err;
  }
  
  /*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1255
   * Schedule aggregate according to its timestamps.
be72f63b4   Paolo Valente   sched: add missin...
1256
   */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1257
  static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
be72f63b4   Paolo Valente   sched: add missin...
1258
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1259
  	struct qfq_group *grp = agg->grp;
be72f63b4   Paolo Valente   sched: add missin...
1260
1261
  	u64 roundedS;
  	int s;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1262
  	roundedS = qfq_round_down(agg->S, grp->slot_shift);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1263
1264
  
  	/*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1265
1266
  	 * Insert agg in the correct bucket.
  	 * If agg->S >= grp->S we don't need to adjust the
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1267
1268
1269
1270
1271
1272
1273
  	 * bucket list and simply go to the insertion phase.
  	 * Otherwise grp->S is decreasing, we must make room
  	 * in the bucket list, and also recompute the group state.
  	 * Finally, if there were no flows in this group and nobody
  	 * was in ER make sure to adjust V.
  	 */
  	if (grp->full_slots) {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1274
  		if (!qfq_gt(grp->S, agg->S))
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1275
  			goto skip_update;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1276
  		/* create a slot for this agg->S */
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1277
1278
1279
1280
  		qfq_slot_rotate(grp, roundedS);
  		/* group was surely ineligible, remove */
  		__clear_bit(grp->index, &q->bitmaps[IR]);
  		__clear_bit(grp->index, &q->bitmaps[IB]);
40dd2d546   Paolo Valente   pkt_sched: sch_qf...
1281
1282
  	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
  		   q->in_serv_agg == NULL)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
  		q->V = roundedS;
  
  	grp->S = roundedS;
  	grp->F = roundedS + (2ULL << grp->slot_shift);
  	s = qfq_calc_state(q, grp);
  	__set_bit(grp->index, &q->bitmaps[s]);
  
  	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld
  ",
  		 s, q->bitmaps[s],
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1293
1294
  		 (unsigned long long) agg->S,
  		 (unsigned long long) agg->F,
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1295
1296
1297
  		 (unsigned long long) q->V);
  
  skip_update:
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1298
  	qfq_slot_insert(grp, agg, roundedS);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1299
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1300
1301
1302
1303
  /* Update agg ts and schedule agg for service */
  static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
  			     enum update_reason reason)
  {
2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1304
  	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1305
  	qfq_update_agg_ts(q, agg, reason);
2f3b89a1f   Paolo Valente   pkt_sched: sch_qf...
1306
1307
1308
1309
1310
1311
  	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
  		q->in_serv_agg = agg; /* start serving this aggregate */
  		 /* update V: to be in service, agg must be eligible */
  		q->oldV = q->V = agg->S;
  	} else if (agg != q->in_serv_agg)
  		qfq_schedule_agg(q, agg);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1312
  }
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1313
  static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1314
  			    struct qfq_aggregate *agg)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1315
1316
1317
  {
  	unsigned int i, offset;
  	u64 roundedS;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1318
  	roundedS = qfq_round_down(agg->S, grp->slot_shift);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1319
  	offset = (roundedS - grp->S) >> grp->slot_shift;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1320

0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1321
  	i = (grp->front + offset) % QFQ_MAX_SLOTS;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1322
  	hlist_del(&agg->next);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1323
1324
1325
1326
1327
  	if (hlist_empty(&grp->slots[i]))
  		__clear_bit(offset, &grp->full_slots);
  }
  
  /*
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1328
1329
1330
1331
   * Called to forcibly deschedule an aggregate.  If the aggregate is
   * not in the front bucket, or if the latter has other aggregates in
   * the front bucket, we can simply remove the aggregate with no other
   * side effects.
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1332
1333
   * Otherwise we must propagate the event up.
   */
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1334
  static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1335
  {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1336
  	struct qfq_group *grp = agg->grp;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1337
1338
1339
  	unsigned long mask;
  	u64 roundedS;
  	int s;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1340
1341
1342
1343
1344
1345
1346
1347
  	if (agg == q->in_serv_agg) {
  		charge_actual_service(agg);
  		q->in_serv_agg = qfq_choose_next_agg(q);
  		return;
  	}
  
  	agg->F = agg->S;
  	qfq_slot_remove(q, grp, agg);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
  
  	if (!grp->full_slots) {
  		__clear_bit(grp->index, &q->bitmaps[IR]);
  		__clear_bit(grp->index, &q->bitmaps[EB]);
  		__clear_bit(grp->index, &q->bitmaps[IB]);
  
  		if (test_bit(grp->index, &q->bitmaps[ER]) &&
  		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
  			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
  			if (mask)
  				mask = ~((1UL << __fls(mask)) - 1);
  			else
  				mask = ~0UL;
  			qfq_move_groups(q, mask, EB, ER);
  			qfq_move_groups(q, mask, IB, IR);
  		}
  		__clear_bit(grp->index, &q->bitmaps[ER]);
  	} else if (hlist_empty(&grp->slots[grp->front])) {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1366
1367
  		agg = qfq_slot_scan(grp);
  		roundedS = qfq_round_down(agg->S, grp->slot_shift);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
  		if (grp->S != roundedS) {
  			__clear_bit(grp->index, &q->bitmaps[ER]);
  			__clear_bit(grp->index, &q->bitmaps[IR]);
  			__clear_bit(grp->index, &q->bitmaps[EB]);
  			__clear_bit(grp->index, &q->bitmaps[IB]);
  			grp->S = roundedS;
  			grp->F = roundedS + (2ULL << grp->slot_shift);
  			s = qfq_calc_state(q, grp);
  			__set_bit(grp->index, &q->bitmaps[s]);
  		}
  	}
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
  }
  
  static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_class *cl = (struct qfq_class *)arg;
  
  	if (cl->qdisc->q.qlen == 0)
  		qfq_deactivate_class(q, cl);
  }
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1389
1390
1391
1392
  static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
  				       struct hlist_head *slot)
  {
  	struct qfq_aggregate *agg;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1393
1394
  	struct qfq_class *cl;
  	unsigned int len;
b67bfe0d4   Sasha Levin   hlist: drop the n...
1395
  	hlist_for_each_entry(agg, slot, next) {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
  		list_for_each_entry(cl, &agg->active, alist) {
  
  			if (!cl->qdisc->ops->drop)
  				continue;
  
  			len = cl->qdisc->ops->drop(cl->qdisc);
  			if (len > 0) {
  				if (cl->qdisc->q.qlen == 0)
  					qfq_deactivate_class(q, cl);
  
  				return len;
  			}
  		}
  	}
  	return 0;
  }
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1412
1413
1414
1415
1416
1417
1418
1419
1420
  static unsigned int qfq_drop(struct Qdisc *sch)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_group *grp;
  	unsigned int i, j, len;
  
  	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
  		grp = &q->groups[i];
  		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1421
1422
1423
1424
  			len = qfq_drop_from_slot(q, &grp->slots[j]);
  			if (len > 0) {
  				sch->q.qlen--;
  				return len;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1425
1426
  			}
  		}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1427

0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
  	}
  
  	return 0;
  }
  
  static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_group *grp;
  	int i, j, err;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1438
  	u32 max_cl_shift, maxbudg_shift, max_classes;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1439
1440
1441
1442
  
  	err = qdisc_class_hash_init(&q->clhash);
  	if (err < 0)
  		return err;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
  	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
  		max_classes = QFQ_MAX_AGG_CLASSES;
  	else
  		max_classes = qdisc_dev(sch)->tx_queue_len + 1;
  	/* max_cl_shift = floor(log_2(max_classes)) */
  	max_cl_shift = __fls(max_classes);
  	q->max_agg_classes = 1<<max_cl_shift;
  
  	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
  	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
  	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1454
1455
1456
  	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
  		grp = &q->groups[i];
  		grp->index = i;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1457
  		grp->slot_shift = q->min_slot_shift + i;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1458
1459
1460
  		for (j = 0; j < QFQ_MAX_SLOTS; j++)
  			INIT_HLIST_HEAD(&grp->slots[j]);
  	}
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1461
  	INIT_HLIST_HEAD(&q->nonfull_aggs);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1462
1463
1464
1465
1466
1467
  	return 0;
  }
  
  static void qfq_reset_qdisc(struct Qdisc *sch)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1468
  	struct qfq_class *cl;
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1469
  	unsigned int i;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1470

462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1471
  	for (i = 0; i < q->clhash.hashsize; i++) {
b67bfe0d4   Sasha Levin   hlist: drop the n...
1472
  		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1473
  			if (cl->qdisc->q.qlen > 0)
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1474
  				qfq_deactivate_class(q, cl);
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1475

0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1476
  			qdisc_reset(cl->qdisc);
462dbc910   Paolo Valente   pkt_sched: QFQ Pl...
1477
  		}
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1478
1479
1480
1481
1482
1483
1484
1485
  	}
  	sch->q.qlen = 0;
  }
  
  static void qfq_destroy_qdisc(struct Qdisc *sch)
  {
  	struct qfq_sched *q = qdisc_priv(sch);
  	struct qfq_class *cl;
b67bfe0d4   Sasha Levin   hlist: drop the n...
1486
  	struct hlist_node *next;
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1487
1488
1489
1490
1491
  	unsigned int i;
  
  	tcf_destroy_chain(&q->filter_list);
  
  	for (i = 0; i < q->clhash.hashsize; i++) {
b67bfe0d4   Sasha Levin   hlist: drop the n...
1492
  		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
0545a3037   stephen hemminger   pkt_sched: QFQ - ...
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
  					  common.hnode) {
  			qfq_destroy_class(sch, cl);
  		}
  	}
  	qdisc_class_hash_destroy(&q->clhash);
  }
  
  static const struct Qdisc_class_ops qfq_class_ops = {
  	.change		= qfq_change_class,
  	.delete		= qfq_delete_class,
  	.get		= qfq_get_class,
  	.put		= qfq_put_class,
  	.tcf_chain	= qfq_tcf_chain,
  	.bind_tcf	= qfq_bind_tcf,
  	.unbind_tcf	= qfq_unbind_tcf,
  	.graft		= qfq_graft_class,
  	.leaf		= qfq_class_leaf,
  	.qlen_notify	= qfq_qlen_notify,
  	.dump		= qfq_dump_class,
  	.dump_stats	= qfq_dump_class_stats,
  	.walk		= qfq_walk,
  };
  
  static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
  	.cl_ops		= &qfq_class_ops,
  	.id		= "qfq",
  	.priv_size	= sizeof(struct qfq_sched),
  	.enqueue	= qfq_enqueue,
  	.dequeue	= qfq_dequeue,
  	.peek		= qdisc_peek_dequeued,
  	.drop		= qfq_drop,
  	.init		= qfq_init_qdisc,
  	.reset		= qfq_reset_qdisc,
  	.destroy	= qfq_destroy_qdisc,
  	.owner		= THIS_MODULE,
  };
  
  static int __init qfq_init(void)
  {
  	return register_qdisc(&qfq_qdisc_ops);
  }
  
  static void __exit qfq_exit(void)
  {
  	unregister_qdisc(&qfq_qdisc_ops);
  }
  
  module_init(qfq_init);
  module_exit(qfq_exit);
  MODULE_LICENSE("GPL");