cfq-iosched: Do group share accounting in IOPS when slice_idle=0

o Implement another CFQ mode where we charge group in terms of number of requests dispatched instead of measuring the time. Measuring in terms of time is not possible when we are driving deeper queue depths and there are requests from multiple cfq queues in the request queue. o This mode currently gets activated if one sets slice_idle=0 and associated disk supports NCQ. Again the idea is that on an NCQ disk with idling disabled most of the queues will dispatch 1 or more requests and then cfq queue expiry happens and we don't have a way to measure time. So start providing fairness in terms of IOPS. o Currently IOPS mode works only with cfq group scheduling. CFQ is following different scheduling algorithms for queue and group scheduling. These IOPS stats are used only for group scheduling hence in non-croup mode nothing should change. o For CFQ group scheduling one can disable slice idling so that we don't idle on queue and drive deeper request queue depths (achieving better throughput), at the same time group idle is enabled so one should get service differentiation among groups. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

cfq-iosched: Do group share accounting in IOPS when slice_idle=0
o Implement another CFQ mode where we charge group in terms of number of requests dispatched instead of measuring the time. Measuring in terms of time is not possible when we are driving deeper queue depths and there are requests from multiple cfq queues in the request queue. o This mode currently gets activated if one sets slice_idle=0 and associated disk supports NCQ. Again the idea is that on an NCQ disk with idling disabled most of the queues will dispatch 1 or more requests and then cfq queue expiry happens and we don't have a way to measure time. So start providing fairness in terms of IOPS. o Currently IOPS mode works only with cfq group scheduling. CFQ is following different scheduling algorithms for queue and group scheduling. These IOPS stats are used only for group scheduling hence in non-croup mode nothing should change. o For CFQ group scheduling one can disable slice idling so that we don't idle on queue and drive deeper request queue depths (achieving better throughput), at the same time group idle is enabled so one should get service differentiation among groups. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Vivek Goyal · Jens Axboe
1 parent b6508c1618
Showing 1 changed file with 24 additions and 6 deletions Side-by-side Diff
block/cfq-iosched.c
@@ -378,6 +378,21 @@
 			&cfqg->service_trees[i][j]: NULL) \
  
  
+static inline bool iops_mode(struct cfq_data *cfqd)
+{
+	/*
+	 * If we are not idling on queues and it is a NCQ drive, parallel
+	 * execution of requests is on and measuring time is not possible
+	 * in most of the cases until and unless we drive shallower queue
+	 * depths and that becomes a performance bottleneck. In such cases
+	 * switch to start providing fairness in terms of number of IOs.
+	 */
+	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
+		return true;
+	else
+		return false;
+}
+
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
@@ -906,7 +921,6 @@
 			slice_used = cfqq->allocated_slice;
 	}
  
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
 	return slice_used;
 }
  
  
  
  
@@ -914,19 +928,21 @@
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	unsigned int used_sl, charge_sl;
+	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
  
 	BUG_ON(nr_sync < 0);
-	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
+	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
  
-	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
-		charge_sl = cfqq->allocated_slice;
+	if (iops_mode(cfqd))
+		charge = cfqq->slice_dispatch;
+	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+		charge = cfqq->allocated_slice;
  
 	/* Can't update vdisktime while group is on service tree */
 	cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
+	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
  
 	/* This group is being expired. Save the context */
@@ -940,6 +956,8 @@
  
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u",
+			used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd));
 	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
...	...	@@ -378,6 +378,21 @@
378	378	&cfqg->service_trees[i][j]: NULL) \
379	379
380	380
	381	+static inline bool iops_mode(struct cfq_data *cfqd)
	382	+{
	383	+ /*
	384	+ * If we are not idling on queues and it is a NCQ drive, parallel
	385	+ * execution of requests is on and measuring time is not possible
	386	+ * in most of the cases until and unless we drive shallower queue
	387	+ * depths and that becomes a performance bottleneck. In such cases
	388	+ * switch to start providing fairness in terms of number of IOs.
	389	+ */
	390	+ if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
	391	+ return true;
	392	+ else
	393	+ return false;
	394	+}
	395	+
381	396	static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
382	397	{
383	398	if (cfq_class_idle(cfqq))
...	...	@@ -906,7 +921,6 @@
906	921	slice_used = cfqq->allocated_slice;
907	922	}
908	923
909		- cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
910	924	return slice_used;
911	925	}
912	926
913	927
914	928
915	929
...	...	@@ -914,19 +928,21 @@
914	928	struct cfq_queue *cfqq)
915	929	{
916	930	struct cfq_rb_root *st = &cfqd->grp_service_tree;
917		- unsigned int used_sl, charge_sl;
	931	+ unsigned int used_sl, charge;
918	932	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
919	933	- cfqg->service_tree_idle.count;
920	934
921	935	BUG_ON(nr_sync < 0);
922		- used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
	936	+ used_sl = charge = cfq_cfqq_slice_usage(cfqq);
923	937
924		- if (!cfq_cfqq_sync(cfqq) && !nr_sync)
925		- charge_sl = cfqq->allocated_slice;
	938	+ if (iops_mode(cfqd))
	939	+ charge = cfqq->slice_dispatch;
	940	+ else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
	941	+ charge = cfqq->allocated_slice;
926	942
927	943	/* Can't update vdisktime while group is on service tree */
928	944	cfq_rb_erase(&cfqg->rb_node, st);
929		- cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
	945	+ cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
930	946	__cfq_group_service_tree_add(st, cfqg);
931	947
932	948	/* This group is being expired. Save the context */
...	...	@@ -940,6 +956,8 @@
940	956
941	957	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
942	958	st->min_vdisktime);
	959	+ cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u",
	960	+ used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd));
943	961	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
944	962	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
945	963	}