Commit 8e5fc1a7320baf6076391607515dceb61319b36a

Authored by Stephane Eranian
Committed by Ingo Molnar
1 parent ba0cef3d14

perf_events: Fix transaction recovery in group_sched_in()

The group_sched_in() function uses a transactional approach to schedule
a group of events. In a group, either all events can be scheduled or
none are. To schedule each event in, the function calls event_sched_in().
In case of error, event_sched_out() is called on each event in the group.

The problem is that event_sched_out() does not completely cancel the
effects of event_sched_in(). Furthermore event_sched_out() changes the
state of the event as if it had run which is not true is this particular
case.

Those inconsistencies impact time tracking fields and may lead to events
in a group not all reporting the same time_enabled and time_running values.
This is demonstrated with the example below:

$ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5
1946101 unhalted_core_cycles (32.85% scaling, ena=829181, run=556827)
  11423 baclears (32.85% scaling, ena=829181, run=556827)
   7671 baclears (0.00% scaling, ena=556827, run=556827)

2250443 unhalted_core_cycles (57.83% scaling, ena=962822, run=405995)
  11705 baclears (57.83% scaling, ena=962822, run=405995)
  11705 baclears (57.83% scaling, ena=962822, run=405995)

Notice that in the first group, the last baclears event does not
report the same timings as its siblings.

This issue comes from the fact that tstamp_stopped is updated
by event_sched_out() as if the event had actually run.

To solve the issue, we must ensure that, in case of error, there is
no change in the event state whatsoever. That means timings must
remain as they were when entering group_sched_in().

To do this we defer updating tstamp_running until we know the
transaction succeeded. Therefore, we have split event_sched_in()
in two parts separating the update to tstamp_running.

Similarly, in case of error, we do not want to update tstamp_stopped.
Therefore, we have split event_sched_out() in two parts separating
the update to tstamp_stopped.

With this patch, we now get the following output:

$ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5
2492050 unhalted_core_cycles (71.75% scaling, ena=1093330, run=308841)
  11243 baclears (71.75% scaling, ena=1093330, run=308841)
  11243 baclears (71.75% scaling, ena=1093330, run=308841)

1852746 unhalted_core_cycles (0.00% scaling, ena=784489, run=784489)
   9253 baclears (0.00% scaling, ena=784489, run=784489)
   9253 baclears (0.00% scaling, ena=784489, run=784489)

Note that the uneven timing between groups is a side effect of
the process spending most of its time sleeping, i.e., not enough
event rotations (but that's a separate issue).

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4cb86b4c.41e9d80a.44e9.3e19@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 1 changed file with 63 additions and 13 deletions Side-by-side Diff

... ... @@ -412,8 +412,8 @@
412 412 return event->cpu == -1 || event->cpu == smp_processor_id();
413 413 }
414 414  
415   -static void
416   -event_sched_out(struct perf_event *event,
  415 +static int
  416 +__event_sched_out(struct perf_event *event,
417 417 struct perf_cpu_context *cpuctx,
418 418 struct perf_event_context *ctx)
419 419 {
420 420  
... ... @@ -432,14 +432,13 @@
432 432 }
433 433  
434 434 if (event->state != PERF_EVENT_STATE_ACTIVE)
435   - return;
  435 + return 0;
436 436  
437 437 event->state = PERF_EVENT_STATE_INACTIVE;
438 438 if (event->pending_disable) {
439 439 event->pending_disable = 0;
440 440 event->state = PERF_EVENT_STATE_OFF;
441 441 }
442   - event->tstamp_stopped = ctx->time;
443 442 event->pmu->del(event, 0);
444 443 event->oncpu = -1;
445 444  
446 445  
... ... @@ -448,9 +447,22 @@
448 447 ctx->nr_active--;
449 448 if (event->attr.exclusive || !cpuctx->active_oncpu)
450 449 cpuctx->exclusive = 0;
  450 + return 1;
451 451 }
452 452  
453 453 static void
  454 +event_sched_out(struct perf_event *event,
  455 + struct perf_cpu_context *cpuctx,
  456 + struct perf_event_context *ctx)
  457 +{
  458 + int ret;
  459 +
  460 + ret = __event_sched_out(event, cpuctx, ctx);
  461 + if (ret)
  462 + event->tstamp_stopped = ctx->time;
  463 +}
  464 +
  465 +static void
454 466 group_sched_out(struct perf_event *group_event,
455 467 struct perf_cpu_context *cpuctx,
456 468 struct perf_event_context *ctx)
... ... @@ -647,7 +659,7 @@
647 659 }
648 660  
649 661 static int
650   -event_sched_in(struct perf_event *event,
  662 +__event_sched_in(struct perf_event *event,
651 663 struct perf_cpu_context *cpuctx,
652 664 struct perf_event_context *ctx)
653 665 {
... ... @@ -667,8 +679,6 @@
667 679 return -EAGAIN;
668 680 }
669 681  
670   - event->tstamp_running += ctx->time - event->tstamp_stopped;
671   -
672 682 if (!is_software_event(event))
673 683 cpuctx->active_oncpu++;
674 684 ctx->nr_active++;
... ... @@ -679,6 +689,35 @@
679 689 return 0;
680 690 }
681 691  
  692 +static inline int
  693 +event_sched_in(struct perf_event *event,
  694 + struct perf_cpu_context *cpuctx,
  695 + struct perf_event_context *ctx)
  696 +{
  697 + int ret = __event_sched_in(event, cpuctx, ctx);
  698 + if (ret)
  699 + return ret;
  700 + event->tstamp_running += ctx->time - event->tstamp_stopped;
  701 + return 0;
  702 +}
  703 +
  704 +static void
  705 +group_commit_event_sched_in(struct perf_event *group_event,
  706 + struct perf_cpu_context *cpuctx,
  707 + struct perf_event_context *ctx)
  708 +{
  709 + struct perf_event *event;
  710 + u64 now = ctx->time;
  711 +
  712 + group_event->tstamp_running += now - group_event->tstamp_stopped;
  713 + /*
  714 + * Schedule in siblings as one group (if any):
  715 + */
  716 + list_for_each_entry(event, &group_event->sibling_list, group_entry) {
  717 + event->tstamp_running += now - event->tstamp_stopped;
  718 + }
  719 +}
  720 +
682 721 static int
683 722 group_sched_in(struct perf_event *group_event,
684 723 struct perf_cpu_context *cpuctx,
... ... @@ -692,7 +731,13 @@
692 731  
693 732 pmu->start_txn(pmu);
694 733  
695   - if (event_sched_in(group_event, cpuctx, ctx)) {
  734 + /*
  735 + * use __event_sched_in() to delay updating tstamp_running
  736 + * until the transaction is committed. In case of failure
  737 + * we will keep an unmodified tstamp_running which is a
  738 + * requirement to get correct timing information
  739 + */
  740 + if (__event_sched_in(group_event, cpuctx, ctx)) {
696 741 pmu->cancel_txn(pmu);
697 742 return -EAGAIN;
698 743 }
699 744  
700 745  
701 746  
702 747  
703 748  
... ... @@ -701,26 +746,31 @@
701 746 * Schedule in siblings as one group (if any):
702 747 */
703 748 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
704   - if (event_sched_in(event, cpuctx, ctx)) {
  749 + if (__event_sched_in(event, cpuctx, ctx)) {
705 750 partial_group = event;
706 751 goto group_error;
707 752 }
708 753 }
709 754  
710   - if (!pmu->commit_txn(pmu))
  755 + if (!pmu->commit_txn(pmu)) {
  756 + /* commit tstamp_running */
  757 + group_commit_event_sched_in(group_event, cpuctx, ctx);
711 758 return 0;
712   -
  759 + }
713 760 group_error:
714 761 /*
715 762 * Groups can be scheduled in as one unit only, so undo any
716 763 * partial group before returning:
  764 + *
  765 + * use __event_sched_out() to avoid updating tstamp_stopped
  766 + * because the event never actually ran
717 767 */
718 768 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
719 769 if (event == partial_group)
720 770 break;
721   - event_sched_out(event, cpuctx, ctx);
  771 + __event_sched_out(event, cpuctx, ctx);
722 772 }
723   - event_sched_out(group_event, cpuctx, ctx);
  773 + __event_sched_out(group_event, cpuctx, ctx);
724 774  
725 775 pmu->cancel_txn(pmu);
726 776