Commit f66c6b2066b44d4ab8e8ac1ee4cae543738fe2ac
Committed by
Ingo Molnar
1 parent
0fd112e41c
Exists in
master
and in
39 other branches
perf_counter: update documentation
Impact: documentation fix This updates the perfcounter documentation to reflect recent changes. Signed-off-by: Paul Mackerras <paulus@samba.org>
Showing 1 changed file with 202 additions and 66 deletions Side-by-side Diff
Documentation/perf_counter/design.txt
... | ... | @@ -11,7 +11,9 @@ |
11 | 11 | |
12 | 12 | The Linux Performance Counter subsystem provides an abstraction of these |
13 | 13 | hardware capabilities. It provides per task and per CPU counters, counter |
14 | -groups, and it provides event capabilities on top of those. | |
14 | +groups, and it provides event capabilities on top of those. It | |
15 | +provides "virtual" 64-bit counters, regardless of the width of the | |
16 | +underlying hardware counters. | |
15 | 17 | |
16 | 18 | Performance counters are accessed via special file descriptors. |
17 | 19 | There's one file descriptor per virtual counter used. |
... | ... | @@ -20,7 +22,8 @@ |
20 | 22 | system call: |
21 | 23 | |
22 | 24 | int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, |
23 | - pid_t pid, int cpu, int group_fd); | |
25 | + pid_t pid, int cpu, int group_fd, | |
26 | + unsigned long flags); | |
24 | 27 | |
25 | 28 | The syscall returns the new fd. The fd can be used via the normal |
26 | 29 | VFS system calls: read() can be used to read the counter, fcntl() |
27 | 30 | |
28 | 31 | |
29 | 32 | |
30 | 33 | |
31 | 34 | |
32 | 35 | |
33 | 36 | |
34 | 37 | |
35 | 38 | |
36 | 39 | |
37 | 40 | |
38 | 41 | |
39 | 42 | |
40 | 43 | |
41 | 44 | |
42 | 45 | |
43 | 46 | |
44 | 47 | |
... | ... | @@ -32,91 +35,181 @@ |
32 | 35 | When creating a new counter fd, 'perf_counter_hw_event' is: |
33 | 36 | |
34 | 37 | /* |
35 | - * Hardware event to monitor via a performance monitoring counter: | |
38 | + * Event to monitor via a performance monitoring counter: | |
36 | 39 | */ |
37 | 40 | struct perf_counter_hw_event { |
38 | - s64 type; | |
41 | + __u64 event_config; | |
39 | 42 | |
40 | - u64 irq_period; | |
41 | - u32 record_type; | |
43 | + __u64 irq_period; | |
44 | + __u64 record_type; | |
45 | + __u64 read_format; | |
42 | 46 | |
43 | - u32 disabled : 1, /* off by default */ | |
44 | - nmi : 1, /* NMI sampling */ | |
45 | - raw : 1, /* raw event type */ | |
46 | - __reserved_1 : 29; | |
47 | + __u64 disabled : 1, /* off by default */ | |
48 | + nmi : 1, /* NMI sampling */ | |
49 | + inherit : 1, /* children inherit it */ | |
50 | + pinned : 1, /* must always be on PMU */ | |
51 | + exclusive : 1, /* only group on PMU */ | |
52 | + exclude_user : 1, /* don't count user */ | |
53 | + exclude_kernel : 1, /* ditto kernel */ | |
54 | + exclude_hv : 1, /* ditto hypervisor */ | |
55 | + exclude_idle : 1, /* don't count when idle */ | |
47 | 56 | |
48 | - u64 __reserved_2; | |
57 | + __reserved_1 : 55; | |
58 | + | |
59 | + __u32 extra_config_len; | |
60 | + | |
61 | + __u32 __reserved_4; | |
62 | + __u64 __reserved_2; | |
63 | + __u64 __reserved_3; | |
49 | 64 | }; |
50 | 65 | |
66 | +The 'event_config' field specifies what the counter should count. It | |
67 | +is divided into 3 bit-fields: | |
68 | + | |
69 | +raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 | |
70 | +type: 7 bits (next most significant) 0x7f00_0000_0000_0000 | |
71 | +event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 | |
72 | + | |
73 | +If 'raw_type' is 1, then the counter will count a hardware event | |
74 | +specified by the remaining 63 bits of event_config. The encoding is | |
75 | +machine-specific. | |
76 | + | |
77 | +If 'raw_type' is 0, then the 'type' field says what kind of counter | |
78 | +this is, with the following encoding: | |
79 | + | |
80 | +enum perf_event_types { | |
81 | + PERF_TYPE_HARDWARE = 0, | |
82 | + PERF_TYPE_SOFTWARE = 1, | |
83 | + PERF_TYPE_TRACEPOINT = 2, | |
84 | +}; | |
85 | + | |
86 | +A counter of PERF_TYPE_HARDWARE will count the hardware event | |
87 | +specified by 'event_id': | |
88 | + | |
51 | 89 | /* |
52 | - * Generalized performance counter event types, used by the hw_event.type | |
90 | + * Generalized performance counter event types, used by the hw_event.event_id | |
53 | 91 | * parameter of the sys_perf_counter_open() syscall: |
54 | 92 | */ |
55 | -enum hw_event_types { | |
93 | +enum hw_event_ids { | |
56 | 94 | /* |
57 | 95 | * Common hardware events, generalized by the kernel: |
58 | 96 | */ |
59 | - PERF_COUNT_CYCLES = 0, | |
60 | - PERF_COUNT_INSTRUCTIONS = 1, | |
61 | - PERF_COUNT_CACHE_REFERENCES = 2, | |
62 | - PERF_COUNT_CACHE_MISSES = 3, | |
63 | - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | |
64 | - PERF_COUNT_BRANCH_MISSES = 5, | |
65 | - | |
66 | - /* | |
67 | - * Special "software" counters provided by the kernel, even if | |
68 | - * the hardware does not support performance counters. These | |
69 | - * counters measure various physical and sw events of the | |
70 | - * kernel (and allow the profiling of them as well): | |
71 | - */ | |
72 | - PERF_COUNT_CPU_CLOCK = -1, | |
73 | - PERF_COUNT_TASK_CLOCK = -2, | |
74 | - /* | |
75 | - * Future software events: | |
76 | - */ | |
77 | - /* PERF_COUNT_PAGE_FAULTS = -3, | |
78 | - PERF_COUNT_CONTEXT_SWITCHES = -4, */ | |
97 | + PERF_COUNT_CPU_CYCLES = 0, | |
98 | + PERF_COUNT_INSTRUCTIONS = 1, | |
99 | + PERF_COUNT_CACHE_REFERENCES = 2, | |
100 | + PERF_COUNT_CACHE_MISSES = 3, | |
101 | + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | |
102 | + PERF_COUNT_BRANCH_MISSES = 5, | |
103 | + PERF_COUNT_BUS_CYCLES = 6, | |
79 | 104 | }; |
80 | 105 | |
81 | -These are standardized types of events that work uniformly on all CPUs | |
82 | -that implements Performance Counters support under Linux. If a CPU is | |
83 | -not able to count branch-misses, then the system call will return | |
84 | --EINVAL. | |
106 | +These are standardized types of events that work relatively uniformly | |
107 | +on all CPUs that implement Performance Counters support under Linux, | |
108 | +although there may be variations (e.g., different CPUs might count | |
109 | +cache references and misses at different levels of the cache hierarchy). | |
110 | +If a CPU is not able to count the selected event, then the system call | |
111 | +will return -EINVAL. | |
85 | 112 | |
86 | -More hw_event_types are supported as well, but they are CPU | |
87 | -specific and are enumerated via /sys on a per CPU basis. Raw hw event | |
88 | -types can be passed in under hw_event.type if hw_event.raw is 1. | |
89 | -For example, to count "External bus cycles while bus lock signal asserted" | |
90 | -events on Intel Core CPUs, pass in a 0x4064 event type value and set | |
91 | -hw_event.raw to 1. | |
113 | +More hw_event_types are supported as well, but they are CPU-specific | |
114 | +and accessed as raw events. For example, to count "External bus | |
115 | +cycles while bus lock signal asserted" events on Intel Core CPUs, pass | |
116 | +in a 0x4064 event_id value and set hw_event.raw_type to 1. | |
92 | 117 | |
93 | -'record_type' is the type of data that a read() will provide for the | |
94 | -counter, and it can be one of: | |
118 | +A counter of type PERF_TYPE_SOFTWARE will count one of the available | |
119 | +software events, selected by 'event_id': | |
95 | 120 | |
96 | 121 | /* |
122 | + * Special "software" counters provided by the kernel, even if the hardware | |
123 | + * does not support performance counters. These counters measure various | |
124 | + * physical and sw events of the kernel (and allow the profiling of them as | |
125 | + * well): | |
126 | + */ | |
127 | +enum sw_event_ids { | |
128 | + PERF_COUNT_CPU_CLOCK = 0, | |
129 | + PERF_COUNT_TASK_CLOCK = 1, | |
130 | + PERF_COUNT_PAGE_FAULTS = 2, | |
131 | + PERF_COUNT_CONTEXT_SWITCHES = 3, | |
132 | + PERF_COUNT_CPU_MIGRATIONS = 4, | |
133 | + PERF_COUNT_PAGE_FAULTS_MIN = 5, | |
134 | + PERF_COUNT_PAGE_FAULTS_MAJ = 6, | |
135 | +}; | |
136 | + | |
137 | +Counters come in two flavours: counting counters and sampling | |
138 | +counters. A "counting" counter is one that is used for counting the | |
139 | +number of events that occur, and is characterised by having | |
140 | +irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a | |
141 | +counting counter simply returns the current value of the counter as | |
142 | +an 8-byte number. | |
143 | + | |
144 | +A "sampling" counter is one that is set up to generate an interrupt | |
145 | +every N events, where N is given by 'irq_period'. A sampling counter | |
146 | +has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The | |
147 | +record_type controls what data is recorded on each interrupt, and the | |
148 | +available values are currently: | |
149 | + | |
150 | +/* | |
97 | 151 | * IRQ-notification data record type: |
98 | 152 | */ |
99 | 153 | enum perf_counter_record_type { |
100 | - PERF_RECORD_SIMPLE = 0, | |
101 | - PERF_RECORD_IRQ = 1, | |
102 | - PERF_RECORD_GROUP = 2, | |
154 | + PERF_RECORD_SIMPLE = 0, | |
155 | + PERF_RECORD_IRQ = 1, | |
156 | + PERF_RECORD_GROUP = 2, | |
103 | 157 | }; |
104 | 158 | |
105 | -a "simple" counter is one that counts hardware events and allows | |
106 | -them to be read out into a u64 count value. (read() returns 8 on | |
107 | -a successful read of a simple counter.) | |
159 | +A record_type value of PERF_RECORD_IRQ will record the instruction | |
160 | +pointer (IP) at which the interrupt occurred. A record_type value of | |
161 | +PERF_RECORD_GROUP will record the event_config and counter value of | |
162 | +all of the other counters in the group, and should only be used on a | |
163 | +group leader (see below). Currently these two values are mutually | |
164 | +exclusive, but record_type will become a bit-mask in future and | |
165 | +support other values. | |
108 | 166 | |
109 | -An "irq" counter is one that will also provide an IRQ context information: | |
110 | -the IP of the interrupted context. In this case read() will return | |
111 | -the 8-byte counter value, plus the Instruction Pointer address of the | |
112 | -interrupted context. | |
167 | +A sampling counter has an event queue, into which an event is placed | |
168 | +on each interrupt. A read() on a sampling counter will read the next | |
169 | +event from the event queue. If the queue is empty, the read() will | |
170 | +either block or return an EAGAIN error, depending on whether the fd | |
171 | +has been set to non-blocking mode or not. | |
113 | 172 | |
114 | -The parameter 'hw_event_period' is the number of events before waking up | |
115 | -a read() that is blocked on a counter fd. Zero value means a non-blocking | |
116 | -counter. | |
173 | +The 'disabled' bit specifies whether the counter starts out disabled | |
174 | +or enabled. If it is initially disabled, it can be enabled by ioctl | |
175 | +or prctl (see below). | |
117 | 176 | |
118 | -The 'pid' parameter allows the counter to be specific to a task: | |
177 | +The 'nmi' bit specifies, for hardware events, whether the counter | |
178 | +should be set up to request non-maskable interrupts (NMIs) or normal | |
179 | +interrupts. This bit is ignored if the user doesn't have | |
180 | +CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't | |
181 | +generate NMIs from hardware counters. | |
119 | 182 | |
183 | +The 'inherit' bit, if set, specifies that this counter should count | |
184 | +events on descendant tasks as well as the task specified. This only | |
185 | +applies to new descendents, not to any existing descendents at the | |
186 | +time the counter is created (nor to any new descendents of existing | |
187 | +descendents). | |
188 | + | |
189 | +The 'pinned' bit, if set, specifies that the counter should always be | |
190 | +on the CPU if at all possible. It only applies to hardware counters | |
191 | +and only to group leaders. If a pinned counter cannot be put onto the | |
192 | +CPU (e.g. because there are not enough hardware counters or because of | |
193 | +a conflict with some other event), then the counter goes into an | |
194 | +'error' state, where reads return end-of-file (i.e. read() returns 0) | |
195 | +until the counter is subsequently enabled or disabled. | |
196 | + | |
197 | +The 'exclusive' bit, if set, specifies that when this counter's group | |
198 | +is on the CPU, it should be the only group using the CPU's counters. | |
199 | +In future, this will allow sophisticated monitoring programs to supply | |
200 | +extra configuration information via 'extra_config_len' to exploit | |
201 | +advanced features of the CPU's Performance Monitor Unit (PMU) that are | |
202 | +not otherwise accessible and that might disrupt other hardware | |
203 | +counters. | |
204 | + | |
205 | +The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a | |
206 | +way to request that counting of events be restricted to times when the | |
207 | +CPU is in user, kernel and/or hypervisor mode. | |
208 | + | |
209 | + | |
210 | +The 'pid' parameter to the perf_counter_open() system call allows the | |
211 | +counter to be specific to a task: | |
212 | + | |
120 | 213 | pid == 0: if the pid parameter is zero, the counter is attached to the |
121 | 214 | current task. |
122 | 215 | |
... | ... | @@ -125,8 +218,7 @@ |
125 | 218 | |
126 | 219 | pid < 0: all tasks are counted (per cpu counters) |
127 | 220 | |
128 | -The 'cpu' parameter allows a counter to be made specific to a full | |
129 | -CPU: | |
221 | +The 'cpu' parameter allows a counter to be made specific to a CPU: | |
130 | 222 | |
131 | 223 | cpu >= 0: the counter is restricted to a specific CPU |
132 | 224 | cpu == -1: the counter counts on all CPUs |
... | ... | @@ -141,7 +233,51 @@ |
141 | 233 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts |
142 | 234 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. |
143 | 235 | |
144 | -Group counters are created by passing in a group_fd of another counter. | |
145 | -Groups are scheduled at once and can be used with PERF_RECORD_GROUP | |
146 | -to record multi-dimensional timestamps. | |
236 | +The 'flags' parameter is currently unused and must be zero. | |
237 | + | |
238 | +The 'group_fd' parameter allows counter "groups" to be set up. A | |
239 | +counter group has one counter which is the group "leader". The leader | |
240 | +is created first, with group_fd = -1 in the perf_counter_open call | |
241 | +that creates it. The rest of the group members are created | |
242 | +subsequently, with group_fd giving the fd of the group leader. | |
243 | +(A single counter on its own is created with group_fd = -1 and is | |
244 | +considered to be a group with only 1 member.) | |
245 | + | |
246 | +A counter group is scheduled onto the CPU as a unit, that is, it will | |
247 | +only be put onto the CPU if all of the counters in the group can be | |
248 | +put onto the CPU. This means that the values of the member counters | |
249 | +can be meaningfully compared, added, divided (to get ratios), etc., | |
250 | +with each other, since they have counted events for the same set of | |
251 | +executed instructions. | |
252 | + | |
253 | +Counters can be enabled and disabled in two ways: via ioctl and via | |
254 | +prctl. When a counter is disabled, it doesn't count or generate | |
255 | +events but does continue to exist and maintain its count value. | |
256 | + | |
257 | +An individual counter or counter group can be enabled with | |
258 | + | |
259 | + ioctl(fd, PERF_COUNTER_IOC_ENABLE); | |
260 | + | |
261 | +or disabled with | |
262 | + | |
263 | + ioctl(fd, PERF_COUNTER_IOC_DISABLE); | |
264 | + | |
265 | +Enabling or disabling the leader of a group enables or disables the | |
266 | +whole group; that is, while the group leader is disabled, none of the | |
267 | +counters in the group will count. Enabling or disabling a member of a | |
268 | +group other than the leader only affects that counter - disabling an | |
269 | +non-leader stops that counter from counting but doesn't affect any | |
270 | +other counter. | |
271 | + | |
272 | +A process can enable or disable all the counter groups that are | |
273 | +attached to it, using prctl: | |
274 | + | |
275 | + prctl(PR_TASK_PERF_COUNTERS_ENABLE); | |
276 | + | |
277 | + prctl(PR_TASK_PERF_COUNTERS_DISABLE); | |
278 | + | |
279 | +This applies to all counters on the current process, whether created | |
280 | +by this process or by another, and doesn't affect any counters that | |
281 | +this process has created on other processes. It only enables or | |
282 | +disables the group leaders, not any other members in the groups. |