Commit 9e06d3f9f6b14f6e3120923ed215032726246c98
Committed by
Linus Torvalds
1 parent
ad4ecbcba7
Exists in
master
and in
7 other branches
[PATCH] per task delay accounting taskstats interface: documentation fix
Change documentation and example program to reflect the flow control issues being addressed by the cpumask changes. Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 2 changed files with 365 additions and 305 deletions Inline Diff
Documentation/accounting/getdelays.c
1 | /* getdelays.c | 1 | /* getdelays.c |
2 | * | 2 | * |
3 | * Utility to get per-pid and per-tgid delay accounting statistics | 3 | * Utility to get per-pid and per-tgid delay accounting statistics |
4 | * Also illustrates usage of the taskstats interface | 4 | * Also illustrates usage of the taskstats interface |
5 | * | 5 | * |
6 | * Copyright (C) Shailabh Nagar, IBM Corp. 2005 | 6 | * Copyright (C) Shailabh Nagar, IBM Corp. 2005 |
7 | * Copyright (C) Balbir Singh, IBM Corp. 2006 | 7 | * Copyright (C) Balbir Singh, IBM Corp. 2006 |
8 | * Copyright (c) Jay Lan, SGI. 2006 | ||
8 | * | 9 | * |
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <stdio.h> | 12 | #include <stdio.h> |
12 | #include <stdlib.h> | 13 | #include <stdlib.h> |
13 | #include <errno.h> | 14 | #include <errno.h> |
14 | #include <unistd.h> | 15 | #include <unistd.h> |
15 | #include <poll.h> | 16 | #include <poll.h> |
16 | #include <string.h> | 17 | #include <string.h> |
17 | #include <fcntl.h> | 18 | #include <fcntl.h> |
18 | #include <sys/types.h> | 19 | #include <sys/types.h> |
19 | #include <sys/stat.h> | 20 | #include <sys/stat.h> |
20 | #include <sys/socket.h> | 21 | #include <sys/socket.h> |
21 | #include <sys/types.h> | 22 | #include <sys/types.h> |
22 | #include <signal.h> | 23 | #include <signal.h> |
23 | 24 | ||
24 | #include <linux/genetlink.h> | 25 | #include <linux/genetlink.h> |
25 | #include <linux/taskstats.h> | 26 | #include <linux/taskstats.h> |
26 | 27 | ||
27 | /* | 28 | /* |
28 | * Generic macros for dealing with netlink sockets. Might be duplicated | 29 | * Generic macros for dealing with netlink sockets. Might be duplicated |
29 | * elsewhere. It is recommended that commercial grade applications use | 30 | * elsewhere. It is recommended that commercial grade applications use |
30 | * libnl or libnetlink and use the interfaces provided by the library | 31 | * libnl or libnetlink and use the interfaces provided by the library |
31 | */ | 32 | */ |
32 | #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) | 33 | #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) |
33 | #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) | 34 | #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) |
34 | #define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN)) | 35 | #define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN)) |
35 | #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) | 36 | #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) |
36 | 37 | ||
37 | #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0) | 38 | #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0) |
38 | int done = 0; | 39 | int done = 0; |
40 | int rcvbufsz=0; | ||
39 | 41 | ||
42 | char name[100]; | ||
43 | int dbg=0, print_delays=0; | ||
44 | __u64 stime, utime; | ||
45 | #define PRINTF(fmt, arg...) { \ | ||
46 | if (dbg) { \ | ||
47 | printf(fmt, ##arg); \ | ||
48 | } \ | ||
49 | } | ||
50 | |||
51 | /* Maximum size of response requested or message sent */ | ||
52 | #define MAX_MSG_SIZE 256 | ||
53 | /* Maximum number of cpus expected to be specified in a cpumask */ | ||
54 | #define MAX_CPUS 32 | ||
55 | /* Maximum length of pathname to log file */ | ||
56 | #define MAX_FILENAME 256 | ||
57 | |||
58 | struct msgtemplate { | ||
59 | struct nlmsghdr n; | ||
60 | struct genlmsghdr g; | ||
61 | char buf[MAX_MSG_SIZE]; | ||
62 | }; | ||
63 | |||
64 | char cpumask[100+6*MAX_CPUS]; | ||
65 | |||
40 | /* | 66 | /* |
41 | * Create a raw netlink socket and bind | 67 | * Create a raw netlink socket and bind |
42 | */ | 68 | */ |
43 | static int create_nl_socket(int protocol, int groups) | 69 | static int create_nl_socket(int protocol) |
44 | { | 70 | { |
45 | socklen_t addr_len; | 71 | int fd; |
46 | int fd; | 72 | struct sockaddr_nl local; |
47 | struct sockaddr_nl local; | ||
48 | 73 | ||
49 | fd = socket(AF_NETLINK, SOCK_RAW, protocol); | 74 | fd = socket(AF_NETLINK, SOCK_RAW, protocol); |
50 | if (fd < 0) | 75 | if (fd < 0) |
51 | return -1; | 76 | return -1; |
52 | 77 | ||
53 | memset(&local, 0, sizeof(local)); | 78 | if (rcvbufsz) |
54 | local.nl_family = AF_NETLINK; | 79 | if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, |
55 | local.nl_groups = groups; | 80 | &rcvbufsz, sizeof(rcvbufsz)) < 0) { |
81 | printf("Unable to set socket rcv buf size to %d\n", | ||
82 | rcvbufsz); | ||
83 | return -1; | ||
84 | } | ||
56 | 85 | ||
57 | if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) | 86 | memset(&local, 0, sizeof(local)); |
58 | goto error; | 87 | local.nl_family = AF_NETLINK; |
59 | 88 | ||
60 | return fd; | 89 | if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) |
61 | error: | 90 | goto error; |
62 | close(fd); | 91 | |
63 | return -1; | 92 | return fd; |
93 | error: | ||
94 | close(fd); | ||
95 | return -1; | ||
64 | } | 96 | } |
65 | 97 | ||
66 | int sendto_fd(int s, const char *buf, int bufLen) | 98 | |
99 | int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, | ||
100 | __u8 genl_cmd, __u16 nla_type, | ||
101 | void *nla_data, int nla_len) | ||
67 | { | 102 | { |
68 | struct sockaddr_nl nladdr; | 103 | struct nlattr *na; |
69 | int r; | 104 | struct sockaddr_nl nladdr; |
105 | int r, buflen; | ||
106 | char *buf; | ||
70 | 107 | ||
71 | memset(&nladdr, 0, sizeof(nladdr)); | 108 | struct msgtemplate msg; |
72 | nladdr.nl_family = AF_NETLINK; | ||
73 | 109 | ||
74 | while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr, | 110 | msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); |
75 | sizeof(nladdr))) < bufLen) { | 111 | msg.n.nlmsg_type = nlmsg_type; |
76 | if (r > 0) { | 112 | msg.n.nlmsg_flags = NLM_F_REQUEST; |
77 | buf += r; | 113 | msg.n.nlmsg_seq = 0; |
78 | bufLen -= r; | 114 | msg.n.nlmsg_pid = nlmsg_pid; |
79 | } else if (errno != EAGAIN) | 115 | msg.g.cmd = genl_cmd; |
80 | return -1; | 116 | msg.g.version = 0x1; |
81 | } | 117 | na = (struct nlattr *) GENLMSG_DATA(&msg); |
82 | return 0; | 118 | na->nla_type = nla_type; |
119 | na->nla_len = nla_len + 1 + NLA_HDRLEN; | ||
120 | memcpy(NLA_DATA(na), nla_data, nla_len); | ||
121 | msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); | ||
122 | |||
123 | buf = (char *) &msg; | ||
124 | buflen = msg.n.nlmsg_len ; | ||
125 | memset(&nladdr, 0, sizeof(nladdr)); | ||
126 | nladdr.nl_family = AF_NETLINK; | ||
127 | while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, | ||
128 | sizeof(nladdr))) < buflen) { | ||
129 | if (r > 0) { | ||
130 | buf += r; | ||
131 | buflen -= r; | ||
132 | } else if (errno != EAGAIN) | ||
133 | return -1; | ||
134 | } | ||
135 | return 0; | ||
83 | } | 136 | } |
84 | 137 | ||
138 | |||
85 | /* | 139 | /* |
86 | * Probe the controller in genetlink to find the family id | 140 | * Probe the controller in genetlink to find the family id |
87 | * for the TASKSTATS family | 141 | * for the TASKSTATS family |
88 | */ | 142 | */ |
89 | int get_family_id(int sd) | 143 | int get_family_id(int sd) |
90 | { | 144 | { |
91 | struct { | 145 | struct { |
92 | struct nlmsghdr n; | 146 | struct nlmsghdr n; |
93 | struct genlmsghdr g; | 147 | struct genlmsghdr g; |
94 | char buf[256]; | 148 | char buf[256]; |
95 | } family_req; | 149 | } ans; |
96 | struct { | ||
97 | struct nlmsghdr n; | ||
98 | struct genlmsghdr g; | ||
99 | char buf[256]; | ||
100 | } ans; | ||
101 | 150 | ||
102 | int id; | 151 | int id, rc; |
103 | struct nlattr *na; | 152 | struct nlattr *na; |
104 | int rep_len; | 153 | int rep_len; |
105 | 154 | ||
106 | /* Get family name */ | 155 | strcpy(name, TASKSTATS_GENL_NAME); |
107 | family_req.n.nlmsg_type = GENL_ID_CTRL; | 156 | rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, |
108 | family_req.n.nlmsg_flags = NLM_F_REQUEST; | 157 | CTRL_ATTR_FAMILY_NAME, (void *)name, |
109 | family_req.n.nlmsg_seq = 0; | 158 | strlen(TASKSTATS_GENL_NAME)+1); |
110 | family_req.n.nlmsg_pid = getpid(); | ||
111 | family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); | ||
112 | family_req.g.cmd = CTRL_CMD_GETFAMILY; | ||
113 | family_req.g.version = 0x1; | ||
114 | na = (struct nlattr *) GENLMSG_DATA(&family_req); | ||
115 | na->nla_type = CTRL_ATTR_FAMILY_NAME; | ||
116 | na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN; | ||
117 | strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME); | ||
118 | family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); | ||
119 | 159 | ||
120 | if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0) | 160 | rep_len = recv(sd, &ans, sizeof(ans), 0); |
121 | err(1, "error sending message via Netlink\n"); | 161 | if (ans.n.nlmsg_type == NLMSG_ERROR || |
162 | (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) | ||
163 | return 0; | ||
122 | 164 | ||
123 | rep_len = recv(sd, &ans, sizeof(ans), 0); | 165 | na = (struct nlattr *) GENLMSG_DATA(&ans); |
124 | 166 | na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); | |
125 | if (rep_len < 0) | 167 | if (na->nla_type == CTRL_ATTR_FAMILY_ID) { |
126 | err(1, "error receiving reply message via Netlink\n"); | 168 | id = *(__u16 *) NLA_DATA(na); |
127 | 169 | } | |
128 | 170 | return id; | |
129 | /* Validate response message */ | ||
130 | if (!NLMSG_OK((&ans.n), rep_len)) | ||
131 | err(1, "invalid reply message received via Netlink\n"); | ||
132 | |||
133 | if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */ | ||
134 | printf("error received NACK - leaving\n"); | ||
135 | exit(1); | ||
136 | } | ||
137 | |||
138 | |||
139 | na = (struct nlattr *) GENLMSG_DATA(&ans); | ||
140 | na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); | ||
141 | if (na->nla_type == CTRL_ATTR_FAMILY_ID) { | ||
142 | id = *(__u16 *) NLA_DATA(na); | ||
143 | } | ||
144 | return id; | ||
145 | } | 171 | } |
146 | 172 | ||
147 | void print_taskstats(struct taskstats *t) | 173 | void print_delayacct(struct taskstats *t) |
148 | { | 174 | { |
149 | printf("\n\nCPU %15s%15s%15s%15s\n" | 175 | printf("\n\nCPU %15s%15s%15s%15s\n" |
150 | " %15llu%15llu%15llu%15llu\n" | 176 | " %15llu%15llu%15llu%15llu\n" |
151 | "IO %15s%15s\n" | 177 | "IO %15s%15s\n" |
152 | " %15llu%15llu\n" | 178 | " %15llu%15llu\n" |
153 | "MEM %15s%15s\n" | 179 | "MEM %15s%15s\n" |
154 | " %15llu%15llu\n\n", | 180 | " %15llu%15llu\n\n", |
155 | "count", "real total", "virtual total", "delay total", | 181 | "count", "real total", "virtual total", "delay total", |
156 | t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total, | 182 | t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total, |
157 | t->cpu_delay_total, | 183 | t->cpu_delay_total, |
158 | "count", "delay total", | 184 | "count", "delay total", |
159 | t->blkio_count, t->blkio_delay_total, | 185 | t->blkio_count, t->blkio_delay_total, |
160 | "count", "delay total", t->swapin_count, t->swapin_delay_total); | 186 | "count", "delay total", t->swapin_count, t->swapin_delay_total); |
161 | } | 187 | } |
162 | 188 | ||
163 | void sigchld(int sig) | ||
164 | { | ||
165 | done = 1; | ||
166 | } | ||
167 | |||
168 | int main(int argc, char *argv[]) | 189 | int main(int argc, char *argv[]) |
169 | { | 190 | { |
170 | int rc; | 191 | int c, rc, rep_len, aggr_len, len2, cmd_type; |
171 | int sk_nl; | 192 | __u16 id; |
172 | struct nlmsghdr *nlh; | 193 | __u32 mypid; |
173 | struct genlmsghdr *genlhdr; | ||
174 | char *buf; | ||
175 | struct taskstats_cmd_param *param; | ||
176 | __u16 id; | ||
177 | struct nlattr *na; | ||
178 | 194 | ||
179 | /* For receiving */ | 195 | struct nlattr *na; |
180 | struct sockaddr_nl kern_nla, from_nla; | 196 | int nl_sd = -1; |
181 | socklen_t from_nla_len; | 197 | int len = 0; |
182 | int recv_len; | 198 | pid_t tid = 0; |
183 | struct taskstats_reply *reply; | 199 | pid_t rtid = 0; |
184 | 200 | ||
185 | struct { | 201 | int fd = 0; |
186 | struct nlmsghdr n; | 202 | int count = 0; |
187 | struct genlmsghdr g; | 203 | int write_file = 0; |
188 | char buf[256]; | 204 | int maskset = 0; |
189 | } req; | 205 | char logfile[128]; |
206 | int loop = 0; | ||
190 | 207 | ||
191 | struct { | 208 | struct msgtemplate msg; |
192 | struct nlmsghdr n; | ||
193 | struct genlmsghdr g; | ||
194 | char buf[256]; | ||
195 | } ans; | ||
196 | 209 | ||
197 | int nl_sd = -1; | 210 | while (1) { |
198 | int rep_len; | 211 | c = getopt(argc, argv, "dw:r:m:t:p:v:l"); |
199 | int len = 0; | 212 | if (c < 0) |
200 | int aggr_len, len2; | 213 | break; |
201 | struct sockaddr_nl nladdr; | ||
202 | pid_t tid = 0; | ||
203 | pid_t rtid = 0; | ||
204 | int cmd_type = TASKSTATS_TYPE_TGID; | ||
205 | int c, status; | ||
206 | int forking = 0; | ||
207 | struct sigaction act = { | ||
208 | .sa_handler = SIG_IGN, | ||
209 | .sa_mask = SA_NOMASK, | ||
210 | }; | ||
211 | struct sigaction tact ; | ||
212 | 214 | ||
213 | if (argc < 3) { | 215 | switch (c) { |
214 | printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]); | 216 | case 'd': |
215 | exit(-1); | 217 | printf("print delayacct stats ON\n"); |
216 | } | 218 | print_delays = 1; |
219 | break; | ||
220 | case 'w': | ||
221 | strncpy(logfile, optarg, MAX_FILENAME); | ||
222 | printf("write to file %s\n", logfile); | ||
223 | write_file = 1; | ||
224 | break; | ||
225 | case 'r': | ||
226 | rcvbufsz = atoi(optarg); | ||
227 | printf("receive buf size %d\n", rcvbufsz); | ||
228 | if (rcvbufsz < 0) | ||
229 | err(1, "Invalid rcv buf size\n"); | ||
230 | break; | ||
231 | case 'm': | ||
232 | strncpy(cpumask, optarg, sizeof(cpumask)); | ||
233 | maskset = 1; | ||
234 | printf("cpumask %s maskset %d\n", cpumask, maskset); | ||
235 | break; | ||
236 | case 't': | ||
237 | tid = atoi(optarg); | ||
238 | if (!tid) | ||
239 | err(1, "Invalid tgid\n"); | ||
240 | cmd_type = TASKSTATS_CMD_ATTR_TGID; | ||
241 | print_delays = 1; | ||
242 | break; | ||
243 | case 'p': | ||
244 | tid = atoi(optarg); | ||
245 | if (!tid) | ||
246 | err(1, "Invalid pid\n"); | ||
247 | cmd_type = TASKSTATS_CMD_ATTR_PID; | ||
248 | print_delays = 1; | ||
249 | break; | ||
250 | case 'v': | ||
251 | printf("debug on\n"); | ||
252 | dbg = 1; | ||
253 | break; | ||
254 | case 'l': | ||
255 | printf("listen forever\n"); | ||
256 | loop = 1; | ||
257 | break; | ||
258 | default: | ||
259 | printf("Unknown option %d\n", c); | ||
260 | exit(-1); | ||
261 | } | ||
262 | } | ||
217 | 263 | ||
218 | tact.sa_handler = sigchld; | 264 | if (write_file) { |
219 | sigemptyset(&tact.sa_mask); | 265 | fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, |
220 | if (sigaction(SIGCHLD, &tact, NULL) < 0) | 266 | S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); |
221 | err(1, "sigaction failed for SIGCHLD\n"); | 267 | if (fd == -1) { |
268 | perror("Cannot open output file\n"); | ||
269 | exit(1); | ||
270 | } | ||
271 | } | ||
222 | 272 | ||
223 | while (1) { | 273 | if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0) |
274 | err(1, "error creating Netlink socket\n"); | ||
224 | 275 | ||
225 | c = getopt(argc, argv, "t:p:c:"); | ||
226 | if (c < 0) | ||
227 | break; | ||
228 | 276 | ||
229 | switch (c) { | 277 | mypid = getpid(); |
230 | case 't': | 278 | id = get_family_id(nl_sd); |
231 | tid = atoi(optarg); | 279 | if (!id) { |
232 | if (!tid) | 280 | printf("Error getting family id, errno %d", errno); |
233 | err(1, "Invalid tgid\n"); | 281 | goto err; |
234 | cmd_type = TASKSTATS_CMD_ATTR_TGID; | 282 | } |
235 | break; | 283 | PRINTF("family id %d\n", id); |
236 | case 'p': | ||
237 | tid = atoi(optarg); | ||
238 | if (!tid) | ||
239 | err(1, "Invalid pid\n"); | ||
240 | cmd_type = TASKSTATS_CMD_ATTR_TGID; | ||
241 | break; | ||
242 | case 'c': | ||
243 | opterr = 0; | ||
244 | tid = fork(); | ||
245 | if (tid < 0) | ||
246 | err(1, "fork failed\n"); | ||
247 | 284 | ||
248 | if (tid == 0) { /* child process */ | 285 | if (maskset) { |
249 | if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) { | 286 | rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, |
250 | exit(-1); | 287 | TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, |
288 | &cpumask, sizeof(cpumask)); | ||
289 | PRINTF("Sent register cpumask, retval %d\n", rc); | ||
290 | if (rc < 0) { | ||
291 | printf("error sending register cpumask\n"); | ||
292 | goto err; | ||
251 | } | 293 | } |
252 | } | ||
253 | forking = 1; | ||
254 | break; | ||
255 | default: | ||
256 | printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]); | ||
257 | exit(-1); | ||
258 | break; | ||
259 | } | 294 | } |
260 | if (c == 'c') | ||
261 | break; | ||
262 | } | ||
263 | 295 | ||
264 | /* Construct Netlink request message */ | 296 | if (tid) { |
297 | rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, | ||
298 | cmd_type, &tid, sizeof(__u32)); | ||
299 | PRINTF("Sent pid/tgid, retval %d\n", rc); | ||
300 | if (rc < 0) { | ||
301 | printf("error sending tid/tgid cmd\n"); | ||
302 | goto done; | ||
303 | } | ||
304 | } | ||
265 | 305 | ||
266 | /* Send Netlink request message & get reply */ | 306 | do { |
307 | int i; | ||
267 | 308 | ||
268 | if ((nl_sd = | 309 | rep_len = recv(nl_sd, &msg, sizeof(msg), 0); |
269 | create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0) | 310 | PRINTF("received %d bytes\n", rep_len); |
270 | err(1, "error creating Netlink socket\n"); | ||
271 | 311 | ||
312 | if (rep_len < 0) { | ||
313 | printf("nonfatal reply error: errno %d\n", errno); | ||
314 | continue; | ||
315 | } |
Documentation/accounting/taskstats.txt
1 | Per-task statistics interface | 1 | Per-task statistics interface |
2 | ----------------------------- | 2 | ----------------------------- |
3 | 3 | ||
4 | 4 | ||
5 | Taskstats is a netlink-based interface for sending per-task and | 5 | Taskstats is a netlink-based interface for sending per-task and |
6 | per-process statistics from the kernel to userspace. | 6 | per-process statistics from the kernel to userspace. |
7 | 7 | ||
8 | Taskstats was designed for the following benefits: | 8 | Taskstats was designed for the following benefits: |
9 | 9 | ||
10 | - efficiently provide statistics during lifetime of a task and on its exit | 10 | - efficiently provide statistics during lifetime of a task and on its exit |
11 | - unified interface for multiple accounting subsystems | 11 | - unified interface for multiple accounting subsystems |
12 | - extensibility for use by future accounting patches | 12 | - extensibility for use by future accounting patches |
13 | 13 | ||
14 | Terminology | 14 | Terminology |
15 | ----------- | 15 | ----------- |
16 | 16 | ||
17 | "pid", "tid" and "task" are used interchangeably and refer to the standard | 17 | "pid", "tid" and "task" are used interchangeably and refer to the standard |
18 | Linux task defined by struct task_struct. per-pid stats are the same as | 18 | Linux task defined by struct task_struct. per-pid stats are the same as |
19 | per-task stats. | 19 | per-task stats. |
20 | 20 | ||
21 | "tgid", "process" and "thread group" are used interchangeably and refer to the | 21 | "tgid", "process" and "thread group" are used interchangeably and refer to the |
22 | tasks that share an mm_struct i.e. the traditional Unix process. Despite the | 22 | tasks that share an mm_struct i.e. the traditional Unix process. Despite the |
23 | use of tgid, there is no special treatment for the task that is thread group | 23 | use of tgid, there is no special treatment for the task that is thread group |
24 | leader - a process is deemed alive as long as it has any task belonging to it. | 24 | leader - a process is deemed alive as long as it has any task belonging to it. |
25 | 25 | ||
26 | Usage | 26 | Usage |
27 | ----- | 27 | ----- |
28 | 28 | ||
29 | To get statistics during task's lifetime, userspace opens a unicast netlink | 29 | To get statistics during a task's lifetime, userspace opens a unicast netlink |
30 | socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid. | 30 | socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid. |
31 | The response contains statistics for a task (if pid is specified) or the sum of | 31 | The response contains statistics for a task (if pid is specified) or the sum of |
32 | statistics for all tasks of the process (if tgid is specified). | 32 | statistics for all tasks of the process (if tgid is specified). |
33 | 33 | ||
34 | To obtain statistics for tasks which are exiting, userspace opens a multicast | 34 | To obtain statistics for tasks which are exiting, the userspace listener |
35 | netlink socket. Each time a task exits, its per-pid statistics is always sent | 35 | sends a register command and specifies a cpumask. Whenever a task exits on |
36 | by the kernel to each listener on the multicast socket. In addition, if it is | 36 | one of the cpus in the cpumask, its per-pid statistics are sent to the |
37 | the last thread exiting its thread group, an additional record containing the | 37 | registered listener. Using cpumasks allows the data received by one listener |
38 | per-tgid stats are also sent. The latter contains the sum of per-pid stats for | 38 | to be limited and assists in flow control over the netlink interface and is |
39 | all threads in the thread group, both past and present. | 39 | explained in more detail below. |
40 | 40 | ||
41 | If the exiting task is the last thread exiting its thread group, | ||
42 | an additional record containing the per-tgid stats is also sent to userspace. | ||
43 | The latter contains the sum of per-pid stats for all threads in the thread | ||
44 | group, both past and present. | ||
45 | |||
41 | getdelays.c is a simple utility demonstrating usage of the taskstats interface | 46 | getdelays.c is a simple utility demonstrating usage of the taskstats interface |
42 | for reporting delay accounting statistics. | 47 | for reporting delay accounting statistics. Users can register cpumasks, |
48 | send commands and process responses, listen for per-tid/tgid exit data, | ||
49 | write the data received to a file and do basic flow control by increasing | ||
50 | receive buffer sizes. | ||
43 | 51 | ||
44 | Interface | 52 | Interface |
45 | --------- | 53 | --------- |
46 | 54 | ||
47 | The user-kernel interface is encapsulated in include/linux/taskstats.h | 55 | The user-kernel interface is encapsulated in include/linux/taskstats.h |
48 | 56 | ||
49 | To avoid this documentation becoming obsolete as the interface evolves, only | 57 | To avoid this documentation becoming obsolete as the interface evolves, only |
50 | an outline of the current version is given. taskstats.h always overrides the | 58 | an outline of the current version is given. taskstats.h always overrides the |
51 | description here. | 59 | description here. |
52 | 60 | ||
53 | struct taskstats is the common accounting structure for both per-pid and | 61 | struct taskstats is the common accounting structure for both per-pid and |
54 | per-tgid data. It is versioned and can be extended by each accounting subsystem | 62 | per-tgid data. It is versioned and can be extended by each accounting subsystem |
55 | that is added to the kernel. The fields and their semantics are defined in the | 63 | that is added to the kernel. The fields and their semantics are defined in the |
56 | taskstats.h file. | 64 | taskstats.h file. |
57 | 65 | ||
58 | The data exchanged between user and kernel space is a netlink message belonging | 66 | The data exchanged between user and kernel space is a netlink message belonging |
59 | to the NETLINK_GENERIC family and using the netlink attributes interface. | 67 | to the NETLINK_GENERIC family and using the netlink attributes interface. |
60 | The messages are in the format | 68 | The messages are in the format |
61 | 69 | ||
62 | +----------+- - -+-------------+-------------------+ | 70 | +----------+- - -+-------------+-------------------+ |
63 | | nlmsghdr | Pad | genlmsghdr | taskstats payload | | 71 | | nlmsghdr | Pad | genlmsghdr | taskstats payload | |
64 | +----------+- - -+-------------+-------------------+ | 72 | +----------+- - -+-------------+-------------------+ |
65 | 73 | ||
66 | 74 | ||
67 | The taskstats payload is one of the following three kinds: | 75 | The taskstats payload is one of the following three kinds: |
68 | 76 | ||
69 | 1. Commands: Sent from user to kernel. The payload is one attribute, of type | 77 | 1. Commands: Sent from user to kernel. Commands to get data on |
70 | TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute | 78 | a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID, |
71 | payload. The pid/tgid denotes the task/process for which userspace wants | 79 | containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes |
72 | statistics. | 80 | the task/process for which userspace wants statistics. |
73 | 81 | ||
82 | Commands to register/deregister interest in exit data from a set of cpus | ||
83 | consist of one attribute, of type | ||
84 | TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the | ||
85 | attribute payload. The cpumask is specified as an ascii string of | ||
86 | comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8 | ||
87 | the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest | ||
88 | in cpus before closing the listening socket, the kernel cleans up its interest | ||
89 | set over time. However, for the sake of efficiency, an explicit deregistration | ||
90 | is advisable. | ||
91 | |||
74 | 2. Response for a command: sent from the kernel in response to a userspace | 92 | 2. Response for a command: sent from the kernel in response to a userspace |
75 | command. The payload is a series of three attributes of type: | 93 | command. The payload is a series of three attributes of type: |
76 | 94 | ||
77 | a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates | 95 | a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates |
78 | a pid/tgid will be followed by some stats. | 96 | a pid/tgid will be followed by some stats. |
79 | 97 | ||
80 | b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats | 98 | b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats |
81 | is being returned. | 99 | is being returned. |
82 | 100 | ||
83 | c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The | 101 | c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The |
84 | same structure is used for both per-pid and per-tgid stats. | 102 | same structure is used for both per-pid and per-tgid stats. |
85 | 103 | ||
86 | 3. New message sent by kernel whenever a task exits. The payload consists of a | 104 | 3. New message sent by kernel whenever a task exits. The payload consists of a |
87 | series of attributes of the following type: | 105 | series of attributes of the following type: |
88 | 106 | ||
89 | a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats | 107 | a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats |
90 | b) TASKSTATS_TYPE_PID: contains exiting task's pid | 108 | b) TASKSTATS_TYPE_PID: contains exiting task's pid |
91 | c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats | 109 | c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats |
92 | d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats | 110 | d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats |
93 | e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs | 111 | e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs |
94 | f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process | 112 | f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process |
95 | 113 | ||
96 | 114 | ||
97 | per-tgid stats | 115 | per-tgid stats |
98 | -------------- | 116 | -------------- |
99 | 117 | ||
100 | Taskstats provides per-process stats, in addition to per-task stats, since | 118 | Taskstats provides per-process stats, in addition to per-task stats, since |
101 | resource management is often done at a process granularity and aggregating task | 119 | resource management is often done at a process granularity and aggregating task |
102 | stats in userspace alone is inefficient and potentially inaccurate (due to lack | 120 | stats in userspace alone is inefficient and potentially inaccurate (due to lack |
103 | of atomicity). | 121 | of atomicity). |
104 | 122 | ||
105 | However, maintaining per-process, in addition to per-task stats, within the | 123 | However, maintaining per-process, in addition to per-task stats, within the |
106 | kernel has space and time overheads. To address this, the taskstats code | 124 | kernel has space and time overheads. To address this, the taskstats code |
107 | accumalates each exiting task's statistics into a process-wide data structure. | 125 | accumalates each exiting task's statistics into a process-wide data structure. |
108 | When the last task of a process exits, the process level data accumalated also | 126 | When the last task of a process exits, the process level data accumalated also |
109 | gets sent to userspace (along with the per-task data). | 127 | gets sent to userspace (along with the per-task data). |
110 | 128 | ||
111 | When a user queries to get per-tgid data, the sum of all other live threads in | 129 | When a user queries to get per-tgid data, the sum of all other live threads in |
112 | the group is added up and added to the accumalated total for previously exited | 130 | the group is added up and added to the accumalated total for previously exited |
113 | threads of the same thread group. | 131 | threads of the same thread group. |
114 | 132 | ||
115 | Extending taskstats | 133 | Extending taskstats |
116 | ------------------- | 134 | ------------------- |
117 | 135 | ||
118 | There are two ways to extend the taskstats interface to export more | 136 | There are two ways to extend the taskstats interface to export more |
119 | per-task/process stats as patches to collect them get added to the kernel | 137 | per-task/process stats as patches to collect them get added to the kernel |
120 | in future: | 138 | in future: |
121 | 139 | ||
122 | 1. Adding more fields to the end of the existing struct taskstats. Backward | 140 | 1. Adding more fields to the end of the existing struct taskstats. Backward |
123 | compatibility is ensured by the version number within the | 141 | compatibility is ensured by the version number within the |
124 | structure. Userspace will use only the fields of the struct that correspond | 142 | structure. Userspace will use only the fields of the struct that correspond |
125 | to the version its using. | 143 | to the version its using. |
126 | 144 | ||
127 | 2. Defining separate statistic structs and using the netlink attributes | 145 | 2. Defining separate statistic structs and using the netlink attributes |
128 | interface to return them. Since userspace processes each netlink attribute | 146 | interface to return them. Since userspace processes each netlink attribute |
129 | independently, it can always ignore attributes whose type it does not | 147 | independently, it can always ignore attributes whose type it does not |
130 | understand (because it is using an older version of the interface). | 148 | understand (because it is using an older version of the interface). |
131 | 149 | ||
132 | 150 | ||
133 | Choosing between 1. and 2. is a matter of trading off flexibility and | 151 | Choosing between 1. and 2. is a matter of trading off flexibility and |
134 | overhead. If only a few fields need to be added, then 1. is the preferable | 152 | overhead. If only a few fields need to be added, then 1. is the preferable |
135 | path since the kernel and userspace don't need to incur the overhead of | 153 | path since the kernel and userspace don't need to incur the overhead of |
136 | processing new netlink attributes. But if the new fields expand the existing | 154 | processing new netlink attributes. But if the new fields expand the existing |
137 | struct too much, requiring disparate userspace accounting utilities to | 155 | struct too much, requiring disparate userspace accounting utilities to |
138 | unnecessarily receive large structures whose fields are of no interest, then | 156 | unnecessarily receive large structures whose fields are of no interest, then |
139 | extending the attributes structure would be worthwhile. | 157 | extending the attributes structure would be worthwhile. |
158 | |||
159 | Flow control for taskstats | ||
160 | -------------------------- | ||
161 | |||
162 | When the rate of task exits becomes large, a listener may not be able to keep | ||
163 | up with the kernel's rate of sending per-tid/tgid exit data leading to data | ||
164 | loss. This possibility gets compounded when the taskstats structure gets | ||
165 | extended and the number of cpus grows large. | ||
166 | |||
167 | To avoid losing statistics, userspace should do one or more of the following: | ||
168 | |||
169 | - increase the receive buffer sizes for the netlink sockets opened by | ||
170 | listeners to receive exit data. | ||
171 | |||
172 | - create more listeners and reduce the number of cpus being listened to by | ||
173 | each listener. In the extreme case, there could be one listener for each cpu. | ||
174 | Users may also consider setting the cpu affinity of the listener to the subset | ||
175 | of cpus to which it listens, especially if they are listening to just one cpu. | ||
176 | |||
177 | Despite these measures, if the userspace receives ENOBUFS error messages | ||
178 | indicated overflow of receive buffers, it should take measures to handle the | ||
179 | loss of data. | ||
140 | 180 | ||
141 | ---- | 181 | ---- |
142 | 182 |