Commit 9e06d3f9f6b14f6e3120923ed215032726246c98

Authored by Shailabh Nagar
Committed by Linus Torvalds
1 parent ad4ecbcba7

[PATCH] per task delay accounting taskstats interface: documentation fix

Change documentation and example program to reflect the flow control issues
being addressed by the cpumask changes.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 365 additions and 305 deletions Inline Diff

Documentation/accounting/getdelays.c
1 /* getdelays.c 1 /* getdelays.c
2 * 2 *
3 * Utility to get per-pid and per-tgid delay accounting statistics 3 * Utility to get per-pid and per-tgid delay accounting statistics
4 * Also illustrates usage of the taskstats interface 4 * Also illustrates usage of the taskstats interface
5 * 5 *
6 * Copyright (C) Shailabh Nagar, IBM Corp. 2005 6 * Copyright (C) Shailabh Nagar, IBM Corp. 2005
7 * Copyright (C) Balbir Singh, IBM Corp. 2006 7 * Copyright (C) Balbir Singh, IBM Corp. 2006
8 * Copyright (c) Jay Lan, SGI. 2006
8 * 9 *
9 */ 10 */
10 11
11 #include <stdio.h> 12 #include <stdio.h>
12 #include <stdlib.h> 13 #include <stdlib.h>
13 #include <errno.h> 14 #include <errno.h>
14 #include <unistd.h> 15 #include <unistd.h>
15 #include <poll.h> 16 #include <poll.h>
16 #include <string.h> 17 #include <string.h>
17 #include <fcntl.h> 18 #include <fcntl.h>
18 #include <sys/types.h> 19 #include <sys/types.h>
19 #include <sys/stat.h> 20 #include <sys/stat.h>
20 #include <sys/socket.h> 21 #include <sys/socket.h>
21 #include <sys/types.h> 22 #include <sys/types.h>
22 #include <signal.h> 23 #include <signal.h>
23 24
24 #include <linux/genetlink.h> 25 #include <linux/genetlink.h>
25 #include <linux/taskstats.h> 26 #include <linux/taskstats.h>
26 27
27 /* 28 /*
28 * Generic macros for dealing with netlink sockets. Might be duplicated 29 * Generic macros for dealing with netlink sockets. Might be duplicated
29 * elsewhere. It is recommended that commercial grade applications use 30 * elsewhere. It is recommended that commercial grade applications use
30 * libnl or libnetlink and use the interfaces provided by the library 31 * libnl or libnetlink and use the interfaces provided by the library
31 */ 32 */
32 #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) 33 #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
33 #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) 34 #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
34 #define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN)) 35 #define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN))
35 #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) 36 #define NLA_PAYLOAD(len) (len - NLA_HDRLEN)
36 37
37 #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0) 38 #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
38 int done = 0; 39 int done = 0;
40 int rcvbufsz=0;
39 41
42 char name[100];
43 int dbg=0, print_delays=0;
44 __u64 stime, utime;
45 #define PRINTF(fmt, arg...) { \
46 if (dbg) { \
47 printf(fmt, ##arg); \
48 } \
49 }
50
51 /* Maximum size of response requested or message sent */
52 #define MAX_MSG_SIZE 256
53 /* Maximum number of cpus expected to be specified in a cpumask */
54 #define MAX_CPUS 32
55 /* Maximum length of pathname to log file */
56 #define MAX_FILENAME 256
57
58 struct msgtemplate {
59 struct nlmsghdr n;
60 struct genlmsghdr g;
61 char buf[MAX_MSG_SIZE];
62 };
63
64 char cpumask[100+6*MAX_CPUS];
65
40 /* 66 /*
41 * Create a raw netlink socket and bind 67 * Create a raw netlink socket and bind
42 */ 68 */
43 static int create_nl_socket(int protocol, int groups) 69 static int create_nl_socket(int protocol)
44 { 70 {
45 socklen_t addr_len; 71 int fd;
46 int fd; 72 struct sockaddr_nl local;
47 struct sockaddr_nl local;
48 73
49 fd = socket(AF_NETLINK, SOCK_RAW, protocol); 74 fd = socket(AF_NETLINK, SOCK_RAW, protocol);
50 if (fd < 0) 75 if (fd < 0)
51 return -1; 76 return -1;
52 77
53 memset(&local, 0, sizeof(local)); 78 if (rcvbufsz)
54 local.nl_family = AF_NETLINK; 79 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
55 local.nl_groups = groups; 80 &rcvbufsz, sizeof(rcvbufsz)) < 0) {
81 printf("Unable to set socket rcv buf size to %d\n",
82 rcvbufsz);
83 return -1;
84 }
56 85
57 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) 86 memset(&local, 0, sizeof(local));
58 goto error; 87 local.nl_family = AF_NETLINK;
59 88
60 return fd; 89 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
61 error: 90 goto error;
62 close(fd); 91
63 return -1; 92 return fd;
93 error:
94 close(fd);
95 return -1;
64 } 96 }
65 97
66 int sendto_fd(int s, const char *buf, int bufLen) 98
99 int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
100 __u8 genl_cmd, __u16 nla_type,
101 void *nla_data, int nla_len)
67 { 102 {
68 struct sockaddr_nl nladdr; 103 struct nlattr *na;
69 int r; 104 struct sockaddr_nl nladdr;
105 int r, buflen;
106 char *buf;
70 107
71 memset(&nladdr, 0, sizeof(nladdr)); 108 struct msgtemplate msg;
72 nladdr.nl_family = AF_NETLINK;
73 109
74 while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr, 110 msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
75 sizeof(nladdr))) < bufLen) { 111 msg.n.nlmsg_type = nlmsg_type;
76 if (r > 0) { 112 msg.n.nlmsg_flags = NLM_F_REQUEST;
77 buf += r; 113 msg.n.nlmsg_seq = 0;
78 bufLen -= r; 114 msg.n.nlmsg_pid = nlmsg_pid;
79 } else if (errno != EAGAIN) 115 msg.g.cmd = genl_cmd;
80 return -1; 116 msg.g.version = 0x1;
81 } 117 na = (struct nlattr *) GENLMSG_DATA(&msg);
82 return 0; 118 na->nla_type = nla_type;
119 na->nla_len = nla_len + 1 + NLA_HDRLEN;
120 memcpy(NLA_DATA(na), nla_data, nla_len);
121 msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
122
123 buf = (char *) &msg;
124 buflen = msg.n.nlmsg_len ;
125 memset(&nladdr, 0, sizeof(nladdr));
126 nladdr.nl_family = AF_NETLINK;
127 while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
128 sizeof(nladdr))) < buflen) {
129 if (r > 0) {
130 buf += r;
131 buflen -= r;
132 } else if (errno != EAGAIN)
133 return -1;
134 }
135 return 0;
83 } 136 }
84 137
138
85 /* 139 /*
86 * Probe the controller in genetlink to find the family id 140 * Probe the controller in genetlink to find the family id
87 * for the TASKSTATS family 141 * for the TASKSTATS family
88 */ 142 */
89 int get_family_id(int sd) 143 int get_family_id(int sd)
90 { 144 {
91 struct { 145 struct {
92 struct nlmsghdr n; 146 struct nlmsghdr n;
93 struct genlmsghdr g; 147 struct genlmsghdr g;
94 char buf[256]; 148 char buf[256];
95 } family_req; 149 } ans;
96 struct {
97 struct nlmsghdr n;
98 struct genlmsghdr g;
99 char buf[256];
100 } ans;
101 150
102 int id; 151 int id, rc;
103 struct nlattr *na; 152 struct nlattr *na;
104 int rep_len; 153 int rep_len;
105 154
106 /* Get family name */ 155 strcpy(name, TASKSTATS_GENL_NAME);
107 family_req.n.nlmsg_type = GENL_ID_CTRL; 156 rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
108 family_req.n.nlmsg_flags = NLM_F_REQUEST; 157 CTRL_ATTR_FAMILY_NAME, (void *)name,
109 family_req.n.nlmsg_seq = 0; 158 strlen(TASKSTATS_GENL_NAME)+1);
110 family_req.n.nlmsg_pid = getpid();
111 family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
112 family_req.g.cmd = CTRL_CMD_GETFAMILY;
113 family_req.g.version = 0x1;
114 na = (struct nlattr *) GENLMSG_DATA(&family_req);
115 na->nla_type = CTRL_ATTR_FAMILY_NAME;
116 na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
117 strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
118 family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
119 159
120 if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0) 160 rep_len = recv(sd, &ans, sizeof(ans), 0);
121 err(1, "error sending message via Netlink\n"); 161 if (ans.n.nlmsg_type == NLMSG_ERROR ||
162 (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
163 return 0;
122 164
123 rep_len = recv(sd, &ans, sizeof(ans), 0); 165 na = (struct nlattr *) GENLMSG_DATA(&ans);
124 166 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
125 if (rep_len < 0) 167 if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
126 err(1, "error receiving reply message via Netlink\n"); 168 id = *(__u16 *) NLA_DATA(na);
127 169 }
128 170 return id;
129 /* Validate response message */
130 if (!NLMSG_OK((&ans.n), rep_len))
131 err(1, "invalid reply message received via Netlink\n");
132
133 if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */
134 printf("error received NACK - leaving\n");
135 exit(1);
136 }
137
138
139 na = (struct nlattr *) GENLMSG_DATA(&ans);
140 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
141 if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
142 id = *(__u16 *) NLA_DATA(na);
143 }
144 return id;
145 } 171 }
146 172
147 void print_taskstats(struct taskstats *t) 173 void print_delayacct(struct taskstats *t)
148 { 174 {
149 printf("\n\nCPU %15s%15s%15s%15s\n" 175 printf("\n\nCPU %15s%15s%15s%15s\n"
150 " %15llu%15llu%15llu%15llu\n" 176 " %15llu%15llu%15llu%15llu\n"
151 "IO %15s%15s\n" 177 "IO %15s%15s\n"
152 " %15llu%15llu\n" 178 " %15llu%15llu\n"
153 "MEM %15s%15s\n" 179 "MEM %15s%15s\n"
154 " %15llu%15llu\n\n", 180 " %15llu%15llu\n\n",
155 "count", "real total", "virtual total", "delay total", 181 "count", "real total", "virtual total", "delay total",
156 t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total, 182 t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
157 t->cpu_delay_total, 183 t->cpu_delay_total,
158 "count", "delay total", 184 "count", "delay total",
159 t->blkio_count, t->blkio_delay_total, 185 t->blkio_count, t->blkio_delay_total,
160 "count", "delay total", t->swapin_count, t->swapin_delay_total); 186 "count", "delay total", t->swapin_count, t->swapin_delay_total);
161 } 187 }
162 188
163 void sigchld(int sig)
164 {
165 done = 1;
166 }
167
168 int main(int argc, char *argv[]) 189 int main(int argc, char *argv[])
169 { 190 {
170 int rc; 191 int c, rc, rep_len, aggr_len, len2, cmd_type;
171 int sk_nl; 192 __u16 id;
172 struct nlmsghdr *nlh; 193 __u32 mypid;
173 struct genlmsghdr *genlhdr;
174 char *buf;
175 struct taskstats_cmd_param *param;
176 __u16 id;
177 struct nlattr *na;
178 194
179 /* For receiving */ 195 struct nlattr *na;
180 struct sockaddr_nl kern_nla, from_nla; 196 int nl_sd = -1;
181 socklen_t from_nla_len; 197 int len = 0;
182 int recv_len; 198 pid_t tid = 0;
183 struct taskstats_reply *reply; 199 pid_t rtid = 0;
184 200
185 struct { 201 int fd = 0;
186 struct nlmsghdr n; 202 int count = 0;
187 struct genlmsghdr g; 203 int write_file = 0;
188 char buf[256]; 204 int maskset = 0;
189 } req; 205 char logfile[128];
206 int loop = 0;
190 207
191 struct { 208 struct msgtemplate msg;
192 struct nlmsghdr n;
193 struct genlmsghdr g;
194 char buf[256];
195 } ans;
196 209
197 int nl_sd = -1; 210 while (1) {
198 int rep_len; 211 c = getopt(argc, argv, "dw:r:m:t:p:v:l");
199 int len = 0; 212 if (c < 0)
200 int aggr_len, len2; 213 break;
201 struct sockaddr_nl nladdr;
202 pid_t tid = 0;
203 pid_t rtid = 0;
204 int cmd_type = TASKSTATS_TYPE_TGID;
205 int c, status;
206 int forking = 0;
207 struct sigaction act = {
208 .sa_handler = SIG_IGN,
209 .sa_mask = SA_NOMASK,
210 };
211 struct sigaction tact ;
212 214
213 if (argc < 3) { 215 switch (c) {
214 printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]); 216 case 'd':
215 exit(-1); 217 printf("print delayacct stats ON\n");
216 } 218 print_delays = 1;
219 break;
220 case 'w':
221 strncpy(logfile, optarg, MAX_FILENAME);
222 printf("write to file %s\n", logfile);
223 write_file = 1;
224 break;
225 case 'r':
226 rcvbufsz = atoi(optarg);
227 printf("receive buf size %d\n", rcvbufsz);
228 if (rcvbufsz < 0)
229 err(1, "Invalid rcv buf size\n");
230 break;
231 case 'm':
232 strncpy(cpumask, optarg, sizeof(cpumask));
233 maskset = 1;
234 printf("cpumask %s maskset %d\n", cpumask, maskset);
235 break;
236 case 't':
237 tid = atoi(optarg);
238 if (!tid)
239 err(1, "Invalid tgid\n");
240 cmd_type = TASKSTATS_CMD_ATTR_TGID;
241 print_delays = 1;
242 break;
243 case 'p':
244 tid = atoi(optarg);
245 if (!tid)
246 err(1, "Invalid pid\n");
247 cmd_type = TASKSTATS_CMD_ATTR_PID;
248 print_delays = 1;
249 break;
250 case 'v':
251 printf("debug on\n");
252 dbg = 1;
253 break;
254 case 'l':
255 printf("listen forever\n");
256 loop = 1;
257 break;
258 default:
259 printf("Unknown option %d\n", c);
260 exit(-1);
261 }
262 }
217 263
218 tact.sa_handler = sigchld; 264 if (write_file) {
219 sigemptyset(&tact.sa_mask); 265 fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC,
220 if (sigaction(SIGCHLD, &tact, NULL) < 0) 266 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
221 err(1, "sigaction failed for SIGCHLD\n"); 267 if (fd == -1) {
268 perror("Cannot open output file\n");
269 exit(1);
270 }
271 }
222 272
223 while (1) { 273 if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0)
274 err(1, "error creating Netlink socket\n");
224 275
225 c = getopt(argc, argv, "t:p:c:");
226 if (c < 0)
227 break;
228 276
229 switch (c) { 277 mypid = getpid();
230 case 't': 278 id = get_family_id(nl_sd);
231 tid = atoi(optarg); 279 if (!id) {
232 if (!tid) 280 printf("Error getting family id, errno %d", errno);
233 err(1, "Invalid tgid\n"); 281 goto err;
234 cmd_type = TASKSTATS_CMD_ATTR_TGID; 282 }
235 break; 283 PRINTF("family id %d\n", id);
236 case 'p':
237 tid = atoi(optarg);
238 if (!tid)
239 err(1, "Invalid pid\n");
240 cmd_type = TASKSTATS_CMD_ATTR_TGID;
241 break;
242 case 'c':
243 opterr = 0;
244 tid = fork();
245 if (tid < 0)
246 err(1, "fork failed\n");
247 284
248 if (tid == 0) { /* child process */ 285 if (maskset) {
249 if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) { 286 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
250 exit(-1); 287 TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
288 &cpumask, sizeof(cpumask));
289 PRINTF("Sent register cpumask, retval %d\n", rc);
290 if (rc < 0) {
291 printf("error sending register cpumask\n");
292 goto err;
251 } 293 }
252 }
253 forking = 1;
254 break;
255 default:
256 printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
257 exit(-1);
258 break;
259 } 294 }
260 if (c == 'c')
261 break;
262 }
263 295
264 /* Construct Netlink request message */ 296 if (tid) {
297 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
298 cmd_type, &tid, sizeof(__u32));
299 PRINTF("Sent pid/tgid, retval %d\n", rc);
300 if (rc < 0) {
301 printf("error sending tid/tgid cmd\n");
302 goto done;
303 }
304 }
265 305
266 /* Send Netlink request message & get reply */ 306 do {
307 int i;
267 308
268 if ((nl_sd = 309 rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
269 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0) 310 PRINTF("received %d bytes\n", rep_len);
270 err(1, "error creating Netlink socket\n");
271 311
312 if (rep_len < 0) {
313 printf("nonfatal reply error: errno %d\n", errno);
314 continue;
315 }
Documentation/accounting/taskstats.txt
1 Per-task statistics interface 1 Per-task statistics interface
2 ----------------------------- 2 -----------------------------
3 3
4 4
5 Taskstats is a netlink-based interface for sending per-task and 5 Taskstats is a netlink-based interface for sending per-task and
6 per-process statistics from the kernel to userspace. 6 per-process statistics from the kernel to userspace.
7 7
8 Taskstats was designed for the following benefits: 8 Taskstats was designed for the following benefits:
9 9
10 - efficiently provide statistics during lifetime of a task and on its exit 10 - efficiently provide statistics during lifetime of a task and on its exit
11 - unified interface for multiple accounting subsystems 11 - unified interface for multiple accounting subsystems
12 - extensibility for use by future accounting patches 12 - extensibility for use by future accounting patches
13 13
14 Terminology 14 Terminology
15 ----------- 15 -----------
16 16
17 "pid", "tid" and "task" are used interchangeably and refer to the standard 17 "pid", "tid" and "task" are used interchangeably and refer to the standard
18 Linux task defined by struct task_struct. per-pid stats are the same as 18 Linux task defined by struct task_struct. per-pid stats are the same as
19 per-task stats. 19 per-task stats.
20 20
21 "tgid", "process" and "thread group" are used interchangeably and refer to the 21 "tgid", "process" and "thread group" are used interchangeably and refer to the
22 tasks that share an mm_struct i.e. the traditional Unix process. Despite the 22 tasks that share an mm_struct i.e. the traditional Unix process. Despite the
23 use of tgid, there is no special treatment for the task that is thread group 23 use of tgid, there is no special treatment for the task that is thread group
24 leader - a process is deemed alive as long as it has any task belonging to it. 24 leader - a process is deemed alive as long as it has any task belonging to it.
25 25
26 Usage 26 Usage
27 ----- 27 -----
28 28
29 To get statistics during task's lifetime, userspace opens a unicast netlink 29 To get statistics during a task's lifetime, userspace opens a unicast netlink
30 socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid. 30 socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
31 The response contains statistics for a task (if pid is specified) or the sum of 31 The response contains statistics for a task (if pid is specified) or the sum of
32 statistics for all tasks of the process (if tgid is specified). 32 statistics for all tasks of the process (if tgid is specified).
33 33
34 To obtain statistics for tasks which are exiting, userspace opens a multicast 34 To obtain statistics for tasks which are exiting, the userspace listener
35 netlink socket. Each time a task exits, its per-pid statistics is always sent 35 sends a register command and specifies a cpumask. Whenever a task exits on
36 by the kernel to each listener on the multicast socket. In addition, if it is 36 one of the cpus in the cpumask, its per-pid statistics are sent to the
37 the last thread exiting its thread group, an additional record containing the 37 registered listener. Using cpumasks allows the data received by one listener
38 per-tgid stats are also sent. The latter contains the sum of per-pid stats for 38 to be limited and assists in flow control over the netlink interface and is
39 all threads in the thread group, both past and present. 39 explained in more detail below.
40 40
41 If the exiting task is the last thread exiting its thread group,
42 an additional record containing the per-tgid stats is also sent to userspace.
43 The latter contains the sum of per-pid stats for all threads in the thread
44 group, both past and present.
45
41 getdelays.c is a simple utility demonstrating usage of the taskstats interface 46 getdelays.c is a simple utility demonstrating usage of the taskstats interface
42 for reporting delay accounting statistics. 47 for reporting delay accounting statistics. Users can register cpumasks,
48 send commands and process responses, listen for per-tid/tgid exit data,
49 write the data received to a file and do basic flow control by increasing
50 receive buffer sizes.
43 51
44 Interface 52 Interface
45 --------- 53 ---------
46 54
47 The user-kernel interface is encapsulated in include/linux/taskstats.h 55 The user-kernel interface is encapsulated in include/linux/taskstats.h
48 56
49 To avoid this documentation becoming obsolete as the interface evolves, only 57 To avoid this documentation becoming obsolete as the interface evolves, only
50 an outline of the current version is given. taskstats.h always overrides the 58 an outline of the current version is given. taskstats.h always overrides the
51 description here. 59 description here.
52 60
53 struct taskstats is the common accounting structure for both per-pid and 61 struct taskstats is the common accounting structure for both per-pid and
54 per-tgid data. It is versioned and can be extended by each accounting subsystem 62 per-tgid data. It is versioned and can be extended by each accounting subsystem
55 that is added to the kernel. The fields and their semantics are defined in the 63 that is added to the kernel. The fields and their semantics are defined in the
56 taskstats.h file. 64 taskstats.h file.
57 65
58 The data exchanged between user and kernel space is a netlink message belonging 66 The data exchanged between user and kernel space is a netlink message belonging
59 to the NETLINK_GENERIC family and using the netlink attributes interface. 67 to the NETLINK_GENERIC family and using the netlink attributes interface.
60 The messages are in the format 68 The messages are in the format
61 69
62 +----------+- - -+-------------+-------------------+ 70 +----------+- - -+-------------+-------------------+
63 | nlmsghdr | Pad | genlmsghdr | taskstats payload | 71 | nlmsghdr | Pad | genlmsghdr | taskstats payload |
64 +----------+- - -+-------------+-------------------+ 72 +----------+- - -+-------------+-------------------+
65 73
66 74
67 The taskstats payload is one of the following three kinds: 75 The taskstats payload is one of the following three kinds:
68 76
69 1. Commands: Sent from user to kernel. The payload is one attribute, of type 77 1. Commands: Sent from user to kernel. Commands to get data on
70 TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute 78 a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
71 payload. The pid/tgid denotes the task/process for which userspace wants 79 containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
72 statistics. 80 the task/process for which userspace wants statistics.
73 81
82 Commands to register/deregister interest in exit data from a set of cpus
83 consist of one attribute, of type
84 TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
85 attribute payload. The cpumask is specified as an ascii string of
86 comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
87 the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
88 in cpus before closing the listening socket, the kernel cleans up its interest
89 set over time. However, for the sake of efficiency, an explicit deregistration
90 is advisable.
91
74 2. Response for a command: sent from the kernel in response to a userspace 92 2. Response for a command: sent from the kernel in response to a userspace
75 command. The payload is a series of three attributes of type: 93 command. The payload is a series of three attributes of type:
76 94
77 a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates 95 a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
78 a pid/tgid will be followed by some stats. 96 a pid/tgid will be followed by some stats.
79 97
80 b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats 98 b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
81 is being returned. 99 is being returned.
82 100
83 c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The 101 c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The
84 same structure is used for both per-pid and per-tgid stats. 102 same structure is used for both per-pid and per-tgid stats.
85 103
86 3. New message sent by kernel whenever a task exits. The payload consists of a 104 3. New message sent by kernel whenever a task exits. The payload consists of a
87 series of attributes of the following type: 105 series of attributes of the following type:
88 106
89 a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats 107 a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
90 b) TASKSTATS_TYPE_PID: contains exiting task's pid 108 b) TASKSTATS_TYPE_PID: contains exiting task's pid
91 c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats 109 c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
92 d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats 110 d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
93 e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs 111 e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
94 f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process 112 f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
95 113
96 114
97 per-tgid stats 115 per-tgid stats
98 -------------- 116 --------------
99 117
100 Taskstats provides per-process stats, in addition to per-task stats, since 118 Taskstats provides per-process stats, in addition to per-task stats, since
101 resource management is often done at a process granularity and aggregating task 119 resource management is often done at a process granularity and aggregating task
102 stats in userspace alone is inefficient and potentially inaccurate (due to lack 120 stats in userspace alone is inefficient and potentially inaccurate (due to lack
103 of atomicity). 121 of atomicity).
104 122
105 However, maintaining per-process, in addition to per-task stats, within the 123 However, maintaining per-process, in addition to per-task stats, within the
106 kernel has space and time overheads. To address this, the taskstats code 124 kernel has space and time overheads. To address this, the taskstats code
107 accumalates each exiting task's statistics into a process-wide data structure. 125 accumalates each exiting task's statistics into a process-wide data structure.
108 When the last task of a process exits, the process level data accumalated also 126 When the last task of a process exits, the process level data accumalated also
109 gets sent to userspace (along with the per-task data). 127 gets sent to userspace (along with the per-task data).
110 128
111 When a user queries to get per-tgid data, the sum of all other live threads in 129 When a user queries to get per-tgid data, the sum of all other live threads in
112 the group is added up and added to the accumalated total for previously exited 130 the group is added up and added to the accumalated total for previously exited
113 threads of the same thread group. 131 threads of the same thread group.
114 132
115 Extending taskstats 133 Extending taskstats
116 ------------------- 134 -------------------
117 135
118 There are two ways to extend the taskstats interface to export more 136 There are two ways to extend the taskstats interface to export more
119 per-task/process stats as patches to collect them get added to the kernel 137 per-task/process stats as patches to collect them get added to the kernel
120 in future: 138 in future:
121 139
122 1. Adding more fields to the end of the existing struct taskstats. Backward 140 1. Adding more fields to the end of the existing struct taskstats. Backward
123 compatibility is ensured by the version number within the 141 compatibility is ensured by the version number within the
124 structure. Userspace will use only the fields of the struct that correspond 142 structure. Userspace will use only the fields of the struct that correspond
125 to the version its using. 143 to the version its using.
126 144
127 2. Defining separate statistic structs and using the netlink attributes 145 2. Defining separate statistic structs and using the netlink attributes
128 interface to return them. Since userspace processes each netlink attribute 146 interface to return them. Since userspace processes each netlink attribute
129 independently, it can always ignore attributes whose type it does not 147 independently, it can always ignore attributes whose type it does not
130 understand (because it is using an older version of the interface). 148 understand (because it is using an older version of the interface).
131 149
132 150
133 Choosing between 1. and 2. is a matter of trading off flexibility and 151 Choosing between 1. and 2. is a matter of trading off flexibility and
134 overhead. If only a few fields need to be added, then 1. is the preferable 152 overhead. If only a few fields need to be added, then 1. is the preferable
135 path since the kernel and userspace don't need to incur the overhead of 153 path since the kernel and userspace don't need to incur the overhead of
136 processing new netlink attributes. But if the new fields expand the existing 154 processing new netlink attributes. But if the new fields expand the existing
137 struct too much, requiring disparate userspace accounting utilities to 155 struct too much, requiring disparate userspace accounting utilities to
138 unnecessarily receive large structures whose fields are of no interest, then 156 unnecessarily receive large structures whose fields are of no interest, then
139 extending the attributes structure would be worthwhile. 157 extending the attributes structure would be worthwhile.
158
159 Flow control for taskstats
160 --------------------------
161
162 When the rate of task exits becomes large, a listener may not be able to keep
163 up with the kernel's rate of sending per-tid/tgid exit data leading to data
164 loss. This possibility gets compounded when the taskstats structure gets
165 extended and the number of cpus grows large.
166
167 To avoid losing statistics, userspace should do one or more of the following:
168
169 - increase the receive buffer sizes for the netlink sockets opened by
170 listeners to receive exit data.
171
172 - create more listeners and reduce the number of cpus being listened to by
173 each listener. In the extreme case, there could be one listener for each cpu.
174 Users may also consider setting the cpu affinity of the listener to the subset
175 of cpus to which it listens, especially if they are listening to just one cpu.
176
177 Despite these measures, if the userspace receives ENOBUFS error messages
178 indicated overflow of receive buffers, it should take measures to handle the
179 loss of data.
140 180
141 ---- 181 ----
142 182