Commit 9e06d3f9f6b14f6e3120923ed215032726246c98

Authored by Shailabh Nagar
Committed by Linus Torvalds
1 parent ad4ecbcba7

[PATCH] per task delay accounting taskstats interface: documentation fix

Change documentation and example program to reflect the flow control issues
being addressed by the cpumask changes.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 365 additions and 305 deletions Side-by-side Diff

Documentation/accounting/getdelays.c
... ... @@ -5,6 +5,7 @@
5 5 *
6 6 * Copyright (C) Shailabh Nagar, IBM Corp. 2005
7 7 * Copyright (C) Balbir Singh, IBM Corp. 2006
  8 + * Copyright (c) Jay Lan, SGI. 2006
8 9 *
9 10 */
10 11  
11 12  
12 13  
13 14  
14 15  
15 16  
16 17  
17 18  
18 19  
19 20  
20 21  
21 22  
22 23  
23 24  
24 25  
25 26  
26 27  
27 28  
28 29  
29 30  
30 31  
31 32  
32 33  
33 34  
34 35  
35 36  
36 37  
37 38  
38 39  
39 40  
40 41  
41 42  
42 43  
43 44  
44 45  
45 46  
46 47  
47 48  
48 49  
49 50  
50 51  
51 52  
52 53  
53 54  
54 55  
55 56  
... ... @@ -36,342 +37,361 @@
36 37  
37 38 #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
38 39 int done = 0;
  40 +int rcvbufsz=0;
39 41  
  42 + char name[100];
  43 +int dbg=0, print_delays=0;
  44 +__u64 stime, utime;
  45 +#define PRINTF(fmt, arg...) { \
  46 + if (dbg) { \
  47 + printf(fmt, ##arg); \
  48 + } \
  49 + }
  50 +
  51 +/* Maximum size of response requested or message sent */
  52 +#define MAX_MSG_SIZE 256
  53 +/* Maximum number of cpus expected to be specified in a cpumask */
  54 +#define MAX_CPUS 32
  55 +/* Maximum length of pathname to log file */
  56 +#define MAX_FILENAME 256
  57 +
  58 +struct msgtemplate {
  59 + struct nlmsghdr n;
  60 + struct genlmsghdr g;
  61 + char buf[MAX_MSG_SIZE];
  62 +};
  63 +
  64 +char cpumask[100+6*MAX_CPUS];
  65 +
40 66 /*
41 67 * Create a raw netlink socket and bind
42 68 */
43   -static int create_nl_socket(int protocol, int groups)
  69 +static int create_nl_socket(int protocol)
44 70 {
45   - socklen_t addr_len;
46   - int fd;
47   - struct sockaddr_nl local;
  71 + int fd;
  72 + struct sockaddr_nl local;
48 73  
49   - fd = socket(AF_NETLINK, SOCK_RAW, protocol);
50   - if (fd < 0)
51   - return -1;
  74 + fd = socket(AF_NETLINK, SOCK_RAW, protocol);
  75 + if (fd < 0)
  76 + return -1;
52 77  
53   - memset(&local, 0, sizeof(local));
54   - local.nl_family = AF_NETLINK;
55   - local.nl_groups = groups;
  78 + if (rcvbufsz)
  79 + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
  80 + &rcvbufsz, sizeof(rcvbufsz)) < 0) {
  81 + printf("Unable to set socket rcv buf size to %d\n",
  82 + rcvbufsz);
  83 + return -1;
  84 + }
56 85  
57   - if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
58   - goto error;
  86 + memset(&local, 0, sizeof(local));
  87 + local.nl_family = AF_NETLINK;
59 88  
60   - return fd;
61   - error:
62   - close(fd);
63   - return -1;
  89 + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
  90 + goto error;
  91 +
  92 + return fd;
  93 +error:
  94 + close(fd);
  95 + return -1;
64 96 }
65 97  
66   -int sendto_fd(int s, const char *buf, int bufLen)
  98 +
  99 +int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
  100 + __u8 genl_cmd, __u16 nla_type,
  101 + void *nla_data, int nla_len)
67 102 {
68   - struct sockaddr_nl nladdr;
69   - int r;
  103 + struct nlattr *na;
  104 + struct sockaddr_nl nladdr;
  105 + int r, buflen;
  106 + char *buf;
70 107  
71   - memset(&nladdr, 0, sizeof(nladdr));
72   - nladdr.nl_family = AF_NETLINK;
  108 + struct msgtemplate msg;
73 109  
74   - while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr,
75   - sizeof(nladdr))) < bufLen) {
76   - if (r > 0) {
77   - buf += r;
78   - bufLen -= r;
79   - } else if (errno != EAGAIN)
80   - return -1;
81   - }
82   - return 0;
  110 + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
  111 + msg.n.nlmsg_type = nlmsg_type;
  112 + msg.n.nlmsg_flags = NLM_F_REQUEST;
  113 + msg.n.nlmsg_seq = 0;
  114 + msg.n.nlmsg_pid = nlmsg_pid;
  115 + msg.g.cmd = genl_cmd;
  116 + msg.g.version = 0x1;
  117 + na = (struct nlattr *) GENLMSG_DATA(&msg);
  118 + na->nla_type = nla_type;
  119 + na->nla_len = nla_len + 1 + NLA_HDRLEN;
  120 + memcpy(NLA_DATA(na), nla_data, nla_len);
  121 + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
  122 +
  123 + buf = (char *) &msg;
  124 + buflen = msg.n.nlmsg_len ;
  125 + memset(&nladdr, 0, sizeof(nladdr));
  126 + nladdr.nl_family = AF_NETLINK;
  127 + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
  128 + sizeof(nladdr))) < buflen) {
  129 + if (r > 0) {
  130 + buf += r;
  131 + buflen -= r;
  132 + } else if (errno != EAGAIN)
  133 + return -1;
  134 + }
  135 + return 0;
83 136 }
84 137  
  138 +
85 139 /*
86 140 * Probe the controller in genetlink to find the family id
87 141 * for the TASKSTATS family
88 142 */
89 143 int get_family_id(int sd)
90 144 {
91   - struct {
92   - struct nlmsghdr n;
93   - struct genlmsghdr g;
94   - char buf[256];
95   - } family_req;
96   - struct {
97   - struct nlmsghdr n;
98   - struct genlmsghdr g;
99   - char buf[256];
100   - } ans;
  145 + struct {
  146 + struct nlmsghdr n;
  147 + struct genlmsghdr g;
  148 + char buf[256];
  149 + } ans;
101 150  
102   - int id;
103   - struct nlattr *na;
104   - int rep_len;
  151 + int id, rc;
  152 + struct nlattr *na;
  153 + int rep_len;
105 154  
106   - /* Get family name */
107   - family_req.n.nlmsg_type = GENL_ID_CTRL;
108   - family_req.n.nlmsg_flags = NLM_F_REQUEST;
109   - family_req.n.nlmsg_seq = 0;
110   - family_req.n.nlmsg_pid = getpid();
111   - family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
112   - family_req.g.cmd = CTRL_CMD_GETFAMILY;
113   - family_req.g.version = 0x1;
114   - na = (struct nlattr *) GENLMSG_DATA(&family_req);
115   - na->nla_type = CTRL_ATTR_FAMILY_NAME;
116   - na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
117   - strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
118   - family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
  155 + strcpy(name, TASKSTATS_GENL_NAME);
  156 + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
  157 + CTRL_ATTR_FAMILY_NAME, (void *)name,
  158 + strlen(TASKSTATS_GENL_NAME)+1);
119 159  
120   - if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0)
121   - err(1, "error sending message via Netlink\n");
  160 + rep_len = recv(sd, &ans, sizeof(ans), 0);
  161 + if (ans.n.nlmsg_type == NLMSG_ERROR ||
  162 + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
  163 + return 0;
122 164  
123   - rep_len = recv(sd, &ans, sizeof(ans), 0);
124   -
125   - if (rep_len < 0)
126   - err(1, "error receiving reply message via Netlink\n");
127   -
128   -
129   - /* Validate response message */
130   - if (!NLMSG_OK((&ans.n), rep_len))
131   - err(1, "invalid reply message received via Netlink\n");
132   -
133   - if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */
134   - printf("error received NACK - leaving\n");
135   - exit(1);
136   - }
137   -
138   -
139   - na = (struct nlattr *) GENLMSG_DATA(&ans);
140   - na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
141   - if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
142   - id = *(__u16 *) NLA_DATA(na);
143   - }
144   - return id;
  165 + na = (struct nlattr *) GENLMSG_DATA(&ans);
  166 + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
  167 + if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
  168 + id = *(__u16 *) NLA_DATA(na);
  169 + }
  170 + return id;
145 171 }
146 172  
147   -void print_taskstats(struct taskstats *t)
  173 +void print_delayacct(struct taskstats *t)
148 174 {
149   - printf("\n\nCPU %15s%15s%15s%15s\n"
150   - " %15llu%15llu%15llu%15llu\n"
151   - "IO %15s%15s\n"
152   - " %15llu%15llu\n"
153   - "MEM %15s%15s\n"
154   - " %15llu%15llu\n\n",
155   - "count", "real total", "virtual total", "delay total",
156   - t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
157   - t->cpu_delay_total,
158   - "count", "delay total",
159   - t->blkio_count, t->blkio_delay_total,
160   - "count", "delay total", t->swapin_count, t->swapin_delay_total);
  175 + printf("\n\nCPU %15s%15s%15s%15s\n"
  176 + " %15llu%15llu%15llu%15llu\n"
  177 + "IO %15s%15s\n"
  178 + " %15llu%15llu\n"
  179 + "MEM %15s%15s\n"
  180 + " %15llu%15llu\n\n",
  181 + "count", "real total", "virtual total", "delay total",
  182 + t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
  183 + t->cpu_delay_total,
  184 + "count", "delay total",
  185 + t->blkio_count, t->blkio_delay_total,
  186 + "count", "delay total", t->swapin_count, t->swapin_delay_total);
161 187 }
162 188  
163   -void sigchld(int sig)
164   -{
165   - done = 1;
166   -}
167   -
168 189 int main(int argc, char *argv[])
169 190 {
170   - int rc;
171   - int sk_nl;
172   - struct nlmsghdr *nlh;
173   - struct genlmsghdr *genlhdr;
174   - char *buf;
175   - struct taskstats_cmd_param *param;
176   - __u16 id;
177   - struct nlattr *na;
  191 + int c, rc, rep_len, aggr_len, len2, cmd_type;
  192 + __u16 id;
  193 + __u32 mypid;
178 194  
179   - /* For receiving */
180   - struct sockaddr_nl kern_nla, from_nla;
181   - socklen_t from_nla_len;
182   - int recv_len;
183   - struct taskstats_reply *reply;
  195 + struct nlattr *na;
  196 + int nl_sd = -1;
  197 + int len = 0;
  198 + pid_t tid = 0;
  199 + pid_t rtid = 0;
184 200  
185   - struct {
186   - struct nlmsghdr n;
187   - struct genlmsghdr g;
188   - char buf[256];
189   - } req;
  201 + int fd = 0;
  202 + int count = 0;
  203 + int write_file = 0;
  204 + int maskset = 0;
  205 + char logfile[128];
  206 + int loop = 0;
190 207  
191   - struct {
192   - struct nlmsghdr n;
193   - struct genlmsghdr g;
194   - char buf[256];
195   - } ans;
  208 + struct msgtemplate msg;
196 209  
197   - int nl_sd = -1;
198   - int rep_len;
199   - int len = 0;
200   - int aggr_len, len2;
201   - struct sockaddr_nl nladdr;
202   - pid_t tid = 0;
203   - pid_t rtid = 0;
204   - int cmd_type = TASKSTATS_TYPE_TGID;
205   - int c, status;
206   - int forking = 0;
207   - struct sigaction act = {
208   - .sa_handler = SIG_IGN,
209   - .sa_mask = SA_NOMASK,
210   - };
211   - struct sigaction tact ;
  210 + while (1) {
  211 + c = getopt(argc, argv, "dw:r:m:t:p:v:l");
  212 + if (c < 0)
  213 + break;
212 214  
213   - if (argc < 3) {
214   - printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
215   - exit(-1);
216   - }
  215 + switch (c) {
  216 + case 'd':
  217 + printf("print delayacct stats ON\n");
  218 + print_delays = 1;
  219 + break;
  220 + case 'w':
  221 + strncpy(logfile, optarg, MAX_FILENAME);
  222 + printf("write to file %s\n", logfile);
  223 + write_file = 1;
  224 + break;
  225 + case 'r':
  226 + rcvbufsz = atoi(optarg);
  227 + printf("receive buf size %d\n", rcvbufsz);
  228 + if (rcvbufsz < 0)
  229 + err(1, "Invalid rcv buf size\n");
  230 + break;
  231 + case 'm':
  232 + strncpy(cpumask, optarg, sizeof(cpumask));
  233 + maskset = 1;
  234 + printf("cpumask %s maskset %d\n", cpumask, maskset);
  235 + break;
  236 + case 't':
  237 + tid = atoi(optarg);
  238 + if (!tid)
  239 + err(1, "Invalid tgid\n");
  240 + cmd_type = TASKSTATS_CMD_ATTR_TGID;
  241 + print_delays = 1;
  242 + break;
  243 + case 'p':
  244 + tid = atoi(optarg);
  245 + if (!tid)
  246 + err(1, "Invalid pid\n");
  247 + cmd_type = TASKSTATS_CMD_ATTR_PID;
  248 + print_delays = 1;
  249 + break;
  250 + case 'v':
  251 + printf("debug on\n");
  252 + dbg = 1;
  253 + break;
  254 + case 'l':
  255 + printf("listen forever\n");
  256 + loop = 1;
  257 + break;
  258 + default:
  259 + printf("Unknown option %d\n", c);
  260 + exit(-1);
  261 + }
  262 + }
217 263  
218   - tact.sa_handler = sigchld;
219   - sigemptyset(&tact.sa_mask);
220   - if (sigaction(SIGCHLD, &tact, NULL) < 0)
221   - err(1, "sigaction failed for SIGCHLD\n");
  264 + if (write_file) {
  265 + fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC,
  266 + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
  267 + if (fd == -1) {
  268 + perror("Cannot open output file\n");
  269 + exit(1);
  270 + }
  271 + }
222 272  
223   - while (1) {
  273 + if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0)
  274 + err(1, "error creating Netlink socket\n");
224 275  
225   - c = getopt(argc, argv, "t:p:c:");
226   - if (c < 0)
227   - break;
228 276  
229   - switch (c) {
230   - case 't':
231   - tid = atoi(optarg);
232   - if (!tid)
233   - err(1, "Invalid tgid\n");
234   - cmd_type = TASKSTATS_CMD_ATTR_TGID;
235   - break;
236   - case 'p':
237   - tid = atoi(optarg);
238   - if (!tid)
239   - err(1, "Invalid pid\n");
240   - cmd_type = TASKSTATS_CMD_ATTR_TGID;
241   - break;
242   - case 'c':
243   - opterr = 0;
244   - tid = fork();
245   - if (tid < 0)
246   - err(1, "fork failed\n");
  277 + mypid = getpid();
  278 + id = get_family_id(nl_sd);
  279 + if (!id) {
  280 + printf("Error getting family id, errno %d", errno);
  281 + goto err;
  282 + }
  283 + PRINTF("family id %d\n", id);
247 284  
248   - if (tid == 0) { /* child process */
249   - if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) {
250   - exit(-1);
  285 + if (maskset) {
  286 + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
  287 + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
  288 + &cpumask, sizeof(cpumask));
  289 + PRINTF("Sent register cpumask, retval %d\n", rc);
  290 + if (rc < 0) {
  291 + printf("error sending register cpumask\n");
  292 + goto err;
251 293 }
252   - }
253   - forking = 1;
254   - break;
255   - default:
256   - printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
257   - exit(-1);
258   - break;
259 294 }
260   - if (c == 'c')
261   - break;
262   - }
263 295  
264   - /* Construct Netlink request message */
  296 + if (tid) {
  297 + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
  298 + cmd_type, &tid, sizeof(__u32));
  299 + PRINTF("Sent pid/tgid, retval %d\n", rc);
  300 + if (rc < 0) {
  301 + printf("error sending tid/tgid cmd\n");
  302 + goto done;
  303 + }
  304 + }
265 305  
266   - /* Send Netlink request message & get reply */
  306 + do {
  307 + int i;
267 308  
268   - if ((nl_sd =
269   - create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
270   - err(1, "error creating Netlink socket\n");
  309 + rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
  310 + PRINTF("received %d bytes\n", rep_len);
271 311  
  312 + if (rep_len < 0) {
  313 + printf("nonfatal reply error: errno %d\n", errno);
  314 + continue;
  315 + }
  316 + if (msg.n.nlmsg_type == NLMSG_ERROR ||
  317 + !NLMSG_OK((&msg.n), rep_len)) {
  318 + printf("fatal reply error, errno %d\n", errno);
  319 + goto done;
  320 + }
272 321  
273   - id = get_family_id(nl_sd);
  322 + PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n",
  323 + sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
274 324  
275   - /* Send command needed */
276   - req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
277   - req.n.nlmsg_type = id;
278   - req.n.nlmsg_flags = NLM_F_REQUEST;
279   - req.n.nlmsg_seq = 0;
280   - req.n.nlmsg_pid = tid;
281   - req.g.cmd = TASKSTATS_CMD_GET;
282   - na = (struct nlattr *) GENLMSG_DATA(&req);
283   - na->nla_type = cmd_type;
284   - na->nla_len = sizeof(unsigned int) + NLA_HDRLEN;
285   - *(__u32 *) NLA_DATA(na) = tid;
286   - req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
287 325  
  326 + rep_len = GENLMSG_PAYLOAD(&msg.n);
288 327  
289   - if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
290   - err(1, "error sending message via Netlink\n");
  328 + na = (struct nlattr *) GENLMSG_DATA(&msg);
  329 + len = 0;
  330 + i = 0;
  331 + while (len < rep_len) {
  332 + len += NLA_ALIGN(na->nla_len);
  333 + switch (na->nla_type) {
  334 + case TASKSTATS_TYPE_AGGR_TGID:
  335 + /* Fall through */
  336 + case TASKSTATS_TYPE_AGGR_PID:
  337 + aggr_len = NLA_PAYLOAD(na->nla_len);
  338 + len2 = 0;
  339 + /* For nested attributes, na follows */
  340 + na = (struct nlattr *) NLA_DATA(na);
  341 + done = 0;
  342 + while (len2 < aggr_len) {
  343 + switch (na->nla_type) {
  344 + case TASKSTATS_TYPE_PID:
  345 + rtid = *(int *) NLA_DATA(na);
  346 + if (print_delays)
  347 + printf("PID\t%d\n", rtid);
  348 + break;
  349 + case TASKSTATS_TYPE_TGID:
  350 + rtid = *(int *) NLA_DATA(na);
  351 + if (print_delays)
  352 + printf("TGID\t%d\n", rtid);
  353 + break;
  354 + case TASKSTATS_TYPE_STATS:
  355 + count++;
  356 + if (print_delays)
  357 + print_delayacct((struct taskstats *) NLA_DATA(na));
  358 + if (fd) {
  359 + if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
  360 + err(1,"write error\n");
  361 + }
  362 + }
  363 + if (!loop)
  364 + goto done;
  365 + break;
  366 + default:
  367 + printf("Unknown nested nla_type %d\n", na->nla_type);
  368 + break;
  369 + }
  370 + len2 += NLA_ALIGN(na->nla_len);
  371 + na = (struct nlattr *) ((char *) na + len2);
  372 + }
  373 + break;
291 374  
292   - act.sa_handler = SIG_IGN;
293   - sigemptyset(&act.sa_mask);
294   - if (sigaction(SIGINT, &act, NULL) < 0)
295   - err(1, "sigaction failed for SIGINT\n");
296   -
297   - do {
298   - int i;
299   - struct pollfd pfd;
300   - int pollres;
301   -
302   - pfd.events = 0xffff & ~POLLOUT;
303   - pfd.fd = nl_sd;
304   - pollres = poll(&pfd, 1, 5000);
305   - if (pollres < 0 || done) {
306   - break;
307   - }
308   -
309   - rep_len = recv(nl_sd, &ans, sizeof(ans), 0);
310   - nladdr.nl_family = AF_NETLINK;
311   - nladdr.nl_groups = TASKSTATS_LISTEN_GROUP;
312   -
313   - if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */
314   - printf("error received NACK - leaving\n");
315   - exit(1);
316   - }
317   -
318   - if (rep_len < 0) {
319   - err(1, "error receiving reply message via Netlink\n");
320   - break;
321   - }
322   -
323   - /* Validate response message */
324   - if (!NLMSG_OK((&ans.n), rep_len))
325   - err(1, "invalid reply message received via Netlink\n");
326   -
327   - rep_len = GENLMSG_PAYLOAD(&ans.n);
328   -
329   - na = (struct nlattr *) GENLMSG_DATA(&ans);
330   - len = 0;
331   - i = 0;
332   - while (len < rep_len) {
333   - len += NLA_ALIGN(na->nla_len);
334   - switch (na->nla_type) {
335   - case TASKSTATS_TYPE_AGGR_PID:
336   - /* Fall through */
337   - case TASKSTATS_TYPE_AGGR_TGID:
338   - aggr_len = NLA_PAYLOAD(na->nla_len);
339   - len2 = 0;
340   - /* For nested attributes, na follows */
341   - na = (struct nlattr *) NLA_DATA(na);
342   - done = 0;
343   - while (len2 < aggr_len) {
344   - switch (na->nla_type) {
345   - case TASKSTATS_TYPE_PID:
346   - rtid = *(int *) NLA_DATA(na);
347   - break;
348   - case TASKSTATS_TYPE_TGID:
349   - rtid = *(int *) NLA_DATA(na);
350   - break;
351   - case TASKSTATS_TYPE_STATS:
352   - if (rtid == tid) {
353   - print_taskstats((struct taskstats *)
354   - NLA_DATA(na));
355   - done = 1;
  375 + default:
  376 + printf("Unknown nla_type %d\n", na->nla_type);
  377 + break;
356 378 }
357   - break;
358   - }
359   - len2 += NLA_ALIGN(na->nla_len);
360   - na = (struct nlattr *) ((char *) na + len2);
361   - if (done)
362   - break;
  379 + na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
363 380 }
364   - }
365   - na = (struct nlattr *) (GENLMSG_DATA(&ans) + len);
366   - if (done)
367   - break;
  381 + } while (loop);
  382 +done:
  383 + if (maskset) {
  384 + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
  385 + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
  386 + &cpumask, sizeof(cpumask));
  387 + printf("Sent deregister mask, retval %d\n", rc);
  388 + if (rc < 0)
  389 + err(rc, "error sending deregister cpumask\n");
368 390 }
369   - if (done)
370   - break;
371   - }
372   - while (1);
373   -
374   - close(nl_sd);
375   - return 0;
  391 +err:
  392 + close(nl_sd);
  393 + if (fd)
  394 + close(fd);
  395 + return 0;
376 396 }
Documentation/accounting/taskstats.txt
... ... @@ -26,20 +26,28 @@
26 26 Usage
27 27 -----
28 28  
29   -To get statistics during task's lifetime, userspace opens a unicast netlink
  29 +To get statistics during a task's lifetime, userspace opens a unicast netlink
30 30 socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
31 31 The response contains statistics for a task (if pid is specified) or the sum of
32 32 statistics for all tasks of the process (if tgid is specified).
33 33  
34   -To obtain statistics for tasks which are exiting, userspace opens a multicast
35   -netlink socket. Each time a task exits, its per-pid statistics is always sent
36   -by the kernel to each listener on the multicast socket. In addition, if it is
37   -the last thread exiting its thread group, an additional record containing the
38   -per-tgid stats are also sent. The latter contains the sum of per-pid stats for
39   -all threads in the thread group, both past and present.
  34 +To obtain statistics for tasks which are exiting, the userspace listener
  35 +sends a register command and specifies a cpumask. Whenever a task exits on
  36 +one of the cpus in the cpumask, its per-pid statistics are sent to the
  37 +registered listener. Using cpumasks allows the data received by one listener
  38 +to be limited and assists in flow control over the netlink interface and is
  39 +explained in more detail below.
40 40  
  41 +If the exiting task is the last thread exiting its thread group,
  42 +an additional record containing the per-tgid stats is also sent to userspace.
  43 +The latter contains the sum of per-pid stats for all threads in the thread
  44 +group, both past and present.
  45 +
41 46 getdelays.c is a simple utility demonstrating usage of the taskstats interface
42   -for reporting delay accounting statistics.
  47 +for reporting delay accounting statistics. Users can register cpumasks,
  48 +send commands and process responses, listen for per-tid/tgid exit data,
  49 +write the data received to a file and do basic flow control by increasing
  50 +receive buffer sizes.
43 51  
44 52 Interface
45 53 ---------
46 54  
... ... @@ -66,11 +74,21 @@
66 74  
67 75 The taskstats payload is one of the following three kinds:
68 76  
69   -1. Commands: Sent from user to kernel. The payload is one attribute, of type
70   -TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute
71   -payload. The pid/tgid denotes the task/process for which userspace wants
72   -statistics.
  77 +1. Commands: Sent from user to kernel. Commands to get data on
  78 +a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
  79 +containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
  80 +the task/process for which userspace wants statistics.
73 81  
  82 +Commands to register/deregister interest in exit data from a set of cpus
  83 +consist of one attribute, of type
  84 +TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
  85 +attribute payload. The cpumask is specified as an ascii string of
  86 +comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
  87 +the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
  88 +in cpus before closing the listening socket, the kernel cleans up its interest
  89 +set over time. However, for the sake of efficiency, an explicit deregistration
  90 +is advisable.
  91 +
74 92 2. Response for a command: sent from the kernel in response to a userspace
75 93 command. The payload is a series of three attributes of type:
76 94  
... ... @@ -137,6 +155,28 @@
137 155 struct too much, requiring disparate userspace accounting utilities to
138 156 unnecessarily receive large structures whose fields are of no interest, then
139 157 extending the attributes structure would be worthwhile.
  158 +
  159 +Flow control for taskstats
  160 +--------------------------
  161 +
  162 +When the rate of task exits becomes large, a listener may not be able to keep
  163 +up with the kernel's rate of sending per-tid/tgid exit data leading to data
  164 +loss. This possibility gets compounded when the taskstats structure gets
  165 +extended and the number of cpus grows large.
  166 +
  167 +To avoid losing statistics, userspace should do one or more of the following:
  168 +
  169 +- increase the receive buffer sizes for the netlink sockets opened by
  170 +listeners to receive exit data.
  171 +
  172 +- create more listeners and reduce the number of cpus being listened to by
  173 +each listener. In the extreme case, there could be one listener for each cpu.
  174 +Users may also consider setting the cpu affinity of the listener to the subset
  175 +of cpus to which it listens, especially if they are listening to just one cpu.
  176 +
  177 +Despite these measures, if the userspace receives ENOBUFS error messages
  178 +indicated overflow of receive buffers, it should take measures to handle the
  179 +loss of data.
140 180  
141 181 ----