Commit 99f895518368252ba862cc15ce4eb98ebbe1bec6

Authored by Eric W. Biederman
Committed by Linus Torvalds
1 parent 8578cea750

[PATCH] proc: don't lock task_structs indefinitely

Every inode in /proc holds a reference to a struct task_struct.  If a
directory or file is opened and remains open after the the task exits this
pinning continues.  With 8K stacks on a 32bit machine the amount pinned per
file descriptor is about 10K.

Normally I would figure a reasonable per user process limit is about 100
processes.  With 80 processes, with a 1000 file descriptors each I can trigger
the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless
data.

This patch replaces the struct task_struct pointer with a pointer to a struct
task_ref which has a struct task_struct pointer.  The so the pinning of dead
tasks does not happen.

The code now has to contend with the fact that the task may now exit at any
time.  Which is a little but not muh more complicated.

With this change it takes about 1000 processes each opening up 1000 file
descriptors before I can trigger the OOM killer.  Much better.

[mlp@google.com: task_mmu small fixes]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Paul Jackson <pj@sgi.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Albert Cahalan <acahalan@gmail.com>
Signed-off-by: Prasanna Meda <mlp@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 7 changed files with 349 additions and 143 deletions Side-by-side Diff

... ... @@ -307,12 +307,15 @@
307 307  
308 308 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
309 309 {
310   - struct task_struct *task = proc_task(inode);
311   - struct files_struct *files;
  310 + struct task_struct *task = get_proc_task(inode);
  311 + struct files_struct *files = NULL;
312 312 struct file *file;
313 313 int fd = proc_fd(inode);
314 314  
315   - files = get_files_struct(task);
  315 + if (task) {
  316 + files = get_files_struct(task);
  317 + put_task_struct(task);
  318 + }
316 319 if (files) {
317 320 /*
318 321 * We are not taking a ref to the file structure, so we must
319 322  
320 323  
... ... @@ -344,10 +347,29 @@
344 347 return fs;
345 348 }
346 349  
  350 +static int get_nr_threads(struct task_struct *tsk)
  351 +{
  352 + /* Must be called with the rcu_read_lock held */
  353 + unsigned long flags;
  354 + int count = 0;
  355 +
  356 + if (lock_task_sighand(tsk, &flags)) {
  357 + count = atomic_read(&tsk->signal->count);
  358 + unlock_task_sighand(tsk, &flags);
  359 + }
  360 + return count;
  361 +}
  362 +
347 363 static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
348 364 {
349   - struct fs_struct *fs = get_fs_struct(proc_task(inode));
  365 + struct task_struct *task = get_proc_task(inode);
  366 + struct fs_struct *fs = NULL;
350 367 int result = -ENOENT;
  368 +
  369 + if (task) {
  370 + fs = get_fs_struct(task);
  371 + put_task_struct(task);
  372 + }
351 373 if (fs) {
352 374 read_lock(&fs->lock);
353 375 *mnt = mntget(fs->pwdmnt);
354 376  
... ... @@ -361,8 +383,14 @@
361 383  
362 384 static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
363 385 {
364   - struct fs_struct *fs = get_fs_struct(proc_task(inode));
  386 + struct task_struct *task = get_proc_task(inode);
  387 + struct fs_struct *fs = NULL;
365 388 int result = -ENOENT;
  389 +
  390 + if (task) {
  391 + fs = get_fs_struct(task);
  392 + put_task_struct(task);
  393 + }
366 394 if (fs) {
367 395 read_lock(&fs->lock);
368 396 *mnt = mntget(fs->rootmnt);
369 397  
... ... @@ -550,16 +578,19 @@
550 578  
551 579 static int mounts_open(struct inode *inode, struct file *file)
552 580 {
553   - struct task_struct *task = proc_task(inode);
554   - struct namespace *namespace;
  581 + struct task_struct *task = get_proc_task(inode);
  582 + struct namespace *namespace = NULL;
555 583 struct proc_mounts *p;
556 584 int ret = -EINVAL;
557 585  
558   - task_lock(task);
559   - namespace = task->namespace;
560   - if (namespace)
561   - get_namespace(namespace);
562   - task_unlock(task);
  586 + if (task) {
  587 + task_lock(task);
  588 + namespace = task->namespace;
  589 + if (namespace)
  590 + get_namespace(namespace);
  591 + task_unlock(task);
  592 + put_task_struct(task);
  593 + }
563 594  
564 595 if (namespace) {
565 596 ret = -ENOMEM;
566 597  
567 598  
... ... @@ -616,18 +647,22 @@
616 647 extern struct seq_operations mountstats_op;
617 648 static int mountstats_open(struct inode *inode, struct file *file)
618 649 {
619   - struct task_struct *task = proc_task(inode);
620 650 int ret = seq_open(file, &mountstats_op);
621 651  
622 652 if (!ret) {
623 653 struct seq_file *m = file->private_data;
624   - struct namespace *namespace;
625   - task_lock(task);
626   - namespace = task->namespace;
627   - if (namespace)
628   - get_namespace(namespace);
629   - task_unlock(task);
  654 + struct namespace *namespace = NULL;
  655 + struct task_struct *task = get_proc_task(inode);
630 656  
  657 + if (task) {
  658 + task_lock(task);
  659 + namespace = task->namespace;
  660 + if (namespace)
  661 + get_namespace(namespace);
  662 + task_unlock(task);
  663 + put_task_struct(task);
  664 + }
  665 +
631 666 if (namespace)
632 667 m->private = namespace;
633 668 else {
634 669  
635 670  
636 671  
637 672  
... ... @@ -653,18 +688,27 @@
653 688 struct inode * inode = file->f_dentry->d_inode;
654 689 unsigned long page;
655 690 ssize_t length;
656   - struct task_struct *task = proc_task(inode);
  691 + struct task_struct *task = get_proc_task(inode);
657 692  
  693 + length = -ESRCH;
  694 + if (!task)
  695 + goto out_no_task;
  696 +
658 697 if (count > PROC_BLOCK_SIZE)
659 698 count = PROC_BLOCK_SIZE;
  699 +
  700 + length = -ENOMEM;
660 701 if (!(page = __get_free_page(GFP_KERNEL)))
661   - return -ENOMEM;
  702 + goto out;
662 703  
663 704 length = PROC_I(inode)->op.proc_read(task, (char*)page);
664 705  
665 706 if (length >= 0)
666 707 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
667 708 free_page(page);
  709 +out:
  710 + put_task_struct(task);
  711 +out_no_task:
668 712 return length;
669 713 }
670 714  
671 715  
... ... @@ -681,12 +725,15 @@
681 725 static ssize_t mem_read(struct file * file, char __user * buf,
682 726 size_t count, loff_t *ppos)
683 727 {
684   - struct task_struct *task = proc_task(file->f_dentry->d_inode);
  728 + struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
685 729 char *page;
686 730 unsigned long src = *ppos;
687 731 int ret = -ESRCH;
688 732 struct mm_struct *mm;
689 733  
  734 + if (!task)
  735 + goto out_no_task;
  736 +
690 737 if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
691 738 goto out;
692 739  
... ... @@ -736,6 +783,8 @@
736 783 out_free:
737 784 free_page((unsigned long) page);
738 785 out:
  786 + put_task_struct(task);
  787 +out_no_task:
739 788 return ret;
740 789 }
741 790  
742 791  
743 792  
744 793  
745 794  
... ... @@ -748,15 +797,20 @@
748 797 {
749 798 int copied = 0;
750 799 char *page;
751   - struct task_struct *task = proc_task(file->f_dentry->d_inode);
  800 + struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
752 801 unsigned long dst = *ppos;
753 802  
  803 + copied = -ESRCH;
  804 + if (!task)
  805 + goto out_no_task;
  806 +
754 807 if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
755   - return -ESRCH;
  808 + goto out;
756 809  
  810 + copied = -ENOMEM;
757 811 page = (char *)__get_free_page(GFP_USER);
758 812 if (!page)
759   - return -ENOMEM;
  813 + goto out;
760 814  
761 815 while (count > 0) {
762 816 int this_len, retval;
... ... @@ -779,6 +833,9 @@
779 833 }
780 834 *ppos = dst;
781 835 free_page((unsigned long) page);
  836 +out:
  837 + put_task_struct(task);
  838 +out_no_task:
782 839 return copied;
783 840 }
784 841 #endif
785 842  
786 843  
... ... @@ -809,12 +866,17 @@
809 866 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
810 867 size_t count, loff_t *ppos)
811 868 {
812   - struct task_struct *task = proc_task(file->f_dentry->d_inode);
  869 + struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
813 870 char buffer[PROC_NUMBUF];
814 871 size_t len;
815   - int oom_adjust = task->oomkilladj;
  872 + int oom_adjust;
816 873 loff_t __ppos = *ppos;
817 874  
  875 + if (!task)
  876 + return -ESRCH;
  877 + oom_adjust = task->oomkilladj;
  878 + put_task_struct(task);
  879 +
818 880 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
819 881 if (__ppos >= len)
820 882 return 0;
... ... @@ -829,7 +891,7 @@
829 891 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
830 892 size_t count, loff_t *ppos)
831 893 {
832   - struct task_struct *task = proc_task(file->f_dentry->d_inode);
  894 + struct task_struct *task;
833 895 char buffer[PROC_NUMBUF], *end;
834 896 int oom_adjust;
835 897  
836 898  
... ... @@ -845,7 +907,11 @@
845 907 return -EINVAL;
846 908 if (*end == '\n')
847 909 end++;
  910 + task = get_proc_task(file->f_dentry->d_inode);
  911 + if (!task)
  912 + return -ESRCH;
848 913 task->oomkilladj = oom_adjust;
  914 + put_task_struct(task);
849 915 if (end - buffer == 0)
850 916 return -EIO;
851 917 return end - buffer;
852 918  
853 919  
... ... @@ -862,12 +928,15 @@
862 928 size_t count, loff_t *ppos)
863 929 {
864 930 struct inode * inode = file->f_dentry->d_inode;
865   - struct task_struct *task = proc_task(inode);
  931 + struct task_struct *task = get_proc_task(inode);
866 932 ssize_t length;
867 933 char tmpbuf[TMPBUFLEN];
868 934  
  935 + if (!task)
  936 + return -ESRCH;
869 937 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
870 938 audit_get_loginuid(task->audit_context));
  939 + put_task_struct(task);
871 940 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
872 941 }
873 942  
874 943  
... ... @@ -877,13 +946,12 @@
877 946 struct inode * inode = file->f_dentry->d_inode;
878 947 char *page, *tmp;
879 948 ssize_t length;
880   - struct task_struct *task = proc_task(inode);
881 949 uid_t loginuid;
882 950  
883 951 if (!capable(CAP_AUDIT_CONTROL))
884 952 return -EPERM;
885 953  
886   - if (current != task)
  954 + if (current != proc_tref(inode)->task)
887 955 return -EPERM;
888 956  
889 957 if (count >= PAGE_SIZE)
... ... @@ -907,7 +975,7 @@
907 975 goto out_free_page;
908 976  
909 977 }
910   - length = audit_set_loginuid(task, loginuid);
  978 + length = audit_set_loginuid(current, loginuid);
911 979 if (likely(length == 0))
912 980 length = count;
913 981  
914 982  
915 983  
... ... @@ -926,13 +994,16 @@
926 994 static ssize_t seccomp_read(struct file *file, char __user *buf,
927 995 size_t count, loff_t *ppos)
928 996 {
929   - struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
  997 + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
930 998 char __buf[20];
931 999 loff_t __ppos = *ppos;
932 1000 size_t len;
933 1001  
  1002 + if (!tsk)
  1003 + return -ESRCH;
934 1004 /* no need to print the trailing zero, so use only len */
935 1005 len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
  1006 + put_task_struct(tsk);
936 1007 if (__ppos >= len)
937 1008 return 0;
938 1009 if (count > len - __ppos)
939 1010  
940 1011  
941 1012  
942 1013  
943 1014  
944 1015  
945 1016  
946 1017  
947 1018  
... ... @@ -946,29 +1017,43 @@
946 1017 static ssize_t seccomp_write(struct file *file, const char __user *buf,
947 1018 size_t count, loff_t *ppos)
948 1019 {
949   - struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
  1020 + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
950 1021 char __buf[20], *end;
951 1022 unsigned int seccomp_mode;
  1023 + ssize_t result;
952 1024  
  1025 + result = -ESRCH;
  1026 + if (!tsk)
  1027 + goto out_no_task;
  1028 +
953 1029 /* can set it only once to be even more secure */
  1030 + result = -EPERM;
954 1031 if (unlikely(tsk->seccomp.mode))
955   - return -EPERM;
  1032 + goto out;
956 1033  
  1034 + result = -EFAULT;
957 1035 memset(__buf, 0, sizeof(__buf));
958 1036 count = min(count, sizeof(__buf) - 1);
959 1037 if (copy_from_user(__buf, buf, count))
960   - return -EFAULT;
  1038 + goto out;
  1039 +
961 1040 seccomp_mode = simple_strtoul(__buf, &end, 0);
962 1041 if (*end == '\n')
963 1042 end++;
  1043 + result = -EINVAL;
964 1044 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
965 1045 tsk->seccomp.mode = seccomp_mode;
966 1046 set_tsk_thread_flag(tsk, TIF_SECCOMP);
967 1047 } else
968   - return -EINVAL;
  1048 + goto out;
  1049 + result = -EIO;
969 1050 if (unlikely(!(end - __buf)))
970   - return -EIO;
971   - return end - __buf;
  1051 + goto out;
  1052 + result = end - __buf;
  1053 +out:
  1054 + put_task_struct(tsk);
  1055 +out_no_task:
  1056 + return result;
972 1057 }
973 1058  
974 1059 static struct file_operations proc_seccomp_operations = {
... ... @@ -995,7 +1080,7 @@
995 1080 /* See if the the two tasks share a commone set of
996 1081 * file descriptors. If so everything is visible.
997 1082 */
998   - task = proc_task(inode);
  1083 + task = get_proc_task(inode);
999 1084 if (!task)
1000 1085 goto out;
1001 1086 files = get_files_struct(current);
... ... @@ -1006,6 +1091,7 @@
1006 1091 put_files_struct(task_files);
1007 1092 if (files)
1008 1093 put_files_struct(files);
  1094 + put_task_struct(task);
1009 1095 if (!error)
1010 1096 goto out;
1011 1097  
... ... @@ -1106,7 +1192,7 @@
1106 1192 {
1107 1193 struct dentry *dentry = filp->f_dentry;
1108 1194 struct inode *inode = dentry->d_inode;
1109   - struct task_struct *p = proc_task(inode);
  1195 + struct task_struct *p = get_proc_task(inode);
1110 1196 unsigned int fd, tid, ino;
1111 1197 int retval;
1112 1198 char buf[PROC_NUMBUF];
... ... @@ -1114,8 +1200,8 @@
1114 1200 struct fdtable *fdt;
1115 1201  
1116 1202 retval = -ENOENT;
1117   - if (!pid_alive(p))
1118   - goto out;
  1203 + if (!p)
  1204 + goto out_no_task;
1119 1205 retval = 0;
1120 1206 tid = p->pid;
1121 1207  
... ... @@ -1164,6 +1250,8 @@
1164 1250 put_files_struct(files);
1165 1251 }
1166 1252 out:
  1253 + put_task_struct(p);
  1254 +out_no_task:
1167 1255 return retval;
1168 1256 }
1169 1257  
1170 1258  
1171 1259  
... ... @@ -1175,16 +1263,18 @@
1175 1263 int pid;
1176 1264 struct dentry *dentry = filp->f_dentry;
1177 1265 struct inode *inode = dentry->d_inode;
  1266 + struct task_struct *task = get_proc_task(inode);
1178 1267 struct pid_entry *p;
1179 1268 ino_t ino;
1180 1269 int ret;
1181 1270  
1182 1271 ret = -ENOENT;
1183   - if (!pid_alive(proc_task(inode)))
  1272 + if (!task)
1184 1273 goto out;
1185 1274  
1186 1275 ret = 0;
1187   - pid = proc_task(inode)->pid;
  1276 + pid = task->pid;
  1277 + put_task_struct(task);
1188 1278 i = filp->f_pos;
1189 1279 switch (i) {
1190 1280 case 0:
1191 1281  
... ... @@ -1270,14 +1360,13 @@
1270 1360 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1271 1361 inode->i_ino = fake_ino(task->pid, ino);
1272 1362  
1273   - if (!pid_alive(task))
1274   - goto out_unlock;
1275   -
1276 1363 /*
1277 1364 * grab the reference to task.
1278 1365 */
1279   - get_task_struct(task);
1280   - ei->task = task;
  1366 + ei->tref = tref_get_by_task(task);
  1367 + if (!tref_task(ei->tref))
  1368 + goto out_unlock;
  1369 +
1281 1370 inode->i_uid = 0;
1282 1371 inode->i_gid = 0;
1283 1372 if (task_dumpable(task)) {
1284 1373  
... ... @@ -1303,13 +1392,21 @@
1303 1392 *
1304 1393 * Rewrite the inode's ownerships here because the owning task may have
1305 1394 * performed a setuid(), etc.
  1395 + *
  1396 + * Before the /proc/pid/status file was created the only way to read
  1397 + * the effective uid of a /process was to stat /proc/pid. Reading
  1398 + * /proc/pid/status is slow enough that procps and other packages
  1399 + * kept stating /proc/pid. To keep the rules in /proc simple I have
  1400 + * made this apply to all per process world readable and executable
  1401 + * directories.
1306 1402 */
1307 1403 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1308 1404 {
1309 1405 struct inode *inode = dentry->d_inode;
1310   - struct task_struct *task = proc_task(inode);
1311   - if (pid_alive(task)) {
1312   - if (task_dumpable(task)) {
  1406 + struct task_struct *task = get_proc_task(inode);
  1407 + if (task) {
  1408 + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
  1409 + task_dumpable(task)) {
1313 1410 inode->i_uid = task->euid;
1314 1411 inode->i_gid = task->egid;
1315 1412 } else {
1316 1413  
1317 1414  
1318 1415  
1319 1416  
1320 1417  
... ... @@ -1317,37 +1414,63 @@
1317 1414 inode->i_gid = 0;
1318 1415 }
1319 1416 security_task_to_inode(task, inode);
  1417 + put_task_struct(task);
1320 1418 return 1;
1321 1419 }
1322 1420 d_drop(dentry);
1323 1421 return 0;
1324 1422 }
1325 1423  
  1424 +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
  1425 +{
  1426 + struct inode *inode = dentry->d_inode;
  1427 + struct task_struct *task;
  1428 + generic_fillattr(inode, stat);
  1429 +
  1430 + rcu_read_lock();
  1431 + stat->uid = 0;
  1432 + stat->gid = 0;
  1433 + task = pid_task(proc_pid(inode), PIDTYPE_PID);
  1434 + if (task) {
  1435 + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
  1436 + task_dumpable(task)) {
  1437 + stat->uid = task->euid;
  1438 + stat->gid = task->egid;
  1439 + }
  1440 + }
  1441 + rcu_read_unlock();
  1442 + return 0;
  1443 +}
  1444 +
1326 1445 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1327 1446 {
1328 1447 struct inode *inode = dentry->d_inode;
1329   - struct task_struct *task = proc_task(inode);
  1448 + struct task_struct *task = get_proc_task(inode);
1330 1449 int fd = proc_fd(inode);
1331 1450 struct files_struct *files;
1332 1451  
1333   - files = get_files_struct(task);
1334   - if (files) {
1335   - rcu_read_lock();
1336   - if (fcheck_files(files, fd)) {
  1452 + if (task) {
  1453 + files = get_files_struct(task);
  1454 + if (files) {
  1455 + rcu_read_lock();
  1456 + if (fcheck_files(files, fd)) {
  1457 + rcu_read_unlock();
  1458 + put_files_struct(files);
  1459 + if (task_dumpable(task)) {
  1460 + inode->i_uid = task->euid;
  1461 + inode->i_gid = task->egid;
  1462 + } else {
  1463 + inode->i_uid = 0;
  1464 + inode->i_gid = 0;
  1465 + }
  1466 + security_task_to_inode(task, inode);
  1467 + put_task_struct(task);
  1468 + return 1;
  1469 + }
1337 1470 rcu_read_unlock();
1338 1471 put_files_struct(files);
1339   - if (task_dumpable(task)) {
1340   - inode->i_uid = task->euid;
1341   - inode->i_gid = task->egid;
1342   - } else {
1343   - inode->i_uid = 0;
1344   - inode->i_gid = 0;
1345   - }
1346   - security_task_to_inode(task, inode);
1347   - return 1;
1348 1472 }
1349   - rcu_read_unlock();
1350   - put_files_struct(files);
  1473 + put_task_struct(task);
1351 1474 }
1352 1475 d_drop(dentry);
1353 1476 return 0;
... ... @@ -1359,7 +1482,7 @@
1359 1482 * If so, then don't put the dentry on the lru list,
1360 1483 * kill it immediately.
1361 1484 */
1362   - return !pid_alive(proc_task(dentry->d_inode));
  1485 + return !proc_tref(dentry->d_inode)->task;
1363 1486 }
1364 1487  
1365 1488 static struct dentry_operations tid_fd_dentry_operations =
... ... @@ -1401,7 +1524,7 @@
1401 1524 /* SMP-safe */
1402 1525 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
1403 1526 {
1404   - struct task_struct *task = proc_task(dir);
  1527 + struct task_struct *task = get_proc_task(dir);
1405 1528 unsigned fd = name_to_int(dentry);
1406 1529 struct dentry *result = ERR_PTR(-ENOENT);
1407 1530 struct file * file;
1408 1531  
... ... @@ -1409,10 +1532,10 @@
1409 1532 struct inode *inode;
1410 1533 struct proc_inode *ei;
1411 1534  
  1535 + if (!task)
  1536 + goto out_no_task;
1412 1537 if (fd == ~0U)
1413 1538 goto out;
1414   - if (!pid_alive(task))
1415   - goto out;
1416 1539  
1417 1540 inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
1418 1541 if (!inode)
... ... @@ -1447,6 +1570,8 @@
1447 1570 if (tid_fd_revalidate(dentry, NULL))
1448 1571 result = NULL;
1449 1572 out:
  1573 + put_task_struct(task);
  1574 +out_no_task:
1450 1575 return result;
1451 1576  
1452 1577 out_unlock2:
1453 1578  
1454 1579  
1455 1580  
... ... @@ -1490,12 +1615,17 @@
1490 1615 struct inode * inode = file->f_dentry->d_inode;
1491 1616 unsigned long page;
1492 1617 ssize_t length;
1493   - struct task_struct *task = proc_task(inode);
  1618 + struct task_struct *task = get_proc_task(inode);
1494 1619  
  1620 + length = -ESRCH;
  1621 + if (!task)
  1622 + goto out_no_task;
  1623 +
1495 1624 if (count > PAGE_SIZE)
1496 1625 count = PAGE_SIZE;
  1626 + length = -ENOMEM;
1497 1627 if (!(page = __get_free_page(GFP_KERNEL)))
1498   - return -ENOMEM;
  1628 + goto out;
1499 1629  
1500 1630 length = security_getprocattr(task,
1501 1631 (char*)file->f_dentry->d_name.name,
... ... @@ -1503,6 +1633,9 @@
1503 1633 if (length >= 0)
1504 1634 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
1505 1635 free_page(page);
  1636 +out:
  1637 + put_task_struct(task);
  1638 +out_no_task:
1506 1639 return length;
1507 1640 }
1508 1641  
1509 1642  
1510 1643  
1511 1644  
1512 1645  
1513 1646  
1514 1647  
... ... @@ -1512,26 +1645,36 @@
1512 1645 struct inode * inode = file->f_dentry->d_inode;
1513 1646 char *page;
1514 1647 ssize_t length;
1515   - struct task_struct *task = proc_task(inode);
  1648 + struct task_struct *task = get_proc_task(inode);
1516 1649  
  1650 + length = -ESRCH;
  1651 + if (!task)
  1652 + goto out_no_task;
1517 1653 if (count > PAGE_SIZE)
1518 1654 count = PAGE_SIZE;
1519   - if (*ppos != 0) {
1520   - /* No partial writes. */
1521   - return -EINVAL;
1522   - }
  1655 +
  1656 + /* No partial writes. */
  1657 + length = -EINVAL;
  1658 + if (*ppos != 0)
  1659 + goto out;
  1660 +
  1661 + length = -ENOMEM;
1523 1662 page = (char*)__get_free_page(GFP_USER);
1524 1663 if (!page)
1525   - return -ENOMEM;
  1664 + goto out;
  1665 +
1526 1666 length = -EFAULT;
1527 1667 if (copy_from_user(page, buf, count))
1528   - goto out;
  1668 + goto out_free;
1529 1669  
1530 1670 length = security_setprocattr(task,
1531 1671 (char*)file->f_dentry->d_name.name,
1532 1672 (void*)page, count);
1533   -out:
  1673 +out_free:
1534 1674 free_page((unsigned long) page);
  1675 +out:
  1676 + put_task_struct(task);
  1677 +out_no_task:
1535 1678 return length;
1536 1679 }
1537 1680  
1538 1681  
... ... @@ -1553,15 +1696,15 @@
1553 1696 {
1554 1697 struct inode *inode;
1555 1698 struct dentry *error;
1556   - struct task_struct *task = proc_task(dir);
  1699 + struct task_struct *task = get_proc_task(dir);
1557 1700 struct pid_entry *p;
1558 1701 struct proc_inode *ei;
1559 1702  
1560 1703 error = ERR_PTR(-ENOENT);
1561 1704 inode = NULL;
1562 1705  
1563   - if (!pid_alive(task))
1564   - goto out;
  1706 + if (!task)
  1707 + goto out_no_task;
1565 1708  
1566 1709 for (p = ents; p->name; p++) {
1567 1710 if (p->len != dentry->d_name.len)
... ... @@ -1748,6 +1891,8 @@
1748 1891 if (pid_revalidate(dentry, NULL))
1749 1892 error = NULL;
1750 1893 out:
  1894 + put_task_struct(task);
  1895 +out_no_task:
1751 1896 return error;
1752 1897 }
1753 1898  
1754 1899  
... ... @@ -1771,10 +1916,12 @@
1771 1916  
1772 1917 static struct inode_operations proc_tgid_base_inode_operations = {
1773 1918 .lookup = proc_tgid_base_lookup,
  1919 + .getattr = pid_getattr,
1774 1920 };
1775 1921  
1776 1922 static struct inode_operations proc_tid_base_inode_operations = {
1777 1923 .lookup = proc_tid_base_lookup,
  1924 + .getattr = pid_getattr,
1778 1925 };
1779 1926  
1780 1927 #ifdef CONFIG_SECURITY
1781 1928  
... ... @@ -1816,10 +1963,12 @@
1816 1963  
1817 1964 static struct inode_operations proc_tgid_attr_inode_operations = {
1818 1965 .lookup = proc_tgid_attr_lookup,
  1966 + .getattr = pid_getattr,
1819 1967 };
1820 1968  
1821 1969 static struct inode_operations proc_tid_attr_inode_operations = {
1822 1970 .lookup = proc_tid_attr_lookup,
  1971 + .getattr = pid_getattr,
1823 1972 };
1824 1973 #endif
1825 1974  
1826 1975  
... ... @@ -1981,10 +2130,13 @@
1981 2130 {
1982 2131 struct dentry *result = ERR_PTR(-ENOENT);
1983 2132 struct task_struct *task;
1984   - struct task_struct *leader = proc_task(dir);
  2133 + struct task_struct *leader = get_proc_task(dir);
1985 2134 struct inode *inode;
1986 2135 unsigned tid;
1987 2136  
  2137 + if (!leader)
  2138 + goto out_no_task;
  2139 +
1988 2140 tid = name_to_int(dentry);
1989 2141 if (tid == ~0U)
1990 2142 goto out;
... ... @@ -2024,6 +2176,8 @@
2024 2176 out_drop_task:
2025 2177 put_task_struct(task);
2026 2178 out:
  2179 + put_task_struct(leader);
  2180 +out_no_task:
2027 2181 return result;
2028 2182 }
2029 2183  
... ... @@ -2163,12 +2317,7 @@
2163 2317  
2164 2318 /* If nr exceeds the number of threads there is nothing todo */
2165 2319 if (nr) {
2166   - int threads = 0;
2167   - task_lock(leader);
2168   - if (leader->signal)
2169   - threads = atomic_read(&leader->signal->count);
2170   - task_unlock(leader);
2171   - if (nr >= threads)
  2320 + if (nr >= get_nr_threads(leader))
2172 2321 goto done;
2173 2322 }
2174 2323  
2175 2324  
... ... @@ -2218,15 +2367,15 @@
2218 2367 char buf[PROC_NUMBUF];
2219 2368 struct dentry *dentry = filp->f_dentry;
2220 2369 struct inode *inode = dentry->d_inode;
2221   - struct task_struct *leader = proc_task(inode);
  2370 + struct task_struct *leader = get_proc_task(inode);
2222 2371 struct task_struct *task;
2223 2372 int retval = -ENOENT;
2224 2373 ino_t ino;
2225 2374 int tid;
2226 2375 unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */
2227 2376  
2228   - if (!pid_alive(leader))
2229   - goto out;
  2377 + if (!leader)
  2378 + goto out_no_task;
2230 2379 retval = 0;
2231 2380  
2232 2381 switch (pos) {
2233 2382  
2234 2383  
... ... @@ -2266,20 +2415,22 @@
2266 2415 }
2267 2416 out:
2268 2417 filp->f_pos = pos;
  2418 + put_task_struct(leader);
  2419 +out_no_task:
2269 2420 return retval;
2270 2421 }
2271 2422  
2272 2423 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
2273 2424 {
2274 2425 struct inode *inode = dentry->d_inode;
2275   - struct task_struct *p = proc_task(inode);
  2426 + struct task_struct *p = get_proc_task(inode);
2276 2427 generic_fillattr(inode, stat);
2277 2428  
2278   - if (pid_alive(p)) {
2279   - task_lock(p);
2280   - if (p->signal)
2281   - stat->nlink += atomic_read(&p->signal->count);
2282   - task_unlock(p);
  2429 + if (p) {
  2430 + rcu_read_lock();
  2431 + stat->nlink += get_nr_threads(p);
  2432 + rcu_read_unlock();
  2433 + put_task_struct(p);
2283 2434 }
2284 2435  
2285 2436 return 0;
... ... @@ -58,14 +58,11 @@
58 58 static void proc_delete_inode(struct inode *inode)
59 59 {
60 60 struct proc_dir_entry *de;
61   - struct task_struct *tsk;
62 61  
63 62 truncate_inode_pages(&inode->i_data, 0);
64 63  
65   - /* Let go of any associated process */
66   - tsk = PROC_I(inode)->task;
67   - if (tsk)
68   - put_task_struct(tsk);
  64 + /* Stop tracking associated processes */
  65 + tref_put(PROC_I(inode)->tref);
69 66  
70 67 /* Let go of any associated proc directory entry */
71 68 de = PROC_I(inode)->pde;
... ... @@ -94,7 +91,7 @@
94 91 ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
95 92 if (!ei)
96 93 return NULL;
97   - ei->task = NULL;
  94 + ei->tref = NULL;
98 95 ei->fd = 0;
99 96 ei->op.proc_get_link = NULL;
100 97 ei->pde = NULL;
... ... @@ -10,6 +10,7 @@
10 10 */
11 11  
12 12 #include <linux/proc_fs.h>
  13 +#include <linux/task_ref.h>
13 14  
14 15 struct vmalloc_info {
15 16 unsigned long used;
16 17  
17 18  
... ... @@ -41,13 +42,23 @@
41 42 extern struct file_operations proc_numa_maps_operations;
42 43 extern struct file_operations proc_smaps_operations;
43 44  
  45 +extern struct file_operations proc_maps_operations;
  46 +extern struct file_operations proc_numa_maps_operations;
  47 +extern struct file_operations proc_smaps_operations;
  48 +
  49 +
44 50 void free_proc_entry(struct proc_dir_entry *de);
45 51  
46 52 int proc_init_inodecache(void);
47 53  
48   -static inline struct task_struct *proc_task(struct inode *inode)
  54 +static inline struct task_ref *proc_tref(struct inode *inode)
49 55 {
50   - return PROC_I(inode)->task;
  56 + return PROC_I(inode)->tref;
  57 +}
  58 +
  59 +static inline struct task_struct *get_proc_task(struct inode *inode)
  60 +{
  61 + return get_tref_task(proc_tref(inode));
51 62 }
52 63  
53 64 static inline int proc_fd(struct inode *inode)
... ... @@ -75,9 +75,13 @@
75 75 {
76 76 struct vm_area_struct * vma;
77 77 int result = -ENOENT;
78   - struct task_struct *task = proc_task(inode);
79   - struct mm_struct * mm = get_task_mm(task);
  78 + struct task_struct *task = get_proc_task(inode);
  79 + struct mm_struct * mm = NULL;
80 80  
  81 + if (task) {
  82 + mm = get_task_mm(task);
  83 + put_task_struct(task);
  84 + }
81 85 if (!mm)
82 86 goto out;
83 87 down_read(&mm->mmap_sem);
... ... @@ -120,7 +124,8 @@
120 124  
121 125 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
122 126 {
123   - struct task_struct *task = m->private;
  127 + struct proc_maps_private *priv = m->private;
  128 + struct task_struct *task = priv->task;
124 129 struct vm_area_struct *vma = v;
125 130 struct mm_struct *mm = vma->vm_mm;
126 131 struct file *file = vma->vm_file;
127 132  
128 133  
... ... @@ -295,12 +300,16 @@
295 300  
296 301 static void *m_start(struct seq_file *m, loff_t *pos)
297 302 {
298   - struct task_struct *task = m->private;
  303 + struct proc_maps_private *priv = m->private;
299 304 unsigned long last_addr = m->version;
300 305 struct mm_struct *mm;
301   - struct vm_area_struct *vma, *tail_vma;
  306 + struct vm_area_struct *vma, *tail_vma = NULL;
302 307 loff_t l = *pos;
303 308  
  309 + /* Clear the per syscall fields in priv */
  310 + priv->task = NULL;
  311 + priv->tail_vma = NULL;
  312 +
304 313 /*
305 314 * We remember last_addr rather than next_addr to hit with
306 315 * mmap_cache most of the time. We have zero last_addr at
307 316  
... ... @@ -311,11 +320,15 @@
311 320 if (last_addr == -1UL)
312 321 return NULL;
313 322  
314   - mm = get_task_mm(task);
  323 + priv->task = get_tref_task(priv->tref);
  324 + if (!priv->task)
  325 + return NULL;
  326 +
  327 + mm = get_task_mm(priv->task);
315 328 if (!mm)
316 329 return NULL;
317 330  
318   - tail_vma = get_gate_vma(task);
  331 + priv->tail_vma = tail_vma = get_gate_vma(priv->task);
319 332 down_read(&mm->mmap_sem);
320 333  
321 334 /* Start with last addr hint */
322 335  
... ... @@ -350,11 +363,9 @@
350 363 return tail_vma;
351 364 }
352 365  
353   -static void m_stop(struct seq_file *m, void *v)
  366 +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
354 367 {
355   - struct task_struct *task = m->private;
356   - struct vm_area_struct *vma = v;
357   - if (vma && vma != get_gate_vma(task)) {
  368 + if (vma && vma != priv->tail_vma) {
358 369 struct mm_struct *mm = vma->vm_mm;
359 370 up_read(&mm->mmap_sem);
360 371 mmput(mm);
361 372  
362 373  
363 374  
... ... @@ -363,17 +374,27 @@
363 374  
364 375 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
365 376 {
366   - struct task_struct *task = m->private;
  377 + struct proc_maps_private *priv = m->private;
367 378 struct vm_area_struct *vma = v;
368   - struct vm_area_struct *tail_vma = get_gate_vma(task);
  379 + struct vm_area_struct *tail_vma = priv->tail_vma;
369 380  
370 381 (*pos)++;
371 382 if (vma && (vma != tail_vma) && vma->vm_next)
372 383 return vma->vm_next;
373   - m_stop(m, v);
  384 + vma_stop(priv, vma);
374 385 return (vma != tail_vma)? tail_vma: NULL;
375 386 }
376 387  
  388 +static void m_stop(struct seq_file *m, void *v)
  389 +{
  390 + struct proc_maps_private *priv = m->private;
  391 + struct vm_area_struct *vma = v;
  392 +
  393 + vma_stop(priv, vma);
  394 + if (priv->task)
  395 + put_task_struct(priv->task);
  396 +}
  397 +
377 398 static struct seq_operations proc_pid_maps_op = {
378 399 .start = m_start,
379 400 .next = m_next,
... ... @@ -391,11 +412,18 @@
391 412 static int do_maps_open(struct inode *inode, struct file *file,
392 413 struct seq_operations *ops)
393 414 {
394   - struct task_struct *task = proc_task(inode);
395   - int ret = seq_open(file, ops);
396   - if (!ret) {
397   - struct seq_file *m = file->private_data;
398   - m->private = task;
  415 + struct proc_maps_private *priv;
  416 + int ret = -ENOMEM;
  417 + priv = kzalloc(sizeof(*priv), GFP_KERNEL);
  418 + if (priv) {
  419 + priv->tref = proc_tref(inode);
  420 + ret = seq_open(file, ops);
  421 + if (!ret) {
  422 + struct seq_file *m = file->private_data;
  423 + m->private = priv;
  424 + } else {
  425 + kfree(priv);
  426 + }
399 427 }
400 428 return ret;
401 429 }
... ... @@ -409,7 +437,7 @@
409 437 .open = maps_open,
410 438 .read = seq_read,
411 439 .llseek = seq_lseek,
412   - .release = seq_release,
  440 + .release = seq_release_private,
413 441 };
414 442  
415 443 #ifdef CONFIG_NUMA
... ... @@ -431,7 +459,7 @@
431 459 .open = numa_maps_open,
432 460 .read = seq_read,
433 461 .llseek = seq_lseek,
434   - .release = seq_release,
  462 + .release = seq_release_private,
435 463 };
436 464 #endif
437 465  
... ... @@ -444,6 +472,6 @@
444 472 .open = smaps_open,
445 473 .read = seq_read,
446 474 .llseek = seq_lseek,
447   - .release = seq_release,
  475 + .release = seq_release_private,
448 476 };
include/linux/proc_fs.h
... ... @@ -246,7 +246,7 @@
246 246 #endif
247 247  
248 248 struct proc_inode {
249   - struct task_struct *task;
  249 + struct task_ref *tref;
250 250 int fd;
251 251 union {
252 252 int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
... ... @@ -265,6 +265,12 @@
265 265 {
266 266 return PROC_I(inode)->pde;
267 267 }
  268 +
  269 +struct proc_maps_private {
  270 + struct task_ref *tref;
  271 + struct task_struct *task;
  272 + struct vm_area_struct *tail_vma;
  273 +};
268 274  
269 275 #endif /* _LINUX_PROC_FS_H */
... ... @@ -50,6 +50,7 @@
50 50 #include <linux/time.h>
51 51 #include <linux/backing-dev.h>
52 52 #include <linux/sort.h>
  53 +#include <linux/task_ref.h>
53 54  
54 55 #include <asm/uaccess.h>
55 56 #include <asm/atomic.h>
56 57  
57 58  
58 59  
59 60  
60 61  
61 62  
62 63  
63 64  
64 65  
65 66  
... ... @@ -2442,31 +2443,43 @@
2442 2443 */
2443 2444 static int proc_cpuset_show(struct seq_file *m, void *v)
2444 2445 {
  2446 + struct task_ref *tref;
2445 2447 struct task_struct *tsk;
2446 2448 char *buf;
2447   - int retval = 0;
  2449 + int retval;
2448 2450  
  2451 + retval = -ENOMEM;
2449 2452 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2450 2453 if (!buf)
2451   - return -ENOMEM;
  2454 + goto out;
2452 2455  
2453   - tsk = m->private;
  2456 + retval = -ESRCH;
  2457 + tref = m->private;
  2458 + tsk = get_tref_task(tref);
  2459 + if (!tsk)
  2460 + goto out_free;
  2461 +
  2462 + retval = -EINVAL;
2454 2463 mutex_lock(&manage_mutex);
  2464 +
2455 2465 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2456 2466 if (retval < 0)
2457   - goto out;
  2467 + goto out_unlock;
2458 2468 seq_puts(m, buf);
2459 2469 seq_putc(m, '\n');
2460   -out:
  2470 +out_unlock:
2461 2471 mutex_unlock(&manage_mutex);
  2472 + put_task_struct(tsk);
  2473 +out_free:
2462 2474 kfree(buf);
  2475 +out:
2463 2476 return retval;
2464 2477 }
2465 2478  
2466 2479 static int cpuset_open(struct inode *inode, struct file *file)
2467 2480 {
2468   - struct task_struct *tsk = PROC_I(inode)->task;
2469   - return single_open(file, proc_cpuset_show, tsk);
  2481 + struct task_ref *tref = PROC_I(inode)->tref;
  2482 + return single_open(file, proc_cpuset_show, tref);
2470 2483 }
2471 2484  
2472 2485 struct file_operations proc_cpuset_operations = {
... ... @@ -1821,7 +1821,7 @@
1821 1821  
1822 1822 int show_numa_map(struct seq_file *m, void *v)
1823 1823 {
1824   - struct task_struct *task = m->private;
  1824 + struct proc_maps_private *priv = m->private;
1825 1825 struct vm_area_struct *vma = v;
1826 1826 struct numa_maps *md;
1827 1827 struct file *file = vma->vm_file;
... ... @@ -1837,7 +1837,7 @@
1837 1837 return 0;
1838 1838  
1839 1839 mpol_to_str(buffer, sizeof(buffer),
1840   - get_vma_policy(task, vma, vma->vm_start));
  1840 + get_vma_policy(priv->task, vma, vma->vm_start));
1841 1841  
1842 1842 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1843 1843  
... ... @@ -1891,7 +1891,7 @@
1891 1891 kfree(md);
1892 1892  
1893 1893 if (m->count < m->size)
1894   - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
  1894 + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1895 1895 return 0;
1896 1896 }