Commit 51f39a1f0cea1cacf8c787f652f26dfee9611874

Authored by David Drysdale
Committed by Linus Torvalds
1 parent c0ef0cc9d2

syscalls: implement execveat() system call

This patchset adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).

The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc filesystem,
at least for executables (rather than scripts).  The current glibc version
of fexecve(3) is implemented via /proc, which causes problems in sandboxed
or otherwise restricted environments.

Given the desire for a /proc-free fexecve() implementation, HPA suggested
(https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be
an appropriate generalization.

Also, having a new syscall means that it can take a flags argument without
back-compatibility concerns.  The current implementation just defines the
AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be
added in future -- for example, flags for new namespaces (as suggested at
https://lkml.org/lkml/2006/7/11/474).

Related history:
 - https://lkml.org/lkml/2006/12/27/123 is an example of someone
   realizing that fexecve() is likely to fail in a chroot environment.
 - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
   documenting the /proc requirement of fexecve(3) in its manpage, to
   "prevent other people from wasting their time".
 - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
   problem where a process that did setuid() could not fexecve()
   because it no longer had access to /proc/self/fd; this has since
   been fixed.

This patch (of 4):

Add a new execveat(2) system call.  execveat() is to execve() as openat()
is to open(): it takes a file descriptor that refers to a directory, and
resolves the filename relative to that.

In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers.  This
replicates the functionality of fexecve(), which is a system call in other
UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/<fd>" (and
so relies on /proc being mounted).

The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found.  This does however mean that
execution of a script in a /proc-less environment won't work; also, script
execution via an O_CLOEXEC file descriptor fails (as the file will not be
accessible after exec).

Based on patches by Meredydd Luff.

Signed-off-by: David Drysdale <drysdale@google.com>
Cc: Meredydd Luff <meredydd@senatehouse.org>
Cc: Shuah Khan <shuah.kh@samsung.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Rich Felker <dalias@aerifal.cx>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 13 changed files with 145 additions and 15 deletions Side-by-side Diff

... ... @@ -42,6 +42,10 @@
42 42 return -ENOEXEC;
43 43 }
44 44  
  45 + /* Need to be able to load the file after exec */
  46 + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
  47 + return -ENOENT;
  48 +
45 49 allow_write_access(bprm->file);
46 50 fput(bprm->file);
47 51 bprm->file = NULL;
... ... @@ -144,6 +144,10 @@
144 144 if (!fmt)
145 145 goto ret;
146 146  
  147 + /* Need to be able to load the file after exec */
  148 + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
  149 + return -ENOENT;
  150 +
147 151 if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
148 152 retval = remove_arg_zero(bprm);
149 153 if (retval)
... ... @@ -24,6 +24,16 @@
24 24  
25 25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
26 26 return -ENOEXEC;
  27 +
  28 + /*
  29 + * If the script filename will be inaccessible after exec, typically
  30 + * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give
  31 + * up now (on the assumption that the interpreter will want to load
  32 + * this file).
  33 + */
  34 + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
  35 + return -ENOENT;
  36 +
27 37 /*
28 38 * This section does the #! interpretation.
29 39 * Sorta complicated, but hopefully it will work. -TYT
... ... @@ -748,18 +748,25 @@
748 748  
749 749 #endif /* CONFIG_MMU */
750 750  
751   -static struct file *do_open_exec(struct filename *name)
  751 +static struct file *do_open_execat(int fd, struct filename *name, int flags)
752 752 {
753 753 struct file *file;
754 754 int err;
755   - static const struct open_flags open_exec_flags = {
  755 + struct open_flags open_exec_flags = {
756 756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
757 757 .acc_mode = MAY_EXEC | MAY_OPEN,
758 758 .intent = LOOKUP_OPEN,
759 759 .lookup_flags = LOOKUP_FOLLOW,
760 760 };
761 761  
762   - file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
  762 + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
  763 + return ERR_PTR(-EINVAL);
  764 + if (flags & AT_SYMLINK_NOFOLLOW)
  765 + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
  766 + if (flags & AT_EMPTY_PATH)
  767 + open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
  768 +
  769 + file = do_filp_open(fd, name, &open_exec_flags);
763 770 if (IS_ERR(file))
764 771 goto out;
765 772  
766 773  
... ... @@ -770,12 +777,13 @@
770 777 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
771 778 goto exit;
772 779  
773   - fsnotify_open(file);
774   -
775 780 err = deny_write_access(file);
776 781 if (err)
777 782 goto exit;
778 783  
  784 + if (name->name[0] != '\0')
  785 + fsnotify_open(file);
  786 +
779 787 out:
780 788 return file;
781 789  
... ... @@ -787,7 +795,7 @@
787 795 struct file *open_exec(const char *name)
788 796 {
789 797 struct filename tmp = { .name = name };
790   - return do_open_exec(&tmp);
  798 + return do_open_execat(AT_FDCWD, &tmp, 0);
791 799 }
792 800 EXPORT_SYMBOL(open_exec);
793 801  
794 802  
... ... @@ -1428,10 +1436,12 @@
1428 1436 /*
1429 1437 * sys_execve() executes a new program.
1430 1438 */
1431   -static int do_execve_common(struct filename *filename,
1432   - struct user_arg_ptr argv,
1433   - struct user_arg_ptr envp)
  1439 +static int do_execveat_common(int fd, struct filename *filename,
  1440 + struct user_arg_ptr argv,
  1441 + struct user_arg_ptr envp,
  1442 + int flags)
1434 1443 {
  1444 + char *pathbuf = NULL;
1435 1445 struct linux_binprm *bprm;
1436 1446 struct file *file;
1437 1447 struct files_struct *displaced;
... ... @@ -1472,7 +1482,7 @@
1472 1482 check_unsafe_exec(bprm);
1473 1483 current->in_execve = 1;
1474 1484  
1475   - file = do_open_exec(filename);
  1485 + file = do_open_execat(fd, filename, flags);
1476 1486 retval = PTR_ERR(file);
1477 1487 if (IS_ERR(file))
1478 1488 goto out_unmark;
... ... @@ -1480,7 +1490,28 @@
1480 1490 sched_exec();
1481 1491  
1482 1492 bprm->file = file;
1483   - bprm->filename = bprm->interp = filename->name;
  1493 + if (fd == AT_FDCWD || filename->name[0] == '/') {
  1494 + bprm->filename = filename->name;
  1495 + } else {
  1496 + if (filename->name[0] == '\0')
  1497 + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
  1498 + else
  1499 + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
  1500 + fd, filename->name);
  1501 + if (!pathbuf) {
  1502 + retval = -ENOMEM;
  1503 + goto out_unmark;
  1504 + }
  1505 + /*
  1506 + * Record that a name derived from an O_CLOEXEC fd will be
  1507 + * inaccessible after exec. Relies on having exclusive access to
  1508 + * current->files (due to unshare_files above).
  1509 + */
  1510 + if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
  1511 + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
  1512 + bprm->filename = pathbuf;
  1513 + }
  1514 + bprm->interp = bprm->filename;
1484 1515  
1485 1516 retval = bprm_mm_init(bprm);
1486 1517 if (retval)
... ... @@ -1521,6 +1552,7 @@
1521 1552 acct_update_integrals(current);
1522 1553 task_numa_free(current);
1523 1554 free_bprm(bprm);
  1555 + kfree(pathbuf);
1524 1556 putname(filename);
1525 1557 if (displaced)
1526 1558 put_files_struct(displaced);
... ... @@ -1538,6 +1570,7 @@
1538 1570  
1539 1571 out_free:
1540 1572 free_bprm(bprm);
  1573 + kfree(pathbuf);
1541 1574  
1542 1575 out_files:
1543 1576 if (displaced)
1544 1577  
... ... @@ -1553,9 +1586,20 @@
1553 1586 {
1554 1587 struct user_arg_ptr argv = { .ptr.native = __argv };
1555 1588 struct user_arg_ptr envp = { .ptr.native = __envp };
1556   - return do_execve_common(filename, argv, envp);
  1589 + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1557 1590 }
1558 1591  
  1592 +int do_execveat(int fd, struct filename *filename,
  1593 + const char __user *const __user *__argv,
  1594 + const char __user *const __user *__envp,
  1595 + int flags)
  1596 +{
  1597 + struct user_arg_ptr argv = { .ptr.native = __argv };
  1598 + struct user_arg_ptr envp = { .ptr.native = __envp };
  1599 +
  1600 + return do_execveat_common(fd, filename, argv, envp, flags);
  1601 +}
  1602 +
1559 1603 #ifdef CONFIG_COMPAT
1560 1604 static int compat_do_execve(struct filename *filename,
1561 1605 const compat_uptr_t __user *__argv,
1562 1606  
... ... @@ -1569,8 +1613,24 @@
1569 1613 .is_compat = true,
1570 1614 .ptr.compat = __envp,
1571 1615 };
1572   - return do_execve_common(filename, argv, envp);
  1616 + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1573 1617 }
  1618 +
  1619 +static int compat_do_execveat(int fd, struct filename *filename,
  1620 + const compat_uptr_t __user *__argv,
  1621 + const compat_uptr_t __user *__envp,
  1622 + int flags)
  1623 +{
  1624 + struct user_arg_ptr argv = {
  1625 + .is_compat = true,
  1626 + .ptr.compat = __argv,
  1627 + };
  1628 + struct user_arg_ptr envp = {
  1629 + .is_compat = true,
  1630 + .ptr.compat = __envp,
  1631 + };
  1632 + return do_execveat_common(fd, filename, argv, envp, flags);
  1633 +}
1574 1634 #endif
1575 1635  
1576 1636 void set_binfmt(struct linux_binfmt *new)
1577 1637  
... ... @@ -1609,12 +1669,39 @@
1609 1669 {
1610 1670 return do_execve(getname(filename), argv, envp);
1611 1671 }
  1672 +
  1673 +SYSCALL_DEFINE5(execveat,
  1674 + int, fd, const char __user *, filename,
  1675 + const char __user *const __user *, argv,
  1676 + const char __user *const __user *, envp,
  1677 + int, flags)
  1678 +{
  1679 + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
  1680 +
  1681 + return do_execveat(fd,
  1682 + getname_flags(filename, lookup_flags, NULL),
  1683 + argv, envp, flags);
  1684 +}
  1685 +
1612 1686 #ifdef CONFIG_COMPAT
1613 1687 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1614 1688 const compat_uptr_t __user *, argv,
1615 1689 const compat_uptr_t __user *, envp)
1616 1690 {
1617 1691 return compat_do_execve(getname(filename), argv, envp);
  1692 +}
  1693 +
  1694 +COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
  1695 + const char __user *, filename,
  1696 + const compat_uptr_t __user *, argv,
  1697 + const compat_uptr_t __user *, envp,
  1698 + int, flags)
  1699 +{
  1700 + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
  1701 +
  1702 + return compat_do_execveat(fd,
  1703 + getname_flags(filename, lookup_flags, NULL),
  1704 + argv, envp, flags);
1618 1705 }
1619 1706 #endif
... ... @@ -130,7 +130,7 @@
130 130  
131 131 #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
132 132  
133   -static struct filename *
  133 +struct filename *
134 134 getname_flags(const char __user *filename, int flags, int *empty)
135 135 {
136 136 struct filename *result, *err;
include/linux/binfmts.h
... ... @@ -53,6 +53,10 @@
53 53 #define BINPRM_FLAGS_EXECFD_BIT 1
54 54 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)
55 55  
  56 +/* filename of the binary will be inaccessible after exec */
  57 +#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
  58 +#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
  59 +
56 60 /* Function parameter for binfmt->coredump */
57 61 struct coredump_params {
58 62 const siginfo_t *siginfo;
include/linux/compat.h
... ... @@ -357,6 +357,9 @@
357 357  
358 358 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
359 359 const compat_uptr_t __user *envp);
  360 +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
  361 + const compat_uptr_t __user *argv,
  362 + const compat_uptr_t __user *envp, int flags);
360 363  
361 364 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
362 365 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
... ... @@ -2096,6 +2096,7 @@
2096 2096 extern struct file * dentry_open(const struct path *, int, const struct cred *);
2097 2097 extern int filp_close(struct file *, fl_owner_t id);
2098 2098  
  2099 +extern struct filename *getname_flags(const char __user *, int, int *);
2099 2100 extern struct filename *getname(const char __user *);
2100 2101 extern struct filename *getname_kernel(const char *);
2101 2102  
include/linux/sched.h
... ... @@ -2485,6 +2485,10 @@
2485 2485 extern int do_execve(struct filename *,
2486 2486 const char __user * const __user *,
2487 2487 const char __user * const __user *);
  2488 +extern int do_execveat(int, struct filename *,
  2489 + const char __user * const __user *,
  2490 + const char __user * const __user *,
  2491 + int);
2488 2492 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
2489 2493 struct task_struct *fork_idle(int);
2490 2494 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
include/linux/syscalls.h
... ... @@ -877,5 +877,10 @@
877 877 asmlinkage long sys_getrandom(char __user *buf, size_t count,
878 878 unsigned int flags);
879 879 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
  880 +
  881 +asmlinkage long sys_execveat(int dfd, const char __user *filename,
  882 + const char __user *const __user *argv,
  883 + const char __user *const __user *envp, int flags);
  884 +
880 885 #endif
include/uapi/asm-generic/unistd.h
... ... @@ -707,9 +707,11 @@
707 707 __SYSCALL(__NR_memfd_create, sys_memfd_create)
708 708 #define __NR_bpf 280
709 709 __SYSCALL(__NR_bpf, sys_bpf)
  710 +#define __NR_execveat 281
  711 +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
710 712  
711 713 #undef __NR_syscalls
712   -#define __NR_syscalls 281
  714 +#define __NR_syscalls 282
713 715  
714 716 /*
715 717 * All syscalls below here should go away really,
... ... @@ -226,4 +226,7 @@
226 226  
227 227 /* access BPF programs and maps */
228 228 cond_syscall(sys_bpf);
  229 +
  230 +/* execveat */
  231 +cond_syscall(sys_execveat);
... ... @@ -54,6 +54,9 @@
54 54 case __NR_socketcall:
55 55 return 4;
56 56 #endif
  57 +#ifdef __NR_execveat
  58 + case __NR_execveat:
  59 +#endif
57 60 case __NR_execve:
58 61 return 5;
59 62 default: