Commit 51f39a1f0cea1cacf8c787f652f26dfee9611874
Committed by
Linus Torvalds
1 parent
c0ef0cc9d2
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
syscalls: implement execveat() system call
This patchset adds execveat(2) for x86, and is derived from Meredydd Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528). The primary aim of adding an execveat syscall is to allow an implementation of fexecve(3) that does not rely on the /proc filesystem, at least for executables (rather than scripts). The current glibc version of fexecve(3) is implemented via /proc, which causes problems in sandboxed or otherwise restricted environments. Given the desire for a /proc-free fexecve() implementation, HPA suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be an appropriate generalization. Also, having a new syscall means that it can take a flags argument without back-compatibility concerns. The current implementation just defines the AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be added in future -- for example, flags for new namespaces (as suggested at https://lkml.org/lkml/2006/7/11/474). Related history: - https://lkml.org/lkml/2006/12/27/123 is an example of someone realizing that fexecve() is likely to fail in a chroot environment. - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered documenting the /proc requirement of fexecve(3) in its manpage, to "prevent other people from wasting their time". - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a problem where a process that did setuid() could not fexecve() because it no longer had access to /proc/self/fd; this has since been fixed. This patch (of 4): Add a new execveat(2) system call. execveat() is to execve() as openat() is to open(): it takes a file descriptor that refers to a directory, and resolves the filename relative to that. In addition, if the filename is empty and AT_EMPTY_PATH is specified, execveat() executes the file to which the file descriptor refers. This replicates the functionality of fexecve(), which is a system call in other UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/<fd>" (and so relies on /proc being mounted). The filename fed to the executed program as argv[0] (or the name of the script fed to a script interpreter) will be of the form "/dev/fd/<fd>" (for an empty filename) or "/dev/fd/<fd>/<filename>", effectively reflecting how the executable was found. This does however mean that execution of a script in a /proc-less environment won't work; also, script execution via an O_CLOEXEC file descriptor fails (as the file will not be accessible after exec). Based on patches by Meredydd Luff. Signed-off-by: David Drysdale <drysdale@google.com> Cc: Meredydd Luff <meredydd@senatehouse.org> Cc: Shuah Khan <shuah.kh@samsung.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Rich Felker <dalias@aerifal.cx> Cc: Christoph Hellwig <hch@infradead.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 13 changed files with 145 additions and 15 deletions Side-by-side Diff
fs/binfmt_em86.c
fs/binfmt_misc.c
... | ... | @@ -144,6 +144,10 @@ |
144 | 144 | if (!fmt) |
145 | 145 | goto ret; |
146 | 146 | |
147 | + /* Need to be able to load the file after exec */ | |
148 | + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | |
149 | + return -ENOENT; | |
150 | + | |
147 | 151 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { |
148 | 152 | retval = remove_arg_zero(bprm); |
149 | 153 | if (retval) |
fs/binfmt_script.c
... | ... | @@ -24,6 +24,16 @@ |
24 | 24 | |
25 | 25 | if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) |
26 | 26 | return -ENOEXEC; |
27 | + | |
28 | + /* | |
29 | + * If the script filename will be inaccessible after exec, typically | |
30 | + * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give | |
31 | + * up now (on the assumption that the interpreter will want to load | |
32 | + * this file). | |
33 | + */ | |
34 | + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | |
35 | + return -ENOENT; | |
36 | + | |
27 | 37 | /* |
28 | 38 | * This section does the #! interpretation. |
29 | 39 | * Sorta complicated, but hopefully it will work. -TYT |
fs/exec.c
... | ... | @@ -748,18 +748,25 @@ |
748 | 748 | |
749 | 749 | #endif /* CONFIG_MMU */ |
750 | 750 | |
751 | -static struct file *do_open_exec(struct filename *name) | |
751 | +static struct file *do_open_execat(int fd, struct filename *name, int flags) | |
752 | 752 | { |
753 | 753 | struct file *file; |
754 | 754 | int err; |
755 | - static const struct open_flags open_exec_flags = { | |
755 | + struct open_flags open_exec_flags = { | |
756 | 756 | .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, |
757 | 757 | .acc_mode = MAY_EXEC | MAY_OPEN, |
758 | 758 | .intent = LOOKUP_OPEN, |
759 | 759 | .lookup_flags = LOOKUP_FOLLOW, |
760 | 760 | }; |
761 | 761 | |
762 | - file = do_filp_open(AT_FDCWD, name, &open_exec_flags); | |
762 | + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) | |
763 | + return ERR_PTR(-EINVAL); | |
764 | + if (flags & AT_SYMLINK_NOFOLLOW) | |
765 | + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; | |
766 | + if (flags & AT_EMPTY_PATH) | |
767 | + open_exec_flags.lookup_flags |= LOOKUP_EMPTY; | |
768 | + | |
769 | + file = do_filp_open(fd, name, &open_exec_flags); | |
763 | 770 | if (IS_ERR(file)) |
764 | 771 | goto out; |
765 | 772 | |
766 | 773 | |
... | ... | @@ -770,12 +777,13 @@ |
770 | 777 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) |
771 | 778 | goto exit; |
772 | 779 | |
773 | - fsnotify_open(file); | |
774 | - | |
775 | 780 | err = deny_write_access(file); |
776 | 781 | if (err) |
777 | 782 | goto exit; |
778 | 783 | |
784 | + if (name->name[0] != '\0') | |
785 | + fsnotify_open(file); | |
786 | + | |
779 | 787 | out: |
780 | 788 | return file; |
781 | 789 | |
... | ... | @@ -787,7 +795,7 @@ |
787 | 795 | struct file *open_exec(const char *name) |
788 | 796 | { |
789 | 797 | struct filename tmp = { .name = name }; |
790 | - return do_open_exec(&tmp); | |
798 | + return do_open_execat(AT_FDCWD, &tmp, 0); | |
791 | 799 | } |
792 | 800 | EXPORT_SYMBOL(open_exec); |
793 | 801 | |
794 | 802 | |
... | ... | @@ -1428,10 +1436,12 @@ |
1428 | 1436 | /* |
1429 | 1437 | * sys_execve() executes a new program. |
1430 | 1438 | */ |
1431 | -static int do_execve_common(struct filename *filename, | |
1432 | - struct user_arg_ptr argv, | |
1433 | - struct user_arg_ptr envp) | |
1439 | +static int do_execveat_common(int fd, struct filename *filename, | |
1440 | + struct user_arg_ptr argv, | |
1441 | + struct user_arg_ptr envp, | |
1442 | + int flags) | |
1434 | 1443 | { |
1444 | + char *pathbuf = NULL; | |
1435 | 1445 | struct linux_binprm *bprm; |
1436 | 1446 | struct file *file; |
1437 | 1447 | struct files_struct *displaced; |
... | ... | @@ -1472,7 +1482,7 @@ |
1472 | 1482 | check_unsafe_exec(bprm); |
1473 | 1483 | current->in_execve = 1; |
1474 | 1484 | |
1475 | - file = do_open_exec(filename); | |
1485 | + file = do_open_execat(fd, filename, flags); | |
1476 | 1486 | retval = PTR_ERR(file); |
1477 | 1487 | if (IS_ERR(file)) |
1478 | 1488 | goto out_unmark; |
... | ... | @@ -1480,7 +1490,28 @@ |
1480 | 1490 | sched_exec(); |
1481 | 1491 | |
1482 | 1492 | bprm->file = file; |
1483 | - bprm->filename = bprm->interp = filename->name; | |
1493 | + if (fd == AT_FDCWD || filename->name[0] == '/') { | |
1494 | + bprm->filename = filename->name; | |
1495 | + } else { | |
1496 | + if (filename->name[0] == '\0') | |
1497 | + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); | |
1498 | + else | |
1499 | + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", | |
1500 | + fd, filename->name); | |
1501 | + if (!pathbuf) { | |
1502 | + retval = -ENOMEM; | |
1503 | + goto out_unmark; | |
1504 | + } | |
1505 | + /* | |
1506 | + * Record that a name derived from an O_CLOEXEC fd will be | |
1507 | + * inaccessible after exec. Relies on having exclusive access to | |
1508 | + * current->files (due to unshare_files above). | |
1509 | + */ | |
1510 | + if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) | |
1511 | + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; | |
1512 | + bprm->filename = pathbuf; | |
1513 | + } | |
1514 | + bprm->interp = bprm->filename; | |
1484 | 1515 | |
1485 | 1516 | retval = bprm_mm_init(bprm); |
1486 | 1517 | if (retval) |
... | ... | @@ -1521,6 +1552,7 @@ |
1521 | 1552 | acct_update_integrals(current); |
1522 | 1553 | task_numa_free(current); |
1523 | 1554 | free_bprm(bprm); |
1555 | + kfree(pathbuf); | |
1524 | 1556 | putname(filename); |
1525 | 1557 | if (displaced) |
1526 | 1558 | put_files_struct(displaced); |
... | ... | @@ -1538,6 +1570,7 @@ |
1538 | 1570 | |
1539 | 1571 | out_free: |
1540 | 1572 | free_bprm(bprm); |
1573 | + kfree(pathbuf); | |
1541 | 1574 | |
1542 | 1575 | out_files: |
1543 | 1576 | if (displaced) |
1544 | 1577 | |
... | ... | @@ -1553,9 +1586,20 @@ |
1553 | 1586 | { |
1554 | 1587 | struct user_arg_ptr argv = { .ptr.native = __argv }; |
1555 | 1588 | struct user_arg_ptr envp = { .ptr.native = __envp }; |
1556 | - return do_execve_common(filename, argv, envp); | |
1589 | + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); | |
1557 | 1590 | } |
1558 | 1591 | |
1592 | +int do_execveat(int fd, struct filename *filename, | |
1593 | + const char __user *const __user *__argv, | |
1594 | + const char __user *const __user *__envp, | |
1595 | + int flags) | |
1596 | +{ | |
1597 | + struct user_arg_ptr argv = { .ptr.native = __argv }; | |
1598 | + struct user_arg_ptr envp = { .ptr.native = __envp }; | |
1599 | + | |
1600 | + return do_execveat_common(fd, filename, argv, envp, flags); | |
1601 | +} | |
1602 | + | |
1559 | 1603 | #ifdef CONFIG_COMPAT |
1560 | 1604 | static int compat_do_execve(struct filename *filename, |
1561 | 1605 | const compat_uptr_t __user *__argv, |
1562 | 1606 | |
... | ... | @@ -1569,8 +1613,24 @@ |
1569 | 1613 | .is_compat = true, |
1570 | 1614 | .ptr.compat = __envp, |
1571 | 1615 | }; |
1572 | - return do_execve_common(filename, argv, envp); | |
1616 | + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); | |
1573 | 1617 | } |
1618 | + | |
1619 | +static int compat_do_execveat(int fd, struct filename *filename, | |
1620 | + const compat_uptr_t __user *__argv, | |
1621 | + const compat_uptr_t __user *__envp, | |
1622 | + int flags) | |
1623 | +{ | |
1624 | + struct user_arg_ptr argv = { | |
1625 | + .is_compat = true, | |
1626 | + .ptr.compat = __argv, | |
1627 | + }; | |
1628 | + struct user_arg_ptr envp = { | |
1629 | + .is_compat = true, | |
1630 | + .ptr.compat = __envp, | |
1631 | + }; | |
1632 | + return do_execveat_common(fd, filename, argv, envp, flags); | |
1633 | +} | |
1574 | 1634 | #endif |
1575 | 1635 | |
1576 | 1636 | void set_binfmt(struct linux_binfmt *new) |
1577 | 1637 | |
... | ... | @@ -1609,12 +1669,39 @@ |
1609 | 1669 | { |
1610 | 1670 | return do_execve(getname(filename), argv, envp); |
1611 | 1671 | } |
1672 | + | |
1673 | +SYSCALL_DEFINE5(execveat, | |
1674 | + int, fd, const char __user *, filename, | |
1675 | + const char __user *const __user *, argv, | |
1676 | + const char __user *const __user *, envp, | |
1677 | + int, flags) | |
1678 | +{ | |
1679 | + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; | |
1680 | + | |
1681 | + return do_execveat(fd, | |
1682 | + getname_flags(filename, lookup_flags, NULL), | |
1683 | + argv, envp, flags); | |
1684 | +} | |
1685 | + | |
1612 | 1686 | #ifdef CONFIG_COMPAT |
1613 | 1687 | COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, |
1614 | 1688 | const compat_uptr_t __user *, argv, |
1615 | 1689 | const compat_uptr_t __user *, envp) |
1616 | 1690 | { |
1617 | 1691 | return compat_do_execve(getname(filename), argv, envp); |
1692 | +} | |
1693 | + | |
1694 | +COMPAT_SYSCALL_DEFINE5(execveat, int, fd, | |
1695 | + const char __user *, filename, | |
1696 | + const compat_uptr_t __user *, argv, | |
1697 | + const compat_uptr_t __user *, envp, | |
1698 | + int, flags) | |
1699 | +{ | |
1700 | + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; | |
1701 | + | |
1702 | + return compat_do_execveat(fd, | |
1703 | + getname_flags(filename, lookup_flags, NULL), | |
1704 | + argv, envp, flags); | |
1618 | 1705 | } |
1619 | 1706 | #endif |
fs/namei.c
include/linux/binfmts.h
... | ... | @@ -53,6 +53,10 @@ |
53 | 53 | #define BINPRM_FLAGS_EXECFD_BIT 1 |
54 | 54 | #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) |
55 | 55 | |
56 | +/* filename of the binary will be inaccessible after exec */ | |
57 | +#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 | |
58 | +#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) | |
59 | + | |
56 | 60 | /* Function parameter for binfmt->coredump */ |
57 | 61 | struct coredump_params { |
58 | 62 | const siginfo_t *siginfo; |
include/linux/compat.h
... | ... | @@ -357,6 +357,9 @@ |
357 | 357 | |
358 | 358 | asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, |
359 | 359 | const compat_uptr_t __user *envp); |
360 | +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, | |
361 | + const compat_uptr_t __user *argv, | |
362 | + const compat_uptr_t __user *envp, int flags); | |
360 | 363 | |
361 | 364 | asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, |
362 | 365 | compat_ulong_t __user *outp, compat_ulong_t __user *exp, |
include/linux/fs.h
... | ... | @@ -2096,6 +2096,7 @@ |
2096 | 2096 | extern struct file * dentry_open(const struct path *, int, const struct cred *); |
2097 | 2097 | extern int filp_close(struct file *, fl_owner_t id); |
2098 | 2098 | |
2099 | +extern struct filename *getname_flags(const char __user *, int, int *); | |
2099 | 2100 | extern struct filename *getname(const char __user *); |
2100 | 2101 | extern struct filename *getname_kernel(const char *); |
2101 | 2102 |
include/linux/sched.h
... | ... | @@ -2485,6 +2485,10 @@ |
2485 | 2485 | extern int do_execve(struct filename *, |
2486 | 2486 | const char __user * const __user *, |
2487 | 2487 | const char __user * const __user *); |
2488 | +extern int do_execveat(int, struct filename *, | |
2489 | + const char __user * const __user *, | |
2490 | + const char __user * const __user *, | |
2491 | + int); | |
2488 | 2492 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); |
2489 | 2493 | struct task_struct *fork_idle(int); |
2490 | 2494 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); |
include/linux/syscalls.h
... | ... | @@ -877,5 +877,10 @@ |
877 | 877 | asmlinkage long sys_getrandom(char __user *buf, size_t count, |
878 | 878 | unsigned int flags); |
879 | 879 | asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); |
880 | + | |
881 | +asmlinkage long sys_execveat(int dfd, const char __user *filename, | |
882 | + const char __user *const __user *argv, | |
883 | + const char __user *const __user *envp, int flags); | |
884 | + | |
880 | 885 | #endif |
include/uapi/asm-generic/unistd.h
... | ... | @@ -707,9 +707,11 @@ |
707 | 707 | __SYSCALL(__NR_memfd_create, sys_memfd_create) |
708 | 708 | #define __NR_bpf 280 |
709 | 709 | __SYSCALL(__NR_bpf, sys_bpf) |
710 | +#define __NR_execveat 281 | |
711 | +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) | |
710 | 712 | |
711 | 713 | #undef __NR_syscalls |
712 | -#define __NR_syscalls 281 | |
714 | +#define __NR_syscalls 282 | |
713 | 715 | |
714 | 716 | /* |
715 | 717 | * All syscalls below here should go away really, |
kernel/sys_ni.c
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989
-
mentioned in commit b73989