13 Jun, 2018

1 commit

  • The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
    patch replaces cases of:

    kmalloc(a * b, gfp)

    with:
    kmalloc_array(a * b, gfp)

    as well as handling cases of:

    kmalloc(a * b * c, gfp)

    with:

    kmalloc(array3_size(a, b, c), gfp)

    as it's slightly less ugly than:

    kmalloc_array(array_size(a, b), c, gfp)

    This does, however, attempt to ignore constant size factors like:

    kmalloc(4 * 1024, gfp)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The tools/ directory was manually excluded, since it has its own
    implementation of kmalloc().

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kmalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kmalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kmalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kmalloc
    + kmalloc_array
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kmalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kmalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kmalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kmalloc(sizeof(THING) * C2, ...)
    |
    kmalloc(sizeof(TYPE) * C2, ...)
    |
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(C1 * C2, ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

05 Jun, 2018

3 commits

  • Pull userns updates from Eric Biederman:
    "This is the last couple of vfs bits to enable root in a user namespace
    to mount and manipulate a filesystem with backing store (AKA not a
    virtual filesystem like proc, but a filesystem where the unprivileged
    user controls the content). The target filesystem for this work is
    fuse, and Miklos should be sending you the pull request for the fuse
    bits this merge window.

    The two key patches are "evm: Don't update hmacs in user ns mounts"
    and "vfs: Don't allow changing the link count of an inode with an
    invalid uid or gid". Those close small gaps in the vfs that would be a
    problem if an unprivileged fuse filesystem is mounted.

    The rest of the changes are things that are now safe to allow a root
    user in a user namespace to do with a filesystem they have mounted.
    The most interesting development is that remount is now safe"

    * 'userns-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
    fs: Allow CAP_SYS_ADMIN in s_user_ns to freeze and thaw filesystems
    capabilities: Allow privileged user in s_user_ns to set security.* xattrs
    fs: Allow superblock owner to access do_remount_sb()
    fs: Allow superblock owner to replace invalid owners of inodes
    vfs: Allow userns root to call mknod on owned filesystems.
    vfs: Don't allow changing the link count of an inode with an invalid uid or gid
    evm: Don't update hmacs in user ns mounts

    Linus Torvalds
     
  • Pull misc vfs updates from Al Viro:
    "Misc bits and pieces not fitting into anything more specific"

    * 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    vfs: delete unnecessary assignment in vfs_listxattr
    Documentation: filesystems: update filesystem locking documentation
    vfs: namei: use path_equal() in follow_dotdot()
    fs.h: fix outdated comment about file flags
    __inode_security_revalidate() never gets NULL opt_dentry
    make xattr_getsecurity() static
    vfat: simplify checks in vfat_lookup()
    get rid of dead code in d_find_alias()
    it's SB_BORN, not MS_BORN...
    msdos_rmdir(): kill BS comment
    remove rpc_rmdir()
    fs: avoid fdput() after failed fdget() in vfs_dedupe_file_range()

    Linus Torvalds
     
  • Pull rmdir update from Al Viro:
    "More shrink_dcache_parent()-related stuff - killing the main source of
    potentially contended calls of that on large subtrees"

    * 'work.rmdir' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    rmdir(),rename(): do shrink_dcache_parent() only on success

    Linus Torvalds
     

04 Jun, 2018

1 commit

  • This reverts commit cab64df194667dc5d9d786f0a895f647f5501c0d.

    Having vfs_open() in some cases drop the reference to
    struct file combined with

    error = vfs_open(path, f, cred);
    if (error) {
    put_filp(f);
    return ERR_PTR(error);
    }
    return f;

    is flat-out wrong. It used to be

    error = vfs_open(path, f, cred);
    if (!error) {
    /* from now on we need fput() to dispose of f */
    error = open_check_o_direct(f);
    if (error) {
    fput(f);
    f = ERR_PTR(error);
    }
    } else {
    put_filp(f);
    f = ERR_PTR(error);
    }

    and sure, having that open_check_o_direct() boilerplate gotten rid of is
    nice, but not that way...

    Worse, another call chain (via finish_open()) is FUBAR now wrt
    FILE_OPENED handling - in that case we get error returned, with file
    already hit by fput() *AND* FILE_OPENED not set. Guess what happens in
    path_openat(), when it hits

    if (!(opened & FILE_OPENED)) {
    BUG_ON(!error);
    put_filp(file);
    }

    The root cause of all that crap is that the callers of do_dentry_open()
    have no way to tell which way did it fail; while that could be fixed up
    (by passing something like int *opened to do_dentry_open() and have it
    marked if we'd called ->open()), it's probably much too late in the
    cycle to do so right now.

    Signed-off-by: Al Viro
    Signed-off-by: Linus Torvalds

    Al Viro
     

28 May, 2018

1 commit

  • Once upon a time ->rmdir() instances used to check if victim inode
    had more than one (in-core) reference and failed with -EBUSY if it
    had. The reason was race avoidance - emptiness check is worthless
    if somebody could just go and create new objects in the victim
    directory afterwards.

    With introduction of dcache the checks had been replaced with
    checking the refcount of dentry. However, since a cached negative
    lookup leaves a negative child dentry, such check had lead to false
    positives - with empty foo/ doing stat foo/bar before rmdir foo
    ended up with -EBUSY unless the negative dentry of foo/bar happened
    to be evicted by the time of rmdir(2). That had been fixed by
    doing shrink_dcache_parent() just before the refcount check.

    At the same time, ext2_rmdir() has grown a private solution that
    eliminated those -EBUSY - it did something (setting ->i_size to 0)
    which made any subsequent ext2_add_entry() fail.

    Unfortunately, even with shrink_dcache_parent() the check had been
    racy - after all, the victim itself could be found by dcache lookup
    just after we'd checked its refcount. That got fixed by a new
    helper (dentry_unhash()) that did shrink_dcache_parent() and unhashed
    the sucker if its refcount ended up equal to 1. That got called before
    ->rmdir(), turning the checks in ->rmdir() instances into "if not
    unhashed fail with -EBUSY". Which reduced the boilerplate nicely, but
    had an unpleasant side effect - now shrink_dcache_parent() had been
    done before the emptiness checks, leading to easily triggerable calls
    of shrink_dcache_parent() on arbitrary large subtrees, quite possibly
    nested into each other.

    Several years later the ext2-private trick had been generalized -
    (in-core) inodes of dead directories are flagged and calls of
    lookup, readdir and all directory-modifying methods were prevented
    in so marked directories. Remaining boilerplate in ->rmdir() instances
    became redundant and some instances got rid of it.

    In 2011 the call of dentry_unhash() got shifted into ->rmdir() instances
    and then killed off in all of them. That has lead to another problem,
    though - in case of successful rmdir we *want* any (negative) child
    dentries dropped and the victim itself made negative. There's no point
    keeping cached negative lookups in foo when we can get the negative
    lookup of foo itself cached. So shrink_dcache_parent() call had been
    restored; unfortunately, it went into the place where dentry_unhash()
    used to be, i.e. before the ->rmdir() call. Note that we don't unhash
    anymore, so any "is it busy" checks would be racy; fortunately, all of
    them are gone.

    We should've done that call right *after* successful ->rmdir(). That
    reduces contention caused by tree-walking in shrink_dcache_parent()
    and, especially, contention caused by evictions in two nested subtrees
    going on in parallel. The same goes for directory-overwriting rename() -
    the story there had been parallel to that of rmdir().

    Signed-off-by: Al Viro

    Al Viro
     

25 May, 2018

2 commits


18 May, 2018

1 commit


10 Apr, 2018

1 commit

  • Pull vfs namei updates from Al Viro:

    - make lookup_one_len() safe with parent locked only shared(incoming
    afs series wants that)

    - fix of getname_kernel() regression from 2015 (-stable fodder, that
    one).

    * 'work.namei' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    getname_kernel() needs to make sure that ->name != ->iname in long case
    make lookup_one_len() safe to use with directory locked shared
    new helper: __lookup_slow()
    merge common parts of lookup_one_len{,_unlocked} into common helper

    Linus Torvalds
     

08 Apr, 2018

1 commit


07 Apr, 2018

5 commits

  • Pull audit updates from Paul Moore:
    "We didn't have anything to send for v4.16, but we're back with a
    little more than usual for v4.17.

    Eleven patches in total, most fall into the small fix category, but
    there are three non-trivial changes worth calling out:

    - the audit entry filter is being removed after deprecating it for
    quite a while (years of no one really using it because it turns out
    to be not very practical)

    - created our own version of "__mutex_owner()" because the locking
    folks were upset we were using theirs

    - improved our handling of kernel command line parameters to make
    them more forgiving

    - we fixed auditing of symlink operations

    Everything passes the audit-testsuite and as of a few minutes ago it
    merges well with your tree"

    * tag 'audit-pr-20180403' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit:
    audit: add refused symlink to audit_names
    audit: remove path param from link denied function
    audit: link denied should not directly generate PATH record
    audit: make ANOM_LINK obey audit_enabled and audit_dummy_context
    audit: do not panic on invalid boot parameter
    audit: track the owner of the command mutex ourselves
    audit: return on memory error to avoid null pointer dereference
    audit: bail before bug check if audit disabled
    audit: deprecate the AUDIT_FILTER_ENTRY filter
    audit: session ID should not set arch quick field pointer
    audit: update bugtracker and source URIs

    Linus Torvalds
     
  • Signed-off-by: Al Viro

    Al Viro
     
  • lookup_slow() sans locking/unlocking the directory

    Signed-off-by: Al Viro

    Al Viro
     
  • Signed-off-by: Al Viro

    Al Viro
     
  • Pull misc vfs updates from Al Viro:
    "Assorted stuff, including Christoph's I_DIRTY patches"

    * 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    fs: move I_DIRTY_INODE to fs.h
    ubifs: fix bogus __mark_inode_dirty(I_DIRTY_SYNC | I_DIRTY_DATASYNC) call
    ntfs: fix bogus __mark_inode_dirty(I_DIRTY_SYNC | I_DIRTY_DATASYNC) call
    gfs2: fix bogus __mark_inode_dirty(I_DIRTY_SYNC | I_DIRTY_DATASYNC) calls
    fs: fold open_check_o_direct into do_dentry_open
    vfs: Replace stray non-ASCII homoglyph characters with their ASCII equivalents
    vfs: make sure struct filename->iname is word-aligned
    get rid of pointless includes of fs_struct.h
    [poll] annotate SAA6588_CMD_POLL users

    Linus Torvalds
     

05 Apr, 2018

1 commit

  • Pull vfs dcache updates from Al Viro:
    "Part of this is what the trylock loop elimination series has turned
    into, part making d_move() preserve the parent (and thus the path) of
    victim, plus some general cleanups"

    * 'work.dcache' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (22 commits)
    d_genocide: move export to definition
    fold dentry_lock_for_move() into its sole caller and clean it up
    make non-exchanging __d_move() copy ->d_parent rather than swap them
    oprofilefs: don't oops on allocation failure
    lustre: get rid of pointless casts to struct dentry *
    debugfs_lookup(): switch to lookup_one_len_unlocked()
    fold lookup_real() into __lookup_hash()
    take out orphan externs (empty_string/slash_string)
    split d_path() and friends into a separate file
    dcache.c: trim includes
    fs/dcache: Avoid a try_lock loop in shrink_dentry_list()
    get rid of trylock loop around dentry_kill()
    handle move to LRU in retain_dentry()
    dput(): consolidate the "do we need to retain it?" into an inlined helper
    split the slow part of lock_parent() off
    now lock_parent() can't run into killed dentry
    get rid of trylock loop in locking dentries on shrink list
    d_delete(): get rid of trylock loop
    fs/dcache: Move dentry_kill() below lock_parent()
    fs/dcache: Remove stale comment from dentry_kill()
    ...

    Linus Torvalds
     

03 Apr, 2018

7 commits

  • Pull removal of in-kernel calls to syscalls from Dominik Brodowski:
    "System calls are interaction points between userspace and the kernel.
    Therefore, system call functions such as sys_xyzzy() or
    compat_sys_xyzzy() should only be called from userspace via the
    syscall table, but not from elsewhere in the kernel.

    At least on 64-bit x86, it will likely be a hard requirement from
    v4.17 onwards to not call system call functions in the kernel: It is
    better to use use a different calling convention for system calls
    there, where struct pt_regs is decoded on-the-fly in a syscall wrapper
    which then hands processing over to the actual syscall function. This
    means that only those parameters which are actually needed for a
    specific syscall are passed on during syscall entry, instead of
    filling in six CPU registers with random user space content all the
    time (which may cause serious trouble down the call chain). Those
    x86-specific patches will be pushed through the x86 tree in the near
    future.

    Moreover, rules on how data may be accessed may differ between kernel
    data and user data. This is another reason why calling sys_xyzzy() is
    generally a bad idea, and -- at most -- acceptable in arch-specific
    code.

    This patchset removes all in-kernel calls to syscall functions in the
    kernel with the exception of arch/. On top of this, it cleans up the
    three places where many syscalls are referenced or prototyped, namely
    kernel/sys_ni.c, include/linux/syscalls.h and include/linux/compat.h"

    * 'syscalls-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux: (109 commits)
    bpf: whitelist all syscalls for error injection
    kernel/sys_ni: remove {sys_,sys_compat} from cond_syscall definitions
    kernel/sys_ni: sort cond_syscall() entries
    syscalls/x86: auto-create compat_sys_*() prototypes
    syscalls: sort syscall prototypes in include/linux/compat.h
    net: remove compat_sys_*() prototypes from net/compat.h
    syscalls: sort syscall prototypes in include/linux/syscalls.h
    kexec: move sys_kexec_load() prototype to syscalls.h
    x86/sigreturn: use SYSCALL_DEFINE0
    x86: fix sys_sigreturn() return type to be long, not unsigned long
    x86/ioport: add ksys_ioperm() helper; remove in-kernel calls to sys_ioperm()
    mm: add ksys_readahead() helper; remove in-kernel calls to sys_readahead()
    mm: add ksys_mmap_pgoff() helper; remove in-kernel calls to sys_mmap_pgoff()
    mm: add ksys_fadvise64_64() helper; remove in-kernel call to sys_fadvise64_64()
    fs: add ksys_fallocate() wrapper; remove in-kernel calls to sys_fallocate()
    fs: add ksys_p{read,write}64() helpers; remove in-kernel calls to syscalls
    fs: add ksys_truncate() wrapper; remove in-kernel calls to sys_truncate()
    fs: add ksys_sync_file_range helper(); remove in-kernel calls to syscall
    kernel: add ksys_setsid() helper; remove in-kernel call to sys_setsid()
    kernel: add ksys_unshare() helper; remove in-kernel calls to sys_unshare()
    ...

    Linus Torvalds
     
  • Using the fs-internal do_linkat() helper allows us to get rid of
    fs-internal calls to the sys_linkat() syscall.

    Introducing the ksys_link() wrapper allows us to avoid the in-kernel
    calls to sys_link() syscall. The ksys_ prefix denotes that this function
    is meant as a drop-in replacement for the syscall. In particular, it uses
    the same calling convention as sys_link().

    In the near future, the only fs-external user of ksys_link() should be
    converted to use vfs_link() instead.

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Al Viro
    Cc: Andrew Morton
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     
  • Using the fs-internal do_mknodat() helper allows us to get rid of
    fs-internal calls to the sys_mknodat() syscall.

    Introducing the ksys_mknod() wrapper allows us to avoid the in-kernel
    calls to sys_mknod() syscall. The ksys_ prefix denotes that this function
    is meant as a drop-in replacement for the syscall. In particular, it uses
    the same calling convention as sys_mknod().

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Al Viro
    Cc: Andrew Morton
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     
  • Using the fs-internal do_symlinkat() helper allows us to get rid of
    fs-internal calls to the sys_symlinkat() syscall.

    Introducing the ksys_symlink() wrapper allows us to avoid the in-kernel
    calls to the sys_symlink() syscall. The ksys_ prefix denotes that this
    function is meant as a drop-in replacement for the syscall. In particular,
    it uses the same calling convention as sys_symlink().

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Al Viro
    Cc: Andrew Morton
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     
  • Using the fs-internal do_mkdirat() helper allows us to get rid of
    fs-internal calls to the sys_mkdirat() syscall.

    Introducing the ksys_mkdir() wrapper allows us to avoid the in-kernel calls
    to the sys_mkdir() syscall. The ksys_ prefix denotes that this function is
    meant as a drop-in replacement for the syscall. In particular, it uses the
    same calling convention as sys_mkdir().

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Al Viro
    Cc: Andrew Morton
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     
  • Using this wrapper allows us to avoid the in-kernel calls to the
    sys_rmdir() syscall. The ksys_ prefix denotes that this function is meant
    as a drop-in replacement for the syscall. In particular, it uses the same
    calling convention as sys_rmdir().

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Al Viro
    Cc: Andrew Morton
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     
  • Using this helper removes in-kernel calls to the sys_renameat2() syscall.

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Alexander Viro
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     

30 Mar, 2018

1 commit


28 Mar, 2018

1 commit


21 Mar, 2018

2 commits

  • Audit link denied events for symlinks had duplicate PATH records rather
    than just updating the existing PATH record. Update the symlink's PATH
    record with the current dentry and inode information.

    See: https://github.com/linux-audit/audit-kernel/issues/21

    Signed-off-by: Richard Guy Briggs
    Signed-off-by: Paul Moore

    Richard Guy Briggs
     
  • In commit 45b578fe4c3cade6f4ca1fc934ce199afd857edc
    ("audit: link denied should not directly generate PATH record")
    the need for the struct path *link parameter was removed.
    Remove the now useless struct path argument.

    Signed-off-by: Richard Guy Briggs
    Signed-off-by: Paul Moore

    Richard Guy Briggs
     

19 Mar, 2018

1 commit

  • I noticed that offsetof(struct filename, iname) is actually 28 on 64
    bit platforms, so we always pass an unaligned pointer to
    strncpy_from_user. This is mostly a problem for those 64 bit platforms
    without HAVE_EFFICIENT_UNALIGNED_ACCESS, but even on x86_64, unaligned
    accesses carry a penalty.

    A user-space microbenchmark doing nothing but strncpy_from_user from the
    same (aligned) source string runs about 5% faster when the destination
    is aligned. That number increases to 20% when the string is long
    enough (~32 bytes) that we cross a cache line boundary - that's for
    example the case for about half the files a "git status" in a kernel
    tree ends up stat'ing.

    This won't make any real-life workloads 5%, or even 1%, faster, but path
    lookup is common enough that cutting even a few cycles should be
    worthwhile. So ensure we always pass an aligned destination pointer to
    strncpy_from_user. Instead of explicit padding, simply swap the refcnt
    and aname members, as suggested by Al Viro.

    Signed-off-by: Rasmus Villemoes
    Signed-off-by: Al Viro

    Rasmus Villemoes
     

16 Mar, 2018

1 commit

  • On nfsv2 and nfsv3 the nfs server can export subsets of the same
    filesystem and report the same filesystem identifier, so that the nfs
    client can know they are the same filesystem. The subsets can be from
    disjoint directory trees. The nfsv2 and nfsv3 filesystems provides no
    way to find the common root of all directory trees exported form the
    server with the same filesystem identifier.

    The practical result is that in struct super s_root for nfs s_root is
    not necessarily the root of the filesystem. The nfs mount code sets
    s_root to the root of the first subset of the nfs filesystem that the
    kernel mounts.

    This effects the dcache invalidation code in generic_shutdown_super
    currently called shrunk_dcache_for_umount and that code for years
    has gone through an additional list of dentries that might be dentry
    trees that need to be freed to accomodate nfs.

    When I wrote path_connected I did not realize nfs was so special, and
    it's hueristic for avoiding calling is_subdir can fail.

    The practical case where this fails is when there is a move of a
    directory from the subtree exposed by one nfs mount to the subtree
    exposed by another nfs mount. This move can happen either locally or
    remotely. With the remote case requiring that the move directory be cached
    before the move and that after the move someone walks the path
    to where the move directory now exists and in so doing causes the
    already cached directory to be moved in the dcache through the magic
    of d_splice_alias.

    If someone whose working directory is in the move directory or a
    subdirectory and now starts calling .. from the initial mount of nfs
    (where s_root == mnt_root), then path_connected as a heuristic will
    not bother with the is_subdir check. As s_root really is not the root
    of the nfs filesystem this heuristic is wrong, and the path may
    actually not be connected and path_connected can fail.

    The is_subdir function might be cheap enough that we can call it
    unconditionally. Verifying that will take some benchmarking and
    the result may not be the same on all kernels this fix needs
    to be backported to. So I am avoiding that for now.

    Filesystems with snapshots such as nilfs and btrfs do something
    similar. But as the directory tree of the snapshots are disjoint
    from one another and from the main directory tree rename won't move
    things between them and this problem will not occur.

    Cc: stable@vger.kernel.org
    Reported-by: Al Viro
    Fixes: 397d425dc26d ("vfs: Test for and handle paths that are unreachable from their mnt_root")
    Signed-off-by: "Eric W. Biederman"
    Signed-off-by: Al Viro

    Eric W. Biederman
     

01 Feb, 2018

1 commit

  • Pull misc vfs updates from Al Viro:
    "All kinds of misc stuff, without any unifying topic, from various
    people.

    Neil's d_anon patch, several bugfixes, introduction of kvmalloc
    analogue of kmemdup_user(), extending bitfield.h to deal with
    fixed-endians, assorted cleanups all over the place..."

    * 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (28 commits)
    alpha: osf_sys.c: use timespec64 where appropriate
    alpha: osf_sys.c: fix put_tv32 regression
    jffs2: Fix use-after-free bug in jffs2_iget()'s error handling path
    dcache: delete unused d_hash_mask
    dcache: subtract d_hash_shift from 32 in advance
    fs/buffer.c: fold init_buffer() into init_page_buffers()
    fs: fold __inode_permission() into inode_permission()
    fs: add RWF_APPEND
    sctp: use vmemdup_user() rather than badly open-coding memdup_user()
    snd_ctl_elem_init_enum_names(): switch to vmemdup_user()
    replace_user_tlv(): switch to vmemdup_user()
    new primitive: vmemdup_user()
    memdup_user(): switch to GFP_USER
    eventfd: fold eventfd_ctx_get() into eventfd_ctx_fileget()
    eventfd: fold eventfd_ctx_read() into eventfd_read()
    eventfd: convert to use anon_inode_getfd()
    nfs4file: get rid of pointless include of btrfs.h
    uvc_v4l2: clean copyin/copyout up
    vme_user: don't use __copy_..._user()
    usx2y: don't bother with memdup_user() for 16-byte structure
    ...

    Linus Torvalds
     

31 Jan, 2018

2 commits

  • Pull mqueue/bpf vfs cleanups from Al Viro:
    "mqueue and bpf go through rather painful and similar contortions to
    create objects in their dentry trees. Provide a primitive for doing
    that without abusing ->mknod(), switch bpf and mqueue to it.

    Another mqueue-related thing that has ended up in that branch is
    on-demand creation of internal mount (based upon the work of Giuseppe
    Scrivano)"

    * 'work.mqueue' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    mqueue: switch to on-demand creation of internal mount
    tidy do_mq_open() up a bit
    mqueue: clean prepare_open() up
    do_mq_open(): move all work prior to dentry_open() into a helper
    mqueue: fold mq_attr_ok() into mqueue_get_inode()
    move dentry_open() calls up into do_mq_open()
    mqueue: switch to vfs_mkobj(), quit abusing ->d_fsdata
    bpf_obj_do_pin(): switch to vfs_mkobj(), quit abusing ->mknod()
    new primitive: vfs_mkobj()

    Linus Torvalds
     
  • Pull userns updates from Eric Biederman:
    "Between the holidays and other distractions only a small amount of
    namespace work made it into my tree this time.

    Just a final cleanup from a revert several kernels ago and a small
    typo fix from Wolffhardt Schwabe"

    * 'userns-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
    fix typo in assignment of fs default overflow gid
    autofs4: Modify autofs_wait to use current_uid() and current_gid()
    userns: Don't fail follow_automount based on s_user_ns

    Linus Torvalds
     

26 Jan, 2018

1 commit


06 Jan, 2018

1 commit


01 Dec, 2017

1 commit

  • When vfs_submount was added the test to limit automounts from
    filesystems that with s_user_ns != &init_user_ns accidentially left
    in follow_automount. The test was never about any security concerns
    and was always about how do we implement this for filesystems whose
    s_user_ns != &init_user_ns.

    At the moment this check makes no difference as there are no
    filesystems that both set FS_USERNS_MOUNT and implement d_automount.

    Remove this check now while I am thinking about it so there will not
    be odd booby traps for someone who does want to make this combination
    work.

    vfs_submount still needs improvements to allow this combination to work,
    and vfs_submount contains a check that presents a warning.

    The autofs4 filesystem could be modified to set FS_USERNS_MOUNT and it would
    need not work on this code path, as userspace performs the mounts.

    Fixes: 93faccbbfa95 ("fs: Better permission checking for submounts")
    Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
    Acked-by: Ian Kent
    Signed-off-by: "Eric W. Biederman"

    Eric W. Biederman
     

30 Nov, 2017

1 commit

  • Commit 42f461482178 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
    allowed the fstatat(2) system call to properly honor the AT_NO_AUTOMOUNT
    flag but introduced a semantic change.

    In order to honor AT_NO_AUTOMOUNT a semantic change was made to the
    negative dentry case for stat family system calls in follow_automount().

    This changed the unconditional triggering of an automount in this case
    to no longer be done and an error returned instead.

    This has caused more problems than I expected so reverting the change is
    needed.

    In a discussion with Neil Brown it was concluded that the automount(8)
    daemon can implement this change without kernel modifications. So that
    will be done instead and the autofs module documentation updated with a
    description of the problem and what needs to be done by module users for
    this specific case.

    Link: http://lkml.kernel.org/r/151174730120.6162.3848002191530283984.stgit@pluto.themaw.net
    Fixes: 42f4614821 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
    Signed-off-by: Ian Kent
    Cc: Neil Brown
    Cc: Al Viro
    Cc: David Howells
    Cc: Colin Walters
    Cc: Ondrej Holy
    Cc: [4.11+]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ian Kent
     

18 Nov, 2017

2 commits

  • Pull misc vfs updates from Al Viro:
    "Assorted stuff, really no common topic here"

    * 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    vfs: grab the lock instead of blocking in __fd_install during resizing
    vfs: stop clearing close on exec when closing a fd
    include/linux/fs.h: fix comment about struct address_space
    fs: make fiemap work from compat_ioctl
    coda: fix 'kernel memory exposure attempt' in fsync
    pstore: remove unneeded unlikely()
    vfs: remove unneeded unlikely()
    stubs for mount_bdev() and kill_block_super() in !CONFIG_BLOCK case
    make vfs_ustat() static
    do_handle_open() should be static
    elf_fdpic: fix unused variable warning
    fold destroy_super() into __put_super()
    new helper: destroy_unused_super()
    fix address space warnings in ipc/
    acct.h: get rid of detritus

    Linus Torvalds
     
  • Pull compat and uaccess updates from Al Viro:

    - {get,put}_compat_sigset() series

    - assorted compat ioctl stuff

    - more set_fs() elimination

    - a few more timespec64 conversions

    - several removals of pointless access_ok() in places where it was
    followed only by non-__ variants of primitives

    * 'misc.compat' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (24 commits)
    coredump: call do_unlinkat directly instead of sys_unlink
    fs: expose do_unlinkat for built-in callers
    ext4: take handling of EXT4_IOC_GROUP_ADD into a helper, get rid of set_fs()
    ipmi: get rid of pointless access_ok()
    pi433: sanitize ioctl
    cxlflash: get rid of pointless access_ok()
    mtdchar: get rid of pointless access_ok()
    r128: switch compat ioctls to drm_ioctl_kernel()
    selection: get rid of field-by-field copyin
    VT_RESIZEX: get rid of field-by-field copyin
    i2c compat ioctls: move to ->compat_ioctl()
    sched_rr_get_interval(): move compat to native, get rid of set_fs()
    mips: switch to {get,put}_compat_sigset()
    sparc: switch to {get,put}_compat_sigset()
    s390: switch to {get,put}_compat_sigset()
    ppc: switch to {get,put}_compat_sigset()
    parisc: switch to {get,put}_compat_sigset()
    get_compat_sigset()
    get rid of {get,put}_compat_itimerspec()
    io_getevents: Use timespec64 to represent timeouts
    ...

    Linus Torvalds
     

10 Nov, 2017

1 commit