02 May, 2019

1 commit


18 Aug, 2018

1 commit

  • get_seconds() is deprecated because of the 32-bit overflow and will be
    removed. All callers in ufs also truncate to a 32-bit number, so
    nothing changes during the conversion, but this should be harmless as
    the superblock and cylinder group timestamps are not visible to user
    space, except for checking the fs-dirty state, wich works fine across
    the overflow.

    This moves the call to get_seconds() into a new inline function, with a
    comment explaining the constraints, while converting it to
    ktime_get_real_seconds().

    Link: http://lkml.kernel.org/r/20180718115017.742609-1-arnd@arndb.de
    Signed-off-by: Arnd Bergmann
    Acked-by: Thomas Gleixner
    Cc: Al Viro
    Cc: Thomas Gleixner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Arnd Bergmann
     

13 Jun, 2018

1 commit

  • The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
    patch replaces cases of:

    kmalloc(a * b, gfp)

    with:
    kmalloc_array(a * b, gfp)

    as well as handling cases of:

    kmalloc(a * b * c, gfp)

    with:

    kmalloc(array3_size(a, b, c), gfp)

    as it's slightly less ugly than:

    kmalloc_array(array_size(a, b), c, gfp)

    This does, however, attempt to ignore constant size factors like:

    kmalloc(4 * 1024, gfp)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The tools/ directory was manually excluded, since it has its own
    implementation of kmalloc().

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kmalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kmalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kmalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kmalloc
    + kmalloc_array
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kmalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kmalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kmalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kmalloc(sizeof(THING) * C2, ...)
    |
    kmalloc(sizeof(TYPE) * C2, ...)
    |
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(C1 * C2, ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

04 Feb, 2018

1 commit

  • Pull hardened usercopy whitelisting from Kees Cook:
    "Currently, hardened usercopy performs dynamic bounds checking on slab
    cache objects. This is good, but still leaves a lot of kernel memory
    available to be copied to/from userspace in the face of bugs.

    To further restrict what memory is available for copying, this creates
    a way to whitelist specific areas of a given slab cache object for
    copying to/from userspace, allowing much finer granularity of access
    control.

    Slab caches that are never exposed to userspace can declare no
    whitelist for their objects, thereby keeping them unavailable to
    userspace via dynamic copy operations. (Note, an implicit form of
    whitelisting is the use of constant sizes in usercopy operations and
    get_user()/put_user(); these bypass all hardened usercopy checks since
    these sizes cannot change at runtime.)

    This new check is WARN-by-default, so any mistakes can be found over
    the next several releases without breaking anyone's system.

    The series has roughly the following sections:
    - remove %p and improve reporting with offset
    - prepare infrastructure and whitelist kmalloc
    - update VFS subsystem with whitelists
    - update SCSI subsystem with whitelists
    - update network subsystem with whitelists
    - update process memory with whitelists
    - update per-architecture thread_struct with whitelists
    - update KVM with whitelists and fix ioctl bug
    - mark all other allocations as not whitelisted
    - update lkdtm for more sensible test overage"

    * tag 'usercopy-v4.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux: (38 commits)
    lkdtm: Update usercopy tests for whitelisting
    usercopy: Restrict non-usercopy caches to size 0
    kvm: x86: fix KVM_XEN_HVM_CONFIG ioctl
    kvm: whitelist struct kvm_vcpu_arch
    arm: Implement thread_struct whitelist for hardened usercopy
    arm64: Implement thread_struct whitelist for hardened usercopy
    x86: Implement thread_struct whitelist for hardened usercopy
    fork: Provide usercopy whitelisting for task_struct
    fork: Define usercopy region in thread_stack slab caches
    fork: Define usercopy region in mm_struct slab caches
    net: Restrict unwhitelisted proto caches to size 0
    sctp: Copy struct sctp_sock.autoclose to userspace using put_user()
    sctp: Define usercopy region in SCTP proto slab cache
    caif: Define usercopy region in caif proto slab cache
    ip: Define usercopy region in IP proto slab cache
    net: Define usercopy region in struct proto slab cache
    scsi: Define usercopy region in scsi_sense_cache slab cache
    cifs: Define usercopy region in cifs_request slab cache
    vxfs: Define usercopy region in vxfs_inode slab cache
    ufs: Define usercopy region in ufs_inode_cache slab cache
    ...

    Linus Torvalds
     

29 Jan, 2018

1 commit


16 Jan, 2018

1 commit

  • The ufs symlink pathnames, stored in struct ufs_inode_info.i_u1.i_symlink
    and therefore contained in the ufs_inode_cache slab cache, need to be
    copied to/from userspace.

    cache object allocation:
    fs/ufs/super.c:
    ufs_alloc_inode(...):
    ...
    ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
    ...
    return &ei->vfs_inode;

    fs/ufs/ufs.h:
    UFS_I(struct inode *inode):
    return container_of(inode, struct ufs_inode_info, vfs_inode);

    fs/ufs/namei.c:
    ufs_symlink(...):
    ...
    inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;

    example usage trace:
    readlink_copy+0x43/0x70
    vfs_readlink+0x62/0x110
    SyS_readlinkat+0x100/0x130

    fs/namei.c:
    readlink_copy(..., link):
    ...
    copy_to_user(..., link, len);

    (inlined in vfs_readlink)
    generic_readlink(dentry, ...):
    struct inode *inode = d_inode(dentry);
    const char *link = inode->i_link;
    ...
    readlink_copy(..., link);

    In support of usercopy hardening, this patch defines a region in the
    ufs_inode_cache slab cache in which userspace copy operations are allowed.

    This region is known as the slab cache's usercopy region. Slab caches
    can now check that each dynamically sized copy operation involving
    cache-managed memory falls entirely within the slab's usercopy region.

    This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
    whitelisting code in the last public patch of grsecurity/PaX based on my
    understanding of the code. Changes or omissions from the original code are
    mine and don't reflect the original grsecurity/PaX code.

    Signed-off-by: David Windsor
    [kees: adjust commit log, provide usage trace]
    Cc: Evgeniy Dushistov
    Signed-off-by: Kees Cook

    David Windsor
     

28 Nov, 2017

1 commit

  • This is a pure automated search-and-replace of the internal kernel
    superblock flags.

    The s_flags are now called SB_*, with the names and the values for the
    moment mirroring the MS_* flags that they're equivalent to.

    Note how the MS_xyz flags are the ones passed to the mount system call,
    while the SB_xyz flags are what we then use in sb->s_flags.

    The script to do this was:

    # places to look in; re security/*: it generally should *not* be
    # touched (that stuff parses mount(2) arguments directly), but
    # there are two places where we really deal with superblock flags.
    FILES="drivers/mtd drivers/staging/lustre fs ipc mm \
    include/linux/fs.h include/uapi/linux/bfs_fs.h \
    security/apparmor/apparmorfs.c security/apparmor/include/lib.h"
    # the list of MS_... constants
    SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \
    DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \
    POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \
    I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \
    ACTIVE NOUSER"

    SED_PROG=
    for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done

    # we want files that contain at least one of MS_...,
    # with fs/namespace.c and fs/pnode.c excluded.
    L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c')

    for f in $L; do sed -i $f $SED_PROG; done

    Requested-by: Al Viro
    Signed-off-by: Linus Torvalds

    Linus Torvalds
     

17 Jul, 2017

1 commit

  • Firstly by applying the following with coccinelle's spatch:

    @@ expression SB; @@
    -SB->s_flags & MS_RDONLY
    +sb_rdonly(SB)

    to effect the conversion to sb_rdonly(sb), then by applying:

    @@ expression A, SB; @@
    (
    -(!sb_rdonly(SB)) && A
    +!sb_rdonly(SB) && A
    |
    -A != (sb_rdonly(SB))
    +A != sb_rdonly(SB)
    |
    -A == (sb_rdonly(SB))
    +A == sb_rdonly(SB)
    |
    -!(sb_rdonly(SB))
    +!sb_rdonly(SB)
    |
    -A && (sb_rdonly(SB))
    +A && sb_rdonly(SB)
    |
    -A || (sb_rdonly(SB))
    +A || sb_rdonly(SB)
    |
    -(sb_rdonly(SB)) != A
    +sb_rdonly(SB) != A
    |
    -(sb_rdonly(SB)) == A
    +sb_rdonly(SB) == A
    |
    -(sb_rdonly(SB)) && A
    +sb_rdonly(SB) && A
    |
    -(sb_rdonly(SB)) || A
    +sb_rdonly(SB) || A
    )

    @@ expression A, B, SB; @@
    (
    -(sb_rdonly(SB)) ? 1 : 0
    +sb_rdonly(SB)
    |
    -(sb_rdonly(SB)) ? A : B
    +sb_rdonly(SB) ? A : B
    )

    to remove left over excess bracketage and finally by applying:

    @@ expression A, SB; @@
    (
    -(A & MS_RDONLY) != sb_rdonly(SB)
    +(bool)(A & MS_RDONLY) != sb_rdonly(SB)
    |
    -(A & MS_RDONLY) == sb_rdonly(SB)
    +(bool)(A & MS_RDONLY) == sb_rdonly(SB)
    )

    to make comparisons against the result of sb_rdonly() (which is a bool)
    work correctly.

    Signed-off-by: David Howells

    David Howells
     

22 Jun, 2017

1 commit

  • Pull more ufs fixes from Al Viro:
    "More UFS fixes, unfortunately including build regression fix for the
    64-bit s_dsize commit. Fixed in this pile:

    - trivial bug in signedness of 32bit timestamps on ufs1

    - ESTALE instead of ufs_error() when doing open-by-fhandle on
    something deleted

    - build regression on 32bit in ufs_new_fragments() - calculating that
    many percents of u64 pulls libgcc stuff on some of those. Mea
    culpa.

    - fix hysteresis loop broken by typo in 2.4.14.7 (right next to the
    location of previous bug).

    - fix the insane limits of said hysteresis loop on filesystems with
    very low percentage of reserved blocks. If it's 5% or less, just
    use the OPTSPACE policy.

    - calculate those limits once and mount time.

    This tree does pass xfstests clean (both ufs1 and ufs2) and it _does_
    survive cross-builds.

    Again, my apologies for missing that, especially since I have noticed
    a related percentage-of-64bit issue in earlier patches (when dealing
    with amount of reserved blocks). Self-LART applied..."

    * 'ufs-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    ufs: fix the logics for tail relocation
    ufs_iget(): fail with -ESTALE on deleted inode
    fix signedness of timestamps on ufs1

    Linus Torvalds
     

18 Jun, 2017

1 commit

  • * original hysteresis loop got broken by typo back in 2002; now
    it never switches out of OPTTIME state. Fixed.
    * critical levels for switching from OPTTIME to OPTSPACE and back
    ought to be calculated once, at mount time.
    * we should use mul_u64_u32_div() for those calculations, now that
    ->s_dsize is 64bit.
    * to quote Kirk McKusick (in 1995 FreeBSD commit message):
    The threshold for switching from time-space and space-time is too small
    when minfree is 5%...so make it stay at space in this case.

    Signed-off-by: Al Viro

    Al Viro
     

17 Jun, 2017

1 commit

  • Pull ufs fixes from Al Viro:
    "Fix assorted ufs bugs: a couple of deadlocks, fs corruption in
    truncate(), oopsen on tail unpacking and truncate when racing with
    vmscan, mild fs corruption (free blocks stats summary buggered, *BSD
    fsck would complain and fix), several instances of broken logics
    around reserved blocks (starting with "check almost never triggers
    when it should" and then there are issues with sufficiently large
    UFS2)"

    [ Note: ufs hasn't gotten any loving in a long time, because nobody
    really seems to use it. These ufs fixes are triggered by people
    actually caring now, not some sudden influx of new bugs. - Linus ]

    * 'ufs-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    ufs_truncate_blocks(): fix the case when size is in the last direct block
    ufs: more deadlock prevention on tail unpacking
    ufs: avoid grabbing ->truncate_mutex if possible
    ufs_get_locked_page(): make sure we have buffer_heads
    ufs: fix s_size/s_dsize users
    ufs: fix reserved blocks check
    ufs: make ufs_freespace() return signed
    ufs: fix logics in "ufs: make fsck -f happy"

    Linus Torvalds
     

15 Jun, 2017

2 commits

  • For UFS2 we need 64bit variants; we even store them in uspi, but
    use 32bit ones instead. One wrinkle is in handling of reserved
    space - recalculating it every time had been stupid all along, but
    now it would become really ugly. Just calculate it once...

    Signed-off-by: Al Viro

    Al Viro
     
  • Storing stats _only_ at new locations is wrong for UFS1; old
    locations should always be kept updated. The check for "has
    been converted to use of new locations" is also wrong - it
    should be "->fs_maxbsize is equal to ->fs_bsize".

    Signed-off-by: Al Viro

    Al Viro
     

11 Jun, 2017

1 commit

  • Pull UFS fixes from Al Viro:
    "This is just the obvious backport fodder; I'm pretty sure that there
    will be more - definitely so wrt performance and quite possibly
    correctness as well"

    * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    ufs: we need to sync inode before freeing it
    excessive checks in ufs_write_failed() and ufs_evict_inode()
    ufs_getfrag_block(): we only grab ->truncate_mutex on block creation path
    ufs_extend_tail(): fix the braino in calling conventions of ufs_new_fragments()
    ufs: set correct ->s_maxsize
    ufs: restore maintaining ->i_blocks
    fix ufs_isblockset()
    ufs: restore proper tail allocation

    Linus Torvalds
     

10 Jun, 2017

1 commit


05 Jun, 2017

1 commit

  • This fixes a problem with reading files larger than 2GB from a UFS-2
    file system:

    https://bugzilla.kernel.org/show_bug.cgi?id=195721

    The incorrect UFS s_maxsize limit became a problem as of commit
    c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()")
    which started using s_maxbytes to avoid a page index overflow in
    do_generic_file_read().

    That caused files to be truncated on UFS-2 file systems because the
    default maximum file size is 2GB (MAX_NON_LFS) and UFS didn't update it.

    Here I simply increase the default to a common value used by other file
    systems.

    Signed-off-by: Richard Narron
    Cc: Al Viro
    Cc: Will B
    Cc: Theodore Ts'o
    Cc: # v4.9 and backports of c2a9737f45e2
    Signed-off-by: Linus Torvalds

    Richard Narron
     

25 Dec, 2016

1 commit


11 Apr, 2016

1 commit


15 Jan, 2016

1 commit

  • Mark those kmem allocations that are known to be easily triggered from
    userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to
    memcg. For the list, see below:

    - threadinfo
    - task_struct
    - task_delay_info
    - pid
    - cred
    - mm_struct
    - vm_area_struct and vm_region (nommu)
    - anon_vma and anon_vma_chain
    - signal_struct
    - sighand_struct
    - fs_struct
    - files_struct
    - fdtable and fdtable->full_fds_bits
    - dentry and external_name
    - inode for all filesystems. This is the most tedious part, because
    most filesystems overwrite the alloc_inode method.

    The list is far from complete, so feel free to add more objects.
    Nevertheless, it should be close to "account everything" approach and
    keep most workloads within bounds. Malevolent users will be able to
    breach the limit, but this was possible even with the former "account
    everything" approach (simply because it did not account everything in
    fact).

    [akpm@linux-foundation.org: coding-style fixes]
    Signed-off-by: Vladimir Davydov
    Acked-by: Johannes Weiner
    Acked-by: Michal Hocko
    Cc: Tejun Heo
    Cc: Greg Thelen
    Cc: Christoph Lameter
    Cc: Pekka Enberg
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vladimir Davydov
     

07 Jul, 2015

2 commits

  • There were 3 remaining users; in two of them we took ->s_lock immediately
    after lock_ufs() and held it until just before unlock_ufs(); the third
    one (statfs) could not be called from itself or from other two (remount
    and sync_fs). Just use ->s_lock in statfs and don't bother with lock_ufs
    at all.

    Signed-off-by: Al Viro

    Al Viro
     
  • * stores to block pointers are under per-inode seqlock (meta_lock) and
    mutex (truncate_mutex)
    * fetches of block pointers are either under truncate_mutex, or wrapped
    into seqretry loop on meta_lock
    * all changes of ->i_size are under truncate_mutex and i_mutex
    * all changes of ->i_lastfrag are under truncate_mutex

    It's similar to what ext2 is doing; the main difference is that unlike
    ext2 we can't rely upon the atomicity of stores into block pointers -
    on UFS2 they are 64bit. So we can't cut the corner when switching
    a pointer from NULL to non-NULL as we could in ext2_splice_branch()
    and need to use meta_lock on all modifications.

    We use seqlock where ext2 uses rwlock; ext2 could probably also benefit
    from such change...

    Another non-trivial difference is that with UFS we *cannot* have reader
    grab truncate_mutex in case of race - it has to keep retrying. That
    might be possible to change, but not until we lift tail unpacking
    several levels up in call chain.

    After that commit we do *NOT* hold fs-wide serialization on accesses
    to block pointers anymore. Moreover, lock_ufs() can become a normal
    mutex now - it's only used on statfs, remount and sync_fs and none
    of those uses are recursive. As the matter of fact, *now* it can be
    collapsed with ->s_lock, and be eventually replaced with saner
    per-cylinder-group spinlocks, but that's a separate story.

    Signed-off-by: Al Viro

    Al Viro
     

05 Jul, 2015

1 commit

  • Pull more vfs updates from Al Viro:
    "Assorted VFS fixes and related cleanups (IMO the most interesting in
    that part are f_path-related things and Eric's descriptor-related
    stuff). UFS regression fixes (it got broken last cycle). 9P fixes.
    fs-cache series, DAX patches, Jan's file_remove_suid() work"

    [ I'd say this is much more than "fixes and related cleanups". The
    file_table locking rule change by Eric Dumazet is a rather big and
    fundamental update even if the patch isn't huge. - Linus ]

    * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (49 commits)
    9p: cope with bogus responses from server in p9_client_{read,write}
    p9_client_write(): avoid double p9_free_req()
    9p: forgetting to cancel request on interrupted zero-copy RPC
    dax: bdev_direct_access() may sleep
    block: Add support for DAX reads/writes to block devices
    dax: Use copy_from_iter_nocache
    dax: Add block size note to documentation
    fs/file.c: __fget() and dup2() atomicity rules
    fs/file.c: don't acquire files->file_lock in fd_install()
    fs:super:get_anon_bdev: fix race condition could cause dev exceed its upper limitation
    vfs: avoid creation of inode number 0 in get_next_ino
    namei: make set_root_rcu() return void
    make simple_positive() public
    ufs: use dir_pages instead of ufs_dir_pages()
    pagemap.h: move dir_pages() over there
    remove the pointless include of lglock.h
    fs: cleanup slight list_entry abuse
    xfs: Correctly lock inode when removing suid and file capabilities
    fs: Call security_ops->inode_killpriv on truncate
    fs: Provide function telling whether file_remove_privs() will do anything
    ...

    Linus Torvalds
     

18 Jun, 2015

1 commit


16 Jun, 2015

1 commit

  • Commit 0244756edc4b98c ("ufs: sb mutex merge + mutex_destroy") generated
    deadlocks in read/write mode on mkdir.

    This patch partially reverts it keeping fixes by Andrew Morton and
    mutex_destroy()

    [AV: fixed a missing bit in ufs_remount()]

    Signed-off-by: Fabian Frederick
    Reported-by: Ian Campbell
    Suggested-by: Jan Kara
    Cc: Ian Campbell
    Cc: Evgeniy Dushistov
    Cc: Alexey Khoroshilov
    Cc: Roger Pau Monne
    Cc: Ian Jackson
    Cc: Al Viro
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Al Viro

    Fabian Frederick
     

02 Jun, 2015

1 commit

  • With the planned cgroup writeback support, backing-dev related
    declarations will be more widely used across block and cgroup;
    unfortunately, including backing-dev.h from include/linux/blkdev.h
    makes cyclic include dependency quite likely.

    This patch separates out backing-dev-defs.h which only has the
    essential definitions and updates blkdev.h to include it. c files
    which need access to more backing-dev details now include
    backing-dev.h directly. This takes backing-dev.h off the common
    include dependency chain making it a lot easier to use it across block
    and cgroup.

    v2: fs/fat build failure fixed.

    Signed-off-by: Tejun Heo
    Reviewed-by: Jan Kara
    Cc: Jens Axboe
    Signed-off-by: Jens Axboe

    Tejun Heo
     

16 Apr, 2015

1 commit


18 Feb, 2015

2 commits

  • Let locking subsystem decide on mutex management. As reported by Andrew
    Morton this patch fixes a bug:

    : lock_ufs() is assuming that on non-preempt uniprocessor, the calling
    : code will run atomically up to the matching unlock_ufs().
    :
    : But that isn't true. The very first site I looked at (ufs_frag_map)
    : does sb_bread() under lock_ufs(). And sb_bread() will call schedule(),
    : very commonly.
    :
    : The ->mutex_owner stuff is a bit hacky but should work OK.

    Signed-off-by: Fabian Frederick
    Cc: Evgeniy Dushistov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Fabian Frederick
     
  • Fix the following coccinelle warning:

    fs/ufs/super.c:1418:7-28: WARNING: casting value returned by memory allocation function to (struct ufs_inode_info *) is useless.

    Signed-off-by: Fabian Frederick
    Cc: Evgeniy Dushistov
    Cc: Joe Perches
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Fabian Frederick
     

09 Aug, 2014

5 commits


07 Jun, 2014

1 commit

  • Commit 788257d6101d ("ufs: remove the BKL") replaced BKL with mutex
    protection using functions lock_ufs, unlock_ufs and struct mutex 'mutex'
    in sb_info.

    Commit b6963327e052 ("ufs: drop lock/unlock super") removed lock/unlock
    super and added struct mutex 's_lock' in sb_info.

    Those 2 mutexes are generally locked/unlocked at the same time except in
    allocation (balloc, ialloc).

    This patch merges the 2 mutexes and propagates first commit solution.
    It also adds mutex destruction before kfree during ufs_fill_super
    failure and ufs_put_super.

    [akpm@linux-foundation.org: avoid ifdefs, return -EROFS not -EINVAL]
    Signed-off-by: Fabian Frederick
    Cc: Evgeniy Dushistov
    Cc: "Chen, Jet"
    Cc: Wu Fengguang
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Fabian Frederick
     

08 Apr, 2014

4 commits


13 Mar, 2014

1 commit

  • Previously, the no-op "mount -o mount /dev/xxx" operation when the
    file system is already mounted read-write causes an implied,
    unconditional syncfs(). This seems pretty stupid, and it's certainly
    documented or guaraunteed to do this, nor is it particularly useful,
    except in the case where the file system was mounted rw and is getting
    remounted read-only.

    However, it's possible that there might be some file systems that are
    actually depending on this behavior. In most file systems, it's
    probably fine to only call sync_filesystem() when transitioning from
    read-write to read-only, and there are some file systems where this is
    not needed at all (for example, for a pseudo-filesystem or something
    like romfs).

    Signed-off-by: "Theodore Ts'o"
    Cc: linux-fsdevel@vger.kernel.org
    Cc: Christoph Hellwig
    Cc: Artem Bityutskiy
    Cc: Adrian Hunter
    Cc: Evgeniy Dushistov
    Cc: Jan Kara
    Cc: OGAWA Hirofumi
    Cc: Anders Larsen
    Cc: Phillip Lougher
    Cc: Kees Cook
    Cc: Mikulas Patocka
    Cc: Petr Vandrovec
    Cc: xfs@oss.sgi.com
    Cc: linux-btrfs@vger.kernel.org
    Cc: linux-cifs@vger.kernel.org
    Cc: samba-technical@lists.samba.org
    Cc: codalist@coda.cs.cmu.edu
    Cc: linux-ext4@vger.kernel.org
    Cc: linux-f2fs-devel@lists.sourceforge.net
    Cc: fuse-devel@lists.sourceforge.net
    Cc: cluster-devel@redhat.com
    Cc: linux-mtd@lists.infradead.org
    Cc: jfs-discussion@lists.sourceforge.net
    Cc: linux-nfs@vger.kernel.org
    Cc: linux-nilfs@vger.kernel.org
    Cc: linux-ntfs-dev@lists.sourceforge.net
    Cc: ocfs2-devel@oss.oracle.com
    Cc: reiserfs-devel@vger.kernel.org

    Theodore Ts'o
     

04 Mar, 2013

1 commit

  • Modify the request_module to prefix the file system type with "fs-"
    and add aliases to all of the filesystems that can be built as modules
    to match.

    A common practice is to build all of the kernel code and leave code
    that is not commonly needed as modules, with the result that many
    users are exposed to any bug anywhere in the kernel.

    Looking for filesystems with a fs- prefix limits the pool of possible
    modules that can be loaded by mount to just filesystems trivially
    making things safer with no real cost.

    Using aliases means user space can control the policy of which
    filesystem modules are auto-loaded by editing /etc/modprobe.d/*.conf
    with blacklist and alias directives. Allowing simple, safe,
    well understood work-arounds to known problematic software.

    This also addresses a rare but unfortunate problem where the filesystem
    name is not the same as it's module name and module auto-loading
    would not work. While writing this patch I saw a handful of such
    cases. The most significant being autofs that lives in the module
    autofs4.

    This is relevant to user namespaces because we can reach the request
    module in get_fs_type() without having any special permissions, and
    people get uncomfortable when a user specified string (in this case
    the filesystem type) goes all of the way to request_module.

    After having looked at this issue I don't think there is any
    particular reason to perform any filtering or permission checks beyond
    making it clear in the module request that we want a filesystem
    module. The common pattern in the kernel is to call request_module()
    without regards to the users permissions. In general all a filesystem
    module does once loaded is call register_filesystem() and go to sleep.
    Which means there is not much attack surface exposed by loading a
    filesytem module unless the filesystem is mounted. In a user
    namespace filesystems are not mounted unless .fs_flags = FS_USERNS_MOUNT,
    which most filesystems do not set today.

    Acked-by: Serge Hallyn
    Acked-by: Kees Cook
    Reported-by: Kees Cook
    Signed-off-by: "Eric W. Biederman"

    Eric W. Biederman