24 Mar, 2019

1 commit

  • commit a0ce2f0aa6ad97c3d4927bf2ca54bcebdf062d55 upstream.

    Before this patch, it was possible for two pipes to affect each other after
    data had been transferred between them with tee():

    ============
    $ cat tee_test.c

    int main(void) {
    int pipe_a[2];
    if (pipe(pipe_a)) err(1, "pipe");
    int pipe_b[2];
    if (pipe(pipe_b)) err(1, "pipe");
    if (write(pipe_a[1], "abcd", 4) != 4) err(1, "write");
    if (tee(pipe_a[0], pipe_b[1], 2, 0) != 2) err(1, "tee");
    if (write(pipe_b[1], "xx", 2) != 2) err(1, "write");

    char buf[5];
    if (read(pipe_a[0], buf, 4) != 4) err(1, "read");
    buf[4] = 0;
    printf("got back: '%s'\n", buf);
    }
    $ gcc -o tee_test tee_test.c
    $ ./tee_test
    got back: 'abxx'
    $
    ============

    As suggested by Al Viro, fix it by creating a separate type for
    non-mergeable pipe buffers, then changing the types of buffers in
    splice_pipe_to_pipe() and link_pipe().

    Cc:
    Fixes: 7c77f0b3f920 ("splice: implement pipe to pipe splicing")
    Fixes: 70524490ee2e ("[PATCH] splice: add support for sys_tee()")
    Suggested-by: Al Viro
    Signed-off-by: Jann Horn
    Signed-off-by: Al Viro
    Signed-off-by: Greg Kroah-Hartman

    Jann Horn
     

16 Jun, 2018

1 commit


13 Jun, 2018

1 commit

  • The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
    patch replaces cases of:

    kmalloc(a * b, gfp)

    with:
    kmalloc_array(a * b, gfp)

    as well as handling cases of:

    kmalloc(a * b * c, gfp)

    with:

    kmalloc(array3_size(a, b, c), gfp)

    as it's slightly less ugly than:

    kmalloc_array(array_size(a, b), c, gfp)

    This does, however, attempt to ignore constant size factors like:

    kmalloc(4 * 1024, gfp)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The tools/ directory was manually excluded, since it has its own
    implementation of kmalloc().

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kmalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kmalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kmalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kmalloc
    + kmalloc_array
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kmalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kmalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kmalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kmalloc(sizeof(THING) * C2, ...)
    |
    kmalloc(sizeof(TYPE) * C2, ...)
    |
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(C1 * C2, ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

11 Jun, 2018

1 commit


03 Apr, 2018

1 commit

  • Using the fs-internal do_vmsplice() helper allows us to get rid of the
    fs-internal call to the sys_vmsplice() syscall.

    This patch is part of a series which removes in-kernel calls to syscalls.
    On this basis, the syscall entry path can be streamlined. For details, see
    http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

    Cc: Al Viro
    Cc: Andrew Morton
    Signed-off-by: Dominik Brodowski

    Dominik Brodowski
     

25 Oct, 2017

1 commit

  • …READ_ONCE()/WRITE_ONCE()

    Please do not apply this to mainline directly, instead please re-run the
    coccinelle script shown below and apply its output.

    For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
    preference to ACCESS_ONCE(), and new code is expected to use one of the
    former. So far, there's been no reason to change most existing uses of
    ACCESS_ONCE(), as these aren't harmful, and changing them results in
    churn.

    However, for some features, the read/write distinction is critical to
    correct operation. To distinguish these cases, separate read/write
    accessors must be used. This patch migrates (most) remaining
    ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
    coccinelle script:

    ----
    // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
    // WRITE_ONCE()

    // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch

    virtual patch

    @ depends on patch @
    expression E1, E2;
    @@

    - ACCESS_ONCE(E1) = E2
    + WRITE_ONCE(E1, E2)

    @ depends on patch @
    expression E;
    @@

    - ACCESS_ONCE(E)
    + READ_ONCE(E)
    ----

    Signed-off-by: Mark Rutland <mark.rutland@arm.com>
    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: davem@davemloft.net
    Cc: linux-arch@vger.kernel.org
    Cc: mpe@ellerman.id.au
    Cc: shuah@kernel.org
    Cc: snitzer@redhat.com
    Cc: thor.thayer@linux.intel.com
    Cc: tj@kernel.org
    Cc: viro@zeniv.linux.org.uk
    Cc: will.deacon@arm.com
    Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

    Mark Rutland
     

05 Sep, 2017

1 commit


30 Jun, 2017

1 commit


03 May, 2017

1 commit


04 Mar, 2017

1 commit

  • Pull sched.h split-up from Ingo Molnar:
    "The point of these changes is to significantly reduce the
    header footprint, to speed up the kernel build and to
    have a cleaner header structure.

    After these changes the new 's typical preprocessed
    size goes down from a previous ~0.68 MB (~22K lines) to ~0.45 MB (~15K
    lines), which is around 40% faster to build on typical configs.

    Not much changed from the last version (-v2) posted three weeks ago: I
    eliminated quirks, backmerged fixes plus I rebased it to an upstream
    SHA1 from yesterday that includes most changes queued up in -next plus
    all sched.h changes that were pending from Andrew.

    I've re-tested the series both on x86 and on cross-arch defconfigs,
    and did a bisectability test at a number of random points.

    I tried to test as many build configurations as possible, but some
    build breakage is probably still left - but it should be mostly
    limited to architectures that have no cross-compiler binaries
    available on kernel.org, and non-default configurations"

    * 'WIP.sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (146 commits)
    sched/headers: Clean up
    sched/headers: Remove #ifdefs from
    sched/headers: Remove the include from
    sched/headers, hrtimer: Remove the include from
    sched/headers, x86/apic: Remove the header inclusion from
    sched/headers, timers: Remove the include from
    sched/headers: Remove from
    sched/headers: Remove from
    sched/core: Remove unused prefetch_stack()
    sched/headers: Remove from
    sched/headers: Remove the 'init_pid_ns' prototype from
    sched/headers: Remove from
    sched/headers: Remove from
    sched/headers: Remove the runqueue_is_locked() prototype
    sched/headers: Remove from
    sched/headers: Remove from
    sched/headers: Remove from
    sched/headers: Remove from
    sched/headers: Remove the include from
    sched/headers: Remove from
    ...

    Linus Torvalds
     

02 Mar, 2017

2 commits


20 Feb, 2017

1 commit


17 Feb, 2017

1 commit

  • Flags (PIPE_BUF_FLAG_PACKET, PIPE_BUF_FLAG_GIFT) could remain on the
    unused part of the pipe ring buffer. Previously splice_to_pipe() left
    the flags value alone, which could result in incorrect behavior.

    Uninitialized flags appears to have been there from the introduction of
    the splice syscall.

    Signed-off-by: Miklos Szeredi
    Cc: # 2.6.17+
    Signed-off-by: Linus Torvalds

    Miklos Szeredi
     

27 Dec, 2016

3 commits


22 Dec, 2016

1 commit

  • Commit 8924feff66f3 ("splice: lift pipe_lock out of splice_to_pipe()")
    caused a regression when there were no more readers left on a pipe that
    was being spliced into: rather than the expected SIGPIPE and -EPIPE
    return value, the writer would end up waiting forever for space to free
    up (which obviously was not going to happen with no readers around).

    Fixes: 8924feff66f3 ("splice: lift pipe_lock out of splice_to_pipe()")
    Reported-and-tested-by: Andreas Schwab
    Debugged-by: Al Viro
    Cc: stable@kernel.org # v4.9
    Signed-off-by: Linus Torvalds

    Linus Torvalds
     

14 Dec, 2016

1 commit

  • Pull block layer updates from Jens Axboe:
    "This is the main block pull request this series. Contrary to previous
    release, I've kept the core and driver changes in the same branch. We
    always ended up having dependencies between the two for obvious
    reasons, so makes more sense to keep them together. That said, I'll
    probably try and keep more topical branches going forward, especially
    for cycles that end up being as busy as this one.

    The major parts of this pull request is:

    - Improved support for O_DIRECT on block devices, with a small
    private implementation instead of using the pig that is
    fs/direct-io.c. From Christoph.

    - Request completion tracking in a scalable fashion. This is utilized
    by two components in this pull, the new hybrid polling and the
    writeback queue throttling code.

    - Improved support for polling with O_DIRECT, adding a hybrid mode
    that combines pure polling with an initial sleep. From me.

    - Support for automatic throttling of writeback queues on the block
    side. This uses feedback from the device completion latencies to
    scale the queue on the block side up or down. From me.

    - Support from SMR drives in the block layer and for SD. From Hannes
    and Shaun.

    - Multi-connection support for nbd. From Josef.

    - Cleanup of request and bio flags, so we have a clear split between
    which are bio (or rq) private, and which ones are shared. From
    Christoph.

    - A set of patches from Bart, that improve how we handle queue
    stopping and starting in blk-mq.

    - Support for WRITE_ZEROES from Chaitanya.

    - Lightnvm updates from Javier/Matias.

    - Supoort for FC for the nvme-over-fabrics code. From James Smart.

    - A bunch of fixes from a whole slew of people, too many to name
    here"

    * 'for-4.10/block' of git://git.kernel.dk/linux-block: (182 commits)
    blk-stat: fix a few cases of missing batch flushing
    blk-flush: run the queue when inserting blk-mq flush
    elevator: make the rqhash helpers exported
    blk-mq: abstract out blk_mq_dispatch_rq_list() helper
    blk-mq: add blk_mq_start_stopped_hw_queue()
    block: improve handling of the magic discard payload
    blk-wbt: don't throttle discard or write zeroes
    nbd: use dev_err_ratelimited in io path
    nbd: reset the setup task for NBD_CLEAR_SOCK
    nvme-fabrics: Add FC LLDD loopback driver to test FC-NVME
    nvme-fabrics: Add target support for FC transport
    nvme-fabrics: Add host support for FC transport
    nvme-fabrics: Add FC transport LLDD api definitions
    nvme-fabrics: Add FC transport FC-NVME definitions
    nvme-fabrics: Add FC transport error codes to nvme.h
    Add type 0x28 NVME type code to scsi fc headers
    nvme-fabrics: patch target code in prep for FC transport support
    nvme-fabrics: set sqe.command_id in core not transports
    parser: add u64 number parser
    nvme-rdma: align to generic ib_event logging helper
    ...

    Linus Torvalds
     

27 Nov, 2016

1 commit

  • Botched calculation of number of pages. As the result,
    we were dropping pieces when doing splice to pipe from
    e.g. 9p.

    Reported-by: Alexei Starovoitov
    Tested-by: Alexei Starovoitov
    Signed-off-by: Al Viro

    Al Viro
     

11 Nov, 2016

1 commit

  • i_size check is a leftover from the horrors that used to play with
    the page cache in that function. With the switch to ->read_iter(),
    it's neither needed nor correct - for gfs2 it ends up being buggy,
    since i_size is not guaranteed to be correct until later (inside
    ->read_iter()).

    Spotted-by: Abhi Das
    Signed-off-by: Al Viro

    Al Viro
     

01 Nov, 2016

1 commit


11 Oct, 2016

1 commit

  • by making sure we call iov_iter_advance() on original
    iov_iter even if direct_IO (done on its copy) has returned 0.
    It's a no-op for old iov_iter flavours and does the right thing
    (== truncation of the stuff we'd allocated, but not filled) in
    ITER_PIPE case. Failures (e.g. -EIO) get caught and dealt with
    by cleanup in generic_file_read_iter().

    Signed-off-by: Al Viro

    Al Viro
     

06 Oct, 2016

6 commits

  • Signed-off-by: Miklos Szeredi
    Signed-off-by: Al Viro

    Miklos Szeredi
     
  • Signed-off-by: Miklos Szeredi
    Signed-off-by: Al Viro

    Miklos Szeredi
     
  • Signed-off-by: Miklos Szeredi
    Signed-off-by: Al Viro

    Miklos Szeredi
     
  • we only use iov_iter_get_pages_alloc() and iov_iter_advance() -
    pages are filled by kernel_readv() via a kvec array (as we used
    to do all along), so iov_iter here is used only as a way of
    arranging for those pages to be in pipe.

    Signed-off-by: Al Viro

    Al Viro
     
  • ... and kill the ->splice_read() instances that can be switched to it

    Signed-off-by: Al Viro

    Al Viro
     
  • iov_iter variant for passing data into pipe. copy_to_iter()
    copies data into page(s) it has allocated and stuffs them into
    the pipe; copy_page_to_iter() stuffs there a reference to the
    page given to it. Both will try to coalesce if possible.
    iov_iter_zero() is similar to copy_to_iter(); iov_iter_get_pages()
    and friends will do as copy_to_iter() would have and return the
    pages where the data would've been copied. iov_iter_advance()
    will truncate everything past the spot it has advanced to.

    New primitive: iov_iter_pipe(), used for initializing those.
    pipe should be locked all along.

    Running out of space acts as fault would for iovec-backed ones;
    in other words, giving it to ->read_iter() may result in short
    read if the pipe overflows, or -EFAULT if it happens with nothing
    copied there.

    In other words, ->read_iter() on those acts pretty much like
    ->splice_read(). Moreover, all generic_file_splice_read() users,
    as well as many other ->splice_read() instances can be switched
    to that scheme - that'll happen in the next commit.

    Signed-off-by: Al Viro

    Al Viro
     

04 Oct, 2016

4 commits

  • single-buffer analogue of splice_to_pipe(); vmsplice_to_pipe() switched
    to that, leaving splice_to_pipe() only for ->splice_read() instances
    (and that only until they are converted as well).

    Signed-off-by: Al Viro

    Al Viro
     
  • * splice_to_pipe() stops at pipe overflow and does *not* take pipe_lock
    * ->splice_read() instances do the same
    * vmsplice_to_pipe() and do_splice() (ultimate callers of splice_to_pipe())
    arrange for waiting, looping, etc. themselves.

    That should make pipe_lock the outermost one.

    Unfortunately, existing rules for the amount passed by vmsplice_to_pipe()
    and do_splice() are quite ugly _and_ userland code can be easily broken
    by changing those. It's not even "no more than the maximal capacity of
    this pipe" - it's "once we'd fed pipe->nr_buffers pages into the pipe,
    leave instead of waiting".

    Considering how poorly these rules are documented, let's try "wait for some
    space to appear, unless given SPLICE_F_NONBLOCK, then push into pipe
    and if we run into overflow, we are done".

    Signed-off-by: Al Viro

    Al Viro
     
  • Signed-off-by: Al Viro

    Al Viro
     
  • Signed-off-by: Al Viro

    Al Viro
     

11 May, 2016

1 commit


05 Apr, 2016

1 commit

  • PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
    ago with promise that one day it will be possible to implement page
    cache with bigger chunks than PAGE_SIZE.

    This promise never materialized. And unlikely will.

    We have many places where PAGE_CACHE_SIZE assumed to be equal to
    PAGE_SIZE. And it's constant source of confusion on whether
    PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
    especially on the border between fs and mm.

    Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
    breakage to be doable.

    Let's stop pretending that pages in page cache are special. They are
    not.

    The changes are pretty straight-forward:

    - << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ;

    - >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ;

    - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};

    - page_cache_get() -> get_page();

    - page_cache_release() -> put_page();

    This patch contains automated changes generated with coccinelle using
    script below. For some reason, coccinelle doesn't patch header files.
    I've called spatch for them manually.

    The only adjustment after coccinelle is revert of changes to
    PAGE_CAHCE_ALIGN definition: we are going to drop it later.

    There are few places in the code where coccinelle didn't reach. I'll
    fix them manually in a separate patch. Comments and documentation also
    will be addressed with the separate patch.

    virtual patch

    @@
    expression E;
    @@
    - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
    + E

    @@
    expression E;
    @@
    - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
    + E

    @@
    @@
    - PAGE_CACHE_SHIFT
    + PAGE_SHIFT

    @@
    @@
    - PAGE_CACHE_SIZE
    + PAGE_SIZE

    @@
    @@
    - PAGE_CACHE_MASK
    + PAGE_MASK

    @@
    expression E;
    @@
    - PAGE_CACHE_ALIGN(E)
    + PAGE_ALIGN(E)

    @@
    expression E;
    @@
    - page_cache_get(E)
    + get_page(E)

    @@
    expression E;
    @@
    - page_cache_release(E)
    + put_page(E)

    Signed-off-by: Kirill A. Shutemov
    Acked-by: Michal Hocko
    Signed-off-by: Linus Torvalds

    Kirill A. Shutemov
     

04 Apr, 2016

1 commit


19 Mar, 2016

2 commits

  • Al Viro
     
  • Running the following command:

    busybox cat /sys/kernel/debug/tracing/trace_pipe > /dev/null

    with any tracing enabled pretty very quickly leads to various NULL
    pointer dereferences and VM BUG_ON()s, such as these:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000020
    IP: [] generic_pipe_buf_release+0xc/0x40
    Call Trace:
    [] splice_direct_to_actor+0x143/0x1e0
    [] ? generic_pipe_buf_nosteal+0x10/0x10
    [] do_splice_direct+0x8f/0xb0
    [] do_sendfile+0x199/0x380
    [] SyS_sendfile64+0x90/0xa0
    [] entry_SYSCALL_64_fastpath+0x12/0x6d

    page dumped because: VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0)
    kernel BUG at include/linux/mm.h:367!
    invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
    RIP: [] generic_pipe_buf_release+0x3c/0x40
    Call Trace:
    [] splice_direct_to_actor+0x143/0x1e0
    [] ? generic_pipe_buf_nosteal+0x10/0x10
    [] do_splice_direct+0x8f/0xb0
    [] do_sendfile+0x199/0x380
    [] SyS_sendfile64+0x90/0xa0
    [] tracesys_phase2+0x84/0x89

    (busybox's cat uses sendfile(2), unlike the coreutils version)

    This is because tracing_splice_read_pipe() can call splice_to_pipe()
    with spd->nr_pages == 0. spd_pages underflows in splice_to_pipe() and
    we fill the page pointers and the other fields of the pipe_buffers with
    garbage.

    All other callers of splice_to_pipe() avoid calling it when nr_pages ==
    0, and we could make tracing_splice_read_pipe() do that too, but it
    seems reasonable to have splice_to_page() handle this condition
    gracefully.

    Cc: stable@vger.kernel.org
    Signed-off-by: Rabin Vincent
    Reviewed-by: Christoph Hellwig
    Signed-off-by: Al Viro

    Rabin Vincent
     

05 Mar, 2016

1 commit

  • This way we can set kiocb flags also from the sync read/write path for
    the read_iter/write_iter operations. For now there is no way to pass
    flags to plain read/write operations as there is no real need for that,
    and all flags passed are explicitly rejected for these files.

    Signed-off-by: Milosz Tanski
    [hch: rebased on top of my kiocb changes]
    Signed-off-by: Christoph Hellwig
    Reviewed-by: Stephen Bates
    Tested-by: Stephen Bates
    Acked-by: Jeff Moyer
    Signed-off-by: Al Viro

    Christoph Hellwig
     

09 Jan, 2016

1 commit

  • During testing, I discovered that __generic_file_splice_read() returns
    0 (EOF) when aops->readpage fails with AOP_TRUNCATED_PAGE on the first
    page of a single/multi-page splice read operation. This EOF return code
    causes the userspace test to (correctly) report a zero-length read error
    when it was expecting otherwise.

    The current strategy of returning a partial non-zero read when ->readpage
    returns AOP_TRUNCATED_PAGE works only when the failed page is not the
    first of the lot being processed.

    This patch attempts to retry lookup and call ->readpage again on pages
    that had previously failed with AOP_TRUNCATED_PAGE. With this patch, my
    tests pass and I haven't noticed any unwanted side effects.

    This version removes the thrice-retry loop and instead indefinitely
    retries lookups on AOP_TRUNCATED_PAGE errors from ->readpage. This
    behavior is now similar to do_generic_file_read().

    Signed-off-by: Abhi Das
    Reviewed-by: Jan Kara
    Cc: Bob Peterson
    Cc: Al Viro
    Signed-off-by: Al Viro

    Abhi Das