13 Feb, 2019

1 commit

  • [ Upstream commit 216f0efd19b9cc32207934fd1b87a45f2c4c593e ]

    Before this patch, recovery would cause all callbacks to be delayed,
    put on a queue, and afterward they were all queued to the callback
    work queue. This patch does the same thing, but occasionally takes
    a break after 25 of them so it won't swamp the CPU at the expense
    of other RT processes like corosync.

    Signed-off-by: Bob Peterson
    Signed-off-by: David Teigland
    Signed-off-by: Sasha Levin

    Bob Peterson
     

13 Jan, 2019

4 commits


13 Jun, 2018

1 commit

  • The vmalloc() function has no 2-factor argument form, so multiplication
    factors need to be wrapped in array_size(). This patch replaces cases of:

    vmalloc(a * b)

    with:
    vmalloc(array_size(a, b))

    as well as handling cases of:

    vmalloc(a * b * c)

    with:

    vmalloc(array3_size(a, b, c))

    This does, however, attempt to ignore constant size factors like:

    vmalloc(4 * 1024)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    vmalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    vmalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    vmalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    vmalloc(
    - sizeof(TYPE) * (COUNT_ID)
    + array_size(COUNT_ID, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT_ID
    + array_size(COUNT_ID, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * (COUNT_CONST)
    + array_size(COUNT_CONST, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT_CONST
    + array_size(COUNT_CONST, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT_ID)
    + array_size(COUNT_ID, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT_ID
    + array_size(COUNT_ID, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT_CONST)
    + array_size(COUNT_CONST, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT_CONST
    + array_size(COUNT_CONST, sizeof(THING))
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    vmalloc(
    - SIZE * COUNT
    + array_size(COUNT, SIZE)
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    vmalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    vmalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    vmalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    vmalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    vmalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    vmalloc(C1 * C2 * C3, ...)
    |
    vmalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants.
    @@
    expression E1, E2;
    constant C1, C2;
    @@

    (
    vmalloc(C1 * C2, ...)
    |
    vmalloc(
    - E1 * E2
    + array_size(E1, E2)
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

29 May, 2018

1 commit

  • We should remove O_NONBLOCK flag when calling sock->ops->connect()
    in sctp_connect_to_sock() function.
    Why?
    1. up to now, sctp socket connect() function ignores the flag argument,
    that means O_NONBLOCK flag does not take effect, then we should remove
    it to avoid the confusion (but is not urgent).
    2. for the future, there will be a patch to fix this problem, then the flag
    argument will take effect, the patch has been queued at https://git.kernel.o
    rg/pub/scm/linux/kernel/git/davem/net.git/commit/net/sctp?id=644fbdeacf1d3ed
    d366e44b8ba214de9d1dd66a9.
    But, the O_NONBLOCK flag will make sock->ops->connect() directly return
    without any wait time, then the connection will not be established, DLM kernel
    module will call sock->ops->connect() again and again, the bad results are,
    CPU usage is almost 100%, even trigger soft_lockup problem if the related
    configurations are enabled,
    DLM kernel module also prints lots of messages like,
    [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592
    [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592
    [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592
    [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592
    The upper application (e.g. ocfs2 mount command) is hanged at new_lockspace(),
    the whole backtrace is as below,
    tb0307-nd2:~ # cat /proc/2935/stack
    [] new_lockspace+0x957/0xac0 [dlm]
    [] dlm_new_lockspace+0xae/0x140 [dlm]
    [] user_cluster_connect+0xc3/0x3a0 [ocfs2_stack_user]
    [] ocfs2_cluster_connect+0x144/0x220 [ocfs2_stackglue]
    [] ocfs2_dlm_init+0x215/0x440 [ocfs2]
    [] ocfs2_fill_super+0xcb0/0x1290 [ocfs2]
    [] mount_bdev+0x173/0x1b0
    [] mount_fs+0x35/0x150
    [] vfs_kern_mount.part.23+0x54/0x100
    [] do_mount+0x59a/0xc40
    [] SyS_mount+0x80/0xd0
    [] do_syscall_64+0x76/0x140
    [] entry_SYSCALL_64_after_hwframe+0x42/0xb7
    [] 0xffffffffffffffff

    So, I think we should remove O_NONBLOCK flag here, since DLM kernel module can
    not handle non-block sockect in connect() properly.

    Signed-off-by: Gang He
    Signed-off-by: David Teigland

    Gang He
     

02 May, 2018

2 commits

  • When the user setup a two-ring cluster, DLM kernel module
    will automatically selects to use SCTP protocol to communicate
    between each node. There will be about 5 minute hang in DLM
    kernel module, in case one ring is broken before switching to
    another ring, this will potentially affect the dependent upper
    applications, e.g. ocfs2, gfs2, clvm and clustered-MD, etc.
    Unfortunately, if the user setup a two-ring cluster, we can not
    specify DLM communication protocol with TCP explicitly, since
    DLM kernel module only supports SCTP protocol for multiple
    ring cluster.
    Base on my investigation, the time is spent in sock->ops->connect()
    function before returns ETIMEDOUT(-110) error, since O_NONBLOCK
    argument in connect() function does not work here, then we should
    make sock->ops->connect() function return in specified time via
    setting socket SO_SNDTIMEO atrribute.

    Signed-off-by: Gang He
    Signed-off-by: David Teigland

    Gang He
     
  • There is a clerical error when turn off Nagle's algorithm in
    sctp_connect_to_sock() function, this results in turn off
    Nagle's algorithm failure.
    After this correction, DLM performance will be improved obviously
    when using SCTP procotol.

    Signed-off-by: Gang He
    Signed-off-by: Michal Kubecek
    Signed-off-by: David Teigland

    Gang He
     

13 Feb, 2018

1 commit

  • Changes since v1:
    Added changes in these files:
    drivers/infiniband/hw/usnic/usnic_transport.c
    drivers/staging/lustre/lnet/lnet/lib-socket.c
    drivers/target/iscsi/iscsi_target_login.c
    drivers/vhost/net.c
    fs/dlm/lowcomms.c
    fs/ocfs2/cluster/tcp.c
    security/tomoyo/network.c

    Before:
    All these functions either return a negative error indicator,
    or store length of sockaddr into "int *socklen" parameter
    and return zero on success.

    "int *socklen" parameter is awkward. For example, if caller does not
    care, it still needs to provide on-stack storage for the value
    it does not need.

    None of the many FOO_getname() functions of various protocols
    ever used old value of *socklen. They always just overwrite it.

    This change drops this parameter, and makes all these functions, on success,
    return length of sockaddr. It's always >= 0 and can be differentiated
    from an error.

    Tests in callers are changed from "if (err)" to "if (err < 0)", where needed.

    rpc_sockname() lost "int buflen" parameter, since its only use was
    to be passed to kernel_getsockname() as &buflen and subsequently
    not used in any way.

    Userspace API is not changed.

    text data bss dec hex filename
    30108430 2633624 873672 33615726 200ef6e vmlinux.before.o
    30108109 2633612 873672 33615393 200ee21 vmlinux.o

    Signed-off-by: Denys Vlasenko
    CC: David S. Miller
    CC: linux-kernel@vger.kernel.org
    CC: netdev@vger.kernel.org
    CC: linux-bluetooth@vger.kernel.org
    CC: linux-decnet-user@lists.sourceforge.net
    CC: linux-wireless@vger.kernel.org
    CC: linux-rdma@vger.kernel.org
    CC: linux-sctp@vger.kernel.org
    CC: linux-nfs@vger.kernel.org
    CC: linux-x25@vger.kernel.org
    Signed-off-by: David S. Miller

    Denys Vlasenko
     

12 Feb, 2018

1 commit

  • This is the mindless scripted replacement of kernel use of POLL*
    variables as described by Al, done by this script:

    for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do
    L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'`
    for f in $L; do sed -i "-es/^\([^\"]*\)\(\\)/\\1E\\2/" $f; done
    done

    with de-mangling cleanups yet to come.

    NOTE! On almost all architectures, the EPOLL* constants have the same
    values as the POLL* constants do. But they keyword here is "almost".
    For various bad reasons they aren't the same, and epoll() doesn't
    actually work quite correctly in some cases due to this on Sparc et al.

    The next patch from Al will sort out the final differences, and we
    should be all done.

    Scripted-by: Al Viro
    Signed-off-by: Linus Torvalds

    Linus Torvalds
     

31 Jan, 2018

1 commit

  • Pull kern_recvmsg reduction from Al Viro:
    "kernel_recvmsg() is a set_fs()-using wrapper for sock_recvmsg(). In
    all but one case that is not needed - use of ITER_KVEC for ->msg_iter
    takes care of the data and does not care about set_fs(). The only
    exception is svc_udp_recvfrom() where we want cmsg to be store into
    kernel object; everything else can just use sock_recvmsg() and be done
    with that.

    A followup converting svc_udp_recvfrom() away from set_fs() (and
    killing kernel_recvmsg() off) is *NOT* in here - I'd like to hear what
    netdev folks think of the approach proposed in that followup)"

    * 'work.sock_recvmsg' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
    tipc: switch to sock_recvmsg()
    smc: switch to sock_recvmsg()
    ipvs: switch to sock_recvmsg()
    mISDN: switch to sock_recvmsg()
    drbd: switch to sock_recvmsg()
    lustre lnet_sock_read(): switch to sock_recvmsg()
    cfs2: switch to sock_recvmsg()
    ncpfs: switch to sock_recvmsg()
    dlm: switch to sock_recvmsg()
    svc_recvfrom(): switch to sock_recvmsg()

    Linus Torvalds
     

03 Dec, 2017

1 commit


28 Nov, 2017

1 commit


15 Nov, 2017

2 commits

  • Pull configfs updates from Christoph Hellwig:
    "A couple of configfs cleanups:

    - proper use of the bool type (Thomas Meyer)

    - constification of struct config_item_type (Bhumika Goyal)"

    * tag 'configfs-for-4.15' of git://git.infradead.org/users/hch/configfs:
    RDMA/cma: make config_item_type const
    stm class: make config_item_type const
    ACPI: configfs: make config_item_type const
    nvmet: make config_item_type const
    usb: gadget: configfs: make config_item_type const
    PCI: endpoint: make config_item_type const
    iio: make function argument and some structures const
    usb: gadget: make config_item_type structures const
    dlm: make config_item_type const
    netconsole: make config_item_type const
    nullb: make config_item_type const
    ocfs2/cluster: make config_item_type const
    target: make config_item_type const
    configfs: make ci_type field, some pointers and function arguments const
    configfs: make config_item_type const
    configfs: Fix bool initialization/comparison

    Linus Torvalds
     
  • Pull dlm updates from David Teigland:
    "This set focuses, as usual, on fixes to the comms layer.

    New testing of the dlm with ocfs2 uncovered a number of bugs in the
    TCP connection handling during recovery, starting, and stopping"

    * tag 'dlm-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm:
    dlm: remove dlm_send_rcom_lookup_dump
    dlm: recheck kthread_should_stop() before schedule()
    DLM: fix NULL pointer dereference in send_to_sock()
    DLM: fix to reschedule rwork
    DLM: fix to use sk_callback_lock correctly
    DLM: fix overflow dlm_cb_seq
    DLM: fix memory leak in tcp_accept_from_sock()
    DLM: fix conversion deadlock when DLM_LKF_NODLCKWT flag is set
    DLM: use CF_CLOSE flag to stop dlm_send correctly
    DLM: Reanimate CF_WRITE_PENDING flag
    DLM: fix race condition between dlm_recoverd_stop and dlm_recoverd
    DLM: close othercon at send/receive error
    DLM: retry rcom when dlm_wait_function is timed out.
    DLM: fix to use sock_mutex correctly in xxx_accept_from_sock
    DLM: fix race condition between dlm_send and dlm_recv
    DLM: fix double list_del()
    DLM: fix remove save_cb argument from add_sock()
    DLM: Fix saving of NULL callbacks
    DLM: Eliminate CF_WRITE_PENDING flag
    DLM: Eliminate CF_CONNECT_PENDING flag

    Linus Torvalds
     

02 Nov, 2017

1 commit

  • Many source files in the tree are missing licensing information, which
    makes it harder for compliance tools to determine the correct license.

    By default all files without license information are under the default
    license of the kernel, which is GPL version 2.

    Update the files which contain no license information with the 'GPL-2.0'
    SPDX license identifier. The SPDX identifier is a legally binding
    shorthand, which can be used instead of the full boiler plate text.

    This patch is based on work done by Thomas Gleixner and Kate Stewart and
    Philippe Ombredanne.

    How this work was done:

    Patches were generated and checked against linux-4.14-rc6 for a subset of
    the use cases:
    - file had no licensing information it it.
    - file was a */uapi/* one with no licensing information in it,
    - file was a */uapi/* one with existing licensing information,

    Further patches will be generated in subsequent months to fix up cases
    where non-standard license headers were used, and references to license
    had to be inferred by heuristics based on keywords.

    The analysis to determine which SPDX License Identifier to be applied to
    a file was done in a spreadsheet of side by side results from of the
    output of two independent scanners (ScanCode & Windriver) producing SPDX
    tag:value files created by Philippe Ombredanne. Philippe prepared the
    base worksheet, and did an initial spot review of a few 1000 files.

    The 4.13 kernel was the starting point of the analysis with 60,537 files
    assessed. Kate Stewart did a file by file comparison of the scanner
    results in the spreadsheet to determine which SPDX license identifier(s)
    to be applied to the file. She confirmed any determination that was not
    immediately clear with lawyers working with the Linux Foundation.

    Criteria used to select files for SPDX license identifier tagging was:
    - Files considered eligible had to be source code files.
    - Make and config files were included as candidates if they contained >5
    lines of source
    - File already had some variant of a license header in it (even if
    Reviewed-by: Philippe Ombredanne
    Reviewed-by: Thomas Gleixner
    Signed-off-by: Greg Kroah-Hartman

    Greg Kroah-Hartman
     

19 Oct, 2017

1 commit

  • Make config_item_type structures const as they are either passed to a
    function having the argument as const or stored in the const "ci_type"
    field of a config_item structure.

    Done using Coccinelle.

    Signed-off-by: Bhumika Goyal
    Signed-off-by: Christoph Hellwig

    Bhumika Goyal
     

09 Oct, 2017

1 commit

  • This function was only for debugging. It would be
    called in a condition that should not happen, and
    should probably have been removed from the final
    version of the original commit.

    Remove it because it does mutex lock under spin lock.

    Signed-off-by: David Teigland

    David Teigland
     

26 Sep, 2017

19 commits

  • Call schedule() here could make the thread miss wake
    up from kthread_stop(), so it is better to recheck
    kthread_should_stop() before call schedule(), a symptom
    happened when I run indefinite test (which mostly created
    clustered raid1, assemble it in other nodes, then stop
    them) of clustered raid.

    $ ps aux|grep md|grep D
    root 4211 0.0 0.0 19760 2220 ? Ds 02:58 0:00 mdadm -Ssq
    $ cat /proc/4211/stack
    kthread_stop+0x4d/0x150
    dlm_recoverd_stop+0x15/0x20 [dlm]
    dlm_release_lockspace+0x2ab/0x460 [dlm]
    leave+0xbf/0x150 [md_cluster]
    md_cluster_stop+0x18/0x30 [md_mod]
    bitmap_free+0x12e/0x140 [md_mod]
    bitmap_destroy+0x7f/0x90 [md_mod]
    __md_stop+0x21/0xa0 [md_mod]
    do_md_stop+0x15f/0x5c0 [md_mod]
    md_ioctl+0xa65/0x18a0 [md_mod]
    blkdev_ioctl+0x49e/0x8d0
    block_ioctl+0x41/0x50
    do_vfs_ioctl+0x96/0x5b0
    SyS_ioctl+0x79/0x90
    entry_SYSCALL_64_fastpath+0x1e/0xad

    This maybe not resolve the issue completely since the
    KTHREAD_SHOULD_STOP flag could be set between "break"
    and "schedule", but at least the chance for the symptom
    happen could be reduce a lot (The indefinite test runs
    more than 20 hours without problem and it happens easily
    without the change).

    Signed-off-by: Guoqing Jiang
    Signed-off-by: David Teigland

    Guoqing Jiang
     
  • The writequeue and writequeue_lock member of othercon was not initialized.
    If lowcomms_state_change() is called from network layer, othercon->swork
    may be scheduled. In this case, send_to_sock() will generate a NULL pointer
    reference. We avoid this problem by correctly initializing writequeue and
    writequeue_lock member of othercon.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • When an error occurs in kernel_recvmsg or kernel_sendpage and
    close_connection is called and receive work is already scheduled,
    receive work is canceled. In that case, the receive work will not
    be scheduled forever after reconnection, because CF_READ_PENDING
    flag is established.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • In the current implementation, we think that exclusion control between
    processing to set the callback function to the connection structure and
    processing to refer to the connection structure from the callback function
    was not enough. We fix them.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • dlm_cb_seq is 64 bits. If dlm_cb_seq overflows and returns to 0,
    dlm_rem_lkb_callback() will not work properly.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • The sk member of the socket generated by sock_create_kern() is overwritten
    by ops->accept(). So the previous sk will not be released.
    We use kernel_accept() instead of sock_create_kern() and ops->accept().

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • When the DLM_LKF_NODLCKWT flag was set, even if conversion deadlock
    was detected, the caller of can_be_granted() was unknown.
    We change the behavior of can_be_granted() and change it to detect
    conversion deadlock regardless of whether the DLM_LKF_NODLCKWT flag
    is set or not. And depending on whether the DLM_LKF_NODLCKWT flag
    is set or not, we change the behavior at the caller of can_be_granted().

    This fix has no effect except when using DLM_LKF_NODLCKWT flag.
    Currently, ocfs2 uses the DLM_LKF_NODLCKWT flag and does not expect a
    cancel operation from conversion deadlock when calling dlm_lock().
    ocfs2 is implemented to perform a cancel operation by requesting
    BASTs (callback).

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • If reconnection fails while executing dlm_lowcomms_stop,
    dlm_send will not stop.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • CF_WRITE_PENDING flag has been reanimated to make dlm_send stop properly
    when running dlm_lowcomms_stop.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • When dlm_recoverd_stop() is called between kthread_should_stop() and
    set_task_state(TASK_INTERRUPTIBLE), dlm_recoverd will not wake up.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • If an error occurs in the sending / receiving process, if othercon
    exists, sending / receiving processing using othercon may also result
    in an error. We fix to pre-close othercon as well.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • If a node sends a DLM_RCOM_STATUS command and an error occurs on the
    receiving side, the DLM_RCOM_STATUS_REPLY response may not be returned.
    We retransmitted the DLM_RCOM_STATUS command so that we do not wait for
    an infinite response.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • In the current implementation, we think that exclusion control
    for othercon in tcp_accept_from_sock() and sctp_accept_from_sock()
    was not enough. We fix them.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • When kernel_sendpage(in send_to_sock) and kernel_recvmsg
    (in receive_from_sock) return error, close_connection may works at the
    same time. At that time, they may wait for each other by cancel_work_sync.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • dlm_lowcomms_stop() was not functioning properly. Correctly, we have to
    wait until all processing is finished with send_workqueue and
    recv_workqueue.
    This problem causes the following issue. Senario is

    1. dlm_send thread:
    send_to_sock refers con->writequeue
    2. main thread:
    dlm_lowcomms_stop calls list_del
    3. dlm_send thread:
    send_to_sock calls list_del in writequeue_entry_complete

    [ 1925.770305] dlm: canceled swork for node 4
    [ 1925.772374] general protection fault: 0000 [#1] SMP
    [ 1925.777930] Modules linked in: ocfs2_stack_user ocfs2 ocfs2_nodemanager ocfs2_stackglue dlm fmxnet(O) fmx_api(O) fmx_cu(O) igb(O) kvm_intel kvm irqbypass autofs4
    [ 1925.794131] CPU: 3 PID: 6994 Comm: kworker/u8:0 Tainted: G O 4.4.39 #1
    [ 1925.802684] Hardware name: TOSHIBA OX/OX, BIOS OX-P0015 12/03/2015
    [ 1925.809595] Workqueue: dlm_send process_send_sockets [dlm]
    [ 1925.815714] task: ffff8804398d3c00 ti: ffff88046910c000 task.ti: ffff88046910c000
    [ 1925.824072] RIP: 0010:[] [] process_send_sockets+0xf8/0x280 [dlm]
    [ 1925.834480] RSP: 0018:ffff88046910fde0 EFLAGS: 00010246
    [ 1925.840411] RAX: dead000000000200 RBX: 0000000000000001 RCX: 000000000000000a
    [ 1925.848372] RDX: ffff88046bd980c0 RSI: 0000000000000000 RDI: ffff8804673c5670
    [ 1925.856341] RBP: ffff88046910fe20 R08: 00000000000000c9 R09: 0000000000000010
    [ 1925.864311] R10: ffffffff81e22fc0 R11: 0000000000000000 R12: ffff8804673c56d8
    [ 1925.872281] R13: ffff8804673c5660 R14: ffff88046bd98440 R15: 0000000000000058
    [ 1925.880251] FS: 0000000000000000(0000) GS:ffff88047fd80000(0000) knlGS:0000000000000000
    [ 1925.889280] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
    [ 1925.895694] CR2: 00007fff09eadf58 CR3: 00000004690f5000 CR4: 00000000001006e0
    [ 1925.903663] Stack:
    [ 1925.905903] ffff8804673c5630 ffff8804673c5620 ffff8804673c5670 ffff88007d219b40
    [ 1925.914181] ffff88046f095800 0000000000000100 ffff8800717a1400 ffff8804673c56d8
    [ 1925.922459] ffff88046910fe60 ffffffff81073db2 00ff880400000000 ffff88007d219b40
    [ 1925.930736] Call Trace:
    [ 1925.933468] [] process_one_work+0x162/0x450
    [ 1925.939983] [] worker_thread+0x69/0x4a0
    [ 1925.946109] [] ? rescuer_thread+0x350/0x350
    [ 1925.952622] [] kthread+0xef/0x110
    [ 1925.958165] [] ? kthread_park+0x60/0x60
    [ 1925.964283] [] ret_from_fork+0x3f/0x70
    [ 1925.970312] [] ? kthread_park+0x60/0x60
    [ 1925.976436] Code: 01 00 00 48 8b 7d d0 e8 07 d3 3a e1 45 01 7e 18 45 29 7e 1c 75 ab 41 8b 46 24 85 c0 75 a3 49 8b 16 49 8b 46 08 31 f6 48 89 42 08 89 10 48 b8 00 01 00 00 00 00 ad de 49 8b 7e 10 49 89 06 66
    [ 1925.997791] RIP [] process_send_sockets+0xf8/0x280 [dlm]
    [ 1926.005577] RSP

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • save_cb argument is not used. We remove them.

    Signed-off-by: Tadashi Miyauchi
    Signed-off-by: Tsutomu Owa
    Signed-off-by: David Teigland

    tsutomu.owa@toshiba.co.jp
     
  • In a previous patch I noted that accept() often copies the struct
    sock (sk) which overwrites the sock callbacks. However, in testing
    we discovered that the dlm connection structures (con) are sometimes
    deleted and recreated as connections come and go, and since they're
    zeroed out by kmem_cache_zalloc, the saved callback pointers are
    also initialized to zero. But with today's DLM code, the callbacks
    are only saved when a socket is added.

    During recovery testing, we discovered a common situation in which
    the new con is initialized to zero, then a socket is added after
    accept(). In this case, the sock's saved values are all NULL, but
    the saved values are wiped out, due to accept(). Therefore, we
    don't have a known good copy of the callbacks from which we can
    restore.

    Since the struct sock callbacks are always good after listen(),
    this patch saves the known good values after listen(). These good
    values are then used for subsequent restores.

    Signed-off-by: Bob Peterson
    Reviewed-by: Tadashi Miyauchi
    Signed-off-by: David Teigland

    Bob Peterson
     
  • Signed-off-by: Bob Peterson
    Reviewed-by: Tadashi Miyauchi
    Signed-off-by: David Teigland

    Bob Peterson
     
  • Before this patch, there was a flag in the con structure that was
    used to determine whether or not a connect was needed. The bit was
    set here and there, and cleared here and there, so it left some
    race conditions: the bit was set, work was queued, then the worker
    cleared the bit, allowing someone else to set it while the worker
    ran. For the most part, this worked okay, but we got into trouble
    if connections were lost and it needed to reconnect.

    This patch eliminates the flag in favor of simply checking if we
    actually have a sock pointer while protected by the mutex.

    Signed-off-by: Bob Peterson
    Reviewed-by: Tadashi Miyauchi
    Signed-off-by: David Teigland

    Bob Peterson
     

07 Sep, 2017

1 commit

  • Pull file locking updates from Jeff Layton:
    "This pile just has a few file locking fixes from Ben Coddington. There
    are a couple of cleanup patches + an attempt to bring sanity to the
    l_pid value that is reported back to userland on an F_GETLK request.

    After a few gyrations, he came up with a way for filesystems to
    communicate to the VFS layer code whether the pid should be translated
    according to the namespace or presented as-is to userland"

    * tag 'locks-v4.14-1' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux:
    locks: restore a warn for leaked locks on close
    fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks
    fs/locks: Use allocation rather than the stack in fcntl_getlk()

    Linus Torvalds
     

08 Aug, 2017

1 commit

  • With commit 0ffdaf5b41cf ("net/sock: add WARN_ON(parent->sk)
    in sock_graft()"), a calltrace happened as follows:

    [ 457.018340] WARNING: CPU: 0 PID: 15623 at ./include/net/sock.h:1703 inet_accept+0x135/0x140
    ...
    [ 457.018381] RIP: 0010:inet_accept+0x135/0x140
    [ 457.018381] RSP: 0018:ffffc90001727d18 EFLAGS: 00010286
    [ 457.018383] RAX: 0000000000000001 RBX: ffff880012413000 RCX: 0000000000000001
    [ 457.018384] RDX: 000000000000018a RSI: 00000000fffffe01 RDI: ffffffff8156fae8
    [ 457.018384] RBP: ffffc90001727d38 R08: 0000000000000000 R09: 0000000000004305
    [ 457.018385] R10: 0000000000000001 R11: 0000000000004304 R12: ffff880035ae7a00
    [ 457.018386] R13: ffff88001282af10 R14: ffff880034e4e200 R15: 0000000000000000
    [ 457.018387] FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
    [ 457.018388] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [ 457.018389] CR2: 00007fdec22f9000 CR3: 0000000002b5a000 CR4: 00000000000006f0
    [ 457.018395] Call Trace:
    [ 457.018402] tcp_accept_from_sock.part.8+0x12d/0x449 [dlm]
    [ 457.018405] ? vprintk_emit+0x248/0x2d0
    [ 457.018409] tcp_accept_from_sock+0x3f/0x50 [dlm]
    [ 457.018413] process_recv_sockets+0x3b/0x50 [dlm]
    [ 457.018415] process_one_work+0x138/0x370
    [ 457.018417] worker_thread+0x4d/0x3b0
    [ 457.018419] kthread+0x109/0x140
    [ 457.018421] ? rescuer_thread+0x320/0x320
    [ 457.018422] ? kthread_park+0x60/0x60
    [ 457.018424] ret_from_fork+0x25/0x30

    Since newsocket created by sock_create_kern sets it's
    sock by the path:

    sock_create_kern -> __sock_creat
    ->pf->create => inet_create
    -> sock_init_data

    Then WARN_ON is triggered by "con->sock->ops->accept =>
    inet_accept -> sock_graft", it also means newsock->sk
    is leaked since sock_graft will replace it with a new
    sk.

    To resolve the issue, we need to use sock_create_lite
    instead of sock_create_kern, like commit 0933a578cd55
    ("rds: tcp: use sock_create_lite() to create the accept
    socket") did.

    Reported-by: Zhilong Liu
    Signed-off-by: Guoqing Jiang
    Signed-off-by: David Teigland

    Guoqing Jiang