25 Jul, 2008

27 commits

  • Remove the size parameter from the new epoll_create syscall and renames the
    syscall itself. The updated test program follows.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_epoll_create2
    # ifdef __x86_64__
    # define __NR_epoll_create2 291
    # elif defined __i386__
    # define __NR_epoll_create2 329
    # else
    # error "need __NR_epoll_create2"
    # endif
    #endif

    #define EPOLL_CLOEXEC O_CLOEXEC

    int
    main (void)
    {
    int fd = syscall (__NR_epoll_create2, 0);
    if (fd == -1)
    {
    puts ("epoll_create2(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("epoll_create2(0) set close-on-exec flag");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_epoll_create2, EPOLL_CLOEXEC);
    if (fd == -1)
    {
    puts ("epoll_create2(EPOLL_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds test that ensure the boundary conditions for the various
    constants introduced in the previous patches is met. No code is generated.

    [akpm@linux-foundation.org: fix alpha]
    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds non-blocking support for inotify_init1. The
    additional changes needed are minimal.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include

    #ifndef __NR_inotify_init1
    # ifdef __x86_64__
    # define __NR_inotify_init1 294
    # elif defined __i386__
    # define __NR_inotify_init1 332
    # else
    # error "need __NR_inotify_init1"
    # endif
    #endif

    #define IN_NONBLOCK O_NONBLOCK

    int
    main (void)
    {
    int fd = syscall (__NR_inotify_init1, 0);
    if (fd == -1)
    {
    puts ("inotify_init1(0) failed");
    return 1;
    }
    int fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (fl & O_NONBLOCK)
    {
    puts ("inotify_init1(0) set non-blocking mode");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_inotify_init1, IN_NONBLOCK);
    if (fd == -1)
    {
    puts ("inotify_init1(IN_NONBLOCK) failed");
    return 1;
    }
    fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((fl & O_NONBLOCK) == 0)
    {
    puts ("inotify_init1(IN_NONBLOCK) set non-blocking mode");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds O_NONBLOCK support to pipe2. It is minimally more involved
    than the patches for eventfd et.al but still trivial. The interfaces of the
    create_write_pipe and create_read_pipe helper functions were changed and the
    one other caller as well.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include

    #ifndef __NR_pipe2
    # ifdef __x86_64__
    # define __NR_pipe2 293
    # elif defined __i386__
    # define __NR_pipe2 331
    # else
    # error "need __NR_pipe2"
    # endif
    #endif

    int
    main (void)
    {
    int fds[2];
    if (syscall (__NR_pipe2, fds, 0) == -1)
    {
    puts ("pipe2(0) failed");
    return 1;
    }
    for (int i = 0; i < 2; ++i)
    {
    int fl = fcntl (fds[i], F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (fl & O_NONBLOCK)
    {
    printf ("pipe2(0) set non-blocking mode for fds[%d]\n", i);
    return 1;
    }
    close (fds[i]);
    }

    if (syscall (__NR_pipe2, fds, O_NONBLOCK) == -1)
    {
    puts ("pipe2(O_NONBLOCK) failed");
    return 1;
    }
    for (int i = 0; i < 2; ++i)
    {
    int fl = fcntl (fds[i], F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((fl & O_NONBLOCK) == 0)
    {
    printf ("pipe2(O_NONBLOCK) does not set non-blocking mode for fds[%d]\n", i);
    return 1;
    }
    close (fds[i]);
    }

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds support for the TFD_NONBLOCK flag to timerfd_create. The
    additional changes needed are minimal.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_timerfd_create
    # ifdef __x86_64__
    # define __NR_timerfd_create 283
    # elif defined __i386__
    # define __NR_timerfd_create 322
    # else
    # error "need __NR_timerfd_create"
    # endif
    #endif

    #define TFD_NONBLOCK O_NONBLOCK

    int
    main (void)
    {
    int fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, 0);
    if (fd == -1)
    {
    puts ("timerfd_create(0) failed");
    return 1;
    }
    int fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (fl & O_NONBLOCK)
    {
    puts ("timerfd_create(0) set non-blocking mode");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, TFD_NONBLOCK);
    if (fd == -1)
    {
    puts ("timerfd_create(TFD_NONBLOCK) failed");
    return 1;
    }
    fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((fl & O_NONBLOCK) == 0)
    {
    puts ("timerfd_create(TFD_NONBLOCK) set non-blocking mode");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds support for the EFD_NONBLOCK flag to eventfd2. The
    additional changes needed are minimal.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include

    #ifndef __NR_eventfd2
    # ifdef __x86_64__
    # define __NR_eventfd2 290
    # elif defined __i386__
    # define __NR_eventfd2 328
    # else
    # error "need __NR_eventfd2"
    # endif
    #endif

    #define EFD_NONBLOCK O_NONBLOCK

    int
    main (void)
    {
    int fd = syscall (__NR_eventfd2, 1, 0);
    if (fd == -1)
    {
    puts ("eventfd2(0) failed");
    return 1;
    }
    int fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (fl & O_NONBLOCK)
    {
    puts ("eventfd2(0) sets non-blocking mode");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_eventfd2, 1, EFD_NONBLOCK);
    if (fd == -1)
    {
    puts ("eventfd2(EFD_NONBLOCK) failed");
    return 1;
    }
    fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((fl & O_NONBLOCK) == 0)
    {
    puts ("eventfd2(EFD_NONBLOCK) does not set non-blocking mode");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds support for the SFD_NONBLOCK flag to signalfd4. The
    additional changes needed are minimal.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_signalfd4
    # ifdef __x86_64__
    # define __NR_signalfd4 289
    # elif defined __i386__
    # define __NR_signalfd4 327
    # else
    # error "need __NR_signalfd4"
    # endif
    #endif

    #define SFD_NONBLOCK O_NONBLOCK

    int
    main (void)
    {
    sigset_t ss;
    sigemptyset (&ss);
    sigaddset (&ss, SIGUSR1);
    int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
    if (fd == -1)
    {
    puts ("signalfd4(0) failed");
    return 1;
    }
    int fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (fl & O_NONBLOCK)
    {
    puts ("signalfd4(0) set non-blocking mode");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_NONBLOCK);
    if (fd == -1)
    {
    puts ("signalfd4(SFD_NONBLOCK) failed");
    return 1;
    }
    fl = fcntl (fd, F_GETFL);
    if (fl == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((fl & O_NONBLOCK) == 0)
    {
    puts ("signalfd4(SFD_NONBLOCK) does not set non-blocking mode");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • Building on the previous change to anon_inode_getfd, this patch introduces
    support for handling of O_NONBLOCK in addition to the already supported
    O_CLOEXEC. Following patches will take advantage of this support. As can be
    seen, the additional support for supporting this functionality is minimal.

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch introduces the new syscall inotify_init1 (note: the 1 stands for
    the one parameter the syscall takes, as opposed to no parameter before). The
    values accepted for this parameter are function-specific and defined in the
    inotify.h header. Here the values must match the O_* flags, though. In this
    patch CLOEXEC support is introduced.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include

    #ifndef __NR_inotify_init1
    # ifdef __x86_64__
    # define __NR_inotify_init1 294
    # elif defined __i386__
    # define __NR_inotify_init1 332
    # else
    # error "need __NR_inotify_init1"
    # endif
    #endif

    #define IN_CLOEXEC O_CLOEXEC

    int
    main (void)
    {
    int fd;
    fd = syscall (__NR_inotify_init1, 0);
    if (fd == -1)
    {
    puts ("inotify_init1(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("inotify_init1(0) set close-on-exit");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_inotify_init1, IN_CLOEXEC);
    if (fd == -1)
    {
    puts ("inotify_init1(IN_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    [akpm@linux-foundation.org: add sys_ni stub]
    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch introduces the new syscall pipe2 which is like pipe but it also
    takes an additional parameter which takes a flag value. This patch implements
    the handling of O_CLOEXEC for the flag. I did not add support for the new
    syscall for the architectures which have a special sys_pipe implementation. I
    think the maintainers of those archs have the chance to go with the unified
    implementation but that's up to them.

    The implementation introduces do_pipe_flags. I did that instead of changing
    all callers of do_pipe because some of the callers are written in assembler.
    I would probably screw up changing the assembly code. To avoid breaking code
    do_pipe is now a small wrapper around do_pipe_flags. Once all callers are
    changed over to do_pipe_flags the old do_pipe function can be removed.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include

    #ifndef __NR_pipe2
    # ifdef __x86_64__
    # define __NR_pipe2 293
    # elif defined __i386__
    # define __NR_pipe2 331
    # else
    # error "need __NR_pipe2"
    # endif
    #endif

    int
    main (void)
    {
    int fd[2];
    if (syscall (__NR_pipe2, fd, 0) != 0)
    {
    puts ("pipe2(0) failed");
    return 1;
    }
    for (int i = 0; i < 2; ++i)
    {
    int coe = fcntl (fd[i], F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    printf ("pipe2(0) set close-on-exit for fd[%d]\n", i);
    return 1;
    }
    }
    close (fd[0]);
    close (fd[1]);

    if (syscall (__NR_pipe2, fd, O_CLOEXEC) != 0)
    {
    puts ("pipe2(O_CLOEXEC) failed");
    return 1;
    }
    for (int i = 0; i < 2; ++i)
    {
    int coe = fcntl (fd[i], F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    printf ("pipe2(O_CLOEXEC) does not set close-on-exit for fd[%d]\n", i);
    return 1;
    }
    }
    close (fd[0]);
    close (fd[1]);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds the new dup3 syscall. It extends the old dup2 syscall by one
    parameter which is meant to hold a flag value. Support for the O_CLOEXEC flag
    is added in this patch.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_dup3
    # ifdef __x86_64__
    # define __NR_dup3 292
    # elif defined __i386__
    # define __NR_dup3 330
    # else
    # error "need __NR_dup3"
    # endif
    #endif

    int
    main (void)
    {
    int fd = syscall (__NR_dup3, 1, 4, 0);
    if (fd == -1)
    {
    puts ("dup3(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("dup3(0) set close-on-exec flag");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_dup3, 1, 4, O_CLOEXEC);
    if (fd == -1)
    {
    puts ("dup3(O_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("dup3(O_CLOEXEC) set close-on-exec flag");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds the new epoll_create2 syscall. It extends the old epoll_create
    syscall by one parameter which is meant to hold a flag value. In this
    patch the only flag support is EPOLL_CLOEXEC which causes the close-on-exec
    flag for the returned file descriptor to be set.

    A new name EPOLL_CLOEXEC is introduced which in this implementation must
    have the same value as O_CLOEXEC.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_epoll_create2
    # ifdef __x86_64__
    # define __NR_epoll_create2 291
    # elif defined __i386__
    # define __NR_epoll_create2 329
    # else
    # error "need __NR_epoll_create2"
    # endif
    #endif

    #define EPOLL_CLOEXEC O_CLOEXEC

    int
    main (void)
    {
    int fd = syscall (__NR_epoll_create2, 1, 0);
    if (fd == -1)
    {
    puts ("epoll_create2(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("epoll_create2(0) set close-on-exec flag");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_epoll_create2, 1, EPOLL_CLOEXEC);
    if (fd == -1)
    {
    puts ("epoll_create2(EPOLL_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • The timerfd_create syscall already has a flags parameter. It just is
    unused so far. This patch changes this by introducing the TFD_CLOEXEC
    flag to set the close-on-exec flag for the returned file descriptor.

    A new name TFD_CLOEXEC is introduced which in this implementation must
    have the same value as O_CLOEXEC.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_timerfd_create
    # ifdef __x86_64__
    # define __NR_timerfd_create 283
    # elif defined __i386__
    # define __NR_timerfd_create 322
    # else
    # error "need __NR_timerfd_create"
    # endif
    #endif

    #define TFD_CLOEXEC O_CLOEXEC

    int
    main (void)
    {
    int fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, 0);
    if (fd == -1)
    {
    puts ("timerfd_create(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("timerfd_create(0) set close-on-exec flag");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, TFD_CLOEXEC);
    if (fd == -1)
    {
    puts ("timerfd_create(TFD_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("timerfd_create(TFD_CLOEXEC) set close-on-exec flag");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds the new eventfd2 syscall. It extends the old eventfd
    syscall by one parameter which is meant to hold a flag value. In this
    patch the only flag support is EFD_CLOEXEC which causes the close-on-exec
    flag for the returned file descriptor to be set.

    A new name EFD_CLOEXEC is introduced which in this implementation must
    have the same value as O_CLOEXEC.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include

    #ifndef __NR_eventfd2
    # ifdef __x86_64__
    # define __NR_eventfd2 290
    # elif defined __i386__
    # define __NR_eventfd2 328
    # else
    # error "need __NR_eventfd2"
    # endif
    #endif

    #define EFD_CLOEXEC O_CLOEXEC

    int
    main (void)
    {
    int fd = syscall (__NR_eventfd2, 1, 0);
    if (fd == -1)
    {
    puts ("eventfd2(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("eventfd2(0) sets close-on-exec flag");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC);
    if (fd == -1)
    {
    puts ("eventfd2(EFD_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    [akpm@linux-foundation.org: add sys_ni stub]
    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch adds the new signalfd4 syscall. It extends the old signalfd
    syscall by one parameter which is meant to hold a flag value. In this
    patch the only flag support is SFD_CLOEXEC which causes the close-on-exec
    flag for the returned file descriptor to be set.

    A new name SFD_CLOEXEC is introduced which in this implementation must
    have the same value as O_CLOEXEC.

    The following test must be adjusted for architectures other than x86 and
    x86-64 and in case the syscall numbers changed.

    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #include
    #include
    #include
    #include
    #include

    #ifndef __NR_signalfd4
    # ifdef __x86_64__
    # define __NR_signalfd4 289
    # elif defined __i386__
    # define __NR_signalfd4 327
    # else
    # error "need __NR_signalfd4"
    # endif
    #endif

    #define SFD_CLOEXEC O_CLOEXEC

    int
    main (void)
    {
    sigset_t ss;
    sigemptyset (&ss);
    sigaddset (&ss, SIGUSR1);
    int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
    if (fd == -1)
    {
    puts ("signalfd4(0) failed");
    return 1;
    }
    int coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if (coe & FD_CLOEXEC)
    {
    puts ("signalfd4(0) set close-on-exec flag");
    return 1;
    }
    close (fd);

    fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC);
    if (fd == -1)
    {
    puts ("signalfd4(SFD_CLOEXEC) failed");
    return 1;
    }
    coe = fcntl (fd, F_GETFD);
    if (coe == -1)
    {
    puts ("fcntl failed");
    return 1;
    }
    if ((coe & FD_CLOEXEC) == 0)
    {
    puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag");
    return 1;
    }
    close (fd);

    puts ("OK");

    return 0;
    }
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    [akpm@linux-foundation.org: add sys_ni stub]
    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • This patch just extends the anon_inode_getfd interface to take an additional
    parameter with a flag value. The flag value is passed on to
    get_unused_fd_flags in anticipation for a use with the O_CLOEXEC flag.

    No actual semantic changes here, the changed callers all pass 0 for now.

    [akpm@linux-foundation.org: KVM fix]
    Signed-off-by: Ulrich Drepper
    Acked-by: Davide Libenzi
    Cc: Michael Kerrisk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Ulrich Drepper
     
  • Signed-off-by: Akinobu Mita
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Akinobu Mita
     
  • Adds a check for an overflow in the filesystem size so if someone is
    checking with statfs() on a 16G blocksize hugetlbfs in a 32bit binary that
    it will report back EOVERFLOW instead of a size of 0.

    Acked-by: Nishanth Aravamudan
    Signed-off-by: Jon Tollefson
    Signed-off-by: Nick Piggin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Jon Tollefson
     
  • Add the ability to configure the hugetlb hstate used on a per mount basis.

    - Add a new pagesize= option to the hugetlbfs mount that allows setting
    the page size
    - This option causes the mount code to find the hstate corresponding to the
    specified size, and sets up a pointer to the hstate in the mount's
    superblock.
    - Change the hstate accessors to use this information rather than the
    global_hstate they were using (requires a slight change in mm/memory.c
    so we don't NULL deref in the error-unmap path -- see comments).

    [np: take hstate out of hugetlbfs inode and vma->vm_private_data]

    Acked-by: Adam Litke
    Acked-by: Nishanth Aravamudan
    Signed-off-by: Andi Kleen
    Signed-off-by: Nick Piggin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andi Kleen
     
  • The goal of this patchset is to support multiple hugetlb page sizes. This
    is achieved by introducing a new struct hstate structure, which
    encapsulates the important hugetlb state and constants (eg. huge page
    size, number of huge pages currently allocated, etc).

    The hstate structure is then passed around the code which requires these
    fields, they will do the right thing regardless of the exact hstate they
    are operating on.

    This patch adds the hstate structure, with a single global instance of it
    (default_hstate), and does the basic work of converting hugetlb to use the
    hstate.

    Future patches will add more hstate structures to allow for different
    hugetlbfs mounts to have different page sizes.

    [akpm@linux-foundation.org: coding-style fixes]
    Acked-by: Adam Litke
    Acked-by: Nishanth Aravamudan
    Signed-off-by: Andi Kleen
    Signed-off-by: Nick Piggin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andi Kleen
     
  • Christoph recently added /proc/vmallocinfo file to get information about
    vmalloc allocations.

    This patch adds NUMA specific information, giving number of pages
    allocated on each memory node.

    This should help to check that vmalloc() is able to respect NUMA policies.

    Example of output on a four nodes machine (one cpu per node)

    1) network hash tables are evenly spreaded on four nodes (OK) (Same
    point for inodes and dentries hash tables)

    2) iptables tables (x_tables) are correctly allocated on each cpu node
    (OK).

    3) sys_swapon() allocates its memory from one node only.

    4) each loaded module is using memory on one node.

    Sysadmins could tune their setup to change points 3) and 4) if necessary.

    grep "pages=" /proc/vmallocinfo
    0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204/0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128
    0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204/0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
    0xffffc2000031a000-0xffffc2000031d000 12288 alloc_large_system_hash+0x204/0x2c0 pages=2 vmalloc N1=1 N2=1
    0xffffc2000031f000-0xffffc2000032b000 49152 cramfs_uncompress_init+0x2e/0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3
    0xffffc2000033e000-0xffffc20000341000 12288 sys_swapon+0x640/0xac0 pages=2 vmalloc N0=2
    0xffffc20000341000-0xffffc20000344000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N0=2
    0xffffc20000344000-0xffffc20000347000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N1=2
    0xffffc20000347000-0xffffc2000034a000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N2=2
    0xffffc2000034a000-0xffffc2000034d000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N3=2
    0xffffc20004381000-0xffffc20004402000 528384 alloc_large_system_hash+0x204/0x2c0 pages=128 vmalloc N0=32 N1=32 N2=32 N3=32
    0xffffc20004402000-0xffffc20004803000 4198400 alloc_large_system_hash+0x204/0x2c0 pages=1024 vmalloc vpages N0=256 N1=256 N2=256 N3=256
    0xffffc20004803000-0xffffc20004904000 1052672 alloc_large_system_hash+0x204/0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
    0xffffc20004904000-0xffffc20004bec000 3047424 sys_swapon+0x640/0xac0 pages=743 vmalloc vpages N0=743
    0xffffffffa0000000-0xffffffffa000f000 61440 sys_init_module+0xc27/0x1d00 pages=14 vmalloc N1=14
    0xffffffffa000f000-0xffffffffa0014000 20480 sys_init_module+0xc27/0x1d00 pages=4 vmalloc N0=4
    0xffffffffa0014000-0xffffffffa0017000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N0=2
    0xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 pages=10 vmalloc N1=10
    0xffffffffa0022000-0xffffffffa0028000 24576 sys_init_module+0xc27/0x1d00 pages=5 vmalloc N3=5
    0xffffffffa0028000-0xffffffffa0050000 163840 sys_init_module+0xc27/0x1d00 pages=39 vmalloc N1=39
    0xffffffffa0050000-0xffffffffa0052000 8192 sys_init_module+0xc27/0x1d00 pages=1 vmalloc N1=1
    0xffffffffa0052000-0xffffffffa0056000 16384 sys_init_module+0xc27/0x1d00 pages=3 vmalloc N1=3
    0xffffffffa0056000-0xffffffffa0081000 176128 sys_init_module+0xc27/0x1d00 pages=42 vmalloc N3=42
    0xffffffffa0081000-0xffffffffa00ae000 184320 sys_init_module+0xc27/0x1d00 pages=44 vmalloc N3=44
    0xffffffffa00ae000-0xffffffffa00b1000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2
    0xffffffffa00b1000-0xffffffffa00b9000 32768 sys_init_module+0xc27/0x1d00 pages=7 vmalloc N0=7
    0xffffffffa00b9000-0xffffffffa00c4000 45056 sys_init_module+0xc27/0x1d00 pages=10 vmalloc N3=10
    0xffffffffa00c6000-0xffffffffa00e0000 106496 sys_init_module+0xc27/0x1d00 pages=25 vmalloc N2=25
    0xffffffffa00e0000-0xffffffffa00f1000 69632 sys_init_module+0xc27/0x1d00 pages=16 vmalloc N2=16
    0xffffffffa00f1000-0xffffffffa00f4000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2
    0xffffffffa00f4000-0xffffffffa00f7000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2

    [akpm@linux-foundation.org: fix comment]
    Signed-off-by: Eric Dumazet
    Cc: Christoph Lameter
    Cc: Randy Dunlap
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Eric Dumazet
     
  • [akpm@linux-foundation.org: fix comment text]
    Signed-off-by: Pavel Machek
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Pavel Machek
     
  • …n hugetlbfs will succeed

    After patch 2 in this series, a process that successfully calls mmap() for
    a MAP_PRIVATE mapping will be guaranteed to successfully fault until a
    process calls fork(). At that point, the next write fault from the parent
    could fail due to COW if the child still has a reference.

    We only reserve pages for the parent but a copy must be made to avoid
    leaking data from the parent to the child after fork(). Reserves could be
    taken for both parent and child at fork time to guarantee faults but if
    the mapping is large it is highly likely we will not have sufficient pages
    for the reservation, and it is common to fork only to exec() immediatly
    after. A failure here would be very undesirable.

    Note that the current behaviour of mainline with MAP_PRIVATE pages is
    pretty bad. The following situation is allowed to occur today.

    1. Process calls mmap(MAP_PRIVATE)
    2. Process calls mlock() to fault all pages and makes sure it succeeds
    3. Process forks()
    4. Process writes to MAP_PRIVATE mapping while child still exists
    5. If the COW fails at this point, the process gets SIGKILLed even though it
    had taken care to ensure the pages existed

    This patch improves the situation by guaranteeing the reliability of the
    process that successfully calls mmap(). When the parent performs COW, it
    will try to satisfy the allocation without using reserves. If that fails
    the parent will steal the page leaving any children without a page.
    Faults from the child after that point will result in failure. If the
    child COW happens first, an attempt will be made to allocate the page
    without reserves and the child will get SIGKILLed on failure.

    To summarise the new behaviour:

    1. If the original mapper performs COW on a private mapping with multiple
    references, it will attempt to allocate a hugepage from the pool or
    the buddy allocator without using the existing reserves. On fail, VMAs
    mapping the same area are traversed and the page being COW'd is unmapped
    where found. It will then steal the original page as the last mapper in
    the normal way.

    2. The VMAs the pages were unmapped from are flagged to note that pages
    with data no longer exist. Future no-page faults on those VMAs will
    terminate the process as otherwise it would appear that data was corrupted.
    A warning is printed to the console that this situation occured.

    2. If the child performs COW first, it will attempt to satisfy the COW
    from the pool if there are enough pages or via the buddy allocator if
    overcommit is allowed and the buddy allocator can satisfy the request. If
    it fails, the child will be killed.

    If the pool is large enough, existing applications will not notice that
    the reserves were a factor. Existing applications depending on the
    no-reserves been set are unlikely to exist as for much of the history of
    hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that
    point or failing the mmap().

    [npiggin@suse.de: fix CONFIG_HUGETLB=n build]
    Signed-off-by: Mel Gorman <mel@csn.ul.ie>
    Acked-by: Adam Litke <agl@us.ibm.com>
    Cc: Andy Whitcroft <apw@shadowen.org>
    Cc: William Lee Irwin III <wli@holomorphy.com>
    Cc: Hugh Dickins <hugh@veritas.com>
    Cc: Nick Piggin <npiggin@suse.de>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

    Mel Gorman
     
  • This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in
    a similar manner to the reservations taken for MAP_SHARED mappings. The
    reserve count is accounted both globally and on a per-VMA basis for
    private mappings. This guarantees that a process that successfully calls
    mmap() will successfully fault all pages in the future unless fork() is
    called.

    The characteristics of private mappings of hugetlbfs files behaviour after
    this patch are;

    1. The process calling mmap() is guaranteed to succeed all future faults until
    it forks().
    2. On fork(), the parent may die due to SIGKILL on writes to the private
    mapping if enough pages are not available for the COW. For reasonably
    reliable behaviour in the face of a small huge page pool, children of
    hugepage-aware processes should not reference the mappings; such as
    might occur when fork()ing to exec().
    3. On fork(), the child VMAs inherit no reserves. Reads on pages already
    faulted by the parent will succeed. Successful writes will depend on enough
    huge pages being free in the pool.
    4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper
    and at fault time otherwise.

    Before this patch, all reads or writes in the child potentially needs page
    allocations that can later lead to the death of the parent. This applies
    to reads and writes of uninstantiated pages as well as COW. After the
    patch it is only a write to an instantiated page that causes problems.

    Signed-off-by: Mel Gorman
    Acked-by: Adam Litke
    Cc: Andy Whitcroft
    Cc: William Lee Irwin III
    Cc: Hugh Dickins
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Mel Gorman
     
  • [Summary]

    Split LRU-list of unused dentries to one per superblock to avoid soft
    lock up during NFS mounts and remounting of any filesystem.

    Previously I posted here:
    http://lkml.org/lkml/2008/3/5/590

    [Descriptions]

    - background

    dentry_unused is a list of dentries which are not referenced.
    dentry_unused grows up when references on directories or files are
    released. This list can be very long if there is huge free memory.

    - the problem

    When shrink_dcache_sb() is called, it scans all dentry_unused linearly
    under spin_lock(), and if dentry->d_sb is differnt from given
    superblock, scan next dentry. This scan costs very much if there are
    many entries, and very ineffective if there are many superblocks.

    IOW, When we need to shrink unused dentries on one dentry, but scans
    unused dentries on all superblocks in the system. For example, we scan
    500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
    dentries on other superblocks.

    In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
    unused dentries on NFS, but scans 100,000,000 unused dentries on
    superblocks in the system such as local ext3 filesystems. I hear NFS
    mounting took 1 min on some system in use.

    * : NFS uses virtual filesystem in rpc layer, so NFS is affected by
    this problem.

    100,000,000 is possible number on large systems.

    Per-superblock LRU of unused dentried can reduce the cost in
    reasonable manner.

    - How to fix

    I found this problem is solved by David Chinner's "Per-superblock
    unused dentry LRU lists V3"(1), so I rebase it and add some fix to
    reclaim with fairness, which is in Andrew Morton's comments(2).

    1) http://lkml.org/lkml/2006/5/25/318
    2) http://lkml.org/lkml/2006/5/25/320

    Split LRU-list of unused dentries to each superblocks. Then, NFS
    mounting will check dentries under a superblock instead of all. But
    this spliting will break LRU of dentry-unused. So, I've attempted to
    make reclaim unused dentrins with fairness by calculate number of
    dentries to scan on this sb based on following way

    number of dentries to scan on this sb =
    count * (number of dentries on this sb / number of dentries in the machine)

    - ToDo
    - I have to measuring performance number and do stress tests.

    - When unmount occurs during prune_dcache(), scanning on same
    superblock, It is unable to reach next superblock because it is gone
    away. We restart scannig superblock from first one, it causes
    unfairness of reclaim unused dentries on first superblock. But I think
    this happens very rarely.

    - Test Results

    Result on 6GB boxes with excessive unused dentries.

    Without patch:

    $ cat /proc/sys/fs/dentry-state
    10181835 10180203 45 0 0 0
    # mount -t nfs 10.124.60.70:/work/kernel-src nfs
    real 0m1.830s
    user 0m0.001s
    sys 0m1.653s

    With this patch:
    $ cat /proc/sys/fs/dentry-state
    10236610 10234751 45 0 0 0
    # mount -t nfs 10.124.60.70:/work/kernel-src nfs
    real 0m0.106s
    user 0m0.002s
    sys 0m0.032s

    [akpm@linux-foundation.org: fix comments]
    Signed-off-by: Kentaro Makita
    Cc: Neil Brown
    Cc: Trond Myklebust
    Cc: David Chinner
    Cc: "J. Bruce Fields"
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Kentaro Makita
     
  • The double indirection here is not needed anywhere and hence (at least)
    confusing.

    Signed-off-by: Jan Beulich
    Cc: Hugh Dickins
    Cc: Nick Piggin
    Cc: Christoph Lameter
    Cc: Benjamin Herrenschmidt
    Cc: Paul Mackerras
    Cc: "Luck, Tony"
    Cc: Paul Mundt
    Cc: "David S. Miller"
    Acked-by: Jeremy Fitzhardinge
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Jan Beulich
     
  • This patch adds proper extern declarations for five variables in
    include/linux/vmstat.h

    Signed-off-by: Adrian Bunk
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Adrian Bunk
     

23 Jul, 2008

5 commits

  • * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6: (82 commits)
    ipw2200: Call netif_*_queue() interfaces properly.
    netxen: Needs to include linux/vmalloc.h
    [netdrvr] atl1d: fix !CONFIG_PM build
    r6040: rework init_one error handling
    r6040: bump release number to 0.18
    r6040: handle RX fifo full and no descriptor interrupts
    r6040: change the default waiting time
    r6040: use definitions for magic values in descriptor status
    r6040: completely rework the RX path
    r6040: call napi_disable when puting down the interface and set lp->dev accordingly.
    mv643xx_eth: fix NETPOLL build
    r6040: rework the RX buffers allocation routine
    r6040: fix scheduling while atomic in r6040_tx_timeout
    r6040: fix null pointer access and tx timeouts
    r6040: prefix all functions with r6040
    rndis_host: support WM6 devices as modems
    at91_ether: use netstats in net_device structure
    sfc: Create one RX queue and interrupt per CPU package by default
    sfc: Use a separate workqueue for resets
    sfc: I2C adapter initialisation fixes
    ...

    Linus Torvalds
     
  • get_proc_net() can now become static.

    Signed-off-by: Adrian Bunk
    Acked-by: Pavel Emelyanov
    Signed-off-by: David S. Miller

    Adrian Bunk
     
  • * git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core-2.6: (79 commits)
    arm: bus_id -> dev_name() and dev_set_name() conversions
    sparc64: fix up bus_id changes in sparc core code
    3c59x: handle pci_name() being const
    MTD: handle pci_name() being const
    HP iLO driver
    sysdev: Convert the x86 mce tolerant sysdev attribute to generic attribute
    sysdev: Add utility functions for simple int/ulong variable sysdev attributes
    sysdev: Pass the attribute to the low level sysdev show/store function
    driver core: Suppress sysfs warnings for device_rename().
    kobject: Transmit return value of call_usermodehelper() to caller
    sysfs-rules.txt: reword API stability statement
    debugfs: Implement debugfs_remove_recursive()
    HOWTO: change email addresses of James in HOWTO
    always enable FW_LOADER unless EMBEDDED=y
    uio-howto.tmpl: use unique output names
    uio-howto.tmpl: use standard copyright/legal markings
    sysfs: don't call notify_change
    sysdev: fix debugging statements in registration code.
    kobject: should use kobject_put() in kset-example
    kobject: reorder kobject to save space on 64 bit builds
    ...

    Linus Torvalds
     
  • struct pagemap_walk was placed on stack, some hooks are initialized, the
    rest (->pgd_entry, ->pud_entry, ->pte_entry) are valid but junk.

    Reported-by: Eric Sesterhenn
    Signed-off-by: Alexey Dobriyan
    Cc: "Vegard Nossum"
    Cc: [2.6.25.x, 2.6.26.x]
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Alexey Dobriyan
     
  • The Linux kernel puts the filename argument of execve() into the new
    address space. Many developers are surprised to learn this. Those who
    know and could use it, object "But it's not documented."

    Those who want to use it dislike the expression
    (char *)(1+ strlen(env[-1+ n_env]) + env[-1+ n_env])
    because it requires locating the last original environment variable,
    and assumes that the filename follows the characters.

    This patch documents the insertion of the filename, and makes it easier
    to find by adding a new tag AT_EXECFN in the ElfXX_auxv_t; see .

    In many cases readlink("/proc/self/exe",) gives the same answer. But if
    all the original pages get unmapped, then the kernel erases the symlink
    for /proc/self/exe. This can happen when a program decompressor does a
    good job of cleaning up after uncompressing directly to memory, so that
    the address space of the target program looks the same as if compression
    had never happened. One example is http://upx.sourceforge.net .

    One notable use of the underlying concept (what path containED the
    executable) is glibc expanding $ORIGIN in DT_RUNPATH. In practice for
    the near term, it may be a good idea for user-mode code to use both
    /proc/self/exe and AT_EXECFN as fall-back methods for each other.
    /proc/self/exe can fail due to unmapping, AT_EXECFN can fail because it
    won't be present on non-new systems. The auxvec or {AT_EXECFN}.d_val
    also can get overwritten, although in nearly all cases this would be the
    result of a bug.

    The runtime cost is one NEW_AUX_ENT using two words of stack space. The
    underlying value is maintained already as bprm->exec; setup_arg_pages()
    in fs/exec.c slides it for stack_shift, etc.

    Signed-off-by: John Reiser
    Cc: Roland McGrath
    Cc: Jakub Jelinek
    Cc: Ulrich Drepper
    Cc: Benjamin Herrenschmidt
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    John Reiser
     

22 Jul, 2008

5 commits

  • driver core: Suppress sysfs warnings for device_rename().

    Renaming network devices to an already existing name is not
    something we want sysfs to print a scary warning for, since the
    callers can deal with this correctly. So let's introduce
    sysfs_create_link_nowarn() which gets rid of the common warning.

    Signed-off-by: Cornelia Huck
    Signed-off-by: Greg Kroah-Hartman

    Cornelia Huck
     
  • debugfs_remove_recursive() will remove a dentry and all its children.
    Drivers can use this to zap their whole debugfs tree so that they don't
    need to keep track of every single debugfs dentry they created.

    It may fail to remove the whole tree in certain cases:

    sh-3.2# rmmod atmel-mci < /sys/kernel/debug/mmc0/ios/clock
    mmc0: card b368 removed
    atmel_mci atmel_mci.0: Lost dma0chan1, falling back to PIO
    sh-3.2# ls /sys/kernel/debug/mmc0/
    ios

    But I'm not sure if that case can be handled in any sane manner.

    Signed-off-by: Haavard Skinnemoen
    Cc: Pierre Ossman
    Signed-off-by: Greg Kroah-Hartman

    Haavard Skinnemoen
     
  • sysfs_chmod_file() calls notify_change() to change the permission bits
    on a sysfs file. Replace with explicit call to sysfs_setattr() and
    fsnotify_change().

    This is equivalent, except that security_inode_setattr() is not
    called. This function is called by drivers, so the security checks do
    not make any sense.

    Signed-off-by: Miklos Szeredi
    Signed-off-by: Greg Kroah-Hartman

    Miklos Szeredi
     
  • Kobjects do not have a limit in name size since a while, so stop
    pretending that they do.

    Signed-off-by: Kay Sievers
    Signed-off-by: Greg Kroah-Hartman

    Kay Sievers
     
  • device_create() is race-prone, so use the race-free
    device_create_drvdata() instead as device_create() is going away.

    Cc: Jan Harkes
    Signed-off-by: Greg Kroah-Hartman

    Greg Kroah-Hartman
     

21 Jul, 2008

3 commits

  • * 'for-2.6.27' of git://linux-nfs.org/~bfields/linux: (51 commits)
    nfsd: nfs4xdr.c do-while is not a compound statement
    nfsd: Use C99 initializers in fs/nfsd/nfs4xdr.c
    lockd: Pass "struct sockaddr *" to new failover-by-IP function
    lockd: get host reference in nlmsvc_create_block() instead of callers
    lockd: minor svclock.c style fixes
    lockd: eliminate duplicate nlmsvc_lookup_host call from nlmsvc_lock
    lockd: eliminate duplicate nlmsvc_lookup_host call from nlmsvc_testlock
    lockd: nlm_release_host() checks for NULL, caller needn't
    file lock: reorder struct file_lock to save space on 64 bit builds
    nfsd: take file and mnt write in nfs4_upgrade_open
    nfsd: document open share bit tracking
    nfsd: tabulate nfs4 xdr encoding functions
    nfsd: dprint operation names
    svcrdma: Change WR context get/put to use the kmem cache
    svcrdma: Create a kmem cache for the WR contexts
    svcrdma: Add flush_scheduled_work to module exit function
    svcrdma: Limit ORD based on client's advertised IRD
    svcrdma: Remove unused wait q from svcrdma_xprt structure
    svcrdma: Remove unneeded spin locks from __svc_rdma_free
    svcrdma: Add dma map count and WARN_ON
    ...

    Linus Torvalds
     
  • * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6: (1232 commits)
    iucv: Fix bad merging.
    net_sched: Add size table for qdiscs
    net_sched: Add accessor function for packet length for qdiscs
    net_sched: Add qdisc_enqueue wrapper
    highmem: Export totalhigh_pages.
    ipv6 mcast: Omit redundant address family checks in ip6_mc_source().
    net: Use standard structures for generic socket address structures.
    ipv6 netns: Make several "global" sysctl variables namespace aware.
    netns: Use net_eq() to compare net-namespaces for optimization.
    ipv6: remove unused macros from net/ipv6.h
    ipv6: remove unused parameter from ip6_ra_control
    tcp: fix kernel panic with listening_get_next
    tcp: Remove redundant checks when setting eff_sacks
    tcp: options clean up
    tcp: Fix MD5 signatures for non-linear skbs
    sctp: Update sctp global memory limit allocations.
    sctp: remove unnecessary byteshifting, calculate directly in big-endian
    sctp: Allow only 1 listening socket with SO_REUSEADDR
    sctp: Do not leak memory on multiple listen() calls
    sctp: Support ipv6only AF_INET6 sockets.
    ...

    Linus Torvalds
     
  • * 'configfs-fixup-ptr-error' of git://oss.oracle.com/git/jlbec/linux-2.6:
    configfs: Allow ->make_item() and ->make_group() to return detailed errors.
    Revert "configfs: Allow ->make_item() and ->make_group() to return detailed errors."

    Linus Torvalds