Commit 13cc56013842a847a0f6ff805d9ed9181e753ef8

Authored by Linus Torvalds

Merge branch 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

Pull per-cpu changes from Tejun Heo:
 "This pull request contains Kent's per-cpu reference counter.  It has
  gone through several iterations since the last time and the dynamic
  allocation is gone.

  The usual usage is relatively straight-forward although async kill
  confirm interface, which is not used int most cases, is somewhat icky.
  There also are some interface concerns - e.g.  I'm not sure about
  passing in @relesae callback during init as that becomes funny when we
  later implement synchronous kill_and_drain - but nothing too serious
  and it's quite useable now.

  cgroup_subsys_state refcnting has already been converted and we should
  convert module refcnt (Kent?)"

* 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu:
  percpu-refcount: use RCU-sched insted of normal RCU
  percpu-refcount: implement percpu_tryget() along with percpu_ref_kill_and_confirm()
  percpu-refcount: implement percpu_ref_cancel_init()
  percpu-refcount: add __must_check to percpu_ref_init() and don't use ACCESS_ONCE() in percpu_ref_kill_rcu()
  percpu-refcount: cosmetic updates
  percpu-refcount: consistently use plain (non-sched) RCU
  percpu-refcount: Don't use silly cmpxchg()
  percpu: implement generic percpu refcounting

Showing 3 changed files Side-by-side Diff

include/linux/percpu-refcount.h
  1 +/*
  2 + * Percpu refcounts:
  3 + * (C) 2012 Google, Inc.
  4 + * Author: Kent Overstreet <koverstreet@google.com>
  5 + *
  6 + * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
  7 + * atomic_dec_and_test() - but percpu.
  8 + *
  9 + * There's one important difference between percpu refs and normal atomic_t
  10 + * refcounts; you have to keep track of your initial refcount, and then when you
  11 + * start shutting down you call percpu_ref_kill() _before_ dropping the initial
  12 + * refcount.
  13 + *
  14 + * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
  15 + * than an atomic_t - this is because of the way shutdown works, see
  16 + * percpu_ref_kill()/PCPU_COUNT_BIAS.
  17 + *
  18 + * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
  19 + * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
  20 + * puts the ref back in single atomic_t mode, collecting the per cpu refs and
  21 + * issuing the appropriate barriers, and then marks the ref as shutting down so
  22 + * that percpu_ref_put() will check for the ref hitting 0. After it returns,
  23 + * it's safe to drop the initial ref.
  24 + *
  25 + * USAGE:
  26 + *
  27 + * See fs/aio.c for some example usage; it's used there for struct kioctx, which
  28 + * is created when userspaces calls io_setup(), and destroyed when userspace
  29 + * calls io_destroy() or the process exits.
  30 + *
  31 + * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
  32 + * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove
  33 + * the kioctx from the proccess's list of kioctxs - after that, there can't be
  34 + * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
  35 + * the initial ref with percpu_ref_put().
  36 + *
  37 + * Code that does a two stage shutdown like this often needs some kind of
  38 + * explicit synchronization to ensure the initial refcount can only be dropped
  39 + * once - percpu_ref_kill() does this for you, it returns true once and false if
  40 + * someone else already called it. The aio code uses it this way, but it's not
  41 + * necessary if the code has some other mechanism to synchronize teardown.
  42 + * around.
  43 + */
  44 +
  45 +#ifndef _LINUX_PERCPU_REFCOUNT_H
  46 +#define _LINUX_PERCPU_REFCOUNT_H
  47 +
  48 +#include <linux/atomic.h>
  49 +#include <linux/kernel.h>
  50 +#include <linux/percpu.h>
  51 +#include <linux/rcupdate.h>
  52 +
  53 +struct percpu_ref;
  54 +typedef void (percpu_ref_func_t)(struct percpu_ref *);
  55 +
  56 +struct percpu_ref {
  57 + atomic_t count;
  58 + /*
  59 + * The low bit of the pointer indicates whether the ref is in percpu
  60 + * mode; if set, then get/put will manipulate the atomic_t (this is a
  61 + * hack because we need to keep the pointer around for
  62 + * percpu_ref_kill_rcu())
  63 + */
  64 + unsigned __percpu *pcpu_count;
  65 + percpu_ref_func_t *release;
  66 + percpu_ref_func_t *confirm_kill;
  67 + struct rcu_head rcu;
  68 +};
  69 +
  70 +int __must_check percpu_ref_init(struct percpu_ref *ref,
  71 + percpu_ref_func_t *release);
  72 +void percpu_ref_cancel_init(struct percpu_ref *ref);
  73 +void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
  74 + percpu_ref_func_t *confirm_kill);
  75 +
  76 +/**
  77 + * percpu_ref_kill - drop the initial ref
  78 + * @ref: percpu_ref to kill
  79 + *
  80 + * Must be used to drop the initial ref on a percpu refcount; must be called
  81 + * precisely once before shutdown.
  82 + *
  83 + * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
  84 + * percpu counters and dropping the initial ref.
  85 + */
  86 +static inline void percpu_ref_kill(struct percpu_ref *ref)
  87 +{
  88 + return percpu_ref_kill_and_confirm(ref, NULL);
  89 +}
  90 +
  91 +#define PCPU_STATUS_BITS 2
  92 +#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1)
  93 +#define PCPU_REF_PTR 0
  94 +#define PCPU_REF_DEAD 1
  95 +
  96 +#define REF_STATUS(count) (((unsigned long) count) & PCPU_STATUS_MASK)
  97 +
  98 +/**
  99 + * percpu_ref_get - increment a percpu refcount
  100 + * @ref: percpu_ref to get
  101 + *
  102 + * Analagous to atomic_inc().
  103 + */
  104 +static inline void percpu_ref_get(struct percpu_ref *ref)
  105 +{
  106 + unsigned __percpu *pcpu_count;
  107 +
  108 + rcu_read_lock_sched();
  109 +
  110 + pcpu_count = ACCESS_ONCE(ref->pcpu_count);
  111 +
  112 + if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
  113 + __this_cpu_inc(*pcpu_count);
  114 + else
  115 + atomic_inc(&ref->count);
  116 +
  117 + rcu_read_unlock_sched();
  118 +}
  119 +
  120 +/**
  121 + * percpu_ref_tryget - try to increment a percpu refcount
  122 + * @ref: percpu_ref to try-get
  123 + *
  124 + * Increment a percpu refcount unless it has already been killed. Returns
  125 + * %true on success; %false on failure.
  126 + *
  127 + * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget
  128 + * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be
  129 + * used. After the confirm_kill callback is invoked, it's guaranteed that
  130 + * no new reference will be given out by percpu_ref_tryget().
  131 + */
  132 +static inline bool percpu_ref_tryget(struct percpu_ref *ref)
  133 +{
  134 + unsigned __percpu *pcpu_count;
  135 + int ret = false;
  136 +
  137 + rcu_read_lock_sched();
  138 +
  139 + pcpu_count = ACCESS_ONCE(ref->pcpu_count);
  140 +
  141 + if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) {
  142 + __this_cpu_inc(*pcpu_count);
  143 + ret = true;
  144 + }
  145 +
  146 + rcu_read_unlock_sched();
  147 +
  148 + return ret;
  149 +}
  150 +
  151 +/**
  152 + * percpu_ref_put - decrement a percpu refcount
  153 + * @ref: percpu_ref to put
  154 + *
  155 + * Decrement the refcount, and if 0, call the release function (which was passed
  156 + * to percpu_ref_init())
  157 + */
  158 +static inline void percpu_ref_put(struct percpu_ref *ref)
  159 +{
  160 + unsigned __percpu *pcpu_count;
  161 +
  162 + rcu_read_lock_sched();
  163 +
  164 + pcpu_count = ACCESS_ONCE(ref->pcpu_count);
  165 +
  166 + if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
  167 + __this_cpu_dec(*pcpu_count);
  168 + else if (unlikely(atomic_dec_and_test(&ref->count)))
  169 + ref->release(ref);
  170 +
  171 + rcu_read_unlock_sched();
  172 +}
  173 +
  174 +#endif
... ... @@ -13,7 +13,7 @@
13 13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
14 14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
15 15 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
16   - earlycpio.o
  16 + earlycpio.o percpu-refcount.o
17 17  
18 18 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
19 19 lib-$(CONFIG_MMU) += ioremap.o
lib/percpu-refcount.c
  1 +#define pr_fmt(fmt) "%s: " fmt "\n", __func__
  2 +
  3 +#include <linux/kernel.h>
  4 +#include <linux/percpu-refcount.h>
  5 +
  6 +/*
  7 + * Initially, a percpu refcount is just a set of percpu counters. Initially, we
  8 + * don't try to detect the ref hitting 0 - which means that get/put can just
  9 + * increment or decrement the local counter. Note that the counter on a
  10 + * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
  11 + * percpu counters will all sum to the correct value
  12 + *
  13 + * (More precisely: because moduler arithmatic is commutative the sum of all the
  14 + * pcpu_count vars will be equal to what it would have been if all the gets and
  15 + * puts were done to a single integer, even if some of the percpu integers
  16 + * overflow or underflow).
  17 + *
  18 + * The real trick to implementing percpu refcounts is shutdown. We can't detect
  19 + * the ref hitting 0 on every put - this would require global synchronization
  20 + * and defeat the whole purpose of using percpu refs.
  21 + *
  22 + * What we do is require the user to keep track of the initial refcount; we know
  23 + * the ref can't hit 0 before the user drops the initial ref, so as long as we
  24 + * convert to non percpu mode before the initial ref is dropped everything
  25 + * works.
  26 + *
  27 + * Converting to non percpu mode is done with some RCUish stuff in
  28 + * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t
  29 + * can't hit 0 before we've added up all the percpu refs.
  30 + */
  31 +
  32 +#define PCPU_COUNT_BIAS (1U << 31)
  33 +
  34 +/**
  35 + * percpu_ref_init - initialize a percpu refcount
  36 + * @ref: percpu_ref to initialize
  37 + * @release: function which will be called when refcount hits 0
  38 + *
  39 + * Initializes the refcount in single atomic counter mode with a refcount of 1;
  40 + * analagous to atomic_set(ref, 1).
  41 + *
  42 + * Note that @release must not sleep - it may potentially be called from RCU
  43 + * callback context by percpu_ref_kill().
  44 + */
  45 +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
  46 +{
  47 + atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS);
  48 +
  49 + ref->pcpu_count = alloc_percpu(unsigned);
  50 + if (!ref->pcpu_count)
  51 + return -ENOMEM;
  52 +
  53 + ref->release = release;
  54 + return 0;
  55 +}
  56 +
  57 +/**
  58 + * percpu_ref_cancel_init - cancel percpu_ref_init()
  59 + * @ref: percpu_ref to cancel init for
  60 + *
  61 + * Once a percpu_ref is initialized, its destruction is initiated by
  62 + * percpu_ref_kill() and completes asynchronously, which can be painful to
  63 + * do when destroying a half-constructed object in init failure path.
  64 + *
  65 + * This function destroys @ref without invoking @ref->release and the
  66 + * memory area containing it can be freed immediately on return. To
  67 + * prevent accidental misuse, it's required that @ref has finished
  68 + * percpu_ref_init(), whether successful or not, but never used.
  69 + *
  70 + * The weird name and usage restriction are to prevent people from using
  71 + * this function by mistake for normal shutdown instead of
  72 + * percpu_ref_kill().
  73 + */
  74 +void percpu_ref_cancel_init(struct percpu_ref *ref)
  75 +{
  76 + unsigned __percpu *pcpu_count = ref->pcpu_count;
  77 + int cpu;
  78 +
  79 + WARN_ON_ONCE(atomic_read(&ref->count) != 1 + PCPU_COUNT_BIAS);
  80 +
  81 + if (pcpu_count) {
  82 + for_each_possible_cpu(cpu)
  83 + WARN_ON_ONCE(*per_cpu_ptr(pcpu_count, cpu));
  84 + free_percpu(ref->pcpu_count);
  85 + }
  86 +}
  87 +
  88 +static void percpu_ref_kill_rcu(struct rcu_head *rcu)
  89 +{
  90 + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu);
  91 + unsigned __percpu *pcpu_count = ref->pcpu_count;
  92 + unsigned count = 0;
  93 + int cpu;
  94 +
  95 + /* Mask out PCPU_REF_DEAD */
  96 + pcpu_count = (unsigned __percpu *)
  97 + (((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK);
  98 +
  99 + for_each_possible_cpu(cpu)
  100 + count += *per_cpu_ptr(pcpu_count, cpu);
  101 +
  102 + free_percpu(pcpu_count);
  103 +
  104 + pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count);
  105 +
  106 + /*
  107 + * It's crucial that we sum the percpu counters _before_ adding the sum
  108 + * to &ref->count; since gets could be happening on one cpu while puts
  109 + * happen on another, adding a single cpu's count could cause
  110 + * @ref->count to hit 0 before we've got a consistent value - but the
  111 + * sum of all the counts will be consistent and correct.
  112 + *
  113 + * Subtracting the bias value then has to happen _after_ adding count to
  114 + * &ref->count; we need the bias value to prevent &ref->count from
  115 + * reaching 0 before we add the percpu counts. But doing it at the same
  116 + * time is equivalent and saves us atomic operations:
  117 + */
  118 +
  119 + atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count);
  120 +
  121 + /* @ref is viewed as dead on all CPUs, send out kill confirmation */
  122 + if (ref->confirm_kill)
  123 + ref->confirm_kill(ref);
  124 +
  125 + /*
  126 + * Now we're in single atomic_t mode with a consistent refcount, so it's
  127 + * safe to drop our initial ref:
  128 + */
  129 + percpu_ref_put(ref);
  130 +}
  131 +
  132 +/**
  133 + * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
  134 + * @ref: percpu_ref to kill
  135 + * @confirm_kill: optional confirmation callback
  136 + *
  137 + * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
  138 + * @confirm_kill is not NULL. @confirm_kill, which may not block, will be
  139 + * called after @ref is seen as dead from all CPUs - all further
  140 + * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget()
  141 + * for more details.
  142 + *
  143 + * Due to the way percpu_ref is implemented, @confirm_kill will be called
  144 + * after at least one full RCU grace period has passed but this is an
  145 + * implementation detail and callers must not depend on it.
  146 + */
  147 +void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
  148 + percpu_ref_func_t *confirm_kill)
  149 +{
  150 + WARN_ONCE(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD,
  151 + "percpu_ref_kill() called more than once!\n");
  152 +
  153 + ref->pcpu_count = (unsigned __percpu *)
  154 + (((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD);
  155 + ref->confirm_kill = confirm_kill;
  156 +
  157 + call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu);
  158 +}