Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit d8bf4ca9ca9576548628344c9725edd3786e90b1

Authored by Michal Hocko 2011-07-08 20:39:41 +0800

Committed by Jiri Kosina 2011-07-09 04:21:58 +0800

Exists in master and in 6 other branches

rcu: treewide: Do not use rcu_read_lock_held when calling rcu_dereference_check

Since ca5ecddf (rcu: define __rcu address space modifier for sparse)
rcu_dereference_check use rcu_read_lock_held as a part of condition
automatically so callers do not have to do that as well.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>

Showing 14 changed files with 6 additions and 27 deletions Inline Diff

include/linux/cgroup.h
include/linux/cred.h
include/linux/fdtable.h
include/linux/rtnetlink.h
include/net/sock.h
kernel/cgroup.c
kernel/exit.c
kernel/pid.c
kernel/rcutorture.c
kernel/sched.c
net/mac80211/sta_info.c
net/netlabel/netlabel_domainhash.c
net/netlabel/netlabel_unlabeled.c
security/keys/keyring.c

include/linux/cgroup.h

Diff comments View file @ d8bf4ca

 #ifndef _LINUX_CGROUP_H
 #define _LINUX_CGROUP_H
 /*
  *  cgroup interface
  *
  *  Copyright (C) 2003 BULL SA
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #ifdef CONFIG_CGROUPS
 struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
 struct css_id;
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_lock(void);
 extern int cgroup_lock_is_held(void);
 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
 extern void cgroup_unlock(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_fork_callbacks(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 extern int cgroup_load_subsys(struct cgroup_subsys *ss);
 extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 extern const struct file_operations proc_cgroup_operations;
 /* Define the enumeration of all builtin cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_BUILTIN_SUBSYS_COUNT
 };
 #undef SUBSYS
 /*
  * This define indicates the maximum number of subsystems that can be loaded
  * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
  * track of all of them.
  */
 #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
 	/*
 	 * The cgroup that this subsystem is attached to. Useful
 	 * for subsystems that want to know about the cgroup
 	 * hierarchy structure
 	 */
 	struct cgroup *cgroup;
 	/*
 	 * State maintained by the cgroup system to allow subsystems
 	 * to be "busy". Should be accessed via css_get(),
 	 * css_tryget() and and css_put().
 	 */
 	atomic_t refcnt;
 	unsigned long flags;
 	/* ID for this css, if possible */
 	struct css_id __rcu *id;
 };
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_ROOT, /* This CSS is the root of the subsystem */
 	CSS_REMOVED, /* This CSS is dead */
 };
 /* Caller must verify that the css is not for root cgroup */
 static inline void __css_get(struct cgroup_subsys_state *css, int count)
 {
 	atomic_add(count, &css->refcnt);
 }
 /*
  * Call css_get() to hold a reference on the css; it can be used
  * for a reference obtained via:
  * - an existing ref-counted reference to the css
  * - task->cgroups for a locked task
  */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!test_bit(CSS_ROOT, &css->flags))
 		__css_get(css, 1);
 }
 static inline bool css_is_removed(struct cgroup_subsys_state *css)
 {
 	return test_bit(CSS_REMOVED, &css->flags);
 }
 /*
  * Call css_tryget() to take a reference on a css if your existing
  * (known-valid) reference isn't already ref-counted. Returns false if
  * the css has been destroyed.
  */
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (test_bit(CSS_ROOT, &css->flags))
 		return true;
 	while (!atomic_inc_not_zero(&css->refcnt)) {
 		if (test_bit(CSS_REMOVED, &css->flags))
 			return false;
 		cpu_relax();
 	}
 	return true;
 }
 /*
  * css_put() should be called to release a reference taken by
  * css_get() or css_tryget()
  */
 extern void __css_put(struct cgroup_subsys_state *css, int count);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!test_bit(CSS_ROOT, &css->flags))
 		__css_put(css, 1);
 }
 /* bits in struct cgroup flags field */
 enum {
 	/* Control Group is dead */
 	CGRP_REMOVED,
 	/*
 	 * Control Group has previously had a child cgroup or a task,
 	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
 	 */
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
 	/*
 	 * A thread in rmdir() is wating for this cgroup.
 	 */
 	CGRP_WAIT_ON_RMDIR,
 	/*
 	 * Clone cgroup values when creating a new child cgroup
 	 */
 	CGRP_CLONE_CHILDREN,
 };
 /* which pidlist file are we talking about? */
 enum cgroup_filetype {
 	CGROUP_FILE_PROCS,
 	CGROUP_FILE_TASKS,
 };
 /*
  * A pidlist is a list of pids that virtually represents the contents of one
  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  * a pair (one each for procs, tasks) for each pid namespace that's relevant
  * to the cgroup.
  */
 struct cgroup_pidlist {
 	/*
 	 * used to find which pidlist is wanted. doesn't change as long as
 	 * this particular list stays in the list.
 	 */
 	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
 	/* array of xids */
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
 	/* how many files are using the current array */
 	int use_count;
 	/* each of these stored in a list by its cgroup */
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
 	struct cgroup *owner;
 	/* protects the other fields */
 	struct rw_semaphore mutex;
 };
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	/*
 	 * count users of this cgroup. >0 means busy, but doesn't
 	 * necessarily indicate the number of tasks in the cgroup
 	 */
 	atomic_t count;
 	/*
 	 * We link our 'sibling' struct into our parent's 'children'.
 	 * Our children link their 'sibling' into our 'children'.
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
 	struct cgroup *parent;		/* my parent */
 	struct dentry __rcu *dentry;	/* cgroup fs entry, RCU protected */
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 	struct cgroupfs_root *root;
 	struct cgroup *top_cgroup;
 	/*
 	 * List of cg_cgroup_links pointing at css_sets with
 	 * tasks in this cgroup. Protected by css_set_lock
 	 */
 	struct list_head css_sets;
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
 	 * release_list_lock
 	 */
 	struct list_head release_list;
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
 	 */
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 };
 /*
  * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
  * object and speeds up fork()/exit(), since a single inc/dec and a
  * list_add()/del() can bump the reference count on the entire cgroup
  * set for a task.
  */
 struct css_set {
 	/* Reference count */
 	atomic_t refcount;
 	/*
 	 * List running through all cgroup groups in the same hash
 	 * slot. Protected by css_set_lock
 	 */
 	struct hlist_node hlist;
 	/*
 	 * List running through all tasks using this cgroup
 	 * group. Protected by css_set_lock
 	 */
 	struct list_head tasks;
 	/*
 	 * List of cg_cgroup_link objects on link chains from
 	 * cgroups referenced from this css_set. Protected by
 	 * css_set_lock
 	 */
 	struct list_head cg_links;
 	/*
 	 * Set of subsystem states, one for each subsystem. This array
 	 * is immutable after creation apart from the init_css_set
 	 * during subsystem registration (at boot time) and modular subsystem
 	 * loading/unloading.
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
 /*
  * cgroup_map_cb is an abstract callback API for reporting map-valued
  * control files
  */
 struct cgroup_map_cb {
 	int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
 	void *state;
 };
 /*
  * struct cftype: handler definitions for cgroup control files
  *
  * When reading/writing to a file:
  *	- the cgroup to use is file->f_dentry->d_parent->d_fsdata
  *	- the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 #define MAX_CFTYPE_NAME 64
 struct cftype {
 	/*
 	 * By convention, the name should begin with the name of the
 	 * subsystem, followed by a period
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
 	/*
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
 	 */
 	mode_t mode;
 	/*
 	 * If non-zero, defines the maximum length of string that can
 	 * be passed to write_string; defaults to 64
 	 */
 	size_t max_write_len;
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
 			char __user *buf, size_t nbytes, loff_t *ppos);
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
 	 */
 	u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft);
 	/*
 	 * read_s64() is a signed version of read_u64()
 	 */
 	s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft);
 	/*
 	 * read_map() is used for defining a map of key/value
 	 * pairs. It should call cb->fill(cb, key, value) for each
 	 * entry. The key/value pairs (and their ordering) should not
 	 * change between reboots.
 	 */
 	int (*read_map)(struct cgroup *cont, struct cftype *cft,
 			struct cgroup_map_cb *cb);
 	/*
 	 * read_seq_string() is used for outputting a simple sequence
 	 * using seqfile.
 	 */
 	int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
 			       struct seq_file *m);
 	ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
 			 struct file *file,
 			 const char __user *buf, size_t nbytes, loff_t *ppos);
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
 	int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val);
 	/*
 	 * write_s64() is a signed version of write_u64()
 	 */
 	int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val);
 	/*
 	 * write_string() is passed a nul-terminated kernelspace
 	 * buffer of maximum length determined by max_write_len.
 	 * Returns 0 or -ve error code.
 	 */
 	int (*write_string)(struct cgroup *cgrp, struct cftype *cft,
 			    const char *buffer);
 	/*
 	 * trigger() callback can be used to get some kick from the
 	 * userspace, when the actual string written is not important
 	 * at all. The private field can be used to determine the
 	 * kick type for multiplexing.
 	 */
 	int (*trigger)(struct cgroup *cgrp, unsigned int event);
 	int (*release)(struct inode *inode, struct file *file);
 	/*
 	 * register_event() callback will be used to add new userspace
 	 * waiter for changes related to the cftype. Implement it if
 	 * you want to provide this functionality. Use eventfd_signal()
 	 * on eventfd to send notification to userspace.
 	 */
 	int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
 			struct eventfd_ctx *eventfd, const char *args);
 	/*
 	 * unregister_event() callback will be called when userspace
 	 * closes the eventfd or on cgroup removing.
 	 * This callback must be implemented, if you want provide
 	 * notification functionality.
 	 */
 	void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
 			struct eventfd_ctx *eventfd);
 };
 struct cgroup_scanner {
 	struct cgroup *cg;
 	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
 	void (*process_task)(struct task_struct *p,
 			struct cgroup_scanner *scan);
 	struct ptr_heap *heap;
 	void *data;
 };
 /*
  * Add a new file to the given cgroup directory. Should only be
  * called by subsystems from within a populate() method
  */
 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		       const struct cftype *cft);
 /*
  * Add a set of new files to the given cgroup directory. Should
  * only be called by subsystems from within a populate() method
  */
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
 			const struct cftype cft[],
 			int count);
 int cgroup_is_removed(const struct cgroup *cgrp);
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 int cgroup_task_count(const struct cgroup *cgrp);
 /* Return true if cgrp is a descendant of the task's cgroup */
 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 /*
  * When the subsys has to access css and may add permanent refcnt to css,
  * it should take care of racy conditions with rmdir(). Following set of
  * functions, is for stop/restart rmdir if necessary.
  * Because these will call css_get/put, "css" should be alive css.
  *
  *  cgroup_exclude_rmdir();
  *  ...do some jobs which may access arbitrary empty cgroup
  *  cgroup_release_and_wakeup_rmdir();
  *
  *  When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
  *  it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
  */
 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
 /*
  * Control Group subsystem type.
  * See Documentation/cgroups/cgroups.txt for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 						  struct cgroup *cgrp);
 	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			  struct task_struct *tsk);
 	int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
 	void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			      struct task_struct *tsk);
 	void (*pre_attach)(struct cgroup *cgrp);
 	void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		       struct cgroup *old_cgrp, struct task_struct *tsk);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
 	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cgrp, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
 	int subsys_id;
 	int active;
 	int disabled;
 	int early_init;
 	/*
 	 * True if this subsys uses ID. ID is not available before cgroup_init()
 	 * (not available in early_init time.)
 	 */
 	bool use_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 	/*
 	 * Protects sibling/children links of cgroups in this
 	 * hierarchy, plus protects which hierarchy (or none) the
 	 * subsystem is a part of (i.e. root/sibling).  To avoid
 	 * potential deadlocks, the following operations should not be
 	 * undertaken while holding any hierarchy_mutex:
 	 *
 	 * - allocating memory
 	 * - initiating hotplug events
 	 */
 	struct mutex hierarchy_mutex;
 	struct lock_class_key subsys_key;
 	/*
 	 * Link to parent, and list entry in parent's children.
 	 * Protected by this->hierarchy_mutex and cgroup_lock()
 	 */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
 	/* used when use_id == true */
 	struct idr idr;
 	spinlock_t id_lock;
 	/* should be defined only by modular subsystems */
 	struct module *module;
 };
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 static inline struct cgroup_subsys_state *cgroup_subsys_state(
 	struct cgroup *cgrp, int subsys_id)
 {
 	return cgrp->subsys[subsys_id];
 }
 /*
  * function to get the cgroup_subsys_state which allows for extra
  * rcu_dereference_check() conditions, such as locks used during the
  * cgroup_subsys::attach() methods.
  */
 #define task_subsys_state_check(task, subsys_id, __c)			\
 	rcu_dereference_check(task->cgroups->subsys[subsys_id],		\
-			      rcu_read_lock_held() ||			\
 			      lockdep_is_held(&task->alloc_lock) ||	\
 			      cgroup_lock_is_held() || (__c))
 static inline struct cgroup_subsys_state *
 task_subsys_state(struct task_struct *task, int subsys_id)
 {
 	return task_subsys_state_check(task, subsys_id, false);
 }
 static inline struct cgroup* task_cgroup(struct task_struct *task,
 					       int subsys_id)
 {
 	return task_subsys_state(task, subsys_id)->cgroup;
 }
 /* A cgroup_iter should be treated as an opaque object */
 struct cgroup_iter {
 	struct list_head *cg_link;
 	struct list_head *task;
 };
 /*
  * To iterate across the tasks in a cgroup:
  *
  * 1) call cgroup_iter_start to initialize an iterator
  *
  * 2) call cgroup_iter_next() to retrieve member tasks until it
  *    returns NULL or until you want to end the iteration
  *
  * 3) call cgroup_iter_end() to destroy the iterator.
  *
  * Or, call cgroup_scan_tasks() to iterate through every task in a
  * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
  * the test_task() callback, but not while calling the process_task()
  * callback.
  */
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 static inline int cgroup_attach_task_current_cg(struct task_struct *tsk)
 {
 	return cgroup_attach_task_all(current, tsk);
 }
 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
  * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
  * CSS ID is assigned at cgroup allocation (create) automatically
  * and removed when subsys calls free_css_id() function. This is because
  * the lifetime of cgroup_subsys_state is subsys's matter.
  *
  * Looking up and scanning function should be called under rcu_read_lock().
  * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
  * But the css returned by this routine can be "not populated yet" or "being
  * destroyed". The caller should check css and cgroup's status.
  */
 /*
  * Typically Called at ->destroy(), or somewhere the subsys frees
  * cgroup_subsys_state.
  */
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 /* Find a cgroup_subsys_state which has given ID */
 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
 /*
  * Get a cgroup whose id is greater than or equal to id under tree of root.
  * Returning a cgroup_subsys_state or NULL.
  */
 struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
 		struct cgroup_subsys_state *root, int *foundid);
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
 		     const struct cgroup_subsys_state *root);
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
 unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 #else /* !CONFIG_CGROUPS */
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 static inline void cgroup_lock(void) {}
 static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
 	return -EINVAL;
 }
 /* No cgroups - nothing to do */
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t)
 {
 	return 0;
 }
 static inline int cgroup_attach_task_current_cg(struct task_struct *t)
 {
 	return 0;
 }
 #endif /* !CONFIG_CGROUPS */
 #endif /* _LINUX_CGROUP_H */

include/linux/cred.h

Diff comments View file @ d8bf4ca

 /* Credentials management - see Documentation/security/credentials.txt
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
  */
 #ifndef _LINUX_CRED_H
 #define _LINUX_CRED_H
 #include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/key.h>
 #include <linux/selinux.h>
 #include <asm/atomic.h>
 struct user_struct;
 struct cred;
 struct inode;
 /*
  * COW Supplementary groups list
  */
 #define NGROUPS_SMALL		32
 #define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
 struct group_info {
 	atomic_t	usage;
 	int		ngroups;
 	int		nblocks;
 	gid_t		small_block[NGROUPS_SMALL];
 	gid_t		*blocks[0];
 };
 /**
  * get_group_info - Get a reference to a group info structure
  * @group_info: The group info to reference
  *
  * This gets a reference to a set of supplementary groups.
  *
  * If the caller is accessing a task's credentials, they must hold the RCU read
  * lock when reading.
  */
 static inline struct group_info *get_group_info(struct group_info *gi)
 {
 	atomic_inc(&gi->usage);
 	return gi;
 }
 /**
  * put_group_info - Release a reference to a group info structure
  * @group_info: The group info to release
  */
 #define put_group_info(group_info)			\
 do {							\
 	if (atomic_dec_and_test(&(group_info)->usage))	\
 		groups_free(group_info);		\
 } while (0)
 extern struct group_info *groups_alloc(int);
 extern struct group_info init_groups;
 extern void groups_free(struct group_info *);
 extern int set_current_groups(struct group_info *);
 extern int set_groups(struct cred *, struct group_info *);
 extern int groups_search(const struct group_info *, gid_t);
 /* access the groups "array" with this macro */
 #define GROUP_AT(gi, i) \
 	((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
 /*
  * The common credentials for a thread group
  * - shared by CLONE_THREAD
  */
 #ifdef CONFIG_KEYS
 struct thread_group_cred {
 	atomic_t	usage;
 	pid_t		tgid;			/* thread group process ID */
 	spinlock_t	lock;
 	struct key __rcu *session_keyring;	/* keyring inherited over fork */
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
 #endif
 /*
  * The security context of a task
  *
  * The parts of the context break down into two categories:
  *
  *  (1) The objective context of a task.  These parts are used when some other
  *	task is attempting to affect this one.
  *
  *  (2) The subjective context.  These details are used when the task is acting
  *	upon another object, be that a file, a task, a key or whatever.
  *
  * Note that some members of this structure belong to both categories - the
  * LSM security pointer for instance.
  *
  * A task has two security pointers.  task->real_cred points to the objective
  * context that defines that task's actual details.  The objective part of this
  * context is used whenever that task is acted upon.
  *
  * task->cred points to the subjective context that defines the details of how
  * that task is going to act upon another object.  This may be overridden
  * temporarily to point to another security context, but normally points to the
  * same context as task->real_cred.
  */
 struct cred {
 	atomic_t	usage;
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	atomic_t	subscribers;	/* number of processes subscribed */
 	void		*put_addr;
 	unsigned	magic;
 #define CRED_MAGIC	0x43736564
 #define CRED_MAGIC_DEAD	0x44656144
 #endif
 	uid_t		uid;		/* real UID of the task */
 	gid_t		gid;		/* real GID of the task */
 	uid_t		suid;		/* saved UID of the task */
 	gid_t		sgid;		/* saved GID of the task */
 	uid_t		euid;		/* effective UID of the task */
 	gid_t		egid;		/* effective GID of the task */
 	uid_t		fsuid;		/* UID for VFS ops */
 	gid_t		fsgid;		/* GID for VFS ops */
 	unsigned	securebits;	/* SUID-less security management */
 	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
 	kernel_cap_t	cap_permitted;	/* caps we're permitted */
 	kernel_cap_t	cap_effective;	/* caps we can actually use */
 	kernel_cap_t	cap_bset;	/* capability bounding set */
 #ifdef CONFIG_KEYS
 	unsigned char	jit_keyring;	/* default keyring to attach requested
 					 * keys to */
 	struct key	*thread_keyring; /* keyring private to this thread */
 	struct key	*request_key_auth; /* assumed request_key authority */
 	struct thread_group_cred *tgcred; /* thread-group shared credentials */
 #endif
 #ifdef CONFIG_SECURITY
 	void		*security;	/* subjective LSM security */
 #endif
 	struct user_struct *user;	/* real user ID subscription */
 	struct user_namespace *user_ns; /* cached user->user_ns */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	struct rcu_head	rcu;		/* RCU deletion hook */
 };
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
 extern int copy_creds(struct task_struct *, unsigned long);
 extern const struct cred *get_task_cred(struct task_struct *);
 extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
 extern struct cred *prepare_exec_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
 extern const struct cred *override_creds(const struct cred *);
 extern void revert_creds(const struct cred *);
 extern struct cred *prepare_kernel_cred(struct task_struct *);
 extern int change_create_files_as(struct cred *, struct inode *);
 extern int set_security_override(struct cred *, u32);
 extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
 extern void __init cred_init(void);
 /*
  * check for validity of credentials
  */
 #ifdef CONFIG_DEBUG_CREDENTIALS
 extern void __invalid_creds(const struct cred *, const char *, unsigned);
 extern void __validate_process_creds(struct task_struct *,
 				     const char *, unsigned);
 extern bool creds_are_invalid(const struct cred *cred);
 static inline void __validate_creds(const struct cred *cred,
 				    const char *file, unsigned line)
 {
 	if (unlikely(creds_are_invalid(cred)))
 		__invalid_creds(cred, file, line);
 }
 #define validate_creds(cred)				\
 do {							\
 	__validate_creds((cred), __FILE__, __LINE__);	\
 } while(0)
 #define validate_process_creds()				\
 do {								\
 	__validate_process_creds(current, __FILE__, __LINE__);	\
 } while(0)
 extern void validate_creds_for_do_exit(struct task_struct *);
 #else
 static inline void validate_creds(const struct cred *cred)
 {
 }
 static inline void validate_creds_for_do_exit(struct task_struct *tsk)
 {
 }
 static inline void validate_process_creds(void)
 {
 }
 #endif
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
  *
  * Get a reference on the specified set of new credentials.  The caller must
  * release the reference.
  */
 static inline struct cred *get_new_cred(struct cred *cred)
 {
 	atomic_inc(&cred->usage);
 	return cred;
 }
 /**
  * get_cred - Get a reference on a set of credentials
  * @cred: The credentials to reference
  *
  * Get a reference on the specified set of credentials.  The caller must
  * release the reference.
  *
  * This is used to deal with a committed set of credentials.  Although the
  * pointer is const, this will temporarily discard the const and increment the
  * usage count.  The purpose of this is to attempt to catch at compile time the
  * accidental alteration of a set of credentials that should be considered
  * immutable.
  */
 static inline const struct cred *get_cred(const struct cred *cred)
 {
 	struct cred *nonconst_cred = (struct cred *) cred;
 	validate_creds(cred);
 	return get_new_cred(nonconst_cred);
 }
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.
  *
  * This takes a const pointer to a set of credentials because the credentials
  * on task_struct are attached by const pointers to prevent accidental
  * alteration of otherwise immutable credential sets.
  */
 static inline void put_cred(const struct cred *_cred)
 {
 	struct cred *cred = (struct cred *) _cred;
 	validate_creds(cred);
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
 /**
  * current_cred - Access the current task's subjective credentials
  *
  * Access the subjective credentials of the current task.
  */
 #define current_cred() \
 	(current->cred)
 /**
  * __task_cred - Access a task's objective credentials
  * @task: The task to query
  *
  * Access the objective credentials of a task.  The caller must hold the RCU
  * readlock or the task must be dead and unable to change its own credentials.
  *
  * The result of this function should not be passed directly to get_cred();
  * rather get_task_cred() should be used instead.
  */
 #define __task_cred(task)						\
 	({								\
 		const struct task_struct *__t = (task);			\
 		rcu_dereference_check(__t->real_cred,			\
-				      rcu_read_lock_held() ||		\
 				      task_is_dead(__t));		\
 	})
 /**
  * get_current_cred - Get the current task's subjective credentials
  *
  * Get the subjective credentials of the current task, pinning them so that
  * they can't go away.  Accessing the current task's credentials directly is
  * not permitted.
  */
 #define get_current_cred()				\
 	(get_cred(current_cred()))
 /**
  * get_current_user - Get the current task's user_struct
  *
  * Get the user record of the current task, pinning it so that it can't go
  * away.
  */
 #define get_current_user()				\
 ({							\
 	struct user_struct *__u;			\
 	struct cred *__cred;				\
 	__cred = (struct cred *) current_cred();	\
 	__u = get_uid(__cred->user);			\
 	__u;						\
 })
 /**
  * get_current_groups - Get the current task's supplementary group list
  *
  * Get the supplementary group list of the current task, pinning it so that it
  * can't go away.
  */
 #define get_current_groups()				\
 ({							\
 	struct group_info *__groups;			\
 	struct cred *__cred;				\
 	__cred = (struct cred *) current_cred();	\
 	__groups = get_group_info(__cred->group_info);	\
 	__groups;					\
 })
 #define task_cred_xxx(task, xxx)			\
 ({							\
 	__typeof__(((struct cred *)NULL)->xxx) ___val;	\
 	rcu_read_lock();				\
 	___val = __task_cred((task))->xxx;		\
 	rcu_read_unlock();				\
 	___val;						\
 })
 #define task_uid(task)		(task_cred_xxx((task), uid))
 #define task_euid(task)		(task_cred_xxx((task), euid))
 #define current_cred_xxx(xxx)			\
 ({						\
 	current->cred->xxx;			\
 })
 #define current_uid()		(current_cred_xxx(uid))
 #define current_gid()		(current_cred_xxx(gid))
 #define current_euid()		(current_cred_xxx(euid))
 #define current_egid()		(current_cred_xxx(egid))
 #define current_suid()		(current_cred_xxx(suid))
 #define current_sgid()		(current_cred_xxx(sgid))
 #define current_fsuid() 	(current_cred_xxx(fsuid))
 #define current_fsgid() 	(current_cred_xxx(fsgid))
 #define current_cap()		(current_cred_xxx(cap_effective))
 #define current_user()		(current_cred_xxx(user))
 #define current_security()	(current_cred_xxx(security))
 #ifdef CONFIG_USER_NS
 #define current_user_ns() (current_cred_xxx(user_ns))
 #else
 extern struct user_namespace init_user_ns;
 #define current_user_ns() (&init_user_ns)
 #endif
 #define current_uid_gid(_uid, _gid)		\
 do {						\
 	const struct cred *__cred;		\
 	__cred = current_cred();		\
 	*(_uid) = __cred->uid;			\
 	*(_gid) = __cred->gid;			\
 } while(0)
 #define current_euid_egid(_euid, _egid)		\
 do {						\
 	const struct cred *__cred;		\
 	__cred = current_cred();		\
 	*(_euid) = __cred->euid;		\
 	*(_egid) = __cred->egid;		\
 } while(0)
 #define current_fsuid_fsgid(_fsuid, _fsgid)	\
 do {						\
 	const struct cred *__cred;		\
 	__cred = current_cred();		\
 	*(_fsuid) = __cred->fsuid;		\
 	*(_fsgid) = __cred->fsgid;		\
 } while(0)
 #endif /* _LINUX_CRED_H */

include/linux/fdtable.h

Diff comments View file @ d8bf4ca

 /*
  * descriptor table internals; you almost certainly want file.h instead.
  */
 #ifndef __LINUX_FDTABLE_H
 #define __LINUX_FDTABLE_H
 #include <linux/posix_types.h>
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <asm/atomic.h>
 /*
  * The default fd array needs to be at least BITS_PER_LONG,
  * as this is the granularity returned by copy_fdset().
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
 /*
  * The embedded_fd_set is a small fd_set,
  * suitable for most tasks (which open <= BITS_PER_LONG files)
  */
 struct embedded_fd_set {
 	unsigned long fds_bits[1];
 };
 struct fdtable {
 	unsigned int max_fds;
 	struct file __rcu **fd;      /* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
 	struct rcu_head rcu;
 	struct fdtable *next;
 };
 /*
  * Open file table structure
  */
 struct files_struct {
   /*
    * read mostly part
    */
 	atomic_t count;
 	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
    * written part on a separate cache line in SMP
    */
 	spinlock_t file_lock ____cacheline_aligned_in_smp;
 	int next_fd;
 	struct embedded_fd_set close_on_exec_init;
 	struct embedded_fd_set open_fds_init;
 	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
 };
 #define rcu_dereference_check_fdtable(files, fdtfd) \
 	(rcu_dereference_check((fdtfd), \
-			       rcu_read_lock_held() || \
 			       lockdep_is_held(&(files)->file_lock) || \
 			       atomic_read(&(files)->count) == 1 || \
 			       rcu_my_thread_group_empty()))
 #define files_fdtable(files) \
 		(rcu_dereference_check_fdtable((files), (files)->fdt))
 struct file_operations;
 struct vfsmount;
 struct dentry;
 extern int expand_files(struct files_struct *, int nr);
 extern void free_fdtable_rcu(struct rcu_head *rcu);
 extern void __init files_defer_init(void);
 static inline void free_fdtable(struct fdtable *fdt)
 {
 	call_rcu(&fdt->rcu, free_fdtable_rcu);
 }
 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
 {
 	struct file * file = NULL;
 	struct fdtable *fdt = files_fdtable(files);
 	if (fd < fdt->max_fds)
 		file = rcu_dereference_check_fdtable(files, fdt->fd[fd]);
 	return file;
 }
 /*
  * Check whether the specified fd has an open file.
  */
 #define fcheck(fd)	fcheck_files(current->files, fd)
 struct task_struct;
 struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
 struct files_struct *dup_fd(struct files_struct *, int *);
 extern struct kmem_cache *files_cachep;
 #endif /* __LINUX_FDTABLE_H */

include/linux/rtnetlink.h

Diff comments View file @ d8bf4ca

 #ifndef __LINUX_RTNETLINK_H
 #define __LINUX_RTNETLINK_H
 #include <linux/types.h>
 #include <linux/netlink.h>
 #include <linux/if_link.h>
 #include <linux/if_addr.h>
 #include <linux/neighbour.h>
 /* rtnetlink families. Values up to 127 are reserved for real address
  * families, values above 128 may be used arbitrarily.
  */
 #define RTNL_FAMILY_IPMR		128
 #define RTNL_FAMILY_IP6MR		129
 #define RTNL_FAMILY_MAX			129
 /****
  *		Routing/neighbour discovery messages.
  ****/
 /* Types of messages */
 enum {
 	RTM_BASE	= 16,
 #define RTM_BASE	RTM_BASE
 	RTM_NEWLINK	= 16,
 #define RTM_NEWLINK	RTM_NEWLINK
 	RTM_DELLINK,
 #define RTM_DELLINK	RTM_DELLINK
 	RTM_GETLINK,
 #define RTM_GETLINK	RTM_GETLINK
 	RTM_SETLINK,
 #define RTM_SETLINK	RTM_SETLINK
 	RTM_NEWADDR	= 20,
 #define RTM_NEWADDR	RTM_NEWADDR
 	RTM_DELADDR,
 #define RTM_DELADDR	RTM_DELADDR
 	RTM_GETADDR,
 #define RTM_GETADDR	RTM_GETADDR
 	RTM_NEWROUTE	= 24,
 #define RTM_NEWROUTE	RTM_NEWROUTE
 	RTM_DELROUTE,
 #define RTM_DELROUTE	RTM_DELROUTE
 	RTM_GETROUTE,
 #define RTM_GETROUTE	RTM_GETROUTE
 	RTM_NEWNEIGH	= 28,
 #define RTM_NEWNEIGH	RTM_NEWNEIGH
 	RTM_DELNEIGH,
 #define RTM_DELNEIGH	RTM_DELNEIGH
 	RTM_GETNEIGH,
 #define RTM_GETNEIGH	RTM_GETNEIGH
 	RTM_NEWRULE	= 32,
 #define RTM_NEWRULE	RTM_NEWRULE
 	RTM_DELRULE,
 #define RTM_DELRULE	RTM_DELRULE
 	RTM_GETRULE,
 #define RTM_GETRULE	RTM_GETRULE
 	RTM_NEWQDISC	= 36,
 #define RTM_NEWQDISC	RTM_NEWQDISC
 	RTM_DELQDISC,
 #define RTM_DELQDISC	RTM_DELQDISC
 	RTM_GETQDISC,
 #define RTM_GETQDISC	RTM_GETQDISC
 	RTM_NEWTCLASS	= 40,
 #define RTM_NEWTCLASS	RTM_NEWTCLASS
 	RTM_DELTCLASS,
 #define RTM_DELTCLASS	RTM_DELTCLASS
 	RTM_GETTCLASS,
 #define RTM_GETTCLASS	RTM_GETTCLASS
 	RTM_NEWTFILTER	= 44,
 #define RTM_NEWTFILTER	RTM_NEWTFILTER
 	RTM_DELTFILTER,
 #define RTM_DELTFILTER	RTM_DELTFILTER
 	RTM_GETTFILTER,
 #define RTM_GETTFILTER	RTM_GETTFILTER
 	RTM_NEWACTION	= 48,
 #define RTM_NEWACTION   RTM_NEWACTION
 	RTM_DELACTION,
 #define RTM_DELACTION   RTM_DELACTION
 	RTM_GETACTION,
 #define RTM_GETACTION   RTM_GETACTION
 	RTM_NEWPREFIX	= 52,
 #define RTM_NEWPREFIX	RTM_NEWPREFIX
 	RTM_GETMULTICAST = 58,
 #define RTM_GETMULTICAST RTM_GETMULTICAST
 	RTM_GETANYCAST	= 62,
 #define RTM_GETANYCAST	RTM_GETANYCAST
 	RTM_NEWNEIGHTBL	= 64,
 #define RTM_NEWNEIGHTBL	RTM_NEWNEIGHTBL
 	RTM_GETNEIGHTBL	= 66,
 #define RTM_GETNEIGHTBL	RTM_GETNEIGHTBL
 	RTM_SETNEIGHTBL,
 #define RTM_SETNEIGHTBL	RTM_SETNEIGHTBL
 	RTM_NEWNDUSEROPT = 68,
 #define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT
 	RTM_NEWADDRLABEL = 72,
 #define RTM_NEWADDRLABEL RTM_NEWADDRLABEL
 	RTM_DELADDRLABEL,
 #define RTM_DELADDRLABEL RTM_DELADDRLABEL
 	RTM_GETADDRLABEL,
 #define RTM_GETADDRLABEL RTM_GETADDRLABEL
 	RTM_GETDCB = 78,
 #define RTM_GETDCB RTM_GETDCB
 	RTM_SETDCB,
 #define RTM_SETDCB RTM_SETDCB
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
 #define RTM_NR_MSGTYPES	(RTM_MAX + 1 - RTM_BASE)
 #define RTM_NR_FAMILIES	(RTM_NR_MSGTYPES >> 2)
 #define RTM_FAM(cmd)	(((cmd) - RTM_BASE) >> 2)
 /*
    Generic structure for encapsulation of optional route information.
    It is reminiscent of sockaddr, but with sa_family replaced
    with attribute type.
  */
 struct rtattr {
 	unsigned short	rta_len;
 	unsigned short	rta_type;
 };
 /* Macros to handle rtattributes */
 #define RTA_ALIGNTO	4
 #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) )
 #define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \
 			 (rta)->rta_len >= sizeof(struct rtattr) && \
 			 (rta)->rta_len <= (len))
 #define RTA_NEXT(rta,attrlen)	((attrlen) -= RTA_ALIGN((rta)->rta_len), \
 				 (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len)))
 #define RTA_LENGTH(len)	(RTA_ALIGN(sizeof(struct rtattr)) + (len))
 #define RTA_SPACE(len)	RTA_ALIGN(RTA_LENGTH(len))
 #define RTA_DATA(rta)   ((void*)(((char*)(rta)) + RTA_LENGTH(0)))
 #define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0))
 /******************************************************************************
  *		Definitions used in routing table administration.
  ****/
 struct rtmsg {
 	unsigned char		rtm_family;
 	unsigned char		rtm_dst_len;
 	unsigned char		rtm_src_len;
 	unsigned char		rtm_tos;
 	unsigned char		rtm_table;	/* Routing table id */
 	unsigned char		rtm_protocol;	/* Routing protocol; see below	*/
 	unsigned char		rtm_scope;	/* See below */
 	unsigned char		rtm_type;	/* See below	*/
 	unsigned		rtm_flags;
 };
 /* rtm_type */
 enum {
 	RTN_UNSPEC,
 	RTN_UNICAST,		/* Gateway or direct route	*/
 	RTN_LOCAL,		/* Accept locally		*/
 	RTN_BROADCAST,		/* Accept locally as broadcast,
 				   send as broadcast */
 	RTN_ANYCAST,		/* Accept locally as broadcast,
 				   but send as unicast */
 	RTN_MULTICAST,		/* Multicast route		*/
 	RTN_BLACKHOLE,		/* Drop				*/
 	RTN_UNREACHABLE,	/* Destination is unreachable   */
 	RTN_PROHIBIT,		/* Administratively prohibited	*/
 	RTN_THROW,		/* Not in this table		*/
 	RTN_NAT,		/* Translate this address	*/
 	RTN_XRESOLVE,		/* Use external resolver	*/
 	__RTN_MAX
 };
 #define RTN_MAX (__RTN_MAX - 1)
 /* rtm_protocol */
 #define RTPROT_UNSPEC	0
 #define RTPROT_REDIRECT	1	/* Route installed by ICMP redirects;
 				   not used by current IPv4 */
 #define RTPROT_KERNEL	2	/* Route installed by kernel		*/
 #define RTPROT_BOOT	3	/* Route installed during boot		*/
 #define RTPROT_STATIC	4	/* Route installed by administrator	*/
 /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel;
    they are just passed from user and back as is.
    It will be used by hypothetical multiple routing daemons.
    Note that protocol values should be standardized in order to
    avoid conflicts.
  */
 #define RTPROT_GATED	8	/* Apparently, GateD */
 #define RTPROT_RA	9	/* RDISC/ND router advertisements */
 #define RTPROT_MRT	10	/* Merit MRT */
 #define RTPROT_ZEBRA	11	/* Zebra */
 #define RTPROT_BIRD	12	/* BIRD */
 #define RTPROT_DNROUTED	13	/* DECnet routing daemon */
 #define RTPROT_XORP	14	/* XORP */
 #define RTPROT_NTK	15	/* Netsukuku */
 #define RTPROT_DHCP	16      /* DHCP client */
 /* rtm_scope
    Really it is not scope, but sort of distance to the destination.
    NOWHERE are reserved for not existing destinations, HOST is our
    local addresses, LINK are destinations, located on directly attached
    link and UNIVERSE is everywhere in the Universe.
    Intermediate values are also possible f.e. interior routes
    could be assigned a value between UNIVERSE and LINK.
 */
 enum rt_scope_t {
 	RT_SCOPE_UNIVERSE=0,
 /* User defined values  */
 	RT_SCOPE_SITE=200,
 	RT_SCOPE_LINK=253,
 	RT_SCOPE_HOST=254,
 	RT_SCOPE_NOWHERE=255
 };
 /* rtm_flags */
 #define RTM_F_NOTIFY		0x100	/* Notify user of route change	*/
 #define RTM_F_CLONED		0x200	/* This route is cloned		*/
 #define RTM_F_EQUALIZE		0x400	/* Multipath equalizer: NI	*/
 #define RTM_F_PREFIX		0x800	/* Prefix addresses		*/
 /* Reserved table identifiers */
 enum rt_class_t {
 	RT_TABLE_UNSPEC=0,
 /* User defined values */
 	RT_TABLE_COMPAT=252,
 	RT_TABLE_DEFAULT=253,
 	RT_TABLE_MAIN=254,
 	RT_TABLE_LOCAL=255,
 	RT_TABLE_MAX=0xFFFFFFFF
 };
 /* Routing message attributes */
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
 	RTA_SRC,
 	RTA_IIF,
 	RTA_OIF,
 	RTA_GATEWAY,
 	RTA_PRIORITY,
 	RTA_PREFSRC,
 	RTA_METRICS,
 	RTA_MULTIPATH,
 	RTA_PROTOINFO, /* no longer used */
 	RTA_FLOW,
 	RTA_CACHEINFO,
 	RTA_SESSION, /* no longer used */
 	RTA_MP_ALGO, /* no longer used */
 	RTA_TABLE,
 	RTA_MARK,
 	__RTA_MAX
 };
 #define RTA_MAX (__RTA_MAX - 1)
 #define RTM_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg))))
 #define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg))
 /* RTM_MULTIPATH --- array of struct rtnexthop.
  *
  * "struct rtnexthop" describes all necessary nexthop information,
  * i.e. parameters of path to a destination via this nexthop.
  *
  * At the moment it is impossible to set different prefsrc, mtu, window
  * and rtt for different paths from multipath.
  */
 struct rtnexthop {
 	unsigned short		rtnh_len;
 	unsigned char		rtnh_flags;
 	unsigned char		rtnh_hops;
 	int			rtnh_ifindex;
 };
 /* rtnh_flags */
 #define RTNH_F_DEAD		1	/* Nexthop is dead (used by multipath)	*/
 #define RTNH_F_PERVASIVE	2	/* Do recursive gateway lookup	*/
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
 /* Macros to handle hexthops */
 #define RTNH_ALIGNTO	4
 #define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) )
 #define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \
 			   ((int)(rtnh)->rtnh_len) <= (len))
 #define RTNH_NEXT(rtnh)	((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len)))
 #define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len))
 #define RTNH_SPACE(len)	RTNH_ALIGN(RTNH_LENGTH(len))
 #define RTNH_DATA(rtnh)   ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
 /* RTM_CACHEINFO */
 struct rta_cacheinfo {
 	__u32	rta_clntref;
 	__u32	rta_lastuse;
 	__s32	rta_expires;
 	__u32	rta_error;
 	__u32	rta_used;
 #define RTNETLINK_HAVE_PEERINFO 1
 	__u32	rta_id;
 	__u32	rta_ts;
 	__u32	rta_tsage;
 };
 /* RTM_METRICS --- array of struct rtattr with types of RTAX_* */
 enum {
 	RTAX_UNSPEC,
 #define RTAX_UNSPEC RTAX_UNSPEC
 	RTAX_LOCK,
 #define RTAX_LOCK RTAX_LOCK
 	RTAX_MTU,
 #define RTAX_MTU RTAX_MTU
 	RTAX_WINDOW,
 #define RTAX_WINDOW RTAX_WINDOW
 	RTAX_RTT,
 #define RTAX_RTT RTAX_RTT
 	RTAX_RTTVAR,
 #define RTAX_RTTVAR RTAX_RTTVAR
 	RTAX_SSTHRESH,
 #define RTAX_SSTHRESH RTAX_SSTHRESH
 	RTAX_CWND,
 #define RTAX_CWND RTAX_CWND
 	RTAX_ADVMSS,
 #define RTAX_ADVMSS RTAX_ADVMSS
 	RTAX_REORDERING,
 #define RTAX_REORDERING RTAX_REORDERING
 	RTAX_HOPLIMIT,
 #define RTAX_HOPLIMIT RTAX_HOPLIMIT
 	RTAX_INITCWND,
 #define RTAX_INITCWND RTAX_INITCWND
 	RTAX_FEATURES,
 #define RTAX_FEATURES RTAX_FEATURES
 	RTAX_RTO_MIN,
 #define RTAX_RTO_MIN RTAX_RTO_MIN
 	RTAX_INITRWND,
 #define RTAX_INITRWND RTAX_INITRWND
 	__RTAX_MAX
 };
 #define RTAX_MAX (__RTAX_MAX - 1)
 #define RTAX_FEATURE_ECN	0x00000001
 #define RTAX_FEATURE_SACK	0x00000002
 #define RTAX_FEATURE_TIMESTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 struct rta_session {
 	__u8	proto;
 	__u8	pad1;
 	__u16	pad2;
 	union {
 		struct {
 			__u16	sport;
 			__u16	dport;
 		} ports;
 		struct {
 			__u8	type;
 			__u8	code;
 			__u16	ident;
 		} icmpt;
 		__u32		spi;
 	} u;
 };
 /****
  *		General form of address family dependent message.
  ****/
 struct rtgenmsg {
 	unsigned char		rtgen_family;
 };
 /*****************************************************************
  *		Link layer specific messages.
  ****/
 /* struct ifinfomsg
  * passes link level specific information, not dependent
  * on network protocol.
  */
 struct ifinfomsg {
 	unsigned char	ifi_family;
 	unsigned char	__ifi_pad;
 	unsigned short	ifi_type;		/* ARPHRD_* */
 	int		ifi_index;		/* Link index	*/
 	unsigned	ifi_flags;		/* IFF_* flags	*/
 	unsigned	ifi_change;		/* IFF_* change mask */
 };
 /********************************************************************
  *		prefix information
  ****/
 struct prefixmsg {
 	unsigned char	prefix_family;
 	unsigned char	prefix_pad1;
 	unsigned short	prefix_pad2;
 	int		prefix_ifindex;
 	unsigned char	prefix_type;
 	unsigned char	prefix_len;
 	unsigned char	prefix_flags;
 	unsigned char	prefix_pad3;
 };
 enum
 {
 	PREFIX_UNSPEC,
 	PREFIX_ADDRESS,
 	PREFIX_CACHEINFO,
 	__PREFIX_MAX
 };
 #define PREFIX_MAX	(__PREFIX_MAX - 1)
 struct prefix_cacheinfo {
 	__u32	preferred_time;
 	__u32	valid_time;
 };
 /*****************************************************************
  *		Traffic control messages.
  ****/
 struct tcmsg {
 	unsigned char	tcm_family;
 	unsigned char	tcm__pad1;
 	unsigned short	tcm__pad2;
 	int		tcm_ifindex;
 	__u32		tcm_handle;
 	__u32		tcm_parent;
 	__u32		tcm_info;
 };
 enum {
 	TCA_UNSPEC,
 	TCA_KIND,
 	TCA_OPTIONS,
 	TCA_STATS,
 	TCA_XSTATS,
 	TCA_RATE,
 	TCA_FCNT,
 	TCA_STATS2,
 	TCA_STAB,
 	__TCA_MAX
 };
 #define TCA_MAX (__TCA_MAX - 1)
 #define TCA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg))))
 #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg))
 /********************************************************************
  *		Neighbor Discovery userland options
  ****/
 struct nduseroptmsg {
 	unsigned char	nduseropt_family;
 	unsigned char	nduseropt_pad1;
 	unsigned short	nduseropt_opts_len;	/* Total length of options */
 	int		nduseropt_ifindex;
 	__u8		nduseropt_icmp_type;
 	__u8		nduseropt_icmp_code;
 	unsigned short	nduseropt_pad2;
 	unsigned int	nduseropt_pad3;
 	/* Followed by one or more ND options */
 };
 enum {
 	NDUSEROPT_UNSPEC,
 	NDUSEROPT_SRCADDR,
 	__NDUSEROPT_MAX
 };
 #define NDUSEROPT_MAX	(__NDUSEROPT_MAX - 1)
 #ifndef __KERNEL__
 /* RTnetlink multicast groups - backwards compatibility for userspace */
 #define RTMGRP_LINK		1
 #define RTMGRP_NOTIFY		2
 #define RTMGRP_NEIGH		4
 #define RTMGRP_TC		8
 #define RTMGRP_IPV4_IFADDR	0x10
 #define RTMGRP_IPV4_MROUTE	0x20
 #define RTMGRP_IPV4_ROUTE	0x40
 #define RTMGRP_IPV4_RULE	0x80
 #define RTMGRP_IPV6_IFADDR	0x100
 #define RTMGRP_IPV6_MROUTE	0x200
 #define RTMGRP_IPV6_ROUTE	0x400
 #define RTMGRP_IPV6_IFINFO	0x800
 #define RTMGRP_DECnet_IFADDR    0x1000
 #define RTMGRP_DECnet_ROUTE     0x4000
 #define RTMGRP_IPV6_PREFIX	0x20000
 #endif
 /* RTnetlink multicast groups */
 enum rtnetlink_groups {
 	RTNLGRP_NONE,
 #define RTNLGRP_NONE		RTNLGRP_NONE
 	RTNLGRP_LINK,
 #define RTNLGRP_LINK		RTNLGRP_LINK
 	RTNLGRP_NOTIFY,
 #define RTNLGRP_NOTIFY		RTNLGRP_NOTIFY
 	RTNLGRP_NEIGH,
 #define RTNLGRP_NEIGH		RTNLGRP_NEIGH
 	RTNLGRP_TC,
 #define RTNLGRP_TC		RTNLGRP_TC
 	RTNLGRP_IPV4_IFADDR,
 #define RTNLGRP_IPV4_IFADDR	RTNLGRP_IPV4_IFADDR
 	RTNLGRP_IPV4_MROUTE,
 #define	RTNLGRP_IPV4_MROUTE	RTNLGRP_IPV4_MROUTE
 	RTNLGRP_IPV4_ROUTE,
 #define RTNLGRP_IPV4_ROUTE	RTNLGRP_IPV4_ROUTE
 	RTNLGRP_IPV4_RULE,
 #define RTNLGRP_IPV4_RULE	RTNLGRP_IPV4_RULE
 	RTNLGRP_IPV6_IFADDR,
 #define RTNLGRP_IPV6_IFADDR	RTNLGRP_IPV6_IFADDR
 	RTNLGRP_IPV6_MROUTE,
 #define RTNLGRP_IPV6_MROUTE	RTNLGRP_IPV6_MROUTE
 	RTNLGRP_IPV6_ROUTE,
 #define RTNLGRP_IPV6_ROUTE	RTNLGRP_IPV6_ROUTE
 	RTNLGRP_IPV6_IFINFO,
 #define RTNLGRP_IPV6_IFINFO	RTNLGRP_IPV6_IFINFO
 	RTNLGRP_DECnet_IFADDR,
 #define RTNLGRP_DECnet_IFADDR	RTNLGRP_DECnet_IFADDR
 	RTNLGRP_NOP2,
 	RTNLGRP_DECnet_ROUTE,
 #define RTNLGRP_DECnet_ROUTE	RTNLGRP_DECnet_ROUTE
 	RTNLGRP_DECnet_RULE,
 #define RTNLGRP_DECnet_RULE	RTNLGRP_DECnet_RULE
 	RTNLGRP_NOP4,
 	RTNLGRP_IPV6_PREFIX,
 #define RTNLGRP_IPV6_PREFIX	RTNLGRP_IPV6_PREFIX
 	RTNLGRP_IPV6_RULE,
 #define RTNLGRP_IPV6_RULE	RTNLGRP_IPV6_RULE
 	RTNLGRP_ND_USEROPT,
 #define RTNLGRP_ND_USEROPT	RTNLGRP_ND_USEROPT
 	RTNLGRP_PHONET_IFADDR,
 #define RTNLGRP_PHONET_IFADDR	RTNLGRP_PHONET_IFADDR
 	RTNLGRP_PHONET_ROUTE,
 #define RTNLGRP_PHONET_ROUTE	RTNLGRP_PHONET_ROUTE
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
 /* TC action piece */
 struct tcamsg {
 	unsigned char	tca_family;
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
 };
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
 #define TCA_ACT_TAB 1 /* attr type must be >=1 */
 #define TCAA_MAX 1
 /* End of information exported to user level */
 #ifdef __KERNEL__
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
 static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
 {
 	int len = strlen(str) + 1;
 	return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len);
 }
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
 			u32 group, struct nlmsghdr *nlh, gfp_t flags);
 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, u32 ts, u32 tsage, long expires,
 			      u32 error);
 extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
 #define RTA_PUT(skb, attrtype, attrlen, data) \
 ({	if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \
 		 goto rtattr_failure; \
    	__rta_fill(skb, attrtype, attrlen, data); })
 #define RTA_APPEND(skb, attrlen, data) \
 ({	if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \
 		goto rtattr_failure; \
 	memcpy(skb_put(skb, attrlen), data, attrlen); })
 #define RTA_PUT_NOHDR(skb, attrlen, data) \
 ({	RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \
 	memset(skb_tail_pointer(skb) - (RTA_ALIGN(attrlen) - attrlen), 0, \
 	       RTA_ALIGN(attrlen) - attrlen); })
 #define RTA_PUT_U8(skb, attrtype, value) \
 ({	u8 _tmp = (value); \
 	RTA_PUT(skb, attrtype, sizeof(u8), &_tmp); })
 #define RTA_PUT_U16(skb, attrtype, value) \
 ({	u16 _tmp = (value); \
 	RTA_PUT(skb, attrtype, sizeof(u16), &_tmp); })
 #define RTA_PUT_U32(skb, attrtype, value) \
 ({	u32 _tmp = (value); \
 	RTA_PUT(skb, attrtype, sizeof(u32), &_tmp); })
 #define RTA_PUT_U64(skb, attrtype, value) \
 ({	u64 _tmp = (value); \
 	RTA_PUT(skb, attrtype, sizeof(u64), &_tmp); })
 #define RTA_PUT_SECS(skb, attrtype, value) \
 	RTA_PUT_U64(skb, attrtype, (value) / HZ)
 #define RTA_PUT_MSECS(skb, attrtype, value) \
 	RTA_PUT_U64(skb, attrtype, jiffies_to_msecs(value))
 #define RTA_PUT_STRING(skb, attrtype, value) \
 	RTA_PUT(skb, attrtype, strlen(value) + 1, value)
 #define RTA_PUT_FLAG(skb, attrtype) \
 	RTA_PUT(skb, attrtype, 0, NULL);
 #define RTA_NEST(skb, type) \
 ({	struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \
 	RTA_PUT(skb, type, 0, NULL); \
 	__start;  })
 #define RTA_NEST_END(skb, start) \
 ({	(start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \
 	(skb)->len; })
 #define RTA_NEST_COMPAT(skb, type, attrlen, data) \
 ({	struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \
 	RTA_PUT(skb, type, attrlen, data); \
 	RTA_NEST(skb, type); \
 	__start; })
 #define RTA_NEST_COMPAT_END(skb, start) \
 ({	struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \
 	(start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \
 	RTA_NEST_END(skb, __nest); \
 	(skb)->len; })
 #define RTA_NEST_CANCEL(skb, start) \
 ({	if (start) \
 		skb_trim(skb, (unsigned char *) (start) - (skb)->data); \
 	-1; })
 #define RTA_GET_U8(rta) \
 ({	if (!rta || RTA_PAYLOAD(rta) < sizeof(u8)) \
 		goto rtattr_failure; \
 	*(u8 *) RTA_DATA(rta); })
 #define RTA_GET_U16(rta) \
 ({	if (!rta || RTA_PAYLOAD(rta) < sizeof(u16)) \
 		goto rtattr_failure; \
 	*(u16 *) RTA_DATA(rta); })
 #define RTA_GET_U32(rta) \
 ({	if (!rta || RTA_PAYLOAD(rta) < sizeof(u32)) \
 		goto rtattr_failure; \
 	*(u32 *) RTA_DATA(rta); })
 #define RTA_GET_U64(rta) \
 ({	u64 _tmp; \
 	if (!rta || RTA_PAYLOAD(rta) < sizeof(u64)) \
 		goto rtattr_failure; \
 	memcpy(&_tmp, RTA_DATA(rta), sizeof(_tmp)); \
 	_tmp; })
 #define RTA_GET_FLAG(rta) (!!(rta))
 #define RTA_GET_SECS(rta) ((unsigned long) RTA_GET_U64(rta) * HZ)
 #define RTA_GET_MSECS(rta) (msecs_to_jiffies((unsigned long) RTA_GET_U64(rta)))
 static inline struct rtattr *
 __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 {
 	struct rtattr *rta;
 	int size = RTA_LENGTH(attrlen);
 	rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
 	rta->rta_type = attrtype;
 	rta->rta_len = size;
 	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 	return rta;
 }
 #define __RTA_PUT(skb, attrtype, attrlen) \
 ({ 	if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \
 		goto rtattr_failure; \
    	__rta_reserve(skb, attrtype, attrlen); })
 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
 /* RTNL is used as a global lock for all changes to network configuration  */
 extern void rtnl_lock(void);
 extern void rtnl_unlock(void);
 extern int rtnl_trylock(void);
 extern int rtnl_is_locked(void);
 #ifdef CONFIG_PROVE_LOCKING
 extern int lockdep_rtnl_is_held(void);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 /**
  * rcu_dereference_rtnl - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
  *
  * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
  * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
  */
 #define rcu_dereference_rtnl(p)					\
-	rcu_dereference_check(p, rcu_read_lock_held() ||	\
+	rcu_dereference_check(p, lockdep_rtnl_is_held())
-				 lockdep_rtnl_is_held())
 /**
  * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
  * @p: The pointer to read, prior to dereferencing
  *
  * Return the value of the specified RCU-protected pointer, but omit
  * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
  * caller holds RTNL.
  */
 #define rtnl_dereference(p)					\
 	rcu_dereference_protected(p, lockdep_rtnl_is_held())
 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 {
 	return rtnl_dereference(dev->ingress_queue);
 }
 extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
 extern void rtnetlink_init(void);
 extern void __rtnl_unlock(void);
 #define ASSERT_RTNL() do { \
 	if (unlikely(!rtnl_is_locked())) { \
 		printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \
 		       __FILE__,  __LINE__); \
 		dump_stack(); \
 	} \
 } while(0)
 static inline u32 rtm_get_table(struct rtattr **rta, u8 table)
 {
 	return RTA_GET_U32(rta[RTA_TABLE-1]);
 rtattr_failure:
 	return table;
 }
 #endif /* __KERNEL__ */
 #endif	/* __LINUX_RTNETLINK_H */

include/net/sock.h

Diff comments View file @ d8bf4ca

 /*
  * INET		An implementation of the TCP/IP protocol suite for the LINUX
  *		operating system.  INET is implemented using the  BSD Socket
  *		interface as the means of communication with the user level.
  *
  *		Definitions for the AF_INET socket handler.
  *
  * Version:	@(#)sock.h	1.0.4	05/13/93
  *
  * Authors:	Ross Biro
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
  *		Florian La Roche <flla@stud.uni-sb.de>
  *
  * Fixes:
  *		Alan Cox	:	Volatiles in skbuff pointers. See
  *					skbuff comments. May be overdone,
  *					better to prove they can be removed
  *					than the reverse.
  *		Alan Cox	:	Added a zapped field for tcp to note
  *					a socket is reset and must stay shut up
  *		Alan Cox	:	New fields for options
  *	Pauline Middelink	:	identd support
  *		Alan Cox	:	Eliminate low level recv/recvfrom
  *		David S. Miller	:	New socket lookup architecture.
  *              Steve Whitehouse:       Default routines for sock_ops
  *              Arnaldo C. Melo :	removed net_pinfo, tp_pinfo and made
  *              			protinfo be just a void pointer, as the
  *              			protocol specific parts were moved to
  *              			respective headers and ipv4/v6, etc now
  *              			use private slabcaches for its socks
  *              Pedro Hortas	:	New flags field for socket options
  *
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  */
 #ifndef _SOCK_H
 #define _SOCK_H
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/list_nulls.h>
 #include <linux/timer.h>
 #include <linux/cache.h>
 #include <linux/module.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/mm.h>
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
 #include <linux/poll.h>
 #include <linux/atomic.h>
 #include <net/dst.h>
 #include <net/checksum.h>
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
  * the other protocols.
  */
 /* Define this to get the SOCK_DBG debugging facility. */
 #define SOCK_DEBUGGING
 #ifdef SOCK_DEBUGGING
 #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \
 					printk(KERN_DEBUG msg); } while (0)
 #else
 /* Validate arguments and do nothing */
 static inline void __attribute__ ((format (printf, 2, 3)))
 SOCK_DEBUG(struct sock *sk, const char *msg, ...)
 {
 }
 #endif
 /* This is the per-socket lock.  The spinlock provides a synchronization
  * between user contexts and software interrupt processing, whereas the
  * mini-semaphore synchronizes multiple users amongst themselves.
  */
 typedef struct {
 	spinlock_t		slock;
 	int			owned;
 	wait_queue_head_t	wq;
 	/*
 	 * We express the mutex-alike socket_lock semantics
 	 * to the lock validator by explicitly managing
 	 * the slock as a lock variant (in addition to
 	 * the slock itself):
 	 */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
 } socket_lock_t;
 struct sock;
 struct proto;
 struct net;
 /**
  *	struct sock_common - minimal network layer representation of sockets
  *	@skc_daddr: Foreign IPv4 addr
  *	@skc_rcv_saddr: Bound local IPv4 addr
  *	@skc_hash: hash value used with various protocol lookup tables
  *	@skc_u16hashes: two u16 hash values used by UDP lookup tables
  *	@skc_family: network address family
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
  *	@skc_prot: protocol handlers inside a network family
  *	@skc_net: reference to the network namespace of this socket
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
  *	@skc_refcnt: reference count
  *
  *	This is the minimal network layer representation of sockets, the header
  *	for struct sock and struct inet_timewait_sock.
  */
 struct sock_common {
 	/* skc_daddr and skc_rcv_saddr must be grouped :
 	 * cf INET_MATCH() and INET_TW_MATCH()
 	 */
 	__be32			skc_daddr;
 	__be32			skc_rcv_saddr;
 	union  {
 		unsigned int	skc_hash;
 		__u16		skc_u16hashes[2];
 	};
 	unsigned short		skc_family;
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
 		struct hlist_nulls_node skc_portaddr_node;
 	};
 	struct proto		*skc_prot;
 #ifdef CONFIG_NET_NS
 	struct net	 	*skc_net;
 #endif
 	/*
 	 * fields between dontcopy_begin/dontcopy_end
 	 * are not copied in sock_copy()
 	 */
 	/* private: */
 	int			skc_dontcopy_begin[0];
 	/* public: */
 	union {
 		struct hlist_node	skc_node;
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
 	atomic_t		skc_refcnt;
 	/* private: */
 	int                     skc_dontcopy_end[0];
 	/* public: */
 };
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
   *	@sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
   *	@sk_rcvbuf: size of receive buffer in bytes
   *	@sk_wq: sock wait queue and async head
   *	@sk_dst_cache: destination cache
   *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
   *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
   *	@sk_allocation: allocation mode
   *	@sk_sndbuf: size of send buffer in bytes
   *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
   *		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
   *	@sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
   *	@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
   *	@sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
   *	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
   *	@sk_gso_max_size: Maximum GSO segment size to build
   *	@sk_lingertime: %SO_LINGER l_linger setting
   *	@sk_backlog: always used with the per-socket spinlock held
   *	@sk_callback_lock: used with the callbacks in the end of this struct
   *	@sk_error_queue: rarely used
   *	@sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
   *			  IPV6_ADDRFORM for instance)
   *	@sk_err: last error
   *	@sk_err_soft: errors that don't cause failure but are the cause of a
   *		      persistent failure not just 'timed out'
   *	@sk_drops: raw/udp drops counter
   *	@sk_ack_backlog: current listen backlog
   *	@sk_max_ack_backlog: listen backlog set in listen()
   *	@sk_priority: %SO_PRIORITY setting
   *	@sk_type: socket type (%SOCK_STREAM, etc)
   *	@sk_protocol: which protocol this socket belongs in this network family
   *	@sk_peer_pid: &struct pid for this socket's peer
   *	@sk_peer_cred: %SO_PEERCRED setting
   *	@sk_rcvlowat: %SO_RCVLOWAT setting
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_rxhash: flow hash received from netif layer
   *	@sk_filter: socket filtering instructions
   *	@sk_protinfo: private area, net family specific, when not using slab
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
   *	@sk_socket: Identd and reporting IO signals
   *	@sk_user_data: RPC layer private data
   *	@sk_sndmsg_page: cached page for sendmsg
   *	@sk_sndmsg_off: cached offset for sendmsg
   *	@sk_send_head: front of stuff to transmit
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
   *	@sk_classid: this socket's cgroup classid
   *	@sk_write_pending: a write to stream socket waits to start
   *	@sk_state_change: callback to indicate change in the state of the sock
   *	@sk_data_ready: callback to indicate there is data to be processed
   *	@sk_write_space: callback to indicate there is bf sending space available
   *	@sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
   *	@sk_backlog_rcv: callback to process the backlog
   *	@sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  */
 struct sock {
 	/*
 	 * Now struct inet_timewait_sock also uses sock_common, so please just
 	 * don't add nothing before this first member (__sk_common) --acme
 	 */
 	struct sock_common	__sk_common;
 #define sk_node			__sk_common.skc_node
 #define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping
 #define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
 #define sk_hash			__sk_common.skc_hash
 #define sk_family		__sk_common.skc_family
 #define sk_state		__sk_common.skc_state
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
 #define sk_net			__sk_common.skc_net
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
 	 * the per-socket spinlock held and requires low latency
 	 * access. Therefore we special case it's implementation.
 	 * Note : rmem_alloc is in this structure to fill a hole
 	 * on 64bit arches, not because its logically part of
 	 * backlog.
 	 */
 	struct {
 		atomic_t	rmem_alloc;
 		int		len;
 		struct sk_buff	*head;
 		struct sk_buff	*tail;
 	} sk_backlog;
 #define sk_rmem_alloc sk_backlog.rmem_alloc
 	int			sk_forward_alloc;
 #ifdef CONFIG_RPS
 	__u32			sk_rxhash;
 #endif
 	atomic_t		sk_drops;
 	int			sk_rcvbuf;
 	struct sk_filter __rcu	*sk_filter;
 	struct socket_wq __rcu	*sk_wq;
 #ifdef CONFIG_NET_DMA
 	struct sk_buff_head	sk_async_wait_queue;
 #endif
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
 	struct sk_buff_head	sk_write_queue;
 	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_shutdown  : 2,
 				sk_no_check  : 2,
 				sk_userlocks : 4,
 				sk_protocol  : 8,
 				sk_type      : 16;
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
 	int			sk_route_caps;
 	int			sk_route_nocaps;
 	int			sk_gso_type;
 	unsigned int		sk_gso_max_size;
 	int			sk_rcvlowat;
 	unsigned long	        sk_lingertime;
 	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
 	unsigned short		sk_ack_backlog;
 	unsigned short		sk_max_ack_backlog;
 	__u32			sk_priority;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
 	long			sk_sndtimeo;
 	void			*sk_protinfo;
 	struct timer_list	sk_timer;
 	ktime_t			sk_stamp;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
 	struct page		*sk_sndmsg_page;
 	struct sk_buff		*sk_send_head;
 	__u32			sk_sndmsg_off;
 	int			sk_write_pending;
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
 	__u32			sk_mark;
 	u32			sk_classid;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
 	void			(*sk_write_space)(struct sock *sk);
 	void			(*sk_error_report)(struct sock *sk);
   	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
 };
 /*
  * Hashed lists helper routines
  */
 static inline struct sock *sk_entry(const struct hlist_node *node)
 {
 	return hlist_entry(node, struct sock, sk_node);
 }
 static inline struct sock *__sk_head(const struct hlist_head *head)
 {
 	return hlist_entry(head->first, struct sock, sk_node);
 }
 static inline struct sock *sk_head(const struct hlist_head *head)
 {
 	return hlist_empty(head) ? NULL : __sk_head(head);
 }
 static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
 {
 	return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
 }
 static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
 {
 	return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
 }
 static inline struct sock *sk_next(const struct sock *sk)
 {
 	return sk->sk_node.next ?
 		hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
 }
 static inline struct sock *sk_nulls_next(const struct sock *sk)
 {
 	return (!is_a_nulls(sk->sk_nulls_node.next)) ?
 		hlist_nulls_entry(sk->sk_nulls_node.next,
 				  struct sock, sk_nulls_node) :
 		NULL;
 }
 static inline int sk_unhashed(const struct sock *sk)
 {
 	return hlist_unhashed(&sk->sk_node);
 }
 static inline int sk_hashed(const struct sock *sk)
 {
 	return !sk_unhashed(sk);
 }
 static __inline__ void sk_node_init(struct hlist_node *node)
 {
 	node->pprev = NULL;
 }
 static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node)
 {
 	node->pprev = NULL;
 }
 static __inline__ void __sk_del_node(struct sock *sk)
 {
 	__hlist_del(&sk->sk_node);
 }
 /* NB: equivalent to hlist_del_init_rcu */
 static __inline__ int __sk_del_node_init(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
 		__sk_del_node(sk);
 		sk_node_init(&sk->sk_node);
 		return 1;
 	}
 	return 0;
 }
 /* Grab socket reference count. This operation is valid only
    when sk is ALREADY grabbed f.e. it is found in hash table
    or a list and the lookup is made under lock preventing hash table
    modifications.
  */
 static inline void sock_hold(struct sock *sk)
 {
 	atomic_inc(&sk->sk_refcnt);
 }
 /* Ungrab socket in the context, which assumes that socket refcnt
    cannot hit zero, f.e. it is true in context of any socketcall.
  */
 static inline void __sock_put(struct sock *sk)
 {
 	atomic_dec(&sk->sk_refcnt);
 }
 static __inline__ int sk_del_node_init(struct sock *sk)
 {
 	int rc = __sk_del_node_init(sk);
 	if (rc) {
 		/* paranoid for a while -acme */
 		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
 		__sock_put(sk);
 	}
 	return rc;
 }
 #define sk_del_node_init_rcu(sk)	sk_del_node_init(sk)
 static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
 		hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
 		return 1;
 	}
 	return 0;
 }
 static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk)
 {
 	int rc = __sk_nulls_del_node_init_rcu(sk);
 	if (rc) {
 		/* paranoid for a while -acme */
 		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
 		__sock_put(sk);
 	}
 	return rc;
 }
 static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_node, list);
 }
 static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 {
 	sock_hold(sk);
 	__sk_add_node(sk, list);
 }
 static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
 {
 	sock_hold(sk);
 	hlist_add_head_rcu(&sk->sk_node, list);
 }
 static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	sock_hold(sk);
 	__sk_nulls_add_node_rcu(sk, list);
 }
 static __inline__ void __sk_del_bind_node(struct sock *sk)
 {
 	__hlist_del(&sk->sk_bind_node);
 }
 static __inline__ void sk_add_bind_node(struct sock *sk,
 					struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_bind_node, list);
 }
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
 #define sk_for_each_rcu(__sk, node, list) \
 	hlist_for_each_entry_rcu(__sk, node, list, sk_node)
 #define sk_nulls_for_each(__sk, node, list) \
 	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
 #define sk_nulls_for_each_rcu(__sk, node, list) \
 	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
 #define sk_nulls_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
 		hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
 #define sk_for_each_safe(__sk, node, tmp, list) \
 	hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_bind_node)
 /* Sock flags */
 enum sock_flags {
 	SOCK_DEAD,
 	SOCK_DONE,
 	SOCK_URGINLINE,
 	SOCK_KEEPOPEN,
 	SOCK_LINGER,
 	SOCK_DESTROY,
 	SOCK_BROADCAST,
 	SOCK_TIMESTAMP,
 	SOCK_ZAPPED,
 	SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
 	SOCK_DBG, /* %SO_DEBUG setting */
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
 	SOCK_TIMESTAMPING_TX_HARDWARE,  /* %SOF_TIMESTAMPING_TX_HARDWARE */
 	SOCK_TIMESTAMPING_TX_SOFTWARE,  /* %SOF_TIMESTAMPING_TX_SOFTWARE */
 	SOCK_TIMESTAMPING_RX_HARDWARE,  /* %SOF_TIMESTAMPING_RX_HARDWARE */
 	SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
 	SOCK_TIMESTAMPING_SOFTWARE,     /* %SOF_TIMESTAMPING_SOFTWARE */
 	SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */
 	SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */
 	SOCK_FASYNC, /* fasync() active */
 	SOCK_RXQ_OVFL,
 };
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
 {
 	nsk->sk_flags = osk->sk_flags;
 }
 static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
 {
 	__set_bit(flag, &sk->sk_flags);
 }
 static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
 {
 	__clear_bit(flag, &sk->sk_flags);
 }
 static inline int sock_flag(struct sock *sk, enum sock_flags flag)
 {
 	return test_bit(flag, &sk->sk_flags);
 }
 static inline void sk_acceptq_removed(struct sock *sk)
 {
 	sk->sk_ack_backlog--;
 }
 static inline void sk_acceptq_added(struct sock *sk)
 {
 	sk->sk_ack_backlog++;
 }
 static inline int sk_acceptq_is_full(struct sock *sk)
 {
 	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
 }
 /*
  * Compute minimal free write space needed to queue new packets.
  */
 static inline int sk_stream_min_wspace(struct sock *sk)
 {
 	return sk->sk_wmem_queued >> 1;
 }
 static inline int sk_stream_wspace(struct sock *sk)
 {
 	return sk->sk_sndbuf - sk->sk_wmem_queued;
 }
 extern void sk_stream_write_space(struct sock *sk);
 static inline int sk_stream_memory_free(struct sock *sk)
 {
 	return sk->sk_wmem_queued < sk->sk_sndbuf;
 }
 /* OOB backlog add */
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	/* dont let skb dst not refcounted, we are going to leave rcu lock */
 	skb_dst_force(skb);
 	if (!sk->sk_backlog.tail)
 		sk->sk_backlog.head = skb;
 	else
 		sk->sk_backlog.tail->next = skb;
 	sk->sk_backlog.tail = skb;
 	skb->next = NULL;
 }
 /*
  * Take into account size of receive queue and backlog queue
  */
 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
 {
 	unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
 	return qsize + skb->truesize > sk->sk_rcvbuf;
 }
 /* The per-socket spinlock must be held here. */
 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	if (sk_rcvqueues_full(sk, skb))
 		return -ENOBUFS;
 	__sk_add_backlog(sk, skb);
 	sk->sk_backlog.len += skb->truesize;
 	return 0;
 }
 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	return sk->sk_backlog_rcv(sk, skb);
 }
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
 	struct rps_sock_flow_table *sock_flow_table;
 	rcu_read_lock();
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	rps_record_sock_flow(sock_flow_table, sk->sk_rxhash);
 	rcu_read_unlock();
 #endif
 }
 static inline void sock_rps_reset_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
 	struct rps_sock_flow_table *sock_flow_table;
 	rcu_read_lock();
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash);
 	rcu_read_unlock();
 #endif
 }
 static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash)
 {
 #ifdef CONFIG_RPS
 	if (unlikely(sk->sk_rxhash != rxhash)) {
 		sock_rps_reset_flow(sk);
 		sk->sk_rxhash = rxhash;
 	}
 #endif
 }
 #define sk_wait_event(__sk, __timeo, __condition)			\
 	({	int __rc;						\
 		release_sock(__sk);					\
 		__rc = __condition;					\
 		if (!__rc) {						\
 			*(__timeo) = schedule_timeout(*(__timeo));	\
 		}							\
 		lock_sock(__sk);					\
 		__rc = __condition;					\
 		__rc;							\
 	})
 extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
 extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
 extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
 extern int sk_stream_error(struct sock *sk, int flags, int err);
 extern void sk_stream_kill_queues(struct sock *sk);
 extern int sk_wait_data(struct sock *sk, long *timeo);
 struct request_sock_ops;
 struct timewait_sock_ops;
 struct inet_hashinfo;
 struct raw_hashinfo;
 /* Networking protocol blocks we attach to sockets.
  * socket layer -> transport layer interface
  * transport -> network interface is defined by struct inet_proto
  */
 struct proto {
 	void			(*close)(struct sock *sk,
 					long timeout);
 	int			(*connect)(struct sock *sk,
 				        struct sockaddr *uaddr,
 					int addr_len);
 	int			(*disconnect)(struct sock *sk, int flags);
 	struct sock *		(*accept) (struct sock *sk, int flags, int *err);
 	int			(*ioctl)(struct sock *sk, int cmd,
 					 unsigned long arg);
 	int			(*init)(struct sock *sk);
 	void			(*destroy)(struct sock *sk);
 	void			(*shutdown)(struct sock *sk, int how);
 	int			(*setsockopt)(struct sock *sk, int level,
 					int optname, char __user *optval,
 					unsigned int optlen);
 	int			(*getsockopt)(struct sock *sk, int level,
 					int optname, char __user *optval,
 					int __user *option);
 #ifdef CONFIG_COMPAT
 	int			(*compat_setsockopt)(struct sock *sk,
 					int level,
 					int optname, char __user *optval,
 					unsigned int optlen);
 	int			(*compat_getsockopt)(struct sock *sk,
 					int level,
 					int optname, char __user *optval,
 					int __user *option);
 	int			(*compat_ioctl)(struct sock *sk,
 					unsigned int cmd, unsigned long arg);
 #endif
 	int			(*sendmsg)(struct kiocb *iocb, struct sock *sk,
 					   struct msghdr *msg, size_t len);
 	int			(*recvmsg)(struct kiocb *iocb, struct sock *sk,
 					   struct msghdr *msg,
 					size_t len, int noblock, int flags,
 					int *addr_len);
 	int			(*sendpage)(struct sock *sk, struct page *page,
 					int offset, size_t size, int flags);
 	int			(*bind)(struct sock *sk,
 					struct sockaddr *uaddr, int addr_len);
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
 	void			(*clear_sk)(struct sock *sk, int size);
 	/* Keeping track of sockets in use */
 #ifdef CONFIG_PROC_FS
 	unsigned int		inuse_idx;
 #endif
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
 	 * Pressure flag: try to collapse.
 	 * Technical note: it is used by multiple contexts non atomically.
 	 * All the __sk_mem_schedule() is of this nature: accounting
 	 * is strict, actions are advisory and have some latency.
 	 */
 	int			*memory_pressure;
 	long			*sysctl_mem;
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
 	int			max_header;
 	bool			no_autobind;
 	struct kmem_cache	*slab;
 	unsigned int		obj_size;
 	int			slab_flags;
 	struct percpu_counter	*orphan_count;
 	struct request_sock_ops	*rsk_prot;
 	struct timewait_sock_ops *twsk_prot;
 	union {
 		struct inet_hashinfo	*hashinfo;
 		struct udp_table	*udp_table;
 		struct raw_hashinfo	*raw_hash;
 	} h;
 	struct module		*owner;
 	char			name[32];
 	struct list_head	node;
 #ifdef SOCK_REFCNT_DEBUG
 	atomic_t		socks;
 #endif
 };
 extern int proto_register(struct proto *prot, int alloc_slab);
 extern void proto_unregister(struct proto *prot);
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
 {
 	atomic_inc(&sk->sk_prot->socks);
 }
 static inline void sk_refcnt_debug_dec(struct sock *sk)
 {
 	atomic_dec(&sk->sk_prot->socks);
 	printk(KERN_DEBUG "%s socket %p released, %d are still alive\n",
 	       sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks));
 }
 static inline void sk_refcnt_debug_release(const struct sock *sk)
 {
 	if (atomic_read(&sk->sk_refcnt) != 1)
 		printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
 		       sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt));
 }
 #else /* SOCK_REFCNT_DEBUG */
 #define sk_refcnt_debug_inc(sk) do { } while (0)
 #define sk_refcnt_debug_dec(sk) do { } while (0)
 #define sk_refcnt_debug_release(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 #ifdef CONFIG_PROC_FS
 /* Called with local bh disabled */
 extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
 extern int sock_prot_inuse_get(struct net *net, struct proto *proto);
 #else
 static void inline sock_prot_inuse_add(struct net *net, struct proto *prot,
 		int inc)
 {
 }
 #endif
 /* With per-bucket locks this operation is not-atomic, so that
  * this version is not worse.
  */
 static inline void __sk_prot_rehash(struct sock *sk)
 {
 	sk->sk_prot->unhash(sk);
 	sk->sk_prot->hash(sk);
 }
 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
 /* About 10 seconds */
 #define SOCK_DESTROY_TIME (10*HZ)
 /* Sockets 0-1023 can't be bound to unless you are superuser */
 #define PROT_SOCK	1024
 #define SHUTDOWN_MASK	3
 #define RCV_SHUTDOWN	1
 #define SEND_SHUTDOWN	2
 #define SOCK_SNDBUF_LOCK	1
 #define SOCK_RCVBUF_LOCK	2
 #define SOCK_BINDADDR_LOCK	4
 #define SOCK_BINDPORT_LOCK	8
 /* sock_iocb: used to kick off async processing of socket ios */
 struct sock_iocb {
 	struct list_head	list;
 	int			flags;
 	int			size;
 	struct socket		*sock;
 	struct sock		*sk;
 	struct scm_cookie	*scm;
 	struct msghdr		*msg, async_msg;
 	struct kiocb		*kiocb;
 };
 static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb)
 {
 	return (struct sock_iocb *)iocb->private;
 }
 static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 {
 	return si->kiocb;
 }
 struct socket_alloc {
 	struct socket socket;
 	struct inode vfs_inode;
 };
 static inline struct socket *SOCKET_I(struct inode *inode)
 {
 	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
 }
 static inline struct inode *SOCK_INODE(struct socket *socket)
 {
 	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
 }
 /*
  * Functions for memory accounting
  */
 extern int __sk_mem_schedule(struct sock *sk, int size, int kind);
 extern void __sk_mem_reclaim(struct sock *sk);
 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
 #define SK_MEM_SEND	0
 #define SK_MEM_RECV	1
 static inline int sk_mem_pages(int amt)
 {
 	return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
 }
 static inline int sk_has_account(struct sock *sk)
 {
 	/* return true if protocol supports memory accounting */
 	return !!sk->sk_prot->memory_allocated;
 }
 static inline int sk_wmem_schedule(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return 1;
 	return size <= sk->sk_forward_alloc ||
 		__sk_mem_schedule(sk, size, SK_MEM_SEND);
 }
 static inline int sk_rmem_schedule(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return 1;
 	return size <= sk->sk_forward_alloc ||
 		__sk_mem_schedule(sk, size, SK_MEM_RECV);
 }
 static inline void sk_mem_reclaim(struct sock *sk)
 {
 	if (!sk_has_account(sk))
 		return;
 	if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
 		__sk_mem_reclaim(sk);
 }
 static inline void sk_mem_reclaim_partial(struct sock *sk)
 {
 	if (!sk_has_account(sk))
 		return;
 	if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
 		__sk_mem_reclaim(sk);
 }
 static inline void sk_mem_charge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return;
 	sk->sk_forward_alloc -= size;
 }
 static inline void sk_mem_uncharge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return;
 	sk->sk_forward_alloc += size;
 }
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
 	__kfree_skb(skb);
 }
 /* Used by processes to "lock" a socket state, so that
  * interrupts and bottom half handlers won't change it
  * from under us. It essentially blocks any incoming
  * packets, so that we won't get any new data or any
  * packets that change the state of the socket.
  *
  * While locked, BH processing will add new packets to
  * the backlog queue.  This queue is processed by the
  * owner of the socket lock right before it is released.
  *
  * Since ~2.3.5 it is also exclusive sleep lock serializing
  * accesses from user process context.
  */
 #define sock_owned_by_user(sk)	((sk)->sk_lock.owned)
 /*
  * Macro so as to not evaluate some arguments when
  * lockdep is not enabled.
  *
  * Mark both the sk_lock and the sk_lock.slock as a
  * per-address-family lock class.
  */
 #define sock_lock_init_class_and_name(sk, sname, skey, name, key) 	\
 do {									\
 	sk->sk_lock.owned = 0;						\
 	init_waitqueue_head(&sk->sk_lock.wq);				\
 	spin_lock_init(&(sk)->sk_lock.slock);				\
 	debug_check_no_locks_freed((void *)&(sk)->sk_lock,		\
 			sizeof((sk)->sk_lock));				\
 	lockdep_set_class_and_name(&(sk)->sk_lock.slock,		\
 		       	(skey), (sname));				\
 	lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);	\
 } while (0)
 extern void lock_sock_nested(struct sock *sk, int subclass);
 static inline void lock_sock(struct sock *sk)
 {
 	lock_sock_nested(sk, 0);
 }
 extern void release_sock(struct sock *sk);
 /* BH context may only use the following locking interface. */
 #define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
 #define bh_lock_sock_nested(__sk) \
 				spin_lock_nested(&((__sk)->sk_lock.slock), \
 				SINGLE_DEPTH_NESTING)
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 extern bool lock_sock_fast(struct sock *sk);
 /**
  * unlock_sock_fast - complement of lock_sock_fast
  * @sk: socket
  * @slow: slow mode
  *
  * fast unlock socket for user context.
  * If slow mode is on, we call regular release_sock()
  */
 static inline void unlock_sock_fast(struct sock *sk, bool slow)
 {
 	if (slow)
 		release_sock(sk);
 	else
 		spin_unlock_bh(&sk->sk_lock.slock);
 }
 extern struct sock		*sk_alloc(struct net *net, int family,
 					  gfp_t priority,
 					  struct proto *prot);
 extern void			sk_free(struct sock *sk);
 extern void			sk_release_kernel(struct sock *sk);
 extern struct sock		*sk_clone(const struct sock *sk,
 					  const gfp_t priority);
 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
 					      unsigned long size, int force,
 					      gfp_t priority);
 extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      unsigned long size, int force,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
 						unsigned int optlen);
 extern int			sock_getsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
 						int __user *optlen);
 extern struct sk_buff 		*sock_alloc_send_skb(struct sock *sk,
 						     unsigned long size,
 						     int noblock,
 						     int *errcode);
 extern struct sk_buff 		*sock_alloc_send_pskb(struct sock *sk,
 						      unsigned long header_len,
 						      unsigned long data_len,
 						      int noblock,
 						      int *errcode);
 extern void *sock_kmalloc(struct sock *sk, int size,
 			  gfp_t priority);
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
 extern void sk_send_sigurg(struct sock *sk);
 #ifdef CONFIG_CGROUPS
 extern void sock_update_classid(struct sock *sk);
 #else
 static inline void sock_update_classid(struct sock *sk)
 {
 }
 #endif
 /*
  * Functions to fill in entries in struct proto_ops when a protocol
  * does not implement a particular function.
  */
 extern int                      sock_no_bind(struct socket *,
 					     struct sockaddr *, int);
 extern int                      sock_no_connect(struct socket *,
 						struct sockaddr *, int, int);
 extern int                      sock_no_socketpair(struct socket *,
 						   struct socket *);
 extern int                      sock_no_accept(struct socket *,
 					       struct socket *, int);
 extern int                      sock_no_getname(struct socket *,
 						struct sockaddr *, int *, int);
 extern unsigned int             sock_no_poll(struct file *, struct socket *,
 					     struct poll_table_struct *);
 extern int                      sock_no_ioctl(struct socket *, unsigned int,
 					      unsigned long);
 extern int			sock_no_listen(struct socket *, int);
 extern int                      sock_no_shutdown(struct socket *, int);
 extern int			sock_no_getsockopt(struct socket *, int , int,
 						   char __user *, int __user *);
 extern int			sock_no_setsockopt(struct socket *, int, int,
 						   char __user *, unsigned int);
 extern int                      sock_no_sendmsg(struct kiocb *, struct socket *,
 						struct msghdr *, size_t);
 extern int                      sock_no_recvmsg(struct kiocb *, struct socket *,
 						struct msghdr *, size_t, int);
 extern int			sock_no_mmap(struct file *file,
 					     struct socket *sock,
 					     struct vm_area_struct *vma);
 extern ssize_t			sock_no_sendpage(struct socket *sock,
 						struct page *page,
 						int offset, size_t size,
 						int flags);
 /*
  * Functions to fill in entries in struct proto_ops when a protocol
  * uses the inet style.
  */
 extern int sock_common_getsockopt(struct socket *sock, int level, int optname,
 				  char __user *optval, int __user *optlen);
 extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
 			       struct msghdr *msg, size_t size, int flags);
 extern int sock_common_setsockopt(struct socket *sock, int level, int optname,
 				  char __user *optval, unsigned int optlen);
 extern int compat_sock_common_getsockopt(struct socket *sock, int level,
 		int optname, char __user *optval, int __user *optlen);
 extern int compat_sock_common_setsockopt(struct socket *sock, int level,
 		int optname, char __user *optval, unsigned int optlen);
 extern void sk_common_release(struct sock *sk);
 /*
  *	Default socket callbacks and setup code
  */
 /* Initialise core socket variables */
 extern void sock_init_data(struct socket *sock, struct sock *sk);
 extern void sk_filter_release_rcu(struct rcu_head *rcu);
 /**
  *	sk_filter_release - release a socket filter
  *	@fp: filter to remove
  *
  *	Remove a filter from a socket and release its resources.
  */
 static inline void sk_filter_release(struct sk_filter *fp)
 {
 	if (atomic_dec_and_test(&fp->refcnt))
 		call_rcu(&fp->rcu, sk_filter_release_rcu);
 }
 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 {
 	unsigned int size = sk_filter_len(fp);
 	atomic_sub(size, &sk->sk_omem_alloc);
 	sk_filter_release(fp);
 }
 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 {
 	atomic_inc(&fp->refcnt);
 	atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc);
 }
 /*
  * Socket reference counting postulates.
  *
  * * Each user of socket SHOULD hold a reference count.
  * * Each access point to socket (an hash table bucket, reference from a list,
  *   running timer, skb in flight MUST hold a reference count.
  * * When reference count hits 0, it means it will never increase back.
  * * When reference count hits 0, it means that no references from
  *   outside exist to this socket and current process on current CPU
  *   is last user and may/should destroy this socket.
  * * sk_free is called from any context: process, BH, IRQ. When
  *   it is called, socket has no references from outside -> sk_free
  *   may release descendant resources allocated by the socket, but
  *   to the time when it is called, socket is NOT referenced by any
  *   hash tables, lists etc.
  * * Packets, delivered from outside (from network or from another process)
  *   and enqueued on receive/error queues SHOULD NOT grab reference count,
  *   when they sit in queue. Otherwise, packets will leak to hole, when
  *   socket is looked up by one cpu and unhasing is made by another CPU.
  *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
  *   (leak to backlog). Packet socket does all the processing inside
  *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
  *   use separate SMP lock, so that they are prone too.
  */
 /* Ungrab socket and destroy it, if it was the last reference. */
 static inline void sock_put(struct sock *sk)
 {
 	if (atomic_dec_and_test(&sk->sk_refcnt))
 		sk_free(sk);
 }
 extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 			  const int nested);
 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 {
 	sk->sk_tx_queue_mapping = tx_queue;
 }
 static inline void sk_tx_queue_clear(struct sock *sk)
 {
 	sk->sk_tx_queue_mapping = -1;
 }
 static inline int sk_tx_queue_get(const struct sock *sk)
 {
 	return sk ? sk->sk_tx_queue_mapping : -1;
 }
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
 	sk->sk_socket = sock;
 }
 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
 {
 	BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
 	return &rcu_dereference_raw(sk->sk_wq)->wait;
 }
 /* Detach socket from process context.
  * Announce socket dead, detach it from wait queue and inode.
  * Note that parent inode held reference count on this struct sock,
  * we do not release it in this function, because protocol
  * probably wants some additional cleanups or even continuing
  * to work with this socket (TCP).
  */
 static inline void sock_orphan(struct sock *sk)
 {
 	write_lock_bh(&sk->sk_callback_lock);
 	sock_set_flag(sk, SOCK_DEAD);
 	sk_set_socket(sk, NULL);
 	sk->sk_wq  = NULL;
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
 	write_lock_bh(&sk->sk_callback_lock);
 	sk->sk_wq = parent->wq;
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 extern int sock_i_uid(struct sock *sk);
 extern unsigned long sock_i_ino(struct sock *sk);
 static inline struct dst_entry *
 __sk_dst_get(struct sock *sk)
 {
-	return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() ||
+	return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) ||
-						       sock_owned_by_user(sk) ||
 						       lockdep_is_held(&sk->sk_lock.slock));
 }
 static inline struct dst_entry *
 sk_dst_get(struct sock *sk)
 {
 	struct dst_entry *dst;
 	rcu_read_lock();
 	dst = rcu_dereference(sk->sk_dst_cache);
 	if (dst)
 		dst_hold(dst);
 	rcu_read_unlock();
 	return dst;
 }
 extern void sk_reset_txq(struct sock *sk);
 static inline void dst_negative_advice(struct sock *sk)
 {
 	struct dst_entry *ndst, *dst = __sk_dst_get(sk);
 	if (dst && dst->ops->negative_advice) {
 		ndst = dst->ops->negative_advice(dst);
 		if (ndst != dst) {
 			rcu_assign_pointer(sk->sk_dst_cache, ndst);
 			sk_reset_txq(sk);
 		}
 	}
 }
 static inline void
 __sk_dst_set(struct sock *sk, struct dst_entry *dst)
 {
 	struct dst_entry *old_dst;
 	sk_tx_queue_clear(sk);
 	/*
 	 * This can be called while sk is owned by the caller only,
 	 * with no state that can be checked in a rcu_dereference_check() cond
 	 */
 	old_dst = rcu_dereference_raw(sk->sk_dst_cache);
 	rcu_assign_pointer(sk->sk_dst_cache, dst);
 	dst_release(old_dst);
 }
 static inline void
 sk_dst_set(struct sock *sk, struct dst_entry *dst)
 {
 	spin_lock(&sk->sk_dst_lock);
 	__sk_dst_set(sk, dst);
 	spin_unlock(&sk->sk_dst_lock);
 }
 static inline void
 __sk_dst_reset(struct sock *sk)
 {
 	__sk_dst_set(sk, NULL);
 }
 static inline void
 sk_dst_reset(struct sock *sk)
 {
 	spin_lock(&sk->sk_dst_lock);
 	__sk_dst_reset(sk);
 	spin_unlock(&sk->sk_dst_lock);
 }
 extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
 extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 static inline int sk_can_gso(const struct sock *sk)
 {
 	return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
 }
 extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
 static inline void sk_nocaps_add(struct sock *sk, int flags)
 {
 	sk->sk_route_nocaps |= flags;
 	sk->sk_route_caps &= ~flags;
 }
 static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
 					   char __user *from, char *to,
 					   int copy, int offset)
 {
 	if (skb->ip_summed == CHECKSUM_NONE) {
 		int err = 0;
 		__wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err);
 		if (err)
 			return err;
 		skb->csum = csum_block_add(skb->csum, csum, offset);
 	} else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
 		if (!access_ok(VERIFY_READ, from, copy) ||
 		    __copy_from_user_nocache(to, from, copy))
 			return -EFAULT;
 	} else if (copy_from_user(to, from, copy))
 		return -EFAULT;
 	return 0;
 }
 static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
 				       char __user *from, int copy)
 {
 	int err, offset = skb->len;
 	err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
 				       copy, offset);
 	if (err)
 		__skb_trim(skb, offset);
 	return err;
 }
 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
 					   struct sk_buff *skb,
 					   struct page *page,
 					   int off, int copy)
 {
 	int err;
 	err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
 				       copy, skb->len);
 	if (err)
 		return err;
 	skb->len	     += copy;
 	skb->data_len	     += copy;
 	skb->truesize	     += copy;
 	sk->sk_wmem_queued   += copy;
 	sk_mem_charge(sk, copy);
 	return 0;
 }
 static inline int skb_copy_to_page(struct sock *sk, char __user *from,
 				   struct sk_buff *skb, struct page *page,
 				   int off, int copy)
 {
 	if (skb->ip_summed == CHECKSUM_NONE) {
 		int err = 0;
 		__wsum csum = csum_and_copy_from_user(from,
 						     page_address(page) + off,
 							    copy, 0, &err);
 		if (err)
 			return err;
 		skb->csum = csum_block_add(skb->csum, csum, skb->len);
 	} else if (copy_from_user(page_address(page) + off, from, copy))
 		return -EFAULT;
 	skb->len	     += copy;
 	skb->data_len	     += copy;
 	skb->truesize	     += copy;
 	sk->sk_wmem_queued   += copy;
 	sk_mem_charge(sk, copy);
 	return 0;
 }
 /**
  * sk_wmem_alloc_get - returns write allocations
  * @sk: socket
  *
  * Returns sk_wmem_alloc minus initial offset of one
  */
 static inline int sk_wmem_alloc_get(const struct sock *sk)
 {
 	return atomic_read(&sk->sk_wmem_alloc) - 1;
 }
 /**
  * sk_rmem_alloc_get - returns read allocations
  * @sk: socket
  *
  * Returns sk_rmem_alloc
  */
 static inline int sk_rmem_alloc_get(const struct sock *sk)
 {
 	return atomic_read(&sk->sk_rmem_alloc);
 }
 /**
  * sk_has_allocations - check if allocations are outstanding
  * @sk: socket
  *
  * Returns true if socket has write or read allocations
  */
 static inline int sk_has_allocations(const struct sock *sk)
 {
 	return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
 }
 /**
  * wq_has_sleeper - check if there are any waiting processes
  * @wq: struct socket_wq
  *
  * Returns true if socket_wq has waiting processes
  *
  * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
  * barrier call. They were added due to the race found within the tcp code.
  *
  * Consider following tcp code paths:
  *
  * CPU1                  CPU2
  *
  * sys_select            receive packet
  *   ...                 ...
  *   __add_wait_queue    update tp->rcv_nxt
  *   ...                 ...
  *   tp->rcv_nxt check   sock_def_readable
  *   ...                 {
  *   schedule               rcu_read_lock();
  *                          wq = rcu_dereference(sk->sk_wq);
  *                          if (wq && waitqueue_active(&wq->wait))
  *                              wake_up_interruptible(&wq->wait)
  *                          ...
  *                       }
  *
  * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
  * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
  * could then endup calling schedule and sleep forever if there are no more
  * data on the socket.
  *
  */
 static inline bool wq_has_sleeper(struct socket_wq *wq)
 {
 	/*
 	 * We need to be sure we are in sync with the
 	 * add_wait_queue modifications to the wait queue.
 	 *
 	 * This memory barrier is paired in the sock_poll_wait.
 	 */
 	smp_mb();
 	return wq && waitqueue_active(&wq->wait);
 }
 /**
  * sock_poll_wait - place memory barrier behind the poll_wait call.
  * @filp:           file
  * @wait_address:   socket wait queue
  * @p:              poll_table
  *
  * See the comments in the wq_has_sleeper function.
  */
 static inline void sock_poll_wait(struct file *filp,
 		wait_queue_head_t *wait_address, poll_table *p)
 {
 	if (p && wait_address) {
 		poll_wait(filp, wait_address, p);
 		/*
 		 * We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
 		 * This memory barrier is paired in the wq_has_sleeper.
 		*/
 		smp_mb();
 	}
 }
 /*
  * 	Queue a received datagram if it will fit. Stream and sequenced
  *	protocols can't normally use this as they need to fit buffers in
  *	and play with them.
  *
  * 	Inlined as it's very short and called for pretty much every
  *	packet ever received.
  */
 static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 {
 	skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = sock_wfree;
 	/*
 	 * We used to take a refcount on sk, but following operation
 	 * is enough to guarantee sk_free() wont free this sock until
 	 * all in-flight packets are completed
 	 */
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 }
 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = sock_rfree;
 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 	sk_mem_charge(sk, skb->truesize);
 }
 extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
 			   unsigned long expires);
 extern void sk_stop_timer(struct sock *sk, struct timer_list* timer);
 extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
 /*
  *	Recover an error report and clear atomically
  */
 static inline int sock_error(struct sock *sk)
 {
 	int err;
 	if (likely(!sk->sk_err))
 		return 0;
 	err = xchg(&sk->sk_err, 0);
 	return -err;
 }
 static inline unsigned long sock_wspace(struct sock *sk)
 {
 	int amt = 0;
 	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 		amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
 		if (amt < 0)
 			amt = 0;
 	}
 	return amt;
 }
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
 	if (sock_flag(sk, SOCK_FASYNC))
 		sock_wake_async(sk->sk_socket, how, band);
 }
 #define SOCK_MIN_SNDBUF 2048
 /*
  * Since sk_rmem_alloc sums skb->truesize, even a small frame might need
  * sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak
  */
 #define SOCK_MIN_RCVBUF (2048 + sizeof(struct sk_buff))
 static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 {
 	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
 		sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
 		sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF);
 	}
 }
 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
 static inline struct page *sk_stream_alloc_page(struct sock *sk)
 {
 	struct page *page = NULL;
 	page = alloc_pages(sk->sk_allocation, 0);
 	if (!page) {
 		sk->sk_prot->enter_memory_pressure(sk);
 		sk_stream_moderate_sndbuf(sk);
 	}
 	return page;
 }
 /*
  *	Default write policy as shown to user space via poll/select/SIGIO
  */
 static inline int sock_writeable(const struct sock *sk)
 {
 	return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
 }
 static inline gfp_t gfp_any(void)
 {
 	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
 }
 static inline long sock_rcvtimeo(const struct sock *sk, int noblock)
 {
 	return noblock ? 0 : sk->sk_rcvtimeo;
 }
 static inline long sock_sndtimeo(const struct sock *sk, int noblock)
 {
 	return noblock ? 0 : sk->sk_sndtimeo;
 }
 static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
 {
 	return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1;
 }
 /* Alas, with timeout socket operations are not restartable.
  * Compare this to poll().
  */
 static inline int sock_intr_errno(long timeo)
 {
 	return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
 }
 extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 	struct sk_buff *skb);
 static __inline__ void
 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 {
 	ktime_t kt = skb->tstamp;
 	struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
 	/*
 	 * generate control messages if
 	 * - receive time stamping in software requested (SOCK_RCVTSTAMP
 	 *   or SOCK_TIMESTAMPING_RX_SOFTWARE)
 	 * - software time stamp available and wanted
 	 *   (SOCK_TIMESTAMPING_SOFTWARE)
 	 * - hardware time stamps available and wanted
 	 *   (SOCK_TIMESTAMPING_SYS_HARDWARE or
 	 *   SOCK_TIMESTAMPING_RAW_HARDWARE)
 	 */
 	if (sock_flag(sk, SOCK_RCVTSTAMP) ||
 	    sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) ||
 	    (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) ||
 	    (hwtstamps->hwtstamp.tv64 &&
 	     sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) ||
 	    (hwtstamps->syststamp.tv64 &&
 	     sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)))
 		__sock_recv_timestamp(msg, sk, skb);
 	else
 		sk->sk_stamp = kt;
 }
 extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 				     struct sk_buff *skb);
 static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 					  struct sk_buff *skb)
 {
 #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL)			| \
 			   (1UL << SOCK_RCVTSTAMP)			| \
 			   (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)	| \
 			   (1UL << SOCK_TIMESTAMPING_SOFTWARE)		| \
 			   (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) 	| \
 			   (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE))
 	if (sk->sk_flags & FLAGS_TS_OR_DROPS)
 		__sock_recv_ts_and_drops(msg, sk, skb);
 	else
 		sk->sk_stamp = skb->tstamp;
 }
 /**
  * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
  * @sk:		socket sending this packet
  * @tx_flags:	filled with instructions for time stamping
  *
  * Currently only depends on SOCK_TIMESTAMPING* flags. Returns error code if
  * parameters are invalid.
  */
 extern int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags);
 /**
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
  * @skb: socket buffer to eat
  * @copied_early: flag indicating whether DMA operations copied this data early
  *
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
 #ifdef CONFIG_NET_DMA
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
 	if (!copied_early)
 		__kfree_skb(skb);
 	else
 		__skb_queue_tail(&sk->sk_async_wait_queue, skb);
 }
 #else
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
 	__kfree_skb(skb);
 }
 #endif
 static inline
 struct net *sock_net(const struct sock *sk)
 {
 	return read_pnet(&sk->sk_net);
 }
 static inline
 void sock_net_set(struct sock *sk, struct net *net)
 {
 	write_pnet(&sk->sk_net, net);
 }
 /*
  * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace.
  * They should not hold a reference to a namespace in order to allow
  * to stop it.
  * Sockets after sk_change_net should be released using sk_release_kernel
  */
 static inline void sk_change_net(struct sock *sk, struct net *net)
 {
 	put_net(sock_net(sk));
 	sock_net_set(sk, hold_net(net));
 }
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (unlikely(skb->sk)) {
 		struct sock *sk = skb->sk;
 		skb->destructor = NULL;
 		skb->sk = NULL;
 		return sk;
 	}
 	return NULL;
 }
 extern void sock_enable_timestamp(struct sock *sk, int flag);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
 extern int sock_get_timestampns(struct sock *, struct timespec __user *);
 /*
  *	Enable debug/info messages
  */
 extern int net_msg_warn;
 #define NETDEBUG(fmt, args...) \
 	do { if (net_msg_warn) printk(fmt,##args); } while (0)
 #define LIMIT_NETDEBUG(fmt, args...) \
 	do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0)
 extern __u32 sysctl_wmem_max;
 extern __u32 sysctl_rmem_max;
 extern void sk_init(void);
 extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 #endif	/* _SOCK_H */

kernel/cgroup.c

Diff comments View file @ d8bf4ca

 /*
  *  Generic process-grouping system.
  *
  *  Based originally on the cpuset system, extracted by Paul Menage
  *  Copyright (C) 2006 Google, Inc
  *
  *  Notifications support
  *  Copyright (C) 2009 Nokia Corporation
  *  Author: Kirill A. Shutemov
  *
  *  Copyright notices from the original cpuset code:
  *  --------------------------------------------------
  *  Copyright (C) 2003 BULL SA.
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  *
  *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
  *  2004 May-July Rework by Paul Jackson.
  *  ---------------------------------------------------
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
  *  distribution for more details.
  */
 #include <linux/cgroup.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/backing-dev.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
 /*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
  * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
  * registered after that. The mutable section of this array is protected by
  * cgroup_mutex.
  */
 #define SUBSYS(_x) &_x ## _subsys,
 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 #define MAX_CGROUP_ROOT_NAMELEN 64
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy,
  * and may be associated with a superblock to form an active
  * hierarchy
  */
 struct cgroupfs_root {
 	struct super_block *sb;
 	/*
 	 * The bitmask of subsystems intended to be attached to this
 	 * hierarchy
 	 */
 	unsigned long subsys_bits;
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 	/* The bitmask of subsystems currently attached to this hierarchy */
 	unsigned long actual_subsys_bits;
 	/* A list running through the attached subsystems */
 	struct list_head subsys_list;
 	/* The root cgroup for this hierarchy */
 	struct cgroup top_cgroup;
 	/* Tracks how many cgroups are currently defined in hierarchy.*/
 	int number_of_cgroups;
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 	/* The path to use for release notifications. */
 	char release_agent_path[PATH_MAX];
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
  * single cgroup, and all tasks are part of that cgroup.
  */
 static struct cgroupfs_root rootnode;
 /*
  * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
  * cgroup_subsys->use_id != 0.
  */
 #define CSS_ID_MAX	(65535)
 struct css_id {
 	/*
 	 * The css to which this ID points. This pointer is set to valid value
 	 * after cgroup is populated. If cgroup is removed, this will be NULL.
 	 * This pointer is expected to be RCU-safe because destroy()
 	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
 	 * css_tryget() should be used for avoiding race.
 	 */
 	struct cgroup_subsys_state __rcu *css;
 	/*
 	 * ID of this css.
 	 */
 	unsigned short id;
 	/*
 	 * Depth in hierarchy which this ID belongs to.
 	 */
 	unsigned short depth;
 	/*
 	 * ID is freed by RCU. (and lookup routine is RCU safe.)
 	 */
 	struct rcu_head rcu_head;
 	/*
 	 * Hierarchy of CSS ID belongs to.
 	 */
 	unsigned short stack[0]; /* Array of Length (depth+1) */
 };
 /*
  * cgroup_event represents events which userspace want to receive.
  */
 struct cgroup_event {
 	/*
 	 * Cgroup which the event belongs to.
 	 */
 	struct cgroup *cgrp;
 	/*
 	 * Control file which the event associated.
 	 */
 	struct cftype *cft;
 	/*
 	 * eventfd to signal userspace about the event.
 	 */
 	struct eventfd_ctx *eventfd;
 	/*
 	 * Each of these stored in a list by the cgroup.
 	 */
 	struct list_head list;
 	/*
 	 * All fields below needed to unregister event when
 	 * userspace closes eventfd.
 	 */
 	poll_table pt;
 	wait_queue_head_t *wqh;
 	wait_queue_t wait;
 	struct work_struct remove;
 };
 /* The list of hierarchy roots */
 static LIST_HEAD(roots);
 static int root_count;
 static DEFINE_IDA(hierarchy_ida);
 static int next_hierarchy_id;
 static DEFINE_SPINLOCK(hierarchy_id_lock);
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
 static int need_forkexit_callback __read_mostly;
 #ifdef CONFIG_PROVE_LOCKING
 int cgroup_lock_is_held(void)
 {
 	return lockdep_is_held(&cgroup_mutex);
 }
 #else /* #ifdef CONFIG_PROVE_LOCKING */
 int cgroup_lock_is_held(void)
 {
 	return mutex_is_locked(&cgroup_mutex);
 }
 #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
 /* bits in struct cgroupfs_root flags field */
 enum {
 	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
 		(1 << CGRP_NOTIFY_ON_RELEASE);
 	return (cgrp->flags & bits) == bits;
 }
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 static int clone_children(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 }
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
  */
 #define for_each_subsys(_root, _ss) \
 list_for_each_entry(_ss, &_root->subsys_list, sibling)
 /* for_each_active_root() allows you to iterate across the active hierarchies */
 #define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
 static DEFINE_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
 	/*
 	 * List running through cg_cgroup_links associated with a
 	 * cgroup, anchored on cgroup->css_sets
 	 */
 	struct list_head cgrp_link_list;
 	struct cgroup *cgrp;
 	/*
 	 * List running through cg_cgroup_links pointing at a
 	 * single css_set object, anchored on css_set->cg_links
 	 */
 	struct list_head cg_link_list;
 	struct css_set *cg;
 };
 /* The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
  * for each subsystem. Also used to anchor the list of css_sets. Not
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 static int cgroup_init_idr(struct cgroup_subsys *ss,
 			   struct cgroup_subsys_state *css);
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
  * account cgroups in empty hierarchies.
  */
 #define CSS_SET_HASH_BITS	7
 #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
 static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
 static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
 {
 	int i;
 	int index;
 	unsigned long tmp = 0UL;
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
 		tmp += (unsigned long)css[i];
 	tmp = (tmp >> 16) ^ tmp;
 	index = hash_long(tmp, CSS_SET_HASH_BITS);
 	return &css_set_table[index];
 }
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 static void __put_css_set(struct css_set *cg, int taskexit)
 {
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
 	if (atomic_add_unless(&cg->refcount, -1, 1))
 		return;
 	write_lock(&css_set_lock);
 	if (!atomic_dec_and_test(&cg->refcount)) {
 		write_unlock(&css_set_lock);
 		return;
 	}
 	/* This css_set is dead. unlink it and release cgroup refcounts */
 	hlist_del(&cg->hlist);
 	css_set_count--;
 	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
 				 cg_link_list) {
 		struct cgroup *cgrp = link->cgrp;
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
 	kfree_rcu(cg, rcu_head);
 }
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cg)
 {
 	atomic_inc(&cg->refcount);
 }
 static inline void put_css_set(struct css_set *cg)
 {
 	__put_css_set(cg, 0);
 }
 static inline void put_css_set_taskexit(struct css_set *cg)
 {
 	__put_css_set(cg, 1);
 }
 /*
  * compare_css_sets - helper function for find_existing_css_set().
  * @cg: candidate css_set being tested
  * @old_cg: existing css_set for a task
  * @new_cgrp: cgroup that's being entered by the task
  * @template: desired set of css pointers in css_set (pre-calculated)
  *
  * Returns true if "cg" matches "old_cg" except for the hierarchy
  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
  */
 static bool compare_css_sets(struct css_set *cg,
 			     struct css_set *old_cg,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
 	struct list_head *l1, *l2;
 	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
 		/* Not all subsystems matched */
 		return false;
 	}
 	/*
 	 * Compare cgroup pointers in order to distinguish between
 	 * different cgroups in heirarchies with no subsystems. We
 	 * could get by with just this check alone (and skip the
 	 * memcmp above) but on most setups the memcmp check will
 	 * avoid the need for this more expensive check on almost all
 	 * candidates.
 	 */
 	l1 = &cg->cg_links;
 	l2 = &old_cg->cg_links;
 	while (1) {
 		struct cg_cgroup_link *cgl1, *cgl2;
 		struct cgroup *cg1, *cg2;
 		l1 = l1->next;
 		l2 = l2->next;
 		/* See if we reached the end - both lists are equal length. */
 		if (l1 == &cg->cg_links) {
 			BUG_ON(l2 != &old_cg->cg_links);
 			break;
 		} else {
 			BUG_ON(l2 == &old_cg->cg_links);
 		}
 		/* Locate the cgroups associated with these links. */
 		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
 		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
 		cg1 = cgl1->cgrp;
 		cg2 = cgl2->cgrp;
 		/* Hierarchies should be linked in the same order. */
 		BUG_ON(cg1->root != cg2->root);
 		/*
 		 * If this hierarchy is the hierarchy of the cgroup
 		 * that's changing, then we need to check that this
 		 * css_set points to the new cgroup; if it's any other
 		 * hierarchy, then this css_set should point to the
 		 * same cgroup as the old css_set.
 		 */
 		if (cg1->root == new_cgrp->root) {
 			if (cg1 != new_cgrp)
 				return false;
 		} else {
 			if (cg1 != cg2)
 				return false;
 		}
 	}
 	return true;
 }
 /*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
  * css_set is suitable.
  *
  * oldcg: the cgroup group that we're using before the cgroup
  * transition
  *
  * cgrp: the cgroup that we're moving into
  *
  * template: location in which to build the desired set of subsystem
  * state objects for the new cgroup group
  */
 static struct css_set *find_existing_css_set(
 	struct css_set *oldcg,
 	struct cgroup *cgrp,
 	struct cgroup_subsys_state *template[])
 {
 	int i;
 	struct cgroupfs_root *root = cgrp->root;
 	struct hlist_head *hhead;
 	struct hlist_node *node;
 	struct css_set *cg;
 	/*
 	 * Build the set of subsystem state objects that we want to see in the
 	 * new css_set. while subsystems can change globally, the entries here
 	 * won't change, so no need for locking.
 	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		if (root->subsys_bits & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
 			template[i] = cgrp->subsys[i];
 		} else {
 			/* Subsystem is not in this hierarchy, so we
 			 * don't want to change the subsystem state */
 			template[i] = oldcg->subsys[i];
 		}
 	}
 	hhead = css_set_hash(template);
 	hlist_for_each_entry(cg, node, hhead, hlist) {
 		if (!compare_css_sets(cg, oldcg, cgrp, template))
 			continue;
 		/* This css_set matches what we need */
 		return cg;
 	}
 	/* No existing cgroup group matched */
 	return NULL;
 }
 static void free_cg_links(struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
 	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
 }
 /*
  * allocate_cg_links() allocates "count" cg_cgroup_link structures
  * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
  * success or a negative error
  */
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
 	int i;
 	INIT_LIST_HEAD(tmp);
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
 			free_cg_links(tmp);
 			return -ENOMEM;
 		}
 		list_add(&link->cgrp_link_list, tmp);
 	}
 	return 0;
 }
 /**
  * link_css_set - a helper function to link a css_set to a cgroup
  * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
  * @cg: the css_set to be linked
  * @cgrp: the destination cgroup
  */
 static void link_css_set(struct list_head *tmp_cg_links,
 			 struct css_set *cg, struct cgroup *cgrp)
 {
 	struct cg_cgroup_link *link;
 	BUG_ON(list_empty(tmp_cg_links));
 	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
 				cgrp_link_list);
 	link->cg = cg;
 	link->cgrp = cgrp;
 	atomic_inc(&cgrp->count);
 	list_move(&link->cgrp_link_list, &cgrp->css_sets);
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
 	 */
 	list_add_tail(&link->cg_link_list, &cg->cg_links);
 }
 /*
  * find_css_set() takes an existing cgroup group and a
  * cgroup object, and returns a css_set object that's
  * equivalent to the old group, but with the given cgroup
  * substituted into the appropriate hierarchy. Must be called with
  * cgroup_mutex held
  */
 static struct css_set *find_css_set(
 	struct css_set *oldcg, struct cgroup *cgrp)
 {
 	struct css_set *res;
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
 	struct list_head tmp_cg_links;
 	struct hlist_head *hhead;
 	struct cg_cgroup_link *link;
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	read_lock(&css_set_lock);
 	res = find_existing_css_set(oldcg, cgrp, template);
 	if (res)
 		get_css_set(res);
 	read_unlock(&css_set_lock);
 	if (res)
 		return res;
 	res = kmalloc(sizeof(*res), GFP_KERNEL);
 	if (!res)
 		return NULL;
 	/* Allocate all the cg_cgroup_link objects that we'll need */
 	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
 		kfree(res);
 		return NULL;
 	}
 	atomic_set(&res->refcount, 1);
 	INIT_LIST_HEAD(&res->cg_links);
 	INIT_LIST_HEAD(&res->tasks);
 	INIT_HLIST_NODE(&res->hlist);
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
 	memcpy(res->subsys, template, sizeof(res->subsys));
 	write_lock(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
 		struct cgroup *c = link->cgrp;
 		if (c->root == cgrp->root)
 			c = cgrp;
 		link_css_set(&tmp_cg_links, res, c);
 	}
 	BUG_ON(!list_empty(&tmp_cg_links));
 	css_set_count++;
 	/* Add this cgroup group to the hash table */
 	hhead = css_set_hash(res->subsys);
 	hlist_add_head(&res->hlist, hhead);
 	write_unlock(&css_set_lock);
 	return res;
 }
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroupfs_root *root)
 {
 	struct css_set *css;
 	struct cgroup *res = NULL;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	read_lock(&css_set_lock);
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
 	 * task can't change groups, so the only thing that can happen
 	 * is that it exits and its css is set back to init_css_set.
 	 */
 	css = task->cgroups;
 	if (css == &init_css_set) {
 		res = &root->top_cgroup;
 	} else {
 		struct cg_cgroup_link *link;
 		list_for_each_entry(link, &css->cg_links, cg_link_list) {
 			struct cgroup *c = link->cgrp;
 			if (c->root == root) {
 				res = c;
 				break;
 			}
 		}
 	}
 	read_unlock(&css_set_lock);
 	BUG_ON(!res);
 	return res;
 }
 /*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
  * See "The task_lock() exception", at the end of this comment.
  *
  * A task must hold cgroup_mutex to modify cgroups.
  *
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding cgroup_mutex can't rely on the count
  * field not changing.  However, if the count goes to zero, then only
  * cgroup_attach_task() can increment it again.  Because a count of zero
  * means that no tasks are currently attached, therefore there is no
  * way a task attached to that cgroup can fork (the other way to
  * increment the count).  So code holding cgroup_mutex can safely
  * assume that if the count is zero, it will stay zero. Similarly, if
  * a task holds cgroup_mutex on a cgroup with zero count, it
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
  * is taken, and if the cgroup count is zero, a usermode call made
  * to the release agent with the name of the cgroup (path relative to
  * the root of cgroup file system) as the argument.
  *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
  * least one task in the system (init, pid == 1), therefore, top_cgroup
  * always has either children cgroups and/or using tasks.  So we don't
  * need a special hack to ensure that top_cgroup cannot be deleted.
  *
  *	The task_lock() exception
  *
  * The need for this exception arises from the action of
  * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
  * another.  It does so using cgroup_mutex, however there are
  * several performance critical places that need to reference
  * task->cgroup without the expense of grabbing a system global
  * mutex.  Therefore except as noted below, when dereferencing or, as
  * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
  * task_lock(), which acts on a spinlock (task->alloc_lock) already in
  * the task_struct routinely used for such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 /**
  * cgroup_lock - lock out any changes to cgroup structures
  *
  */
 void cgroup_lock(void)
 {
 	mutex_lock(&cgroup_mutex);
 }
 EXPORT_SYMBOL_GPL(cgroup_lock);
 /**
  * cgroup_unlock - release lock on cgroup changes
  *
  * Undo the lock taken in a previous cgroup_lock() call.
  */
 void cgroup_unlock(void)
 {
 	mutex_unlock(&cgroup_mutex);
 }
 EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
  * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
  * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
  * -> cgroup_mkdir.
  */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
 static struct backing_dev_info cgroup_backing_dev_info = {
 	.name		= "cgroup",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 static int alloc_css_id(struct cgroup_subsys *ss,
 			struct cgroup *parent, struct cgroup *child);
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
 	}
 	return inode;
 }
 /*
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
 static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
 	int ret = 0;
 	for_each_subsys(cgrp->root, ss)
 		if (ss->pre_destroy) {
 			ret = ss->pre_destroy(ss, cgrp);
 			if (ret)
 				break;
 		}
 	return ret;
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cgrp = dentry->d_fsdata;
 		struct cgroup_subsys *ss;
 		BUG_ON(!(cgroup_is_removed(cgrp)));
 		/* It's possible for external users to be holding css
 		 * reference counts on a cgroup; css_put() needs to
 		 * be able to access the cgroup after decrementing
 		 * the reference count in order to know if it needs to
 		 * queue the cgroup to be handled by the release
 		 * agent */
 		synchronize_rcu();
 		mutex_lock(&cgroup_mutex);
 		/*
 		 * Release the subsystem state objects.
 		 */
 		for_each_subsys(cgrp->root, ss)
 			ss->destroy(ss, cgrp);
 		cgrp->root->number_of_cgroups--;
 		mutex_unlock(&cgroup_mutex);
 		/*
 		 * Drop the active superblock reference that we took when we
 		 * created the cgroup
 		 */
 		deactivate_super(cgrp->root->sb);
 		/*
 		 * if we're getting rid of the cgroup, refcount should ensure
 		 * that there are no pidlists left.
 		 */
 		BUG_ON(!list_empty(&cgrp->pidlists));
 		kfree_rcu(cgrp, rcu_head);
 	}
 	iput(inode);
 }
 static int cgroup_delete(const struct dentry *d)
 {
 	return 1;
 }
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
 	d_delete(d);
 	simple_rmdir(parent->d_inode, d);
 	dput(parent);
 }
 static void cgroup_clear_directory(struct dentry *dentry)
 {
 	struct list_head *node;
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
 	spin_lock(&dentry->d_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
 		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
 		list_del_init(node);
 		if (d->d_inode) {
 			/* This should never be called on a cgroup
 			 * directory with child cgroups */
 			BUG_ON(d->d_inode->i_mode & S_IFDIR);
 			dget_dlock(d);
 			spin_unlock(&d->d_lock);
 			spin_unlock(&dentry->d_lock);
 			d_delete(d);
 			simple_unlink(dentry->d_inode, d);
 			dput(d);
 			spin_lock(&dentry->d_lock);
 		} else
 			spin_unlock(&d->d_lock);
 		node = dentry->d_subdirs.next;
 	}
 	spin_unlock(&dentry->d_lock);
 }
 /*
  * NOTE : the dentry must have been dget()'ed
  */
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
 	struct dentry *parent;
 	cgroup_clear_directory(dentry);
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
 	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 	remove_dir(dentry);
 }
 /*
  * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
  * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
  * reference to css->refcnt. In general, this refcnt is expected to goes down
  * to zero, soon.
  *
  * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
  */
 DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 {
 	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
 		wake_up_all(&cgroup_rmdir_waitq);
 }
 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
 {
 	css_get(css);
 }
 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
 {
 	cgroup_wakeup_rmdir_waiter(css->cgroup);
 	css_put(css);
 }
 /*
  * Call with cgroup_mutex held. Drops reference counts on modules, including
  * any duplicate ones that parse_cgroupfs_options took. If this function
  * returns an error, no reference counts are touched.
  */
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
 	unsigned long added_bits, removed_bits;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int i;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	removed_bits = root->actual_subsys_bits & ~final_bits;
 	added_bits = final_bits & ~root->actual_subsys_bits;
 	/* Check that any added subsystems are currently free */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 		struct cgroup_subsys *ss = subsys[i];
 		if (!(bit & added_bits))
 			continue;
 		/*
 		 * Nobody should tell us to do a subsys that doesn't exist:
 		 * parse_cgroupfs_options should catch that case and refcounts
 		 * ensure that subsystems won't disappear once selected.
 		 */
 		BUG_ON(ss == NULL);
 		if (ss->root != &rootnode) {
 			/* Subsystem isn't free */
 			return -EBUSY;
 		}
 	}
 	/* Currently we don't handle adding/removing subsystems when
 	 * any child cgroups exist. This is theoretically supportable
 	 * but involves complex error handling, so it's being left until
 	 * later */
 	if (root->number_of_cgroups > 1)
 		return -EBUSY;
 	/* Process each subsystem */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		unsigned long bit = 1UL << i;
 		if (bit & added_bits) {
 			/* We're binding this subsystem to this hierarchy */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!dummytop->subsys[i]);
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
 			mutex_lock(&ss->hierarchy_mutex);
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(ss, cgrp);
 			mutex_unlock(&ss->hierarchy_mutex);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_bits) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
 			mutex_lock(&ss->hierarchy_mutex);
 			if (ss->bind)
 				ss->bind(ss, dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
 			list_move(&ss->sibling, &rootnode.subsys_list);
 			mutex_unlock(&ss->hierarchy_mutex);
 			/* subsystem is now free - drop reference on module */
 			module_put(ss->module);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
 			BUG_ON(ss == NULL);
 			BUG_ON(!cgrp->subsys[i]);
 			/*
 			 * a refcount was taken, but we already had one, so
 			 * drop the extra reference.
 			 */
 			module_put(ss->module);
 #ifdef CONFIG_MODULE_UNLOAD
 			BUG_ON(ss->module && !module_refcount(ss->module));
 #endif
 		} else {
 			/* Subsystem state shouldn't exist */
 			BUG_ON(cgrp->subsys[i]);
 		}
 	}
 	root->subsys_bits = root->actual_subsys_bits = final_bits;
 	synchronize_rcu();
 	return 0;
 }
 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
 	struct cgroup_subsys *ss;
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
 		seq_puts(seq, ",noprefix");
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	if (clone_children(&root->top_cgroup))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 struct cgroup_sb_opts {
 	unsigned long subsys_bits;
 	unsigned long flags;
 	char *release_agent;
 	bool clone_children;
 	char *name;
 	/* User explicitly requested empty subsystem */
 	bool none;
 	struct cgroupfs_root *new_root;
 };
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
  * with cgroup_mutex held to protect the subsys[] array. This function takes
  * refcounts on subsystems to be used, unless it returns error, in which case
  * no refcounts are taken.
  */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
 	unsigned long mask = (unsigned long)-1;
 	int i;
 	bool module_pin_failed = false;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
 	mask = ~(1UL << cpuset_subsys_id);
 #endif
 	memset(opts, 0, sizeof(*opts));
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
 			return -EINVAL;
 		if (!strcmp(token, "none")) {
 			/* Explicitly have no subsystems */
 			opts->none = true;
 			continue;
 		}
 		if (!strcmp(token, "all")) {
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (one_ss)
 				return -EINVAL;
 			all_ss = true;
 			continue;
 		}
 		if (!strcmp(token, "noprefix")) {
 			set_bit(ROOT_NOPREFIX, &opts->flags);
 			continue;
 		}
 		if (!strcmp(token, "clone_children")) {
 			opts->clone_children = true;
 			continue;
 		}
 		if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
 				return -EINVAL;
 			opts->release_agent =
 				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
 			continue;
 		}
 		if (!strncmp(token, "name=", 5)) {
 			const char *name = token + 5;
 			/* Can't specify an empty name */
 			if (!strlen(name))
 				return -EINVAL;
 			/* Must match [\w.-]+ */
 			for (i = 0; i < strlen(name); i++) {
 				char c = name[i];
 				if (isalnum(c))
 					continue;
 				if ((c == '.') || (c == '-') || (c == '_'))
 					continue;
 				return -EINVAL;
 			}
 			/* Specifying two names is forbidden */
 			if (opts->name)
 				return -EINVAL;
 			opts->name = kstrndup(name,
 					      MAX_CGROUP_ROOT_NAMELEN - 1,
 					      GFP_KERNEL);
 			if (!opts->name)
 				return -ENOMEM;
 			continue;
 		}
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss == NULL)
 				continue;
 			if (strcmp(token, ss->name))
 				continue;
 			if (ss->disabled)
 				continue;
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (all_ss)
 				return -EINVAL;
 			set_bit(i, &opts->subsys_bits);
 			one_ss = true;
 			break;
 		}
 		if (i == CGROUP_SUBSYS_COUNT)
 			return -ENOENT;
 	}
 	/*
 	 * If the 'all' option was specified select all the subsystems,
 	 * otherwise 'all, 'none' and a subsystem name options were not
 	 * specified, let's default to 'all'
 	 */
 	if (all_ss || (!all_ss && !one_ss && !opts->none)) {
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss == NULL)
 				continue;
 			if (ss->disabled)
 				continue;
 			set_bit(i, &opts->subsys_bits);
 		}
 	}
 	/* Consistency checks */
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
 	 * the cpuset subsystem.
 	 */
 	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
 	    (opts->subsys_bits & mask))
 		return -EINVAL;
 	/* Can't specify "none" and some subsystems */
 	if (opts->subsys_bits && opts->none)
 		return -EINVAL;
 	/*
 	 * We either have to specify by name or by subsystems. (So all
 	 * empty hierarchies must have a name).
 	 */
 	if (!opts->subsys_bits && !opts->name)
 		return -EINVAL;
 	/*
 	 * Grab references on all the modules we'll need, so the subsystems
 	 * don't dance around before rebind_subsystems attaches them. This may
 	 * take duplicate reference counts on a subsystem that's already used,
 	 * but rebind_subsystems handles this case.
 	 */
 	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 		if (!(bit & opts->subsys_bits))
 			continue;
 		if (!try_module_get(subsys[i]->module)) {
 			module_pin_failed = true;
 			break;
 		}
 	}
 	if (module_pin_failed) {
 		/*
 		 * oops, one of the modules was going away. this means that we
 		 * raced with a module_delete call, and to the user this is
 		 * essentially a "subsystem doesn't exist" case.
 		 */
 		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
 			/* drop refcounts only on the ones we took */
 			unsigned long bit = 1UL << i;
 			if (!(bit & opts->subsys_bits))
 				continue;
 			module_put(subsys[i]->module);
 		}
 		return -ENOENT;
 	}
 	return 0;
 }
 static void drop_parsed_module_refcounts(unsigned long subsys_bits)
 {
 	int i;
 	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 		if (!(bit & subsys_bits))
 			continue;
 		module_put(subsys[i]->module);
 	}
 }
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
 	int ret = 0;
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
 		ret = -EINVAL;
 		drop_parsed_module_refcounts(opts.subsys_bits);
 		goto out_unlock;
 	}
 	ret = rebind_subsystems(root, opts.subsys_bits);
 	if (ret) {
 		drop_parsed_module_refcounts(opts.subsys_bits);
 		goto out_unlock;
 	}
 	/* (re)populate subsystem files */
 	cgroup_populate_dir(cgrp);
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
 }
 static const struct super_operations cgroup_ops = {
 	.statfs = simple_statfs,
 	.drop_inode = generic_delete_inode,
 	.show_options = cgroup_show_options,
 	.remount_fs = cgroup_remount,
 };
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->top_cgroup = cgrp;
 	init_cgroup_housekeeping(cgrp);
 }
 static bool init_root_id(struct cgroupfs_root *root)
 {
 	int ret = 0;
 	do {
 		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
 			return false;
 		spin_lock(&hierarchy_id_lock);
 		/* Try to allocate the next unused ID */
 		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
 					&root->hierarchy_id);
 		if (ret == -ENOSPC)
 			/* Try again starting from 0 */
 			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
 		if (!ret) {
 			next_hierarchy_id = root->hierarchy_id + 1;
 		} else if (ret != -EAGAIN) {
 			/* Can only get here if the 31-bit IDR is full ... */
 			BUG_ON(ret);
 		}
 		spin_unlock(&hierarchy_id_lock);
 	} while (ret);
 	return true;
 }
 static int cgroup_test_super(struct super_block *sb, void *data)
 {
 	struct cgroup_sb_opts *opts = data;
 	struct cgroupfs_root *root = sb->s_fs_info;
 	/* If we asked for a name then it must match */
 	if (opts->name && strcmp(opts->name, root->name))
 		return 0;
 	/*
 	 * If we asked for subsystems (or explicitly for no
 	 * subsystems) then they must match
 	 */
 	if ((opts->subsys_bits || opts->none)
 	    && (opts->subsys_bits != root->subsys_bits))
 		return 0;
 	return 1;
 }
 static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 {
 	struct cgroupfs_root *root;
 	if (!opts->subsys_bits && !opts->none)
 		return NULL;
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 	if (!init_root_id(root)) {
 		kfree(root);
 		return ERR_PTR(-ENOMEM);
 	}
 	init_cgroup_root(root);
 	root->subsys_bits = opts->subsys_bits;
 	root->flags = opts->flags;
 	if (opts->release_agent)
 		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->clone_children)
 		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
 	return root;
 }
 static void cgroup_drop_root(struct cgroupfs_root *root)
 {
 	if (!root)
 		return;
 	BUG_ON(!root->hierarchy_id);
 	spin_lock(&hierarchy_id_lock);
 	ida_remove(&hierarchy_ida, root->hierarchy_id);
 	spin_unlock(&hierarchy_id_lock);
 	kfree(root);
 }
 static int cgroup_set_super(struct super_block *sb, void *data)
 {
 	int ret;
 	struct cgroup_sb_opts *opts = data;
 	/* If we don't have a new root, we can't set up a new sb */
 	if (!opts->new_root)
 		return -EINVAL;
 	BUG_ON(!opts->subsys_bits && !opts->none);
 	ret = set_anon_super(sb, NULL);
 	if (ret)
 		return ret;
 	sb->s_fs_info = opts->new_root;
 	opts->new_root->sb = sb;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = CGROUP_SUPER_MAGIC;
 	sb->s_op = &cgroup_ops;
 	return 0;
 }
 static int cgroup_get_rootdir(struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
 		.d_delete = cgroup_delete,
 	};
 	struct inode *inode =
 		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
 	struct dentry *dentry;
 	if (!inode)
 		return -ENOMEM;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_op = &cgroup_dir_inode_operations;
 	/* directories start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
 	dentry = d_alloc_root(inode);
 	if (!dentry) {
 		iput(inode);
 		return -ENOMEM;
 	}
 	sb->s_root = dentry;
 	/* for everything else we want ->d_op set */
 	sb->s_d_op = &cgroup_dops;
 	return 0;
 }
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
 	struct cgroup_sb_opts opts;
 	struct cgroupfs_root *root;
 	int ret = 0;
 	struct super_block *sb;
 	struct cgroupfs_root *new_root;
 	/* First find the desired set of subsystems */
 	mutex_lock(&cgroup_mutex);
 	ret = parse_cgroupfs_options(data, &opts);
 	mutex_unlock(&cgroup_mutex);
 	if (ret)
 		goto out_err;
 	/*
 	 * Allocate a new cgroup root. We may not need it if we're
 	 * reusing an existing hierarchy.
 	 */
 	new_root = cgroup_root_from_opts(&opts);
 	if (IS_ERR(new_root)) {
 		ret = PTR_ERR(new_root);
 		goto drop_modules;
 	}
 	opts.new_root = new_root;
 	/* Locate an existing or new sb for this hierarchy */
 	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
 		cgroup_drop_root(opts.new_root);
 		goto drop_modules;
 	}
 	root = sb->s_fs_info;
 	BUG_ON(!root);
 	if (root == opts.new_root) {
 		/* We used the new root structure, so this is a new hierarchy */
 		struct list_head tmp_cg_links;
 		struct cgroup *root_cgrp = &root->top_cgroup;
 		struct inode *inode;
 		struct cgroupfs_root *existing_root;
 		int i;
 		BUG_ON(sb->s_root != NULL);
 		ret = cgroup_get_rootdir(sb);
 		if (ret)
 			goto drop_new_super;
 		inode = sb->s_root->d_inode;
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (strlen(root->name)) {
 			/* Check for name clashes with existing mounts */
 			for_each_active_root(existing_root) {
 				if (!strcmp(existing_root->name, root->name)) {
 					ret = -EBUSY;
 					mutex_unlock(&cgroup_mutex);
 					mutex_unlock(&inode->i_mutex);
 					goto drop_new_super;
 				}
 			}
 		}
 		/*
 		 * We're accessing css_set_count without locking
 		 * css_set_lock here, but that's OK - it can only be
 		 * increased by someone holding cgroup_lock, and
 		 * that's us. The worst that can happen is that we
 		 * have some link structures left over
 		 */
 		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
 		if (ret) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&inode->i_mutex);
 			goto drop_new_super;
 		}
 		ret = rebind_subsystems(root, root->subsys_bits);
 		if (ret == -EBUSY) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&inode->i_mutex);
 			free_cg_links(&tmp_cg_links);
 			goto drop_new_super;
 		}
 		/*
 		 * There must be no failure case after here, since rebinding
 		 * takes care of subsystems' refcounts, which are explicitly
 		 * dropped in the failure exit path.
 		 */
 		/* EBUSY should be the only error here */
 		BUG_ON(ret);
 		list_add(&root->root_list, &roots);
 		root_count++;
 		sb->s_root->d_fsdata = root_cgrp;
 		root->top_cgroup.dentry = sb->s_root;
 		/* Link the top cgroup in this hierarchy into all
 		 * the css_set objects */
 		write_lock(&css_set_lock);
 		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
 			struct hlist_head *hhead = &css_set_table[i];
 			struct hlist_node *node;
 			struct css_set *cg;
 			hlist_for_each_entry(cg, node, hhead, hlist)
 				link_css_set(&tmp_cg_links, cg, root_cgrp);
 		}
 		write_unlock(&css_set_lock);
 		free_cg_links(&tmp_cg_links);
 		BUG_ON(!list_empty(&root_cgrp->sibling));
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 		cgroup_populate_dir(root_cgrp);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 	} else {
 		/*
 		 * We re-used an existing hierarchy - the new root (if
 		 * any) is not needed
 		 */
 		cgroup_drop_root(opts.new_root);
 		/* no subsys rebinding, so refcounts don't change */
 		drop_parsed_module_refcounts(opts.subsys_bits);
 	}
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	return dget(sb->s_root);
  drop_new_super:
 	deactivate_locked_super(sb);
  drop_modules:
 	drop_parsed_module_refcounts(opts.subsys_bits);
  out_err:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	return ERR_PTR(ret);
 }
 static void cgroup_kill_sb(struct super_block *sb) {
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int ret;
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
 	BUG_ON(!root);
 	BUG_ON(root->number_of_cgroups != 1);
 	BUG_ON(!list_empty(&cgrp->children));
 	BUG_ON(!list_empty(&cgrp->sibling));
 	mutex_lock(&cgroup_mutex);
 	/* Rebind all subsystems back to the default hierarchy */
 	ret = rebind_subsystems(root, 0);
 	/* Shouldn't be able to fail ... */
 	BUG_ON(ret);
 	/*
 	 * Release all the links from css_sets to this hierarchy's
 	 * root cgroup
 	 */
 	write_lock(&css_set_lock);
 	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
 				 cgrp_link_list) {
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
 		root_count--;
 	}
 	mutex_unlock(&cgroup_mutex);
 	kill_litter_super(sb);
 	cgroup_drop_root(root);
 }
 static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
 };
 static struct kobject *cgroup_kobj;
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
 }
 static inline struct cftype *__d_cft(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
 }
 /**
  * cgroup_path - generate the path of a cgroup
  * @cgrp: the cgroup in question
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
  * Called with cgroup_mutex held or else with an RCU-protected cgroup
  * reference.  Writes path of cgroup into buf.  Returns 0 on success,
  * -errno on error.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
 	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-						      rcu_read_lock_held() ||
 						      cgroup_lock_is_held());
 	if (!dentry || cgrp == dummytop) {
 		/*
 		 * Inactive subsystems have no dentry for their root
 		 * cgroup
 		 */
 		strcpy(buf, "/");
 		return 0;
 	}
 	start = buf + buflen;
 	*--start = '\0';
 	for (;;) {
 		int len = dentry->d_name.len;
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
 		memcpy(start, dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
 		dentry = rcu_dereference_check(cgrp->dentry,
-					       rcu_read_lock_held() ||
 					       cgroup_lock_is_held());
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
 			return -ENAMETOOLONG;
 		*start = '/';
 	}
 	memmove(buf, start, buf + buflen - start);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 /*
  * cgroup_task_migrate - move a task from one cgroup to another.
  *
  * 'guarantee' is set if the caller promises that a new css_set for the task
  * will already exist. If not set, this function might sleep, and can fail with
  * -ENOMEM. Otherwise, it can only fail with -ESRCH.
  */
 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 			       struct task_struct *tsk, bool guarantee)
 {
 	struct css_set *oldcg;
 	struct css_set *newcg;
 	/*
 	 * get old css_set. we need to take task_lock and refcount it, because
 	 * an exiting task can change its css_set to init_css_set and drop its
 	 * old one without taking cgroup_mutex.
 	 */
 	task_lock(tsk);
 	oldcg = tsk->cgroups;
 	get_css_set(oldcg);
 	task_unlock(tsk);
 	/* locate or allocate a new css_set for this task. */
 	if (guarantee) {
 		/* we know the css_set we want already exists. */
 		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
 		read_lock(&css_set_lock);
 		newcg = find_existing_css_set(oldcg, cgrp, template);
 		BUG_ON(!newcg);
 		get_css_set(newcg);
 		read_unlock(&css_set_lock);
 	} else {
 		might_sleep();
 		/* find_css_set will give us newcg already referenced. */
 		newcg = find_css_set(oldcg, cgrp);
 		if (!newcg) {
 			put_css_set(oldcg);
 			return -ENOMEM;
 		}
 	}
 	put_css_set(oldcg);
 	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
 	task_lock(tsk);
 	if (tsk->flags & PF_EXITING) {
 		task_unlock(tsk);
 		put_css_set(newcg);
 		return -ESRCH;
 	}
 	rcu_assign_pointer(tsk->cgroups, newcg);
 	task_unlock(tsk);
 	/* Update the css_set linked lists if we're using them */
 	write_lock(&css_set_lock);
 	if (!list_empty(&tsk->cg_list))
 		list_move(&tsk->cg_list, &newcg->tasks);
 	write_unlock(&css_set_lock);
 	/*
 	 * We just gained a reference on oldcg by taking it from the task. As
 	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
 	 * it here; it will be freed under RCU.
 	 */
 	put_css_set(oldcg);
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	return 0;
 }
 /**
  * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
  * @cgrp: the cgroup the task is attaching to
  * @tsk: the task to be attached
  *
  * Call holding cgroup_mutex. May take task_lock of
  * the task 'tsk' during call.
  */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	int retval;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	struct cgroup *oldcgrp;
 	struct cgroupfs_root *root = cgrp->root;
 	/* Nothing to do if the task is already in that cgroup */
 	oldcgrp = task_cgroup_from_root(tsk, root);
 	if (cgrp == oldcgrp)
 		return 0;
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
 			retval = ss->can_attach(ss, cgrp, tsk);
 			if (retval) {
 				/*
 				 * Remember on which subsystem the can_attach()
 				 * failed, so that we only call cancel_attach()
 				 * against the subsystems whose can_attach()
 				 * succeeded. (See below)
 				 */
 				failed_ss = ss;
 				goto out;
 			}
 		}
 		if (ss->can_attach_task) {
 			retval = ss->can_attach_task(cgrp, tsk);
 			if (retval) {
 				failed_ss = ss;
 				goto out;
 			}
 		}
 	}
 	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
 	if (retval)
 		goto out;
 	for_each_subsys(root, ss) {
 		if (ss->pre_attach)
 			ss->pre_attach(cgrp);
 		if (ss->attach_task)
 			ss->attach_task(cgrp, tsk);
 		if (ss->attach)
 			ss->attach(ss, cgrp, oldcgrp, tsk);
 	}
 	synchronize_rcu();
 	/*
 	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
 	 * is no longer empty.
 	 */
 	cgroup_wakeup_rmdir_waiter(cgrp);
 out:
 	if (retval) {
 		for_each_subsys(root, ss) {
 			if (ss == failed_ss)
 				/*
 				 * This subsystem was the one that failed the
 				 * can_attach() check earlier, so we don't need
 				 * to call cancel_attach() against it or any
 				 * remaining subsystems.
 				 */
 				break;
 			if (ss->cancel_attach)
 				ss->cancel_attach(ss, cgrp, tsk);
 		}
 	}
 	return retval;
 }
 /**
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
  * @tsk: the task to be attached
  */
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
 	struct cgroupfs_root *root;
 	int retval = 0;
 	cgroup_lock();
 	for_each_active_root(root) {
 		struct cgroup *from_cg = task_cgroup_from_root(from, root);
 		retval = cgroup_attach_task(from_cg, tsk);
 		if (retval)
 			break;
 	}
 	cgroup_unlock();
 	return retval;
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
  * cgroup_attach_proc works in two stages, the first of which prefetches all
  * new css_sets needed (to make sure we have enough memory before committing
  * to the move) and stores them in a list of entries of the following type.
  * TODO: possible optimization: use css_set->rcu_head for chaining instead
  */
 struct cg_list_entry {
 	struct css_set *cg;
 	struct list_head links;
 };
 static bool css_set_check_fetched(struct cgroup *cgrp,
 				  struct task_struct *tsk, struct css_set *cg,
 				  struct list_head *newcg_list)
 {
 	struct css_set *newcg;
 	struct cg_list_entry *cg_entry;
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
 	read_lock(&css_set_lock);
 	newcg = find_existing_css_set(cg, cgrp, template);
 	if (newcg)
 		get_css_set(newcg);
 	read_unlock(&css_set_lock);
 	/* doesn't exist at all? */
 	if (!newcg)
 		return false;
 	/* see if it's already in the list */
 	list_for_each_entry(cg_entry, newcg_list, links) {
 		if (cg_entry->cg == newcg) {
 			put_css_set(newcg);
 			return true;
 		}
 	}
 	/* not found */
 	put_css_set(newcg);
 	return false;
 }
 /*
  * Find the new css_set and store it in the list in preparation for moving the
  * given task to the given cgroup. Returns 0 or -ENOMEM.
  */
 static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
 			    struct list_head *newcg_list)
 {
 	struct css_set *newcg;
 	struct cg_list_entry *cg_entry;
 	/* ensure a new css_set will exist for this thread */
 	newcg = find_css_set(cg, cgrp);
 	if (!newcg)
 		return -ENOMEM;
 	/* add it to the list */
 	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
 	if (!cg_entry) {
 		put_css_set(newcg);
 		return -ENOMEM;
 	}
 	cg_entry->cg = newcg;
 	list_add(&cg_entry->links, newcg_list);
 	return 0;
 }
 /**
  * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
  * @leader: the threadgroup leader task_struct of the group to be attached
  *
  * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
  * take task_lock of each thread in leader's threadgroup individually in turn.
  */
 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
 	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	bool cancel_failed_ss = false;
 	/* guaranteed to be initialized later, but the compiler needs this */
 	struct cgroup *oldcgrp = NULL;
 	struct css_set *oldcg;
 	struct cgroupfs_root *root = cgrp->root;
 	/* threadgroup list cursor and array */
 	struct task_struct *tsk;
 	struct flex_array *group;
 	/*
 	 * we need to make sure we have css_sets for all the tasks we're
 	 * going to move -before- we actually start moving them, so that in
 	 * case we get an ENOMEM we can bail out before making any changes.
 	 */
 	struct list_head newcg_list;
 	struct cg_list_entry *cg_entry, *temp_nobe;
 	/*
 	 * step 0: in order to do expensive, possibly blocking operations for
 	 * every thread, we cannot iterate the thread group list, since it needs
 	 * rcu or tasklist locked. instead, build an array of all threads in the
 	 * group - threadgroup_fork_lock prevents new threads from appearing,
 	 * and if threads exit, this will just be an over-estimate.
 	 */
 	group_size = get_nr_threads(leader);
 	/* flex_array supports very large thread-groups better than kmalloc. */
 	group = flex_array_alloc(sizeof(struct task_struct *), group_size,
 				 GFP_KERNEL);
 	if (!group)
 		return -ENOMEM;
 	/* pre-allocate to guarantee space while iterating in rcu read-side. */
 	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
 	if (retval)
 		goto out_free_group_list;
 	/* prevent changes to the threadgroup list while we take a snapshot. */
 	rcu_read_lock();
 	if (!thread_group_leader(leader)) {
 		/*
 		 * a race with de_thread from another thread's exec() may strip
 		 * us of our leadership, making while_each_thread unsafe to use
 		 * on this task. if this happens, there is no choice but to
 		 * throw this task away and try again (from cgroup_procs_write);
 		 * this is "double-double-toil-and-trouble-check locking".
 		 */
 		rcu_read_unlock();
 		retval = -EAGAIN;
 		goto out_free_group_list;
 	}
 	/* take a reference on each task in the group to go in the array. */
 	tsk = leader;
 	i = 0;
 	do {
 		/* as per above, nr_threads may decrease, but not increase. */
 		BUG_ON(i >= group_size);
 		get_task_struct(tsk);
 		/*
 		 * saying GFP_ATOMIC has no effect here because we did prealloc
 		 * earlier, but it's good form to communicate our expectations.
 		 */
 		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
 	} while_each_thread(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
 	rcu_read_unlock();
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
 			retval = ss->can_attach(ss, cgrp, leader);
 			if (retval) {
 				failed_ss = ss;
 				goto out_cancel_attach;
 			}
 		}
 		/* a callback to be run on every thread in the threadgroup. */
 		if (ss->can_attach_task) {
 			/* run on each task in the threadgroup. */
 			for (i = 0; i < group_size; i++) {
 				tsk = flex_array_get_ptr(group, i);
 				retval = ss->can_attach_task(cgrp, tsk);
 				if (retval) {
 					failed_ss = ss;
 					cancel_failed_ss = true;
 					goto out_cancel_attach;
 				}
 			}
 		}
 	}
 	/*
 	 * step 2: make sure css_sets exist for all threads to be migrated.
 	 * we use find_css_set, which allocates a new one if necessary.
 	 */
 	INIT_LIST_HEAD(&newcg_list);
 	for (i = 0; i < group_size; i++) {
 		tsk = flex_array_get_ptr(group, i);
 		/* nothing to do if this task is already in the cgroup */
 		oldcgrp = task_cgroup_from_root(tsk, root);
 		if (cgrp == oldcgrp)
 			continue;
 		/* get old css_set pointer */
 		task_lock(tsk);
 		if (tsk->flags & PF_EXITING) {
 			/* ignore this task if it's going away */
 			task_unlock(tsk);
 			continue;
 		}
 		oldcg = tsk->cgroups;
 		get_css_set(oldcg);
 		task_unlock(tsk);
 		/* see if the new one for us is already in the list? */
 		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
 			/* was already there, nothing to do. */
 			put_css_set(oldcg);
 		} else {
 			/* we don't already have it. get new one. */
 			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
 			put_css_set(oldcg);
 			if (retval)
 				goto out_list_teardown;
 		}
 	}
 	/*
 	 * step 3: now that we're guaranteed success wrt the css_sets, proceed
 	 * to move all tasks to the new cgroup, calling ss->attach_task for each
 	 * one along the way. there are no failure cases after here, so this is
 	 * the commit point.
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->pre_attach)
 			ss->pre_attach(cgrp);
 	}
 	for (i = 0; i < group_size; i++) {
 		tsk = flex_array_get_ptr(group, i);
 		/* leave current thread as it is if it's already there */
 		oldcgrp = task_cgroup_from_root(tsk, root);
 		if (cgrp == oldcgrp)
 			continue;
 		/* attach each task to each subsystem */
 		for_each_subsys(root, ss) {
 			if (ss->attach_task)
 				ss->attach_task(cgrp, tsk);
 		}
 		/* if the thread is PF_EXITING, it can just get skipped. */
 		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
 		BUG_ON(retval != 0 && retval != -ESRCH);
 	}
 	/* nothing is sensitive to fork() after this point. */
 	/*
 	 * step 4: do expensive, non-thread-specific subsystem callbacks.
 	 * TODO: if ever a subsystem needs to know the oldcgrp for each task
 	 * being moved, this call will need to be reworked to communicate that.
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->attach)
 			ss->attach(ss, cgrp, oldcgrp, leader);
 	}
 	/*
 	 * step 5: success! and cleanup
 	 */
 	synchronize_rcu();
 	cgroup_wakeup_rmdir_waiter(cgrp);
 	retval = 0;
 out_list_teardown:
 	/* clean up the list of prefetched css_sets. */
 	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
 		list_del(&cg_entry->links);
 		put_css_set(cg_entry->cg);
 		kfree(cg_entry);
 	}
 out_cancel_attach:
 	/* same deal as in cgroup_attach_task */
 	if (retval) {
 		for_each_subsys(root, ss) {
 			if (ss == failed_ss) {
 				if (cancel_failed_ss && ss->cancel_attach)
 					ss->cancel_attach(ss, cgrp, leader);
 				break;
 			}
 			if (ss->cancel_attach)
 				ss->cancel_attach(ss, cgrp, leader);
 		}
 	}
 	/* clean up the array of referenced threads in the group. */
 	for (i = 0; i < group_size; i++) {
 		tsk = flex_array_get_ptr(group, i);
 		put_task_struct(tsk);
 	}
 out_free_group_list:
 	flex_array_free(group);
 	return retval;
 }
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will take
  * cgroup_mutex; may take task_lock of task.
  */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
 	struct task_struct *tsk;
 	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	if (pid) {
 		rcu_read_lock();
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			rcu_read_unlock();
 			cgroup_unlock();
 			return -ESRCH;
 		}
 		if (threadgroup) {
 			/*
 			 * RCU protects this access, since tsk was found in the
 			 * tid map. a race with de_thread may cause group_leader
 			 * to stop being the leader, but cgroup_attach_proc will
 			 * detect it later.
 			 */
 			tsk = tsk->group_leader;
 		} else if (tsk->flags & PF_EXITING) {
 			/* optimization for the single-task-only case */
 			rcu_read_unlock();
 			cgroup_unlock();
 			return -ESRCH;
 		}
 		/*
 		 * even if we're attaching all tasks in the thread group, we
 		 * only need to check permissions on one of them.
 		 */
 		tcred = __task_cred(tsk);
 		if (cred->euid &&
 		    cred->euid != tcred->uid &&
 		    cred->euid != tcred->suid) {
 			rcu_read_unlock();
 			cgroup_unlock();
 			return -EACCES;
 		}
 		get_task_struct(tsk);
 		rcu_read_unlock();
 	} else {
 		if (threadgroup)
 			tsk = current->group_leader;
 		else
 			tsk = current;
 		get_task_struct(tsk);
 	}
 	if (threadgroup) {
 		threadgroup_fork_write_lock(tsk);
 		ret = cgroup_attach_proc(cgrp, tsk);
 		threadgroup_fork_write_unlock(tsk);
 	} else {
 		ret = cgroup_attach_task(cgrp, tsk);
 	}
 	put_task_struct(tsk);
 	cgroup_unlock();
 	return ret;
 }
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
 	return attach_task_by_pid(cgrp, pid, false);
 }
 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 {
 	int ret;
 	do {
 		/*
 		 * attach_proc fails with -EAGAIN if threadgroup leadership
 		 * changes in the middle of the operation, in which case we need
 		 * to find the task_struct for the new leader and start over.
 		 */
 		ret = attach_task_by_pid(cgrp, tgid, true);
 	} while (ret == -EAGAIN);
 	return ret;
 }
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
  * On success, returns true; the lock should be later released with
  * cgroup_unlock(). On failure returns false with no lock held.
  */
 bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_removed(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return false;
 	}
 	return true;
 }
 EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
 	if (strlen(buffer) >= PATH_MAX)
 		return -EINVAL;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	strcpy(cgrp->root->release_agent_path, buffer);
 	cgroup_unlock();
 	return 0;
 }
 static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 				     struct seq_file *seq)
 {
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
 	cgroup_unlock();
 	return 0;
 }
 /* A buffer size big enough for numbers or short strings */
 #define CGROUP_LOCAL_BUFFER_SIZE 64
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
 {
 	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
 	int retval = 0;
 	char *end;
 	if (!nbytes)
 		return -EINVAL;
 	if (nbytes >= sizeof(buffer))
 		return -E2BIG;
 	if (copy_from_user(buffer, userbuf, nbytes))
 		return -EFAULT;
 	buffer[nbytes] = 0;     /* nul-terminate */
 	if (cft->write_u64) {
 		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
 		if (*end)
 			return -EINVAL;
 		retval = cft->write_u64(cgrp, cft, val);
 	} else {
 		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
 		if (*end)
 			return -EINVAL;
 		retval = cft->write_s64(cgrp, cft, val);
 	}
 	if (!retval)
 		retval = nbytes;
 	return retval;
 }
 static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 				   struct file *file,
 				   const char __user *userbuf,
 				   size_t nbytes, loff_t *unused_ppos)
 {
 	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
 	int retval = 0;
 	size_t max_bytes = cft->max_write_len;
 	char *buffer = local_buffer;
 	if (!max_bytes)
 		max_bytes = sizeof(local_buffer) - 1;
 	if (nbytes >= max_bytes)
 		return -E2BIG;
 	/* Allocate a dynamic buffer if we need one */
 	if (nbytes >= sizeof(local_buffer)) {
 		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
 		if (buffer == NULL)
 			return -ENOMEM;
 	}
 	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
 		retval = -EFAULT;
 		goto out;
 	}
 	buffer[nbytes] = 0;     /* nul-terminate */
 	retval = cft->write_string(cgrp, cft, strstrip(buffer));
 	if (!retval)
 		retval = nbytes;
 out:
 	if (buffer != local_buffer)
 		kfree(buffer);
 	return retval;
 }
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 						size_t nbytes, loff_t *ppos)
 {
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->write_u64 || cft->write_s64)
 		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->write_string)
 		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->trigger) {
 		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
 		return ret ? ret : nbytes;
 	}
 	return -EINVAL;
 }
 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
 			       struct file *file,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
 	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
 	u64 val = cft->read_u64(cgrp, cft);
 	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
 			       struct file *file,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
 	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
 	s64 val = cft->read_s64(cgrp, cft);
 	int len = sprintf(tmp, "%lld\n", (long long) val);
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 				   size_t nbytes, loff_t *ppos)
 {
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->read)
 		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->read_u64)
 		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->read_s64)
 		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 /*
  * seqfile ops/methods for returning structured data. Currently just
  * supports string->u64 maps, but can be extended in future.
  */
 struct cgroup_seqfile_state {
 	struct cftype *cft;
 	struct cgroup *cgroup;
 };
 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
 {
 	struct seq_file *sf = cb->state;
 	return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
 }
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cgroup_seqfile_state *state = m->private;
 	struct cftype *cft = state->cft;
 	if (cft->read_map) {
 		struct cgroup_map_cb cb = {
 			.fill = cgroup_map_add,
 			.state = m,
 		};
 		return cft->read_map(state->cgroup, cft, &cb);
 	}
 	return cft->read_seq_string(state->cgroup, cft, m);
 }
 static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
 	kfree(seq->private);
 	return single_release(inode, file);
 }
 static const struct file_operations cgroup_seqfile_operations = {
 	.read = seq_read,
 	.write = cgroup_file_write,
 	.llseek = seq_lseek,
 	.release = cgroup_seqfile_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
 	int err;
 	struct cftype *cft;
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
 	cft = __d_cft(file->f_dentry);
 	if (cft->read_map || cft->read_seq_string) {
 		struct cgroup_seqfile_state *state =
 			kzalloc(sizeof(*state), GFP_USER);
 		if (!state)
 			return -ENOMEM;
 		state->cft = cft;
 		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
 		file->f_op = &cgroup_seqfile_operations;
 		err = single_open(file, cgroup_seqfile_show, state);
 		if (err < 0)
 			kfree(state);
 	} else if (cft->open)
 		err = cft->open(inode, file);
 	else
 		err = 0;
 	return err;
 }
 static int cgroup_file_release(struct inode *inode, struct file *file)
 {
 	struct cftype *cft = __d_cft(file->f_dentry);
 	if (cft->release)
 		return cft->release(inode, file);
 	return 0;
 }
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
 	if (!S_ISDIR(old_dentry->d_inode->i_mode))
 		return -ENOTDIR;
 	if (new_dentry->d_inode)
 		return -EEXIST;
 	if (old_dir != new_dir)
 		return -EIO;
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 static const struct file_operations cgroup_file_operations = {
 	.read = cgroup_file_read,
 	.write = cgroup_file_write,
 	.llseek = generic_file_llseek,
 	.open = cgroup_file_open,
 	.release = cgroup_file_release,
 };
 static const struct inode_operations cgroup_dir_inode_operations = {
 	.lookup = cgroup_lookup,
 	.mkdir = cgroup_mkdir,
 	.rmdir = cgroup_rmdir,
 	.rename = cgroup_rename,
 };
 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 	d_add(dentry, NULL);
 	return NULL;
 }
 /*
  * Check if a file is a control file
  */
 static inline struct cftype *__file_cft(struct file *file)
 {
 	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
 		return ERR_PTR(-EINVAL);
 	return __d_cft(file->f_dentry);
 }
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
 	struct inode *inode;
 	if (!dentry)
 		return -ENOENT;
 	if (dentry->d_inode)
 		return -EEXIST;
 	inode = cgroup_new_inode(mode, sb);
 	if (!inode)
 		return -ENOMEM;
 	if (S_ISDIR(mode)) {
 		inode->i_op = &cgroup_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* start off with i_nlink == 2 (for "." entry) */
 		inc_nlink(inode);
 		/* start with the directory inode held, so that we can
 		 * populate it without racing with another mkdir */
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cgroup_file_operations;
 	}
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
 	return 0;
 }
 /*
  * cgroup_create_dir - create a directory for an object.
  * @cgrp: the cgroup we create the directory for. It must have a valid
  *        ->parent field. And we are going to fill its ->dentry field.
  * @dentry: dentry of the new cgroup
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 				mode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
 	parent = cgrp->parent->dentry;
 	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
 	if (!error) {
 		dentry->d_fsdata = cgrp;
 		inc_nlink(parent->d_inode);
 		rcu_assign_pointer(cgrp->dentry, dentry);
 		dget(dentry);
 	}
 	dput(dentry);
 	return error;
 }
 /**
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
  *
  * returns cft->mode if ->mode is not 0
  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
  * returns S_IRUGO if it has only a read handler
  * returns S_IWUSR if it has only a write hander
  */
 static mode_t cgroup_file_mode(const struct cftype *cft)
 {
 	mode_t mode = 0;
 	if (cft->mode)
 		return cft->mode;
 	if (cft->read || cft->read_u64 || cft->read_s64 ||
 	    cft->read_map || cft->read_seq_string)
 		mode |= S_IRUGO;
 	if (cft->write || cft->write_u64 || cft->write_s64 ||
 	    cft->write_string || cft->trigger)
 		mode |= S_IWUSR;
 	return mode;
 }
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
 	mode_t mode;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
 	}
 	strcat(name, cft->name);
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
 		mode = cgroup_file_mode(cft);
 		error = cgroup_create_file(dentry, mode | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
 		dput(dentry);
 	} else
 		error = PTR_ERR(dentry);
 	return error;
 }
 EXPORT_SYMBOL_GPL(cgroup_add_file);
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
 			const struct cftype cft[],
 			int count)
 {
 	int i, err;
 	for (i = 0; i < count; i++) {
 		err = cgroup_add_file(cgrp, subsys, &cft[i]);
 		if (err)
 			return err;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cgroup_add_files);
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
  * Return the number of tasks in the cgroup.
  */
 int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cg_cgroup_link *link;
 	read_lock(&css_set_lock);
 	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
 		count += atomic_read(&link->cg->refcount);
 	}
 	read_unlock(&css_set_lock);
 	return count;
 }
 /*
  * Advance a list_head iterator.  The iterator should be positioned at
  * the start of a css_set
  */
 static void cgroup_advance_iter(struct cgroup *cgrp,
 				struct cgroup_iter *it)
 {
 	struct list_head *l = it->cg_link;
 	struct cg_cgroup_link *link;
 	struct css_set *cg;
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
 		if (l == &cgrp->css_sets) {
 			it->cg_link = NULL;
 			return;
 		}
 		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
 		cg = link->cg;
 	} while (list_empty(&cg->tasks));
 	it->cg_link = l;
 	it->task = cg->tasks.next;
 }
 /*
  * To reduce the fork() overhead for systems that are not actually
  * using their cgroups capability, we don't maintain the lists running
  * through each css_set to its tasks until we see the list actually
  * used - in other words after the first call to cgroup_iter_start().
  *
  * The tasklist_lock is not held here, as do_each_thread() and
  * while_each_thread() are protected by RCU.
  */
 static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	write_lock(&css_set_lock);
 	use_task_css_set_links = 1;
 	do_each_thread(g, p) {
 		task_lock(p);
 		/*
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
 		 */
 		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
 			list_add(&p->cg_list, &p->cgroups->tasks);
 		task_unlock(p);
 	} while_each_thread(g, p);
 	write_unlock(&css_set_lock);
 }
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	/*
 	 * The first time anyone tries to iterate across a cgroup,
 	 * we need to enable the list linking each css_set to its
 	 * tasks, and fix up all existing tasks.
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 	read_lock(&css_set_lock);
 	it->cg_link = &cgrp->css_sets;
 	cgroup_advance_iter(cgrp, it);
 }
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it)
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
 	struct cg_cgroup_link *link;
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cg_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/* Advance iterator to find next entry */
 	l = l->next;
 	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
 	if (l == &link->cg->tasks) {
 		/* We reached the end of this task list - move on to
 		 * the next cg_cgroup_link */
 		cgroup_advance_iter(cgrp, it);
 	} else {
 		it->task = l;
 	}
 	return res;
 }
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	read_unlock(&css_set_lock);
 }
 static inline int started_after_time(struct task_struct *t1,
 				     struct timespec *time,
 				     struct task_struct *t2)
 {
 	int start_diff = timespec_compare(&t1->start_time, time);
 	if (start_diff > 0) {
 		return 1;
 	} else if (start_diff < 0) {
 		return 0;
 	} else {
 		/*
 		 * Arbitrarily, if two processes started at the same
 		 * time, we'll say that the lower pointer value
 		 * started first. Note that t2 may have exited by now
 		 * so this may not be a valid pointer any longer, but
 		 * that's fine - it still serves to distinguish
 		 * between two tasks started (effectively) simultaneously.
 		 */
 		return t1 > t2;
 	}
 }
 /*
  * This function is a callback from heap_insert() and is used to order
  * the heap.
  * In this case we order the heap in descending task start time.
  */
 static inline int started_after(void *p1, void *p2)
 {
 	struct task_struct *t1 = p1;
 	struct task_struct *t2 = p2;
 	return started_after_time(t1, &t2->start_time, t2);
 }
 /**
  * cgroup_scan_tasks - iterate though all the tasks in a cgroup
  * @scan: struct cgroup_scanner containing arguments for the scan
  *
  * Arguments include pointers to callback functions test_task() and
  * process_task().
  * Iterate through all the tasks in a cgroup, calling test_task() for each,
  * and if it returns true, call process_task() for it also.
  * The test_task pointer may be NULL, meaning always true (select all tasks).
  * Effectively duplicates cgroup_iter_{start,next,end}()
  * but does not lock css_set_lock for the call to process_task().
  * The struct cgroup_scanner may be embedded in any structure of the caller's
  * creation.
  * It is guaranteed that process_task() will act on every task that
  * is a member of the cgroup for the duration of this call. This
  * function may or may not call process_task() for tasks that exit
  * or move to a different cgroup during the call, or are forked or
  * move into the cgroup during the call.
  *
  * Note that test_task() may be called with locks held, and may in some
  * situations be called multiple times for the same task, so it should
  * be cheap.
  * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
  * pre-allocated and will be used for heap operations (and its "gt" member will
  * be overwritten), else a temporary heap will be used (allocation of which
  * may cause this function to fail).
  */
 int cgroup_scan_tasks(struct cgroup_scanner *scan)
 {
 	int retval, i;
 	struct cgroup_iter it;
 	struct task_struct *p, *dropped;
 	/* Never dereference latest_task, since it's not refcounted */
 	struct task_struct *latest_task = NULL;
 	struct ptr_heap tmp_heap;
 	struct ptr_heap *heap;
 	struct timespec latest_time = { 0, 0 };
 	if (scan->heap) {
 		/* The caller supplied our heap and pre-allocated its memory */
 		heap = scan->heap;
 		heap->gt = &started_after;
 	} else {
 		/* We need to allocate our own heap memory */
 		heap = &tmp_heap;
 		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
 		if (retval)
 			/* cannot allocate the heap */
 			return retval;
 	}
  again:
 	/*
 	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
 	 * to determine which are of interest, and using the scanner's
 	 * "process_task" callback to process any of them that need an update.
 	 * Since we don't want to hold any locks during the task updates,
 	 * gather tasks to be processed in a heap structure.
 	 * The heap is sorted by descending task start time.
 	 * If the statically-sized heap fills up, we overflow tasks that
 	 * started later, and in future iterations only consider tasks that
 	 * started after the latest task in the previous pass. This
 	 * guarantees forward progress and that we don't miss any tasks.
 	 */
 	heap->size = 0;
 	cgroup_iter_start(scan->cg, &it);
 	while ((p = cgroup_iter_next(scan->cg, &it))) {
 		/*
 		 * Only affect tasks that qualify per the caller's callback,
 		 * if he provided one
 		 */
 		if (scan->test_task && !scan->test_task(p, scan))
 			continue;
 		/*
 		 * Only process tasks that started after the last task
 		 * we processed
 		 */
 		if (!started_after_time(p, &latest_time, latest_task))
 			continue;
 		dropped = heap_insert(heap, p);
 		if (dropped == NULL) {
 			/*
 			 * The new task was inserted; the heap wasn't
 			 * previously full
 			 */
 			get_task_struct(p);
 		} else if (dropped != p) {
 			/*
 			 * The new task was inserted, and pushed out a
 			 * different task
 			 */
 			get_task_struct(p);
 			put_task_struct(dropped);
 		}
 		/*
 		 * Else the new task was newer than anything already in
 		 * the heap and wasn't inserted
 		 */
 	}
 	cgroup_iter_end(scan->cg, &it);
 	if (heap->size) {
 		for (i = 0; i < heap->size; i++) {
 			struct task_struct *q = heap->ptrs[i];
 			if (i == 0) {
 				latest_time = q->start_time;
 				latest_task = q;
 			}
 			/* Process the task per the caller's callback */
 			scan->process_task(q, scan);
 			put_task_struct(q);
 		}
 		/*
 		 * If we had to process any tasks at all, scan again
 		 * in case some of them were in the middle of forking
 		 * children that didn't get processed.
 		 * Not the most efficient way to do it, but it avoids
 		 * having to take callback_mutex in the fork path
 		 */
 		goto again;
 	}
 	if (heap == &tmp_heap)
 		heap_free(&tmp_heap);
 	return 0;
 }
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
  */
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
  * TODO: replace with a kernel-wide solution to this problem
  */
 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
 static void *pidlist_allocate(int count)
 {
 	if (PIDLIST_TOO_LARGE(count))
 		return vmalloc(count * sizeof(pid_t));
 	else
 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
 static void pidlist_free(void *p)
 {
 	if (is_vmalloc_addr(p))
 		vfree(p);
 	else
 		kfree(p);
 }
 static void *pidlist_resize(void *p, int newcount)
 {
 	void *newlist;
 	/* note: if new alloc fails, old p will still be valid either way */
 	if (is_vmalloc_addr(p)) {
 		newlist = vmalloc(newcount * sizeof(pid_t));
 		if (!newlist)
 			return NULL;
 		memcpy(newlist, p, newcount * sizeof(pid_t));
 		vfree(p);
 	} else {
 		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
 	}
 	return newlist;
 }
 /*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  * If the new stripped list is sufficiently smaller and there's enough memory
  * to allocate a new buffer, will let go of the unneeded memory. Returns the
  * number of unique elements.
  */
 /* is the size difference enough that we should re-allocate the array? */
 #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
 static int pidlist_uniq(pid_t **p, int length)
 {
 	int src, dest = 1;
 	pid_t *list = *p;
 	pid_t *newlist;
 	/*
 	 * we presume the 0th element is unique, so i starts at 1. trivial
 	 * edge cases first; no work needs to be done for either
 	 */
 	if (length == 0 || length == 1)
 		return length;
 	/* src and dest walk down the list; dest counts unique elements */
 	for (src = 1; src < length; src++) {
 		/* find next unique element */
 		while (list[src] == list[src-1]) {
 			src++;
 			if (src == length)
 				goto after;
 		}
 		/* dest always points to where the next unique element goes */
 		list[dest] = list[src];
 		dest++;
 	}
 after:
 	/*
 	 * if the length difference is large enough, we want to allocate a
 	 * smaller buffer to save memory. if this fails due to out of memory,
 	 * we'll just stay with what we've got.
 	 */
 	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
 		newlist = pidlist_resize(list, dest);
 		if (newlist)
 			*p = newlist;
 	}
 	return dest;
 }
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 /*
  * find the appropriate pidlist for our purpose (given procs vs tasks)
  * returns with the lock on that pidlist already held, and takes care
  * of the use count, or returns NULL with no locks held if we're out of
  * memory.
  */
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	struct pid_namespace *ns = current->nsproxy->pid_ns;
 	/*
 	 * We can't drop the pidlist_mutex before taking the l->mutex in case
 	 * the last ref-holder is trying to remove l from the list at the same
 	 * time. Holding the pidlist_mutex precludes somebody taking whichever
 	 * list we find out from under us - compare release_pid_array().
 	 */
 	mutex_lock(&cgrp->pidlist_mutex);
 	list_for_each_entry(l, &cgrp->pidlists, links) {
 		if (l->key.type == type && l->key.ns == ns) {
 			/* make sure l doesn't vanish out from under us */
 			down_write(&l->mutex);
 			mutex_unlock(&cgrp->pidlist_mutex);
 			return l;
 		}
 	}
 	/* entry not found; create a new one */
 	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
 		return l;
 	}
 	init_rwsem(&l->mutex);
 	down_write(&l->mutex);
 	l->key.type = type;
 	l->key.ns = get_pid_ns(ns);
 	l->use_count = 0; /* don't increment here */
 	l->list = NULL;
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
 	mutex_unlock(&cgrp->pidlist_mutex);
 	return l;
 }
 /*
  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  */
 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 			      struct cgroup_pidlist **lp)
 {
 	pid_t *array;
 	int length;
 	int pid, n = 0; /* used for populating the array */
 	struct cgroup_iter it;
 	struct task_struct *tsk;
 	struct cgroup_pidlist *l;
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
 	 * caller from the case that the additional cgroup users didn't
 	 * show up until sometime later on.
 	 */
 	length = cgroup_task_count(cgrp);
 	array = pidlist_allocate(length);
 	if (!array)
 		return -ENOMEM;
 	/* now, populate the array */
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		if (unlikely(n == length))
 			break;
 		/* get tgid or pid for procs or tasks file respectively */
 		if (type == CGROUP_FILE_PROCS)
 			pid = task_tgid_vnr(tsk);
 		else
 			pid = task_pid_vnr(tsk);
 		if (pid > 0) /* make sure to only use valid results */
 			array[n++] = pid;
 	}
 	cgroup_iter_end(cgrp, &it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
 	sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(&array, length);
 	l = cgroup_pidlist_find(cgrp, type);
 	if (!l) {
 		pidlist_free(array);
 		return -ENOMEM;
 	}
 	/* store array, freeing old if necessary - lock already held */
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	l->use_count++;
 	up_write(&l->mutex);
 	*lp = l;
 	return 0;
 }
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
  * @dentry: A dentry entry belonging to the cgroup for which stats have
  * been requested.
  *
  * Build and fill cgroupstats so that taskstats can export it to user
  * space.
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
 	int ret = -EINVAL;
 	struct cgroup *cgrp;
 	struct cgroup_iter it;
 	struct task_struct *tsk;
 	/*
 	 * Validate dentry by checking the superblock operations,
 	 * and make sure it's a directory.
 	 */
 	if (dentry->d_sb->s_op != &cgroup_ops ||
 	    !S_ISDIR(dentry->d_inode->i_mode))
 		 goto err;
 	ret = 0;
 	cgrp = dentry->d_fsdata;
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
 			stats->nr_running++;
 			break;
 		case TASK_INTERRUPTIBLE:
 			stats->nr_sleeping++;
 			break;
 		case TASK_UNINTERRUPTIBLE:
 			stats->nr_uninterruptible++;
 			break;
 		case TASK_STOPPED:
 			stats->nr_stopped++;
 			break;
 		default:
 			if (delayacct_is_task_waiting_on_io(tsk))
 				stats->nr_io_wait++;
 			break;
 		}
 	}
 	cgroup_iter_end(cgrp, &it);
 err:
 	return ret;
 }
 /*
  * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
  * in the cgroup->l->list array.
  */
 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
 	/*
 	 * Initially we receive a position value that corresponds to
 	 * one more than the last pid shown (or 0 on the first call or
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
 	struct cgroup_pidlist *l = s->private;
 	int index = 0, pid = *pos;
 	int *iter;
 	down_read(&l->mutex);
 	if (pid) {
 		int end = l->length;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (l->list[mid] == pid) {
 				index = mid;
 				break;
 			} else if (l->list[mid] <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
 	if (index >= l->length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
 	*pos = *iter;
 	return iter;
 }
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct cgroup_pidlist *l = s->private;
 	up_read(&l->mutex);
 }
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct cgroup_pidlist *l = s->private;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
 	 */
 	p++;
 	if (p >= end) {
 		return NULL;
 	} else {
 		*pos = *p;
 		return p;
 	}
 }
 static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 /*
  * seq_operations functions for iterating on pidlists through seq_file -
  * independent of whether it's tasks or procs
  */
 static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.start = cgroup_pidlist_start,
 	.stop = cgroup_pidlist_stop,
 	.next = cgroup_pidlist_next,
 	.show = cgroup_pidlist_show,
 };
 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
 	/*
 	 * the case where we're the last user of this particular pidlist will
 	 * have us remove it from the cgroup's list, which entails taking the
 	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
 	 * pidlist_mutex, we have to take pidlist_mutex first.
 	 */
 	mutex_lock(&l->owner->pidlist_mutex);
 	down_write(&l->mutex);
 	BUG_ON(!l->use_count);
 	if (!--l->use_count) {
 		/* we're the last user if refcount is 0; remove and free */
 		list_del(&l->links);
 		mutex_unlock(&l->owner->pidlist_mutex);
 		pidlist_free(l->list);
 		put_pid_ns(l->key.ns);
 		up_write(&l->mutex);
 		kfree(l);
 		return;
 	}
 	mutex_unlock(&l->owner->pidlist_mutex);
 	up_write(&l->mutex);
 }
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
 	struct cgroup_pidlist *l;
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 	/*
 	 * the seq_file will only be initialized if the file was opened for
 	 * reading; hence we check if it's not null only in that case.
 	 */
 	l = ((struct seq_file *)file->private_data)->private;
 	cgroup_release_pid_array(l);
 	return seq_release(inode, file);
 }
 static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.write = cgroup_file_write,
 	.release = cgroup_pidlist_release,
 };
 /*
  * The following functions handle opens on a file that displays a pidlist
  * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
  * in the cgroup.
  */
 /* helper function for the two below it */
 static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	struct cgroup_pidlist *l;
 	int retval;
 	/* Nothing to do for write-only files */
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 	/* have the array populated */
 	retval = pidlist_array_load(cgrp, type, &l);
 	if (retval)
 		return retval;
 	/* configure file information */
 	file->f_op = &cgroup_pidlist_operations;
 	retval = seq_open(file, &cgroup_pidlist_seq_operations);
 	if (retval) {
 		cgroup_release_pid_array(l);
 		return retval;
 	}
 	((struct seq_file *)file->private_data)->private = l;
 	return 0;
 }
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
 	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
 }
 static int cgroup_procs_open(struct inode *unused, struct file *file)
 {
 	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
 }
 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 					    struct cftype *cft)
 {
 	return notify_on_release(cgrp);
 }
 static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 					  struct cftype *cft,
 					  u64 val)
 {
 	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
 	if (val)
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	else
 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	return 0;
 }
 /*
  * Unregister event and free resources.
  *
  * Gets called from workqueue.
  */
 static void cgroup_event_remove(struct work_struct *work)
 {
 	struct cgroup_event *event = container_of(work, struct cgroup_event,
 			remove);
 	struct cgroup *cgrp = event->cgrp;
 	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
 	dput(cgrp->dentry);
 }
 /*
  * Gets called on POLLHUP on eventfd when user closes it.
  *
  * Called with wqh->lock held and interrupts disabled.
  */
 static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 		int sync, void *key)
 {
 	struct cgroup_event *event = container_of(wait,
 			struct cgroup_event, wait);
 	struct cgroup *cgrp = event->cgrp;
 	unsigned long flags = (unsigned long)key;
 	if (flags & POLLHUP) {
 		__remove_wait_queue(event->wqh, &event->wait);
 		spin_lock(&cgrp->event_list_lock);
 		list_del(&event->list);
 		spin_unlock(&cgrp->event_list_lock);
 		/*
 		 * We are in atomic context, but cgroup_event_remove() may
 		 * sleep, so we have to call it in workqueue.
 		 */
 		schedule_work(&event->remove);
 	}
 	return 0;
 }
 static void cgroup_event_ptable_queue_proc(struct file *file,
 		wait_queue_head_t *wqh, poll_table *pt)
 {
 	struct cgroup_event *event = container_of(pt,
 			struct cgroup_event, pt);
 	event->wqh = wqh;
 	add_wait_queue(wqh, &event->wait);
 }
 /*
  * Parse input and register new cgroup event handler.
  *
  * Input must be in format '<event_fd> <control_fd> <args>'.
  * Interpretation of args is defined by control file implementation.
  */
 static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
 	struct cgroup_event *event = NULL;
 	unsigned int efd, cfd;
 	struct file *efile = NULL;
 	struct file *cfile = NULL;
 	char *endp;
 	int ret;
 	efd = simple_strtoul(buffer, &endp, 10);
 	if (*endp != ' ')
 		return -EINVAL;
 	buffer = endp + 1;
 	cfd = simple_strtoul(buffer, &endp, 10);
 	if ((*endp != ' ') && (*endp != '\0'))
 		return -EINVAL;
 	buffer = endp + 1;
 	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	event->cgrp = cgrp;
 	INIT_LIST_HEAD(&event->list);
 	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
 	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
 	INIT_WORK(&event->remove, cgroup_event_remove);
 	efile = eventfd_fget(efd);
 	if (IS_ERR(efile)) {
 		ret = PTR_ERR(efile);
 		goto fail;
 	}
 	event->eventfd = eventfd_ctx_fileget(efile);
 	if (IS_ERR(event->eventfd)) {
 		ret = PTR_ERR(event->eventfd);
 		goto fail;
 	}
 	cfile = fget(cfd);
 	if (!cfile) {
 		ret = -EBADF;
 		goto fail;
 	}
 	/* the process need read permission on control file */
 	ret = file_permission(cfile, MAY_READ);
 	if (ret < 0)
 		goto fail;
 	event->cft = __file_cft(cfile);
 	if (IS_ERR(event->cft)) {
 		ret = PTR_ERR(event->cft);
 		goto fail;
 	}
 	if (!event->cft->register_event || !event->cft->unregister_event) {
 		ret = -EINVAL;
 		goto fail;
 	}
 	ret = event->cft->register_event(cgrp, event->cft,
 			event->eventfd, buffer);
 	if (ret)
 		goto fail;
 	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
 		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
 		ret = 0;
 		goto fail;
 	}
 	/*
 	 * Events should be removed after rmdir of cgroup directory, but before
 	 * destroying subsystem state objects. Let's take reference to cgroup
 	 * directory dentry to do that.
 	 */
 	dget(cgrp->dentry);
 	spin_lock(&cgrp->event_list_lock);
 	list_add(&event->list, &cgrp->event_list);
 	spin_unlock(&cgrp->event_list_lock);
 	fput(cfile);
 	fput(efile);
 	return 0;
 fail:
 	if (cfile)
 		fput(cfile);
 	if (event && event->eventfd && !IS_ERR(event->eventfd))
 		eventfd_ctx_put(event->eventfd);
 	if (!IS_ERR_OR_NULL(efile))
 		fput(efile);
 	kfree(event);
 	return ret;
 }
 static u64 cgroup_clone_children_read(struct cgroup *cgrp,
 				    struct cftype *cft)
 {
 	return clone_children(cgrp);
 }
 static int cgroup_clone_children_write(struct cgroup *cgrp,
 				     struct cftype *cft,
 				     u64 val)
 {
 	if (val)
 		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 	else
 		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 	return 0;
 }
 /*
  * for the common functions, 'private' gives the type of file
  */
 /* for hysterical raisins, we can't put this on the older files */
 #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
 static struct cftype files[] = {
 	{
 		.name = "tasks",
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
 		.open = cgroup_procs_open,
 		.write_u64 = cgroup_procs_write,
 		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "notify_on_release",
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
 	},
 	{
 		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
 		.write_string = cgroup_write_event_control,
 		.mode = S_IWUGO,
 	},
 	{
 		.name = "cgroup.clone_children",
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
 };
 static struct cftype cft_release_agent = {
 	.name = "release_agent",
 	.read_seq_string = cgroup_release_agent_show,
 	.write_string = cgroup_release_agent_write,
 	.max_write_len = PATH_MAX,
 };
 static int cgroup_populate_dir(struct cgroup *cgrp)
 {
 	int err;
 	struct cgroup_subsys *ss;
 	/* First clear out any existing files */
 	cgroup_clear_directory(cgrp->dentry);
 	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 	if (err < 0)
 		return err;
 	if (cgrp == cgrp->top_cgroup) {
 		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
 			return err;
 	}
 	for_each_subsys(cgrp->root, ss) {
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
 	/* This cgroup is ready now */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 		/*
 		 * Update id->css pointer and make this css visible from
 		 * CSS ID functions. This pointer will be dereferened
 		 * from RCU-read-side without locks.
 		 */
 		if (css->id)
 			rcu_assign_pointer(css->id->css, css);
 	}
 	return 0;
 }
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
 	cgrp->subsys[ss->subsys_id] = css;
 }
 static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
 {
 	/* We need to take each hierarchy_mutex in a consistent order */
 	int i;
 	/*
 	 * No worry about a race with rebind_subsystems that might mess up the
 	 * locking order, since both parties are under cgroup_mutex.
 	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (ss == NULL)
 			continue;
 		if (ss->root == root)
 			mutex_lock(&ss->hierarchy_mutex);
 	}
 }
 static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
 {
 	int i;
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (ss == NULL)
 			continue;
 		if (ss->root == root)
 			mutex_unlock(&ss->hierarchy_mutex);
 	}
 }
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
  * @dentry: dentry of the new cgroup
  * @mode: mode to set on new inode
  *
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     mode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
 	int err = 0;
 	struct cgroup_subsys *ss;
 	struct super_block *sb = root->sb;
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
 		return -ENOMEM;
 	/* Grab a reference on the superblock so the hierarchy doesn't
 	 * get deleted on unmount if there are child cgroups.  This
 	 * can be done outside cgroup_mutex, since the sb can't
 	 * disappear while someone has an open control file on the
 	 * fs */
 	atomic_inc(&sb->s_active);
 	mutex_lock(&cgroup_mutex);
 	init_cgroup_housekeeping(cgrp);
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
 	cgrp->top_cgroup = parent->top_cgroup;
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	if (clone_children(parent))
 		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 	for_each_subsys(root, ss) {
 		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
 		if (IS_ERR(css)) {
 			err = PTR_ERR(css);
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
 		if (ss->use_id) {
 			err = alloc_css_id(ss, parent, cgrp);
 			if (err)
 				goto err_destroy;
 		}
 		/* At error, ->destroy() callback has to free assigned ID. */
 		if (clone_children(parent) && ss->post_clone)
 			ss->post_clone(ss, cgrp);
 	}
 	cgroup_lock_hierarchy(root);
 	list_add(&cgrp->sibling, &cgrp->parent->children);
 	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups++;
 	err = cgroup_create_dir(cgrp, dentry, mode);
 	if (err < 0)
 		goto err_remove;
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 	err = cgroup_populate_dir(cgrp);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return 0;
  err_remove:
 	cgroup_lock_hierarchy(root);
 	list_del(&cgrp->sibling);
 	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups--;
  err_destroy:
 	for_each_subsys(root, ss) {
 		if (cgrp->subsys[ss->subsys_id])
 			ss->destroy(ss, cgrp);
 	}
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
 	deactivate_super(sb);
 	kfree(cgrp);
 	return err;
 }
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
 	/* the vfs holds inode->i_mutex already */
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
 	/* Check the reference count on each subsystem. Since we
 	 * already established that there are no tasks in the
 	 * cgroup, if the css refcount is also 1, then there should
 	 * be no outstanding references, so the subsystem is safe to
 	 * destroy. We scan across all subsystems rather than using
 	 * the per-hierarchy linked list of mounted subsystems since
 	 * we can be called via check_for_release() with no
 	 * synchronization other than RCU, and the subsystem linked
 	 * list isn't RCU-safe */
 	int i;
 	/*
 	 * We won't need to lock the subsys array, because the subsystems
 	 * we're concerned about aren't going anywhere since our cgroup root
 	 * has a reference on them.
 	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		struct cgroup_subsys_state *css;
 		/* Skip subsystems not present or not in this hierarchy */
 		if (ss == NULL || ss->root != cgrp->root)
 			continue;
 		css = cgrp->subsys[ss->subsys_id];
 		/* When called from check_for_release() it's possible
 		 * that by this point the cgroup has been removed
 		 * and the css deleted. But a false-positive doesn't
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
 		 * release agent to be called anyway. */
 		if (css && (atomic_read(&css->refcnt) > 1))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Atomically mark all (or else none) of the cgroup's CSS objects as
  * CSS_REMOVED. Return true on success, or false if the cgroup has
  * busy subsystems. Call with cgroup_mutex held
  */
 static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
 	unsigned long flags;
 	bool failed = false;
 	local_irq_save(flags);
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 		int refcnt;
 		while (1) {
 			/* We can only remove a CSS with a refcnt==1 */
 			refcnt = atomic_read(&css->refcnt);
 			if (refcnt > 1) {
 				failed = true;
 				goto done;
 			}
 			BUG_ON(!refcnt);
 			/*
 			 * Drop the refcnt to 0 while we check other
 			 * subsystems. This will cause any racing
 			 * css_tryget() to spin until we set the
 			 * CSS_REMOVED bits or abort
 			 */
 			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
 				break;
 			cpu_relax();
 		}
 	}
  done:
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 		if (failed) {
 			/*
 			 * Restore old refcnt if we previously managed
 			 * to clear it from 1 to 0
 			 */
 			if (!atomic_read(&css->refcnt))
 				atomic_set(&css->refcnt, 1);
 		} else {
 			/* Commit the fact that the CSS is removed */
 			set_bit(CSS_REMOVED, &css->flags);
 		}
 	}
 	local_irq_restore(flags);
 	return !failed;
 }
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
 	DEFINE_WAIT(wait);
 	struct cgroup_event *event, *tmp;
 	int ret;
 	/* the vfs holds both inode->i_mutex already */
 again:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	if (!list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	mutex_unlock(&cgroup_mutex);
 	/*
 	 * In general, subsystem has no css->refcnt after pre_destroy(). But
 	 * in racy cases, subsystem may have to get css->refcnt after
 	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
 	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
 	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
 	 * and subsystem's reference count handling. Please see css_get/put
 	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
 	 */
 	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 	/*
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
 	ret = cgroup_call_pre_destroy(cgrp);
 	if (ret) {
 		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 		return ret;
 	}
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
 	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
 	if (!cgroup_clear_css_refs(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		/*
 		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
 		 * prepare_to_wait(), we need to check this flag.
 		 */
 		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
 			schedule();
 		finish_wait(&cgroup_rmdir_waitq, &wait);
 		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 		if (signal_pending(current))
 			return -EINTR;
 		goto again;
 	}
 	/* NO css_tryget() can success after here. */
 	finish_wait(&cgroup_rmdir_waitq, &wait);
 	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	spin_unlock(&release_list_lock);
 	cgroup_lock_hierarchy(cgrp->root);
 	/* delete this cgroup from parent->children */
 	list_del_init(&cgrp->sibling);
 	cgroup_unlock_hierarchy(cgrp->root);
 	d = dget(cgrp->dentry);
 	cgroup_d_remove_dir(d);
 	dput(d);
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 	/*
 	 * Unregister events and notify userspace.
 	 * Notify userspace about cgroup removing only after rmdir of cgroup
 	 * directory to avoid race between userspace and kernelspace
 	 */
 	spin_lock(&cgrp->event_list_lock);
 	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
 		list_del(&event->list);
 		remove_wait_queue(event->wqh, &event->wait);
 		eventfd_signal(event->eventfd, 1);
 		schedule_work(&event->remove);
 	}
 	spin_unlock(&cgrp->event_list_lock);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 	/* Create the top cgroup state for this subsystem */
 	list_add(&ss->sibling, &rootnode.subsys_list);
 	ss->root = &rootnode;
 	css = ss->create(ss, dummytop);
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_cgroup_css(css, ss, dummytop);
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence the
 	 * init_css_set is in the subsystem's top cgroup. */
 	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 	need_forkexit_callback |= ss->fork || ss->exit;
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 	mutex_init(&ss->hierarchy_mutex);
 	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 	/* this function shouldn't be used with modular subsystems, since they
 	 * need to register a subsys_id, among other things */
 	BUG_ON(ss->module);
 }
 /**
  * cgroup_load_subsys: load and register a modular subsystem at runtime
  * @ss: the subsystem to load
  *
  * This function should be called in a modular subsystem's initcall. If the
  * subsystem is built as a module, it will be assigned a new subsys_id and set
  * up for use. If the subsystem is built-in anyway, work is delegated to the
  * simpler cgroup_init_subsys.
  */
 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 {
 	int i;
 	struct cgroup_subsys_state *css;
 	/* check name and function validity */
 	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
 	    ss->create == NULL || ss->destroy == NULL)
 		return -EINVAL;
 	/*
 	 * we don't support callbacks in modular subsystems. this check is
 	 * before the ss->module check for consistency; a subsystem that could
 	 * be a module should still have no callbacks even if the user isn't
 	 * compiling it as one.
 	 */
 	if (ss->fork || ss->exit)
 		return -EINVAL;
 	/*
 	 * an optionally modular subsystem is built-in: we want to do nothing,
 	 * since cgroup_init_subsys will have already taken care of it.
 	 */
 	if (ss->module == NULL) {
 		/* a few sanity checks */
 		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
 		BUG_ON(subsys[ss->subsys_id] != ss);
 		return 0;
 	}
 	/*
 	 * need to register a subsys id before anything else - for example,
 	 * init_cgroup_css needs it.
 	 */
 	mutex_lock(&cgroup_mutex);
 	/* find the first empty slot in the array */
 	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
 		if (subsys[i] == NULL)
 			break;
 	}
 	if (i == CGROUP_SUBSYS_COUNT) {
 		/* maximum number of subsystems already registered! */
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	/* assign ourselves the subsys_id */
 	ss->subsys_id = i;
 	subsys[i] = ss;
 	/*
 	 * no ss->create seems to need anything important in the ss struct, so
 	 * this can happen first (i.e. before the rootnode attachment).
 	 */
 	css = ss->create(ss, dummytop);
 	if (IS_ERR(css)) {
 		/* failure case - need to deassign the subsys[] slot. */
 		subsys[i] = NULL;
 		mutex_unlock(&cgroup_mutex);
 		return PTR_ERR(css);
 	}
 	list_add(&ss->sibling, &rootnode.subsys_list);
 	ss->root = &rootnode;
 	/* our new subsystem will be attached to the dummy hierarchy. */
 	init_cgroup_css(css, ss, dummytop);
 	/* init_idr must be after init_cgroup_css because it sets css->id. */
 	if (ss->use_id) {
 		int ret = cgroup_init_idr(ss, css);
 		if (ret) {
 			dummytop->subsys[ss->subsys_id] = NULL;
 			ss->destroy(ss, dummytop);
 			subsys[i] = NULL;
 			mutex_unlock(&cgroup_mutex);
 			return ret;
 		}
 	}
 	/*
 	 * Now we need to entangle the css into the existing css_sets. unlike
 	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
 	 * will need a new pointer to it; done by iterating the css_set_table.
 	 * furthermore, modifying the existing css_sets will corrupt the hash
 	 * table state, so each changed css_set will need its hash recomputed.
 	 * this is all done under the css_set_lock.
 	 */
 	write_lock(&css_set_lock);
 	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
 		struct css_set *cg;
 		struct hlist_node *node, *tmp;
 		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
 		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
 			/* skip entries that we already rehashed */
 			if (cg->subsys[ss->subsys_id])
 				continue;
 			/* remove existing entry */
 			hlist_del(&cg->hlist);
 			/* set new value */
 			cg->subsys[ss->subsys_id] = css;
 			/* recompute hash and restore entry */
 			new_bucket = css_set_hash(cg->subsys);
 			hlist_add_head(&cg->hlist, new_bucket);
 		}
 	}
 	write_unlock(&css_set_lock);
 	mutex_init(&ss->hierarchy_mutex);
 	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 	/* success! */
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 /**
  * cgroup_unload_subsys: unload a modular subsystem
  * @ss: the subsystem to unload
  *
  * This function should be called in a modular subsystem's exitcall. When this
  * function is invoked, the refcount on the subsystem's module will be 0, so
  * the subsystem will not be attached to any hierarchy.
  */
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
 	struct cg_cgroup_link *link;
 	struct hlist_head *hhead;
 	BUG_ON(ss->module == NULL);
 	/*
 	 * we shouldn't be called if the subsystem is in use, and the use of
 	 * try_module_get in parse_cgroupfs_options should ensure that it
 	 * doesn't start being used while we're killing it off.
 	 */
 	BUG_ON(ss->root != &rootnode);
 	mutex_lock(&cgroup_mutex);
 	/* deassign the subsys_id */
 	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
 	subsys[ss->subsys_id] = NULL;
 	/* remove subsystem from rootnode's list of subsystems */
 	list_del_init(&ss->sibling);
 	/*
 	 * disentangle the css from all css_sets attached to the dummytop. as
 	 * in loading, we need to pay our respects to the hashtable gods.
 	 */
 	write_lock(&css_set_lock);
 	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
 		struct css_set *cg = link->cg;
 		hlist_del(&cg->hlist);
 		BUG_ON(!cg->subsys[ss->subsys_id]);
 		cg->subsys[ss->subsys_id] = NULL;
 		hhead = css_set_hash(cg->subsys);
 		hlist_add_head(&cg->hlist, hhead);
 	}
 	write_unlock(&css_set_lock);
 	/*
 	 * remove subsystem's css from the dummytop and free it - need to free
 	 * before marking as null because ss->destroy needs the cgrp->subsys
 	 * pointer to find their state. note that this also takes care of
 	 * freeing the css_id.
 	 */
 	ss->destroy(ss, dummytop);
 	dummytop->subsys[ss->subsys_id] = NULL;
 	mutex_unlock(&cgroup_mutex);
 }
 EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
 /**
  * cgroup_init_early - cgroup initialization at system boot
  *
  * Initialize cgroups at system boot, and initialize any
  * subsystems that request early init.
  */
 int __init cgroup_init_early(void)
 {
 	int i;
 	atomic_set(&init_css_set.refcount, 1);
 	INIT_LIST_HEAD(&init_css_set.cg_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
 	init_cgroup_root(&rootnode);
 	root_count = 1;
 	init_task.cgroups = &init_css_set;
 	init_css_set_link.cg = &init_css_set;
 	init_css_set_link.cgrp = dummytop;
 	list_add(&init_css_set_link.cgrp_link_list,
 		 &rootnode.top_cgroup.css_sets);
 	list_add(&init_css_set_link.cg_link_list,
 		 &init_css_set.cg_links);
 	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
 		INIT_HLIST_HEAD(&css_set_table[i]);
 	/* at bootup time, we don't worry about modular subsystems */
 	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		BUG_ON(!ss->name);
 		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
 		BUG_ON(!ss->create);
 		BUG_ON(!ss->destroy);
 		if (ss->subsys_id != i) {
 			printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
 			       ss->name, ss->subsys_id);
 			BUG();
 		}
 		if (ss->early_init)
 			cgroup_init_subsys(ss);
 	}
 	return 0;
 }
 /**
  * cgroup_init - cgroup initialization
  *
  * Register cgroup filesystem and /proc file, and initialize
  * any subsystems that didn't request early init.
  */
 int __init cgroup_init(void)
 {
 	int err;
 	int i;
 	struct hlist_head *hhead;
 	err = bdi_init(&cgroup_backing_dev_info);
 	if (err)
 		return err;
 	/* at bootup time, we don't worry about modular subsystems */
 	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		if (ss->use_id)
 			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
 	}
 	/* Add init_css_set to the hash table */
 	hhead = css_set_hash(init_css_set.subsys);
 	hlist_add_head(&init_css_set.hlist, hhead);
 	BUG_ON(!init_root_id(&rootnode));
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
 	if (!cgroup_kobj) {
 		err = -ENOMEM;
 		goto out;
 	}
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0) {
 		kobject_put(cgroup_kobj);
 		goto out;
 	}
 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 out:
 	if (err)
 		bdi_destroy(&cgroup_backing_dev_info);
 	return err;
 }
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
  *  - Used for /proc/<pid>/cgroup.
  *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
  *    doesn't really matter if tsk->cgroup changes after we read it,
  *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
  *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
  *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
  *    cgroup to top_cgroup.
  */
 /* TODO: Use a proper seq_file iterator */
 static int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
 	char *buf;
 	int retval;
 	struct cgroupfs_root *root;
 	retval = -ENOMEM;
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
 		goto out;
 	retval = -ESRCH;
 	pid = m->private;
 	tsk = get_pid_task(pid, PIDTYPE_PID);
 	if (!tsk)
 		goto out_free;
 	retval = 0;
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int count = 0;
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
 		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
 		if (retval < 0)
 			goto out_unlock;
 		seq_puts(m, buf);
 		seq_putc(m, '\n');
 	}
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	put_task_struct(tsk);
 out_free:
 	kfree(buf);
 out:
 	return retval;
 }
 static int cgroup_open(struct inode *inode, struct file *file)
 {
 	struct pid *pid = PROC_I(inode)->pid;
 	return single_open(file, proc_cgroup_show, pid);
 }
 const struct file_operations proc_cgroup_operations = {
 	.open		= cgroup_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	int i;
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
 	/*
 	 * ideally we don't want subsystems moving around while we do this.
 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
 	 * subsys/hierarchy state.
 	 */
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (ss == NULL)
 			continue;
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
 			   ss->root->number_of_cgroups, !ss->disabled);
 	}
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroupstats_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 static const struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 /**
  * cgroup_fork - attach newly forked task to its parents cgroup.
  * @child: pointer to task_struct of forking parent process.
  *
  * Description: A task inherits its parent's cgroup at fork().
  *
  * A pointer to the shared css_set was automatically copied in
  * fork.c by dup_task_struct().  However, we ignore that copy, since
  * it was not made under the protection of RCU or cgroup_mutex, so
  * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
  * have already changed current->cgroups, allowing the previously
  * referenced cgroup group to be removed and freed.
  *
  * At the point that cgroup_fork() is called, 'current' is the parent
  * task, and the passed argument 'child' points to the child task.
  */
 void cgroup_fork(struct task_struct *child)
 {
 	task_lock(current);
 	child->cgroups = current->cgroups;
 	get_css_set(child->cgroups);
 	task_unlock(current);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 /**
  * cgroup_fork_callbacks - run fork callbacks
  * @child: the new task
  *
  * Called on a new task very soon before adding it to the
  * tasklist. No need to take any locks since no-one can
  * be operating on this task.
  */
 void cgroup_fork_callbacks(struct task_struct *child)
 {
 	if (need_forkexit_callback) {
 		int i;
 		/*
 		 * forkexit callbacks are only supported for builtin
 		 * subsystems, and the builtin section of the subsys array is
 		 * immutable, so we don't need to lock the subsys array here.
 		 */
 		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->fork)
 				ss->fork(ss, child);
 		}
 	}
 }
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
  * Adds the task to the list running through its css_set if necessary.
  * Has to be after the task is visible on the task list in case we race
  * with the first call to cgroup_iter_start() - to guarantee that the
  * new task ends up on its list.
  */
 void cgroup_post_fork(struct task_struct *child)
 {
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
 		task_lock(child);
 		if (list_empty(&child->cg_list))
 			list_add(&child->cg_list, &child->cgroups->tasks);
 		task_unlock(child);
 		write_unlock(&css_set_lock);
 	}
 }
 /**
  * cgroup_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
  * @run_callback: run exit callbacks?
  *
  * Description: Detach cgroup from @tsk and release it.
  *
  * Note that cgroups marked notify_on_release force every task in
  * them to take the global cgroup_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cgroups where very high task exit scaling
  * is required on large systems.
  *
  * the_top_cgroup_hack:
  *
  *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
  *
  *    We call cgroup_exit() while the task is still competent to
  *    handle notify_on_release(), then leave the task attached to the
  *    root cgroup in each hierarchy for the remainder of its exit.
  *
  *    To do this properly, we would increment the reference count on
  *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
  *    code we would add a second cgroup function call, to drop that
  *    reference.  This would just create an unnecessary hot spot on
  *    the top_cgroup reference count, to no avail.
  *
  *    Normally, holding a reference to a cgroup without bumping its
  *    count is unsafe.   The cgroup could go away, or someone could
  *    attach us to a different cgroup, decrementing the count on
  *    the first cgroup that we never incremented.  But in this case,
  *    top_cgroup isn't going away, and either task has PF_EXITING set,
  *    which wards off any cgroup_attach_task() attempts, or task is a failed
  *    fork, never visible to cgroup_attach_task.
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
 	struct css_set *cg;
 	int i;
 	/*
 	 * Unlink from the css_set task list if necessary.
 	 * Optimistically check cg_list before taking
 	 * css_set_lock
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		write_lock(&css_set_lock);
 		if (!list_empty(&tsk->cg_list))
 			list_del_init(&tsk->cg_list);
 		write_unlock(&css_set_lock);
 	}
 	/* Reassign the task to the init_css_set. */
 	task_lock(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
 	if (run_callbacks && need_forkexit_callback) {
 		/*
 		 * modular subsystems can't use callbacks, so no need to lock
 		 * the subsys array
 		 */
 		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->exit) {
 				struct cgroup *old_cgrp =
 					rcu_dereference_raw(cg->subsys[i])->cgroup;
 				struct cgroup *cgrp = task_cgroup(tsk, i);
 				ss->exit(ss, cgrp, old_cgrp, tsk);
 			}
 		}
 	}
 	task_unlock(tsk);
 	if (cg)
 		put_css_set_taskexit(cg);
 }
 /**
  * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
  * @cgrp: the cgroup in question
  * @task: the task in question
  *
  * See if @cgrp is a descendant of @task's cgroup in the appropriate
  * hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
 	if (cgrp == dummytop)
 		return 1;
 	target = task_cgroup_from_root(task, cgrp->root);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
 	return ret;
 }
 static void check_for_release(struct cgroup *cgrp)
 {
 	/* All of these checks rely on RCU to keep the cgroup
 	 * structure alive */
 	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
 	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
 		/* Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
 		 * it now */
 		int need_schedule_work = 0;
 		spin_lock(&release_list_lock);
 		if (!cgroup_is_removed(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
 		}
 		spin_unlock(&release_list_lock);
 		if (need_schedule_work)
 			schedule_work(&release_agent_work);
 	}
 }
 /* Caller must verify that the css is not for root cgroup */
 void __css_put(struct cgroup_subsys_state *css, int count)
 {
 	struct cgroup *cgrp = css->cgroup;
 	int val;
 	rcu_read_lock();
 	val = atomic_sub_return(count, &css->refcnt);
 	if (val == 1) {
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		cgroup_wakeup_rmdir_waiter(cgrp);
 	}
 	rcu_read_unlock();
 	WARN_ON_ONCE(val < 1);
 }
 EXPORT_SYMBOL_GPL(__css_put);
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
  * relative to the root of cgroup file system) as the argument.
  *
  * Most likely, this user command will try to rmdir this cgroup.
  *
  * This races with the possibility that some other task will be
  * attached to this cgroup before it is removed, or that some other
  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  * unused, and this cgroup will be reprieved from its death sentence,
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  * means only wait until the task is successfully execve()'d.  The
  * separate release agent task is forked by call_usermodehelper(),
  * then control in this thread returns here, without waiting for the
  * release agent task.  We don't bother to wait because the caller of
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
 	BUG_ON(work != &release_agent_work);
 	mutex_lock(&cgroup_mutex);
 	spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
 		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
 		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
 			goto continue_free;
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 		i = 0;
 		argv[i++] = agentbuf;
 		argv[i++] = pathbuf;
 		argv[i] = NULL;
 		i = 0;
 		/* minimal command environment */
 		envp[i++] = "HOME=/";
 		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 		envp[i] = NULL;
 		/* Drop the lock while we invoke the usermode helper,
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
 		spin_lock(&release_list_lock);
 	}
 	spin_unlock(&release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 static int __init cgroup_disable(char *str)
 {
 	int i;
 	char *token;
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
 		/*
 		 * cgroup_disable, being at boot time, can't know about module
 		 * subsystems, so we don't worry about them.
 		 */
 		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
 					" subsystem\n", ss->name);
 				break;
 			}
 		}
 	}
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
 /*
  * Functons for CSS ID.
  */
 /*
  *To get ID other than 0, this should be called when !cgroup_is_removed().
  */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
 	struct css_id *cssid;
 	/*
 	 * This css_id() can return correct value when somone has refcnt
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id,
+	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-			rcu_read_lock_held() || atomic_read(&css->refcnt));
 	if (cssid)
 		return cssid->id;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(css_id);
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
 	struct css_id *cssid;
-	cssid = rcu_dereference_check(css->id,
+	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-			rcu_read_lock_held() || atomic_read(&css->refcnt));
 	if (cssid)
 		return cssid->depth;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(css_depth);
 /**
  *  css_is_ancestor - test "root" css is an ancestor of "child"
  * @child: the css to be tested.
  * @root: the css supporsed to be an ancestor of the child.
  *
  * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
  * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
  * But, considering usual usage, the csses should be valid objects after test.
  * Assuming that the caller will do some action to the child if this returns
  * returns true, the caller must take "child";s reference count.
  * If "child" is valid object and this returns true, "root" is valid, too.
  */
 bool css_is_ancestor(struct cgroup_subsys_state *child,
 		    const struct cgroup_subsys_state *root)
 {
 	struct css_id *child_id;
 	struct css_id *root_id;
 	bool ret = true;
 	rcu_read_lock();
 	child_id  = rcu_dereference(child->id);
 	root_id = rcu_dereference(root->id);
 	if (!child_id
 	    || !root_id
 	    || (child_id->depth < root_id->depth)
 	    || (child_id->stack[root_id->depth] != root_id->id))
 		ret = false;
 	rcu_read_unlock();
 	return ret;
 }
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
 	struct css_id *id = css->id;
 	/* When this is called before css_id initialization, id can be NULL */
 	if (!id)
 		return;
 	BUG_ON(!ss->use_id);
 	rcu_assign_pointer(id->css, NULL);
 	rcu_assign_pointer(css->id, NULL);
 	spin_lock(&ss->id_lock);
 	idr_remove(&ss->idr, id->id);
 	spin_unlock(&ss->id_lock);
 	kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
 /*
  * This is called by init or create(). Then, calls to this function are
  * always serialized (By cgroup_mutex() at create()).
  */
 static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 {
 	struct css_id *newid;
 	int myid, error, size;
 	BUG_ON(!ss->use_id);
 	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
 	newid = kzalloc(size, GFP_KERNEL);
 	if (!newid)
 		return ERR_PTR(-ENOMEM);
 	/* get id */
 	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
 		error = -ENOMEM;
 		goto err_out;
 	}
 	spin_lock(&ss->id_lock);
 	/* Don't use 0. allocates an ID of 1-65535 */
 	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
 	spin_unlock(&ss->id_lock);
 	/* Returns error when there are no free spaces for new ID.*/
 	if (error) {
 		error = -ENOSPC;
 		goto err_out;
 	}
 	if (myid > CSS_ID_MAX)
 		goto remove_idr;
 	newid->id = myid;
 	newid->depth = depth;
 	return newid;
 remove_idr:
 	error = -ENOSPC;
 	spin_lock(&ss->id_lock);
 	idr_remove(&ss->idr, myid);
 	spin_unlock(&ss->id_lock);
 err_out:
 	kfree(newid);
 	return ERR_PTR(error);
 }
 static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 					    struct cgroup_subsys_state *rootcss)
 {
 	struct css_id *newid;
 	spin_lock_init(&ss->id_lock);
 	idr_init(&ss->idr);
 	newid = get_new_cssid(ss, 0);
 	if (IS_ERR(newid))
 		return PTR_ERR(newid);
 	newid->stack[0] = newid->id;
 	newid->css = rootcss;
 	rootcss->id = newid;
 	return 0;
 }
 static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 			struct cgroup *child)
 {
 	int subsys_id, i, depth = 0;
 	struct cgroup_subsys_state *parent_css, *child_css;
 	struct css_id *child_id, *parent_id;
 	subsys_id = ss->subsys_id;
 	parent_css = parent->subsys[subsys_id];
 	child_css = child->subsys[subsys_id];
 	parent_id = parent_css->id;
 	depth = parent_id->depth + 1;
 	child_id = get_new_cssid(ss, depth);
 	if (IS_ERR(child_id))
 		return PTR_ERR(child_id);
 	for (i = 0; i < depth; i++)
 		child_id->stack[i] = parent_id->stack[i];
 	child_id->stack[depth] = child_id->id;
 	/*
 	 * child_id->css pointer will be set after this cgroup is available
 	 * see cgroup_populate_dir()
 	 */
 	rcu_assign_pointer(child_css->id, child_id);
 	return 0;
 }
 /**
  * css_lookup - lookup css by id
  * @ss: cgroup subsys to be looked into.
  * @id: the id
  *
  * Returns pointer to cgroup_subsys_state if there is valid one with id.
  * NULL if not. Should be called under rcu_read_lock()
  */
 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
 {
 	struct css_id *cssid = NULL;
 	BUG_ON(!ss->use_id);
 	cssid = idr_find(&ss->idr, id);
 	if (unlikely(!cssid))
 		return NULL;
 	return rcu_dereference(cssid->css);
 }
 EXPORT_SYMBOL_GPL(css_lookup);
 /**
  * css_get_next - lookup next cgroup under specified hierarchy.
  * @ss: pointer to subsystem
  * @id: current position of iteration.
  * @root: pointer to css. search tree under this.
  * @foundid: position of found object.
  *
  * Search next css under the specified hierarchy of rootid. Calling under
  * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
  */
 struct cgroup_subsys_state *
 css_get_next(struct cgroup_subsys *ss, int id,
 	     struct cgroup_subsys_state *root, int *foundid)
 {
 	struct cgroup_subsys_state *ret = NULL;
 	struct css_id *tmp;
 	int tmpid;
 	int rootid = css_id(root);
 	int depth = css_depth(root);
 	if (!rootid)
 		return NULL;
 	BUG_ON(!ss->use_id);
 	/* fill start point for scan */
 	tmpid = id;
 	while (1) {
 		/*
 		 * scan next entry from bitmap(tree), tmpid is updated after
 		 * idr_get_next().
 		 */
 		spin_lock(&ss->id_lock);
 		tmp = idr_get_next(&ss->idr, &tmpid);
 		spin_unlock(&ss->id_lock);
 		if (!tmp)
 			break;
 		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
 			ret = rcu_dereference(tmp->css);
 			if (ret) {
 				*foundid = tmpid;
 				break;
 			}
 		}
 		/* continue to scan from next id */
 		tmpid = tmpid + 1;
 	}
 	return ret;
 }
 /*
  * get corresponding css from file open on cgroupfs directory
  */
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 {
 	struct cgroup *cgrp;
 	struct inode *inode;
 	struct cgroup_subsys_state *css;
 	inode = f->f_dentry->d_inode;
 	/* check in cgroup filesystem dir */
 	if (inode->i_op != &cgroup_dir_inode_operations)
 		return ERR_PTR(-EBADF);
 	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
 		return ERR_PTR(-EINVAL);
 	/* get cgroup */
 	cgrp = __d_cgrp(f->f_dentry);
 	css = cgrp->subsys[id];
 	return css ? css : ERR_PTR(-ENOENT);
 }
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 	if (!css)
 		return ERR_PTR(-ENOMEM);
 	return css;
 }
 static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	kfree(cont->subsys[debug_subsys_id]);
 }
 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	return atomic_read(&cont->count);
 }
 static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	return cgroup_task_count(cont);
 }
 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
 {
 	return (u64)(unsigned long)current->cgroups;
 }
 static u64 current_css_set_refcount_read(struct cgroup *cont,
 					   struct cftype *cft)
 {
 	u64 count;
 	rcu_read_lock();
 	count = atomic_read(&current->cgroups->refcount);
 	rcu_read_unlock();
 	return count;
 }
 static int current_css_set_cg_links_read(struct cgroup *cont,
 					 struct cftype *cft,
 					 struct seq_file *seq)
 {
 	struct cg_cgroup_link *link;
 	struct css_set *cg;
 	read_lock(&css_set_lock);
 	rcu_read_lock();
 	cg = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
 		struct cgroup *c = link->cgrp;
 		const char *name;
 		if (c->dentry)
 			name = c->dentry->d_name.name;
 		else
 			name = "?";
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name);
 	}
 	rcu_read_unlock();
 	read_unlock(&css_set_lock);
 	return 0;
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
 static int cgroup_css_links_read(struct cgroup *cont,
 				 struct cftype *cft,
 				 struct seq_file *seq)
 {
 	struct cg_cgroup_link *link;
 	read_lock(&css_set_lock);
 	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
 		struct css_set *cg = link->cg;
 		struct task_struct *task;
 		int count = 0;
 		seq_printf(seq, "css_set %p\n", cg);
 		list_for_each_entry(task, &cg->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
 				seq_puts(seq, "  ...\n");
 				break;
 			} else {
 				seq_printf(seq, "  task %d\n",
 					   task_pid_vnr(task));
 			}
 		}
 	}
 	read_unlock(&css_set_lock);
 	return 0;
 }
 static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
 }
 static struct cftype debug_files[] =  {
 	{
 		.name = "cgroup_refcount",
 		.read_u64 = cgroup_refcount_read,
 	},
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
 	},
 	{
 		.name = "current_css_set",
 		.read_u64 = current_css_set_read,
 	},
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
 	},
 	{
 		.name = "current_css_set_cg_links",
 		.read_seq_string = current_css_set_cg_links_read,
 	},
 	{
 		.name = "cgroup_css_links",
 		.read_seq_string = cgroup_css_links_read,
 	},
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
 	},
 };
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, debug_files,
 				ARRAY_SIZE(debug_files));
 }
 struct cgroup_subsys debug_subsys = {
 	.name = "debug",
 	.create = debug_create,
 	.destroy = debug_destroy,
 	.populate = debug_populate,
 	.subsys_id = debug_subsys_id,
 };
 #endif /* CONFIG_CGROUP_DEBUG */

kernel/exit.c

Diff comments View file @ d8bf4ca

 /*
  *  linux/kernel/exit.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/freezer.h>
 #include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/posix-timers.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
 	if (group_dead) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
 }
 /*
  * This function expects the tasklist_lock write-locked.
  */
 static void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
 	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
 	struct tty_struct *uninitialized_var(tty);
 	sighand = rcu_dereference_check(tsk->sighand,
-					rcu_read_lock_held() ||
 					lockdep_tasklist_lock_is_held());
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (group_dead) {
 		posix_cpu_timers_exit_group(tsk);
 		tty = sig->tty;
 		sig->tty = NULL;
 	} else {
 		/*
 		 * This can only happen if the caller is de_thread().
 		 * FIXME: this is the temporary hack, we should teach
 		 * posix-cpu-timers to handle this case correctly.
 		 */
 		if (unlikely(has_group_leader_pid(tsk)))
 			posix_cpu_timers_exit_group(tsk);
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
 		 */
 		if (sig->notify_count > 0 && !--sig->notify_count)
 			wake_up_process(sig->group_exit_task);
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		/*
 		 * Accumulate here the counters for all threads but the
 		 * group leader as they die, so they can be added into
 		 * the process-wide totals when those are taken.
 		 * The group leader stays around as a zombie as long
 		 * as there are other threads.  When it gets reaped,
 		 * the exit.c code will add its counts into these totals.
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
 		sig->utime = cputime_add(sig->utime, tsk->utime);
 		sig->stime = cputime_add(sig->stime, tsk->stime);
 		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	}
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 	/*
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
 	flush_sigqueue(&tsk->pending);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	if (group_dead) {
 		flush_sigqueue(&sig->shared_pending);
 		tty_kref_put(tty);
 	}
 }
 static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
 	atomic_dec(&__task_cred(p)->user->processes);
 	rcu_read_unlock();
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
 	/*
 	 * If we are the last non-leader member of the thread
 	 * group, and the leader is zombie, then notify the
 	 * group leader's parent process. (if it wants notification.)
 	 */
 	zap_leader = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(task_detached(leader));
 		do_notify_parent(leader, leader->exit_signal);
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,
 		 * then we are the one who should release the leader.
 		 *
 		 * do_notify_parent() will have marked it self-reaping in
 		 * that case.
 		 */
 		zap_leader = task_detached(leader);
 		/*
 		 * This maintains the invariant that release_task()
 		 * only runs on a task in EXIT_DEAD, just for sanity.
 		 */
 		if (zap_leader)
 			leader->exit_state = EXIT_DEAD;
 	}
 	write_unlock_irq(&tasklist_lock);
 	release_thread(p);
 	call_rcu(&p->rcu, delayed_put_task_struct);
 	p = leader;
 	if (unlikely(zap_leader))
 		goto repeat;
 }
 /*
  * This checks not only the pgrp, but falls back on the pid if no
  * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
  * without this...
  *
  * The caller must hold rcu lock or the tasklist lock.
  */
 struct pid *session_of_pgrp(struct pid *pgrp)
 {
 	struct task_struct *p;
 	struct pid *sid = NULL;
 	p = pid_task(pgrp, PIDTYPE_PGID);
 	if (p == NULL)
 		p = pid_task(pgrp, PIDTYPE_PID);
 	if (p != NULL)
 		sid = task_session(p);
 	return sid;
 }
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
  * by terminal-generated stop signals.  Newly orphaned process groups are
  * to receive a SIGHUP and a SIGCONT.
  *
  * "I ask you, have you ever known what it is to be an orphan?"
  */
 static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
 {
 	struct task_struct *p;
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 		if ((p == ignored_task) ||
 		    (p->exit_state && thread_group_empty(p)) ||
 		    is_global_init(p->real_parent))
 			continue;
 		if (task_pgrp(p->real_parent) != pgrp &&
 		    task_session(p->real_parent) == task_session(p))
 			return 0;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 	return 1;
 }
 int is_current_pgrp_orphaned(void)
 {
 	int retval;
 	read_lock(&tasklist_lock);
 	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static int has_stopped_jobs(struct pid *pgrp)
 {
 	int retval = 0;
 	struct task_struct *p;
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 		if (!task_is_stopped(p))
 			continue;
 		retval = 1;
 		break;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 	return retval;
 }
 /*
  * Check to see if any process groups have become orphaned as
  * a result of our exiting, and if they have any stopped jobs,
  * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
  */
 static void
 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 {
 	struct pid *pgrp = task_pgrp(tsk);
 	struct task_struct *ignored_task = tsk;
 	if (!parent)
 		 /* exit: our father is in a different pgrp than
 		  * we are and we were the only connection outside.
 		  */
 		parent = tsk->real_parent;
 	else
 		/* reparent: our child is in a different pgrp than
 		 * we are, and it was the only connection outside.
 		 */
 		ignored_task = NULL;
 	if (task_pgrp(parent) != pgrp &&
 	    task_session(parent) == task_session(tsk) &&
 	    will_become_orphaned_pgrp(pgrp, ignored_task) &&
 	    has_stopped_jobs(pgrp)) {
 		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 	}
 }
 /**
  * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
  *
  * If a kernel thread is launched as a result of a system call, or if
  * it ever exits, it should generally reparent itself to kthreadd so it
  * isn't in the way of other processes and is correctly cleaned up on exit.
  *
  * The various task state such as scheduling policy and priority may have
  * been inherited from a user process, so we reset them to sane values here.
  *
  * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
  */
 static void reparent_to_kthreadd(void)
 {
 	write_lock_irq(&tasklist_lock);
 	ptrace_unlink(current);
 	/* Reparent to init */
 	current->real_parent = current->parent = kthreadd_task;
 	list_move_tail(&current->sibling, &current->real_parent->children);
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 	if (task_nice(current) < 0)
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
 	atomic_inc(&init_cred.usage);
 	commit_creds(&init_cred);
 	write_unlock_irq(&tasklist_lock);
 }
 void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
 	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
 	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
 }
 static void set_special_pids(struct pid *pid)
 {
 	write_lock_irq(&tasklist_lock);
 	__set_special_pids(pid);
 	write_unlock_irq(&tasklist_lock);
 }
 /*
  * Let kernel threads use this to say that they allow a certain signal.
  * Must not be used if kthread was cloned with CLONE_SIGHAND.
  */
 int allow_signal(int sig)
 {
 	if (!valid_signal(sig) || sig < 1)
 		return -EINVAL;
 	spin_lock_irq(&current->sighand->siglock);
 	/* This is only needed for daemonize()'ed kthreads */
 	sigdelset(&current->blocked, sig);
 	/*
 	 * Kernel threads handle their own signals. Let the signal code
 	 * know it'll be handled, so that they don't get converted to
 	 * SIGKILL or just silently dropped.
 	 */
 	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
 }
 EXPORT_SYMBOL(allow_signal);
 int disallow_signal(int sig)
 {
 	if (!valid_signal(sig) || sig < 1)
 		return -EINVAL;
 	spin_lock_irq(&current->sighand->siglock);
 	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
 }
 EXPORT_SYMBOL(disallow_signal);
 /*
  *	Put all the gunge required to become a kernel thread without
  *	attached user resources in one place where it belongs.
  */
 void daemonize(const char *name, ...)
 {
 	va_list args;
 	sigset_t blocked;
 	va_start(args, name);
 	vsnprintf(current->comm, sizeof(current->comm), name, args);
 	va_end(args);
 	/*
 	 * If we were started as result of loading a module, close all of the
 	 * user space pages.  We don't need them, and if we didn't close them
 	 * they would be locked into memory.
 	 */
 	exit_mm(current);
 	/*
 	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
 	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 	if (current->nsproxy != &init_nsproxy) {
 		get_nsproxy(&init_nsproxy);
 		switch_task_namespaces(current, &init_nsproxy);
 	}
 	set_special_pids(&init_struct_pid);
 	proc_clear_tty(current);
 	/* Block and flush all signals */
 	sigfillset(&blocked);
 	sigprocmask(SIG_BLOCK, &blocked, NULL);
 	flush_signals(current);
 	/* Become as one with the init task */
 	daemonize_fs_struct();
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
 	reparent_to_kthreadd();
 }
 EXPORT_SYMBOL(daemonize);
 static void close_files(struct files_struct * files)
 {
 	int i, j;
 	struct fdtable *fdt;
 	j = 0;
 	/*
 	 * It is safe to dereference the fd table without RCU or
 	 * ->file_lock because this is the last reference to the
 	 * files structure.  But use RCU to shut RCU-lockdep up.
 	 */
 	rcu_read_lock();
 	fdt = files_fdtable(files);
 	rcu_read_unlock();
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
 		if (i >= fdt->max_fds)
 			break;
 		set = fdt->open_fds->fds_bits[j++];
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&fdt->fd[i], NULL);
 				if (file) {
 					filp_close(file, files);
 					cond_resched();
 				}
 			}
 			i++;
 			set >>= 1;
 		}
 	}
 }
 struct files_struct *get_files_struct(struct task_struct *task)
 {
 	struct files_struct *files;
 	task_lock(task);
 	files = task->files;
 	if (files)
 		atomic_inc(&files->count);
 	task_unlock(task);
 	return files;
 }
 void put_files_struct(struct files_struct *files)
 {
 	struct fdtable *fdt;
 	if (atomic_dec_and_test(&files->count)) {
 		close_files(files);
 		/*
 		 * Free the fd and fdset arrays if we expanded them.
 		 * If the fdtable was embedded, pass files for freeing
 		 * at the end of the RCU grace period. Otherwise,
 		 * you can free files immediately.
 		 */
 		rcu_read_lock();
 		fdt = files_fdtable(files);
 		if (fdt != &files->fdtab)
 			kmem_cache_free(files_cachep, files);
 		free_fdtable(fdt);
 		rcu_read_unlock();
 	}
 }
 void reset_files_struct(struct files_struct *files)
 {
 	struct task_struct *tsk = current;
 	struct files_struct *old;
 	old = tsk->files;
 	task_lock(tsk);
 	tsk->files = files;
 	task_unlock(tsk);
 	put_files_struct(old);
 }
 void exit_files(struct task_struct *tsk)
 {
 	struct files_struct * files = tsk->files;
 	if (files) {
 		task_lock(tsk);
 		tsk->files = NULL;
 		task_unlock(tsk);
 		put_files_struct(files);
 	}
 }
 #ifdef CONFIG_MM_OWNER
 /*
  * Task p is exiting and it owned mm, lets find a new owner for it
  */
 static inline int
 mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
 {
 	/*
 	 * If there are other users of the mm and the owner (us) is exiting
 	 * we need to find a new owner to take on the responsibility.
 	 */
 	if (atomic_read(&mm->mm_users) <= 1)
 		return 0;
 	if (mm->owner != p)
 		return 0;
 	return 1;
 }
 void mm_update_next_owner(struct mm_struct *mm)
 {
 	struct task_struct *c, *g, *p = current;
 retry:
 	if (!mm_need_new_owner(mm, p))
 		return;
 	read_lock(&tasklist_lock);
 	/*
 	 * Search in the children
 	 */
 	list_for_each_entry(c, &p->children, sibling) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	}
 	/*
 	 * Search in the siblings
 	 */
 	list_for_each_entry(c, &p->real_parent->children, sibling) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	}
 	/*
 	 * Search through everything else. We should not get
 	 * here often
 	 */
 	do_each_thread(g, c) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	} while_each_thread(g, c);
 	read_unlock(&tasklist_lock);
 	/*
 	 * We found no owner yet mm_users > 1: this implies that we are
 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
 	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 	 */
 	mm->owner = NULL;
 	return;
 assign_new_owner:
 	BUG_ON(c == p);
 	get_task_struct(c);
 	/*
 	 * The task_lock protects c->mm from changing.
 	 * We always want mm->owner->mm == mm
 	 */
 	task_lock(c);
 	/*
 	 * Delay read_unlock() till we have the task_lock()
 	 * to ensure that c does not slip away underneath us
 	 */
 	read_unlock(&tasklist_lock);
 	if (c->mm != mm) {
 		task_unlock(c);
 		put_task_struct(c);
 		goto retry;
 	}
 	mm->owner = c;
 	task_unlock(c);
 	put_task_struct(c);
 }
 #endif /* CONFIG_MM_OWNER */
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
  */
 static void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
 	mm_release(tsk, mm);
 	if (!mm)
 		return;
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_sem around checking core_state
 	 * and clearing tsk->mm.  The core-inducing thread
 	 * will increment ->nr_threads for each thread in the
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
 	core_state = mm->core_state;
 	if (core_state) {
 		struct core_thread self;
 		up_read(&mm->mmap_sem);
 		self.task = tsk;
 		self.next = xchg(&core_state->dumper.next, &self);
 		/*
 		 * Implies mb(), the result of xchg() must be visible
 		 * to core_state->dumper.
 		 */
 		if (atomic_dec_and_test(&core_state->nr_threads))
 			complete(&core_state->startup);
 		for (;;) {
 			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			schedule();
 		}
 		__set_task_state(tsk, TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
 	BUG_ON(mm != tsk->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
 	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
 		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
 }
 /*
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our thread
  * group, and if no such member exists, give it to
  * the child reaper process (ie "init") in our pid
  * space.
  */
 static struct task_struct *find_new_reaper(struct task_struct *father)
 	__releases(&tasklist_lock)
 	__acquires(&tasklist_lock)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(father);
 	struct task_struct *thread;
 	thread = father;
 	while_each_thread(father, thread) {
 		if (thread->flags & PF_EXITING)
 			continue;
 		if (unlikely(pid_ns->child_reaper == father))
 			pid_ns->child_reaper = thread;
 		return thread;
 	}
 	if (unlikely(pid_ns->child_reaper == father)) {
 		write_unlock_irq(&tasklist_lock);
 		if (unlikely(pid_ns == &init_pid_ns))
 			panic("Attempted to kill init!");
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
 		/*
 		 * We can not clear ->child_reaper or leave it alone.
 		 * There may by stealth EXIT_DEAD tasks on ->children,
 		 * forget_original_parent() must move them somewhere.
 		 */
 		pid_ns->child_reaper = init_pid_ns.child_reaper;
 	}
 	return pid_ns->child_reaper;
 }
 /*
 * Any that need to be release_task'd are put on the @dead list.
  */
 static void reparent_leader(struct task_struct *father, struct task_struct *p,
 				struct list_head *dead)
 {
 	list_move_tail(&p->sibling, &p->real_parent->children);
 	if (task_detached(p))
 		return;
 	/*
 	 * If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
 	if (same_thread_group(p->real_parent, father))
 		return;
 	/* We don't want people slaying init.  */
 	p->exit_signal = SIGCHLD;
 	/* If it has exited notify the new parent about this child's death. */
 	if (!task_ptrace(p) &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
 		if (task_detached(p)) {
 			p->exit_state = EXIT_DEAD;
 			list_move_tail(&p->sibling, dead);
 		}
 	}
 	kill_orphaned_pgrp(p, father);
 }
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(dead_children);
 	write_lock_irq(&tasklist_lock);
 	/*
 	 * Note that exit_ptrace() and find_new_reaper() might
 	 * drop tasklist_lock and reacquire it.
 	 */
 	exit_ptrace(father);
 	reaper = find_new_reaper(father);
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		struct task_struct *t = p;
 		do {
 			t->real_parent = reaper;
 			if (t->parent == father) {
 				BUG_ON(task_ptrace(t));
 				t->parent = t->real_parent;
 			}
 			if (t->pdeath_signal)
 				group_send_sig_info(t->pdeath_signal,
 						    SEND_SIG_NOINFO, t);
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 	list_for_each_entry_safe(p, n, &dead_children, sibling) {
 		list_del_init(&p->sibling);
 		release_task(p);
 	}
 }
 /*
  * Send signals to all our closest relatives so that they know
  * to properly mourn us..
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
 	int signal;
 	void *cookie;
 	/*
 	 * This does two things:
 	 *
   	 * A.  Make init inherit all the child processes
 	 * B.  Check to see if any process groups have become orphaned
 	 *	as a result of our exiting, and if they have any stopped
 	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 	 */
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 	/* Let father know we died
 	 *
 	 * Thread signals are configurable, but you aren't going to use
 	 * that to send signals to arbitrary processes.
 	 * That stops right now.
 	 *
 	 * If the parent exec id doesn't match the exec id we saved
 	 * when we started then we know the parent has changed security
 	 * domain.
 	 *
 	 * If our self_exec id doesn't match our parent_exec_id then
 	 * we have changed execution domain as these two values started
 	 * the same after a fork.
 	 */
 	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
 	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
 	if (signal >= 0)
 		signal = do_notify_parent(tsk, signal);
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exit_task);
 	write_unlock_irq(&tasklist_lock);
 	tracehook_report_death(tsk, signal, cookie, group_dead);
 	/* If the process is dead, release it - nobody will wait for it */
 	if (signal == DEATH_REAP)
 		release_task(tsk);
 }
 #ifdef CONFIG_DEBUG_STACK_USAGE
 static void check_stack_usage(void)
 {
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
 	unsigned long free;
 	free = stack_not_used(current);
 	if (free >= lowest_to_date)
 		return;
 	spin_lock(&low_water_lock);
 	if (free < lowest_to_date) {
 		printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
 				"left\n",
 				current->comm, free);
 		lowest_to_date = free;
 	}
 	spin_unlock(&low_water_lock);
 }
 #else
 static inline void check_stack_usage(void) {}
 #endif
 NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
 	profile_task_exit(tsk);
 	WARN_ON(atomic_read(&tsk->fs_excl));
 	WARN_ON(blk_needs_flush_plug(tsk));
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 	/*
 	 * If do_exit is called because this processes oopsed, it's possible
 	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 	 * continuing. Amongst other possible reasons, this is to prevent
 	 * mm_release()->clear_child_tid() from writing to a user-controlled
 	 * kernel address.
 	 */
 	set_fs(USER_DS);
 	tracehook_report_exit(&code);
 	validate_creds_for_do_exit(tsk);
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
 	 * leave this task alone and wait for reboot.
 	 */
 	if (unlikely(tsk->flags & PF_EXITING)) {
 		printk(KERN_ALERT
 			"Fixing recursive fault but reboot is needed!\n");
 		/*
 		 * We can do this unlocked here. The futex code uses
 		 * this flag just to verify whether the pi state
 		 * cleanup has been done or not. In the worst case it
 		 * loops once more. We pretend that the cleanup was
 		 * done as there is no way to return. Either the
 		 * OWNER_DIED bit is set by now or we push the blocked
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
 	exit_irq_thread();
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
 	 * an exiting task cleaning up the robust pi futexes.
 	 */
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, task_pid_nr(current),
 				preempt_count());
 	acct_update_integrals(tsk);
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
 		sync_mm_rss(tsk, tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		if (tsk->mm)
 			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 	}
 	acct_collect(code, group_dead);
 	if (group_dead)
 		tty_audit_exit();
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 	exit_mm(tsk);
 	if (group_dead)
 		acct_process();
 	trace_sched_process_exit(tsk);
 	exit_sem(tsk);
 	exit_files(tsk);
 	exit_fs(tsk);
 	check_stack_usage();
 	exit_thread();
 	/*
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
 	 *
 	 * because of cgroup mode, must be called before cgroup_exit()
 	 */
 	perf_event_exit_task(tsk);
 	cgroup_exit(tsk, 1);
 	if (group_dead)
 		disassociate_ctty(1);
 	module_put(task_thread_info(tsk)->exec_domain->module);
 	proc_exit_connector(tsk);
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
 	ptrace_put_breakpoints(tsk);
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
 	task_lock(tsk);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 	task_unlock(tsk);
 #endif
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
 #endif
 	/*
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held(tsk);
 	/*
 	 * We can do this unlocked here. The futex code uses this flag
 	 * just to verify whether the pi state cleanup has been done
 	 * or not. In the worst case it loops once more.
 	 */
 	tsk->flags |= PF_EXITPIDONE;
 	if (tsk->io_context)
 		exit_io_context(tsk);
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 	validate_creds_for_do_exit(tsk);
 	preempt_disable();
 	exit_rcu();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
 	for (;;)
 		cpu_relax();	/* For when BUG is null */
 }
 EXPORT_SYMBOL_GPL(do_exit);
 NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
 	do_exit(code);
 }
 EXPORT_SYMBOL(complete_and_exit);
 SYSCALL_DEFINE1(exit, int, error_code)
 {
 	do_exit((error_code&0xff)<<8);
 }
 /*
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
 NORET_TYPE void
 do_group_exit(int exit_code)
 {
 	struct signal_struct *sig = current->signal;
 	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 	if (signal_group_exit(sig))
 		exit_code = sig->group_exit_code;
 	else if (!thread_group_empty(current)) {
 		struct sighand_struct *const sighand = current->sighand;
 		spin_lock_irq(&sighand->siglock);
 		if (signal_group_exit(sig))
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
 			sig->group_exit_code = exit_code;
 			sig->flags = SIGNAL_GROUP_EXIT;
 			zap_other_threads(current);
 		}
 		spin_unlock_irq(&sighand->siglock);
 	}
 	do_exit(exit_code);
 	/* NOTREACHED */
 }
 /*
  * this kills every thread in the thread group. Note that any externally
  * wait4()-ing process will get the correct exit code - even if this
  * thread is not the thread group leader.
  */
 SYSCALL_DEFINE1(exit_group, int, error_code)
 {
 	do_group_exit((error_code & 0xff) << 8);
 	/* NOTREACHED */
 	return 0;
 }
 struct wait_opts {
 	enum pid_type		wo_type;
 	int			wo_flags;
 	struct pid		*wo_pid;
 	struct siginfo __user	*wo_info;
 	int __user		*wo_stat;
 	struct rusage __user	*wo_rusage;
 	wait_queue_t		child_wait;
 	int			notask_error;
 };
 static inline
 struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
 	if (type != PIDTYPE_PID)
 		task = task->group_leader;
 	return task->pids[type].pid;
 }
 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
 		return 0;
 	/* Wait for all children (clone and not) if __WALL is set;
 	 * otherwise, wait for clone children *only* if __WCLONE is
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
 	 * A "clone" child here is one that reports to its parent
 	 * using a signal other than SIGCHLD.) */
 	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 	    && !(wo->wo_flags & __WALL))
 		return 0;
 	return 1;
 }
 static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 				pid_t pid, uid_t uid, int why, int status)
 {
 	struct siginfo __user *infop;
 	int retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	put_task_struct(p);
 	infop = wo->wo_info;
 	if (infop) {
 		if (!retval)
 			retval = put_user(SIGCHLD, &infop->si_signo);
 		if (!retval)
 			retval = put_user(0, &infop->si_errno);
 		if (!retval)
 			retval = put_user((short)why, &infop->si_code);
 		if (!retval)
 			retval = put_user(pid, &infop->si_pid);
 		if (!retval)
 			retval = put_user(uid, &infop->si_uid);
 		if (!retval)
 			retval = put_user(status, &infop->si_status);
 	}
 	if (!retval)
 		retval = pid;
 	return retval;
 }
 /*
  * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = __task_cred(p)->uid;
 	struct siginfo __user *infop;
 	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
 	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		int exit_code = p->exit_code;
 		int why;
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
 		if ((exit_code & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status = exit_code >> 8;
 		} else {
 			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
 			status = exit_code & 0x7f;
 		}
 		return wait_noreap_copyout(wo, p, pid, uid, why, status);
 	}
 	/*
 	 * Try to move the task's state to DEAD
 	 * only one thread is allowed to do this:
 	 */
 	state = xchg(&p->exit_state, EXIT_DEAD);
 	if (state != EXIT_ZOMBIE) {
 		BUG_ON(state != EXIT_DEAD);
 		return 0;
 	}
 	traced = ptrace_reparented(p);
 	/*
 	 * It can be ptraced but not reparented, check
 	 * !task_detached() to filter out sub-threads.
 	 */
 	if (likely(!traced) && likely(!task_detached(p))) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
 		cputime_t tgutime, tgstime;
 		/*
 		 * The resource counters for the group leader are in its
 		 * own task_struct.  Those for dead threads in the group
 		 * are in its signal_struct, as are those for the child
 		 * processes it has previously reaped.  All these
 		 * accumulate in the parent's signal_struct c* fields.
 		 *
 		 * We don't bother to take a lock here to protect these
 		 * p->signal fields, because they are only touched by
 		 * __exit_signal, which runs with tasklist_lock
 		 * write-locked anyway, and so is excluded here.  We do
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
 		 *
 		 * We use thread_group_times() to get times for the thread
 		 * group, which consolidates times for all threads in the
 		 * group including the group leader.
 		 */
 		thread_group_times(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
 			cputime_add(tgutime,
 				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
 			cputime_add(tgstime,
 				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
 			cputime_add(sig->gtime,
 				    sig->cgtime)));
 		psig->cmin_flt +=
 			p->min_flt + sig->min_flt + sig->cmin_flt;
 		psig->cmaj_flt +=
 			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
 		psig->cnvcsw +=
 			p->nvcsw + sig->nvcsw + sig->cnvcsw;
 		psig->cnivcsw +=
 			p->nivcsw + sig->nivcsw + sig->cnivcsw;
 		psig->cinblock +=
 			task_io_get_inblock(p) +
 			sig->inblock + sig->cinblock;
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
 		maxrss = max(sig->maxrss, sig->cmaxrss);
 		if (psig->cmaxrss < maxrss)
 			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 	/*
 	 * Now we are sure this task is interesting, and no other
 	 * thread can reap it because we set its state to EXIT_DEAD.
 	 */
 	read_unlock(&tasklist_lock);
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
 	if (!retval && wo->wo_stat)
 		retval = put_user(status, wo->wo_stat);
 	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
 		retval = put_user(0, &infop->si_errno);
 	if (!retval && infop) {
 		int why;
 		if ((status & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status >>= 8;
 		} else {
 			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
 			status &= 0x7f;
 		}
 		retval = put_user((short)why, &infop->si_code);
 		if (!retval)
 			retval = put_user(status, &infop->si_status);
 	}
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
 		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 	if (traced) {
 		write_lock_irq(&tasklist_lock);
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 		/*
 		 * If this is not a detached task, notify the parent.
 		 * If it's still not detached after that, don't release
 		 * it now.
 		 */
 		if (!task_detached(p)) {
 			do_notify_parent(p, p->exit_signal);
 			if (!task_detached(p)) {
 				p->exit_state = EXIT_ZOMBIE;
 				p = NULL;
 			}
 		}
 		write_unlock_irq(&tasklist_lock);
 	}
 	if (p != NULL)
 		release_task(p);
 	return retval;
 }
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
 	if (ptrace) {
 		if (task_is_stopped_or_traced(p))
 			return &p->exit_code;
 	} else {
 		if (p->signal->flags & SIGNAL_STOP_STOPPED)
 			return &p->signal->group_exit_code;
 	}
 	return NULL;
 }
 /**
  * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
  * @wo: wait options
  * @ptrace: is the wait for ptrace
  * @p: task to wait for
  *
  * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
  *
  * CONTEXT:
  * read_lock(&tasklist_lock), which is released if return value is
  * non-zero.  Also, grabs and releases @p->sighand->siglock.
  *
  * RETURNS:
  * 0 if wait condition didn't exist and search for other wait conditions
  * should continue.  Non-zero return, -errno on failure and @p's pid on
  * success, implies that tasklist_lock is released and wait condition
  * search should terminate.
  */
 static int wait_task_stopped(struct wait_opts *wo,
 				int ptrace, struct task_struct *p)
 {
 	struct siginfo __user *infop;
 	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 	/*
 	 * Traditionally we see ptrace'd stopped tasks regardless of options.
 	 */
 	if (!ptrace && !(wo->wo_flags & WUNTRACED))
 		return 0;
 	if (!task_stopped_code(p, ptrace))
 		return 0;
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 	p_code = task_stopped_code(p, ptrace);
 	if (unlikely(!p_code))
 		goto unlock_sig;
 	exit_code = *p_code;
 	if (!exit_code)
 		goto unlock_sig;
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		*p_code = 0;
 	uid = task_uid(p);
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
 		return 0;
 	/*
 	 * Now we are pretty sure this task is interesting.
 	 * Make sure it doesn't get reaped out from under us while we
 	 * give up the lock and then examine it below.  We don't want to
 	 * keep holding onto the tasklist_lock while we call getrusage and
 	 * possibly take page faults for user memory.
 	 */
 	get_task_struct(p);
 	pid = task_pid_vnr(p);
 	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 	if (unlikely(wo->wo_flags & WNOWAIT))
 		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	if (!retval && wo->wo_stat)
 		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
 	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
 		retval = put_user(0, &infop->si_errno);
 	if (!retval && infop)
 		retval = put_user((short)why, &infop->si_code);
 	if (!retval && infop)
 		retval = put_user(exit_code, &infop->si_status);
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
 		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 	put_task_struct(p);
 	BUG_ON(!retval);
 	return retval;
 }
 /*
  * Handle do_wait work for one task in a live, non-stopped state.
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 {
 	int retval;
 	pid_t pid;
 	uid_t uid;
 	if (!unlikely(wo->wo_flags & WCONTINUED))
 		return 0;
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
 	spin_lock_irq(&p->sighand->siglock);
 	/* Re-check with the lock held.  */
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	uid = task_uid(p);
 	spin_unlock_irq(&p->sighand->siglock);
 	pid = task_pid_vnr(p);
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	if (!wo->wo_info) {
 		retval = wo->wo_rusage
 			? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 		put_task_struct(p);
 		if (!retval && wo->wo_stat)
 			retval = put_user(0xffff, wo->wo_stat);
 		if (!retval)
 			retval = pid;
 	} else {
 		retval = wait_noreap_copyout(wo, p, pid, uid,
 					     CLD_CONTINUED, SIGCONT);
 		BUG_ON(retval == 0);
 	}
 	return retval;
 }
 /*
  * Consider @p for a wait by @parent.
  *
  * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
  * then ->notask_error is 0 if @p is an eligible child,
  * or another error from security_task_wait(), or still -ECHILD.
  */
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
 				struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
 	if (!ret)
 		return ret;
 	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
 		/*
 		 * If we have not yet seen any eligible child,
 		 * then let this error code replace -ECHILD.
 		 * A permission error will give the user a clue
 		 * to look for security policy problems, rather
 		 * than for mysterious wait bugs.
 		 */
 		if (wo->notask_error)
 			wo->notask_error = ret;
 		return 0;
 	}
 	/* dead body doesn't have much to contribute */
 	if (p->exit_state == EXIT_DEAD)
 		return 0;
 	/* slay zombie? */
 	if (p->exit_state == EXIT_ZOMBIE) {
 		/*
 		 * A zombie ptracee is only visible to its ptracer.
 		 * Notification and reaping will be cascaded to the real
 		 * parent when the ptracer detaches.
 		 */
 		if (likely(!ptrace) && unlikely(task_ptrace(p))) {
 			/* it will become visible, clear notask_error */
 			wo->notask_error = 0;
 			return 0;
 		}
 		/* we don't reap group leaders with subthreads */
 		if (!delay_group_leader(p))
 			return wait_task_zombie(wo, p);
 		/*
 		 * Allow access to stopped/continued state via zombie by
 		 * falling through.  Clearing of notask_error is complex.
 		 *
 		 * When !@ptrace:
 		 *
 		 * If WEXITED is set, notask_error should naturally be
 		 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
 		 * so, if there are live subthreads, there are events to
 		 * wait for.  If all subthreads are dead, it's still safe
 		 * to clear - this function will be called again in finite
 		 * amount time once all the subthreads are released and
 		 * will then return without clearing.
 		 *
 		 * When @ptrace:
 		 *
 		 * Stopped state is per-task and thus can't change once the
 		 * target task dies.  Only continued and exited can happen.
 		 * Clear notask_error if WCONTINUED | WEXITED.
 		 */
 		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
 			wo->notask_error = 0;
 	} else {
 		/*
 		 * If @p is ptraced by a task in its real parent's group,
 		 * hide group stop/continued state when looking at @p as
 		 * the real parent; otherwise, a single stop can be
 		 * reported twice as group and ptrace stops.
 		 *
 		 * If a ptracer wants to distinguish the two events for its
 		 * own children, it should create a separate process which
 		 * takes the role of real parent.
 		 */
 		if (likely(!ptrace) && task_ptrace(p) &&
 		    same_thread_group(p->parent, p->real_parent))
 			return 0;
 		/*
 		 * @p is alive and it's gonna stop, continue or exit, so
 		 * there always is something to wait for.
 		 */
 		wo->notask_error = 0;
 	}
 	/*
 	 * Wait for stopped.  Depending on @ptrace, different stopped state
 	 * is used and the two don't interact with each other.
 	 */
 	ret = wait_task_stopped(wo, ptrace, p);
 	if (ret)
 		return ret;
 	/*
 	 * Wait for continued.  There's only one continued state and the
 	 * ptracer can consume it which can confuse the real parent.  Don't
 	 * use WCONTINUED from ptracer.  You don't need or want it.
 	 */
 	return wait_task_continued(wo, p);
 }
 /*
  * Do the work of do_wait() for one thread in the group, @tsk.
  *
  * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
  * ->notask_error is 0 if there were any eligible children,
  * or another error from security_task_wait(), or still -ECHILD.
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 	list_for_each_entry(p, &tsk->children, sibling) {
 		int ret = wait_consider_task(wo, 0, p);
 		if (ret)
 			return ret;
 	}
 	return 0;
 }
 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
 		int ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}
 	return 0;
 }
 static int child_wait_callback(wait_queue_t *wait, unsigned mode,
 				int sync, void *key)
 {
 	struct wait_opts *wo = container_of(wait, struct wait_opts,
 						child_wait);
 	struct task_struct *p = key;
 	if (!eligible_pid(wo, p))
 		return 0;
 	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
 		return 0;
 	return default_wake_function(wait, mode, sync, key);
 }
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
 {
 	__wake_up_sync_key(&parent->signal->wait_chldexit,
 				TASK_INTERRUPTIBLE, 1, p);
 }
 static long do_wait(struct wait_opts *wo)
 {
 	struct task_struct *tsk;
 	int retval;
 	trace_sched_process_wait(wo->wo_pid);
 	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
 	wo->child_wait.private = current;
 	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 repeat:
 	/*
 	 * If there is nothing that can match our critiera just get out.
 	 * We will clear ->notask_error to zero if we see any child that
 	 * might later match our criteria, even if we are not able to reap
 	 * it yet.
 	 */
 	wo->notask_error = -ECHILD;
 	if ((wo->wo_type < PIDTYPE_MAX) &&
 	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
 		goto notask;
 	set_current_state(TASK_INTERRUPTIBLE);
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
 		retval = do_wait_thread(wo, tsk);
 		if (retval)
 			goto end;
 		retval = ptrace_do_wait(wo, tsk);
 		if (retval)
 			goto end;
 		if (wo->wo_flags & __WNOTHREAD)
 			break;
 	} while_each_thread(current, tsk);
 	read_unlock(&tasklist_lock);
 notask:
 	retval = wo->notask_error;
 	if (!retval && !(wo->wo_flags & WNOHANG)) {
 		retval = -ERESTARTSYS;
 		if (!signal_pending(current)) {
 			schedule();
 			goto repeat;
 		}
 	}
 end:
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 	return retval;
 }
 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		infop, int, options, struct rusage __user *, ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
 		return -EINVAL;
 	switch (which) {
 	case P_ALL:
 		type = PIDTYPE_MAX;
 		break;
 	case P_PID:
 		type = PIDTYPE_PID;
 		if (upid <= 0)
 			return -EINVAL;
 		break;
 	case P_PGID:
 		type = PIDTYPE_PGID;
 		if (upid <= 0)
 			return -EINVAL;
 		break;
 	default:
 		return -EINVAL;
 	}
 	if (type < PIDTYPE_MAX)
 		pid = find_get_pid(upid);
 	wo.wo_type	= type;
 	wo.wo_pid	= pid;
 	wo.wo_flags	= options;
 	wo.wo_info	= infop;
 	wo.wo_stat	= NULL;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 	if (ret > 0) {
 		ret = 0;
 	} else if (infop) {
 		/*
 		 * For a WNOHANG return, clear out all the fields
 		 * we would set so the user can easily tell the
 		 * difference.
 		 */
 		if (!ret)
 			ret = put_user(0, &infop->si_signo);
 		if (!ret)
 			ret = put_user(0, &infop->si_errno);
 		if (!ret)
 			ret = put_user(0, &infop->si_code);
 		if (!ret)
 			ret = put_user(0, &infop->si_pid);
 		if (!ret)
 			ret = put_user(0, &infop->si_uid);
 		if (!ret)
 			ret = put_user(0, &infop->si_status);
 	}
 	put_pid(pid);
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(5, ret, which, upid, infop, options, ru);
 	return ret;
 }
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		int, options, struct rusage __user *, ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
 	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
 			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
 	if (upid == -1)
 		type = PIDTYPE_MAX;
 	else if (upid < 0) {
 		type = PIDTYPE_PGID;
 		pid = find_get_pid(-upid);
 	} else if (upid == 0) {
 		type = PIDTYPE_PGID;
 		pid = get_task_pid(current, PIDTYPE_PGID);
 	} else /* upid > 0 */ {
 		type = PIDTYPE_PID;
 		pid = find_get_pid(upid);
 	}
 	wo.wo_type	= type;
 	wo.wo_pid	= pid;
 	wo.wo_flags	= options | WEXITED;
 	wo.wo_info	= NULL;
 	wo.wo_stat	= stat_addr;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 	put_pid(pid);
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
 	return ret;
 }
 #ifdef __ARCH_WANT_SYS_WAITPID
 /*
  * sys_waitpid() remains for compatibility. waitpid() should be
  * implemented by calling sys_wait4() from libc.a.
  */
 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
 {
 	return sys_wait4(pid, stat_addr, options, NULL);
 }
 #endif

kernel/pid.c

Diff comments View file @ d8bf4ca

 /*
  * Generic pidhash and scalable, time-bounded PID allocator
  *
  * (C) 2002-2003 William Irwin, IBM
  * (C) 2004 William Irwin, Oracle
  * (C) 2002-2004 Ingo Molnar, Red Hat
  *
  * pid-structures are backing objects for tasks sharing a given ID to chain
  * against. There is very little to them aside from hashing them and
  * parking tasks using given ID's on a list.
  *
  * The hash is always changed with the tasklist_lock write-acquired,
  * and the hash is only accessed with the tasklist_lock at least
  * read-acquired, so there's no additional SMP locking needed here.
  *
  * We have a list of bitmap pages, which bitmaps represent the PID space.
  * Allocating and freeing PIDs is completely lockless. The worst-case
  * allocation scenario when all but one out of 1 million PIDs possible are
  * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
  * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
  *
  * Pid namespaces:
  *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
  *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
  *     Many thanks to Oleg Nesterov for comments and help
  *
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
 static struct hlist_head *pid_hash;
 static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 int pid_max = PID_MAX_DEFAULT;
 #define RESERVED_PIDS		300
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
 static inline int mk_pid(struct pid_namespace *pid_ns,
 		struct pidmap *map, int off)
 {
 	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
 }
 #define find_next_offset(map, off)					\
 		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
  * value does not cause lots of bitmaps to be allocated, but
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
 	.kref = {
 		.refcount       = ATOMIC_INIT(2),
 	},
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
 	.last_pid = 0,
 	.level = 0,
 	.child_reaper = &init_task,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 int is_container_init(struct task_struct *tsk)
 {
 	int ret = 0;
 	struct pid *pid;
 	rcu_read_lock();
 	pid = task_pid(tsk);
 	if (pid != NULL && pid->numbers[pid->level].nr == 1)
 		ret = 1;
 	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL(is_container_init);
 /*
  * Note: disable interrupts while the pidmap_lock is held as an
  * interrupt might come in and do read_lock(&tasklist_lock).
  *
  * If we don't disable interrupts there is a nasty deadlock between
  * detach_pid()->free_pid() and another cpu that does
  * spin_lock(&pidmap_lock) followed by an interrupt routine that does
  * read_lock(&tasklist_lock);
  *
  * After we clean up the tasklist_lock and know there are no
  * irq handlers that take it we can leave the interrupts enabled.
  * For now it is easier to be safe than to prove it can't happen.
  */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 static void free_pidmap(struct upid *upid)
 {
 	int nr = upid->nr;
 	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
 	int offset = nr & BITS_PER_PAGE_MASK;
 	clear_bit(offset, map->page);
 	atomic_inc(&map->nr_free);
 }
 /*
  * If we started walking pids at 'base', is 'a' seen before 'b'?
  */
 static int pid_before(int base, int a, int b)
 {
 	/*
 	 * This is the same as saying
 	 *
 	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
 	 * and that mapping orders 'a' and 'b' with respect to 'base'.
 	 */
 	return (unsigned)(a - base) < (unsigned)(b - base);
 }
 /*
  * We might be racing with someone else trying to set pid_ns->last_pid.
  * We want the winner to have the "later" value, because if the
  * "earlier" value prevails, then a pid may get reused immediately.
  *
  * Since pids rollover, it is not sufficient to just pick the bigger
  * value.  We have to consider where we started counting from.
  *
  * 'base' is the value of pid_ns->last_pid that we observed when
  * we started looking for a pid.
  *
  * 'pid' is the pid that we eventually found.
  */
 static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
 {
 	int prev;
 	int last_write = base;
 	do {
 		prev = last_write;
 		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
 	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
 }
 static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
 	int i, offset, max_scan, pid, last = pid_ns->last_pid;
 	struct pidmap *map;
 	pid = last + 1;
 	if (pid >= pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
 	/*
 	 * If last_pid points into the middle of the map->page we
 	 * want to scan this bitmap block twice, the second time
 	 * we start with offset == 0 (or RESERVED_PIDS).
 	 */
 	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
 			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 			/*
 			 * Free the page if someone raced with us
 			 * installing it:
 			 */
 			spin_lock_irq(&pidmap_lock);
 			if (!map->page) {
 				map->page = page;
 				page = NULL;
 			}
 			spin_unlock_irq(&pidmap_lock);
 			kfree(page);
 			if (unlikely(!map->page))
 				break;
 		}
 		if (likely(atomic_read(&map->nr_free))) {
 			do {
 				if (!test_and_set_bit(offset, map->page)) {
 					atomic_dec(&map->nr_free);
 					set_last_pid(pid_ns, last, pid);
 					return pid;
 				}
 				offset = find_next_offset(map, offset);
 				pid = mk_pid(pid_ns, map, offset);
 			} while (offset < BITS_PER_PAGE && pid < pid_max);
 		}
 		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
 			map = &pid_ns->pidmap[0];
 			offset = RESERVED_PIDS;
 			if (unlikely(last == offset))
 				break;
 		}
 		pid = mk_pid(pid_ns, map, offset);
 	}
 	return -1;
 }
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 {
 	int offset;
 	struct pidmap *map, *end;
 	if (last >= PID_MAX_LIMIT)
 		return -1;
 	offset = (last + 1) & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
 	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
 	for (; map < end; map++, offset = 0) {
 		if (unlikely(!map->page))
 			continue;
 		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
 		if (offset < BITS_PER_PAGE)
 			return mk_pid(pid_ns, map, offset);
 	}
 	return -1;
 }
 void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
 	if (!pid)
 		return;
 	ns = pid->numbers[pid->level].ns;
 	if ((atomic_read(&pid->count) == 1) ||
 	     atomic_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);
 		put_pid_ns(ns);
 	}
 }
 EXPORT_SYMBOL_GPL(put_pid);
 static void delayed_put_pid(struct rcu_head *rhp)
 {
 	struct pid *pid = container_of(rhp, struct pid, rcu);
 	put_pid(pid);
 }
 void free_pid(struct pid *pid)
 {
 	/* We can be called with write_lock_irq(&tasklist_lock) held */
 	int i;
 	unsigned long flags;
 	spin_lock_irqsave(&pidmap_lock, flags);
 	for (i = 0; i <= pid->level; i++)
 		hlist_del_rcu(&pid->numbers[i].pid_chain);
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 	for (i = 0; i <= pid->level; i++)
 		free_pidmap(pid->numbers + i);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 struct pid *alloc_pid(struct pid_namespace *ns)
 {
 	struct pid *pid;
 	enum pid_type type;
 	int i, nr;
 	struct pid_namespace *tmp;
 	struct upid *upid;
 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
 	if (!pid)
 		goto out;
 	tmp = ns;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
 		if (nr < 0)
 			goto out_free;
 		pid->numbers[i].nr = nr;
 		pid->numbers[i].ns = tmp;
 		tmp = tmp->parent;
 	}
 	get_pid_ns(ns);
 	pid->level = ns->level;
 	atomic_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
 	for ( ; upid >= pid->numbers; --upid)
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 	spin_unlock_irq(&pidmap_lock);
 out:
 	return pid;
 out_free:
 	while (++i <= ns->level)
 		free_pidmap(pid->numbers + i);
 	kmem_cache_free(ns->pid_cachep, pid);
 	pid = NULL;
 	goto out;
 }
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
 	struct hlist_node *elem;
 	struct upid *pnr;
 	hlist_for_each_entry_rcu(pnr, elem,
 			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
 		if (pnr->nr == nr && pnr->ns == ns)
 			return container_of(pnr, struct pid,
 					numbers[ns->level]);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 struct pid *find_vpid(int nr)
 {
 	return find_pid_ns(nr, current->nsproxy->pid_ns);
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
 void attach_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
 {
 	struct pid_link *link;
 	link = &task->pids[type];
 	link->pid = pid;
 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
 }
 static void __change_pid(struct task_struct *task, enum pid_type type,
 			struct pid *new)
 {
 	struct pid_link *link;
 	struct pid *pid;
 	int tmp;
 	link = &task->pids[type];
 	pid = link->pid;
 	hlist_del_rcu(&link->node);
 	link->pid = new;
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 		if (!hlist_empty(&pid->tasks[tmp]))
 			return;
 	free_pid(pid);
 }
 void detach_pid(struct task_struct *task, enum pid_type type)
 {
 	__change_pid(task, type, NULL);
 }
 void change_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
 {
 	__change_pid(task, type, pid);
 	attach_pid(task, type, pid);
 }
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
 void transfer_pid(struct task_struct *old, struct task_struct *new,
 			   enum pid_type type)
 {
 	new->pids[type].pid = old->pids[type].pid;
 	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
 }
 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result = NULL;
 	if (pid) {
 		struct hlist_node *first;
 		first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
-					      rcu_read_lock_held() ||
 					      lockdep_tasklist_lock_is_held());
 		if (first)
 			result = hlist_entry(first, struct task_struct, pids[(type)].node);
 	}
 	return result;
 }
 EXPORT_SYMBOL(pid_task);
 /*
  * Must be called under rcu_read_lock().
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
 	rcu_lockdep_assert(rcu_read_lock_held());
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
 	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
 }
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid;
 	rcu_read_lock();
 	if (type != PIDTYPE_PID)
 		task = task->group_leader;
 	pid = get_pid(task->pids[type].pid);
 	rcu_read_unlock();
 	return pid;
 }
 EXPORT_SYMBOL_GPL(get_task_pid);
 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result;
 	rcu_read_lock();
 	result = pid_task(pid, type);
 	if (result)
 		get_task_struct(result);
 	rcu_read_unlock();
 	return result;
 }
 EXPORT_SYMBOL_GPL(get_pid_task);
 struct pid *find_get_pid(pid_t nr)
 {
 	struct pid *pid;
 	rcu_read_lock();
 	pid = get_pid(find_vpid(nr));
 	rcu_read_unlock();
 	return pid;
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
 	struct upid *upid;
 	pid_t nr = 0;
 	if (pid && ns->level <= pid->level) {
 		upid = &pid->numbers[ns->level];
 		if (upid->ns == ns)
 			nr = upid->nr;
 	}
 	return nr;
 }
 pid_t pid_vnr(struct pid *pid)
 {
 	return pid_nr_ns(pid, current->nsproxy->pid_ns);
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 			struct pid_namespace *ns)
 {
 	pid_t nr = 0;
 	rcu_read_lock();
 	if (!ns)
 		ns = current->nsproxy->pid_ns;
 	if (likely(pid_alive(task))) {
 		if (type != PIDTYPE_PID)
 			task = task->group_leader;
 		nr = pid_nr_ns(task->pids[type].pid, ns);
 	}
 	rcu_read_unlock();
 	return nr;
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return pid_nr_ns(task_tgid(tsk), ns);
 }
 EXPORT_SYMBOL(task_tgid_nr_ns);
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
 }
 EXPORT_SYMBOL_GPL(task_active_pid_ns);
 /*
  * Used by proc to find the first pid that is greater than or equal to nr.
  *
  * If there is a pid at nr this function is exactly the same as find_pid_ns.
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
 	struct pid *pid;
 	do {
 		pid = find_pid_ns(nr, ns);
 		if (pid)
 			break;
 		nr = next_pidmap(ns, nr);
 	} while (nr > 0);
 	return pid;
 }
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
  * more.
  */
 void __init pidhash_init(void)
 {
 	int i, pidhash_size;
 	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
 					   HASH_EARLY | HASH_SMALL,
 					   &pidhash_shift, NULL, 4096);
 	pidhash_size = 1 << pidhash_shift;
 	for (i = 0; i < pidhash_size; i++)
 		INIT_HLIST_HEAD(&pid_hash[i]);
 }
 void __init pidmap_init(void)
 {
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
 				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 	pid_max_min = max_t(int, pid_max_min,
 				PIDS_PER_CPU_MIN * num_possible_cpus());
 	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, init_pid_ns.pidmap[0].page);
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 }

kernel/rcutorture.c

Diff comments View file @ d8bf4ca

 /*
  * Read-Copy Update module-based torture test facility
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright (C) IBM Corporation, 2005, 2006
  *
  * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  *	  Josh Triplett <josh@freedesktop.org>
  *
  * See also:  Documentation/RCU/torture.txt
  */
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/err.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/stat.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
 	      "Josh Triplett <josh@freedesktop.org>");
 static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;	/* # fake writer threads */
 static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;		/* Start/stop testing interval (in sec) */
 static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;	/* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;	/* Hold time within burst (us). */
 static int fqs_stutter = 3;	/* Wait time between bursts (s). */
 static int test_boost = 1;	/* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
 module_param(nfakewriters, int, 0444);
 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
 module_param(stat_interval, int, 0444);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0444);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
 module_param(test_no_idle_hz, bool, 0444);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0444);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 module_param(stutter, int, 0444);
 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
 module_param(irqreader, int, 0444);
 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
 module_param(fqs_duration, int, 0444);
 MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
 module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
 module_param(test_boost_interval, int, 0444);
 MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
 module_param(test_boost_duration, int, 0444);
 MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 #define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
 	do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_STRING(s) \
 	do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_ERRSTRING(s) \
 	do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 static char printk_buf[4096];
 static int nrealreaders;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
 #define RCU_TORTURE_PIPE_LEN 10
 struct rcu_torture {
 	struct rcu_head rtort_rcu;
 	int rtort_pipe_count;
 	struct list_head rtort_free;
 	int rtort_mbtest;
 };
 static LIST_HEAD(rcu_torture_freelist);
 static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
 	{ 0 };
 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
 	{ 0 };
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
 static long n_rcu_torture_boost_ktrerror;
 static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
 static int stutter_pause_test;
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
 #else
 #define RCUTORTURE_RUNNABLE_INIT 0
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
 static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
 					/*  and boost task create/destroy. */
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 #define FULLSTOP_DONTSTOP 0	/* Normal operation. */
 #define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
 #define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
 static int fullstop = FULLSTOP_RMMOD;
 /*
  * Protect fullstop transitions and spawning of kthreads.
  */
 static DEFINE_MUTEX(fullstop_mutex);
 /*
  * Detect and respond to a system shutdown.
  */
 static int
 rcutorture_shutdown_notify(struct notifier_block *unused1,
 			   unsigned long unused2, void *unused3)
 {
 	mutex_lock(&fullstop_mutex);
 	if (fullstop == FULLSTOP_DONTSTOP)
 		fullstop = FULLSTOP_SHUTDOWN;
 	else
 		printk(KERN_WARNING /* but going down anyway, so... */
 		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
 	mutex_unlock(&fullstop_mutex);
 	return NOTIFY_DONE;
 }
 /*
  * Absorb kthreads into a kernel function that won't return, so that
  * they won't ever access module text or data again.
  */
 static void rcutorture_shutdown_absorb(char *title)
 {
 	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		printk(KERN_NOTICE
 		       "rcutorture thread %s parking due to system shutdown\n",
 		       title);
 		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
 	}
 }
 /*
  * Allocate an element from the rcu_tortures pool.
  */
 static struct rcu_torture *
 rcu_torture_alloc(void)
 {
 	struct list_head *p;
 	spin_lock_bh(&rcu_torture_lock);
 	if (list_empty(&rcu_torture_freelist)) {
 		atomic_inc(&n_rcu_torture_alloc_fail);
 		spin_unlock_bh(&rcu_torture_lock);
 		return NULL;
 	}
 	atomic_inc(&n_rcu_torture_alloc);
 	p = rcu_torture_freelist.next;
 	list_del_init(p);
 	spin_unlock_bh(&rcu_torture_lock);
 	return container_of(p, struct rcu_torture, rtort_free);
 }
 /*
  * Free an element to the rcu_tortures pool.
  */
 static void
 rcu_torture_free(struct rcu_torture *p)
 {
 	atomic_inc(&n_rcu_torture_free);
 	spin_lock_bh(&rcu_torture_lock);
 	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
 	spin_unlock_bh(&rcu_torture_lock);
 }
 struct rcu_random_state {
 	unsigned long rrs_state;
 	long rrs_count;
 };
 #define RCU_RANDOM_MULT 39916801  /* prime */
 #define RCU_RANDOM_ADD	479001701 /* prime */
 #define RCU_RANDOM_REFRESH 10000
 #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
 /*
  * Crude but fast random-number generator.  Uses a linear congruential
  * generator, with occasional help from cpu_clock().
  */
 static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
 	if (--rrsp->rrs_count < 0) {
 		rrsp->rrs_state += (unsigned long)local_clock();
 		rrsp->rrs_count = RCU_RANDOM_REFRESH;
 	}
 	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
 	return swahw32(rrsp->rrs_state);
 }
 static void
 rcu_stutter_wait(char *title)
 {
 	while (stutter_pause_test || !rcutorture_runnable) {
 		if (rcutorture_runnable)
 			schedule_timeout_interruptible(1);
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
 		rcutorture_shutdown_absorb(title);
 	}
 }
 /*
  * Operations vector for selecting different types of tests.
  */
 struct rcu_torture_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	int (*readlock)(void);
 	void (*read_delay)(struct rcu_random_state *rrsp);
 	void (*readunlock)(int idx);
 	int (*completed)(void);
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
 	void (*cb_barrier)(void);
 	void (*fqs)(void);
 	int (*stats)(char *page);
 	int irq_capable;
 	int can_boost;
 	char *name;
 };
 static struct rcu_torture_ops *cur_ops;
 /*
  * Definitions for rcu torture testing.
  */
 static int rcu_torture_read_lock(void) __acquires(RCU)
 {
 	rcu_read_lock();
 	return 0;
 }
 static void rcu_read_delay(struct rcu_random_state *rrsp)
 {
 	const unsigned long shortdelay_us = 200;
 	const unsigned long longdelay_ms = 50;
 	/* We want a short delay sometimes to make a reader delay the grace
 	 * period, and we want a long delay occasionally to trigger
 	 * force_quiescent_state. */
 	if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
 		mdelay(longdelay_ms);
 	if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
 		udelay(shortdelay_us);
 #ifdef CONFIG_PREEMPT
 	if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
 		preempt_schedule();  /* No QS if preempt_disable() in effect */
 #endif
 }
 static void rcu_torture_read_unlock(int idx) __releases(RCU)
 {
 	rcu_read_unlock();
 }
 static int rcu_torture_completed(void)
 {
 	return rcu_batches_completed();
 }
 static void
 rcu_torture_cb(struct rcu_head *p)
 {
 	int i;
 	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
 	if (fullstop != FULLSTOP_DONTSTOP) {
 		/* Test is ending, just drop callbacks on the floor. */
 		/* The next initialization will pick up the pieces. */
 		return;
 	}
 	i = rp->rtort_pipe_count;
 	if (i > RCU_TORTURE_PIPE_LEN)
 		i = RCU_TORTURE_PIPE_LEN;
 	atomic_inc(&rcu_torture_wcount[i]);
 	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
 		rp->rtort_mbtest = 0;
 		rcu_torture_free(rp);
 	} else
 		cur_ops->deferred_free(rp);
 }
 static int rcu_no_completed(void)
 {
 	return 0;
 }
 static void rcu_torture_deferred_free(struct rcu_torture *p)
 {
 	call_rcu(&p->rtort_rcu, rcu_torture_cb);
 }
 static struct rcu_torture_ops rcu_ops = {
 	.init		= NULL,
 	.cleanup	= NULL,
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,
 	.readunlock	= rcu_torture_read_unlock,
 	.completed	= rcu_torture_completed,
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
 	.cb_barrier	= rcu_barrier,
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.can_boost	= rcu_can_boost(),
 	.name		= "rcu"
 };
 static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
 {
 	int i;
 	struct rcu_torture *rp;
 	struct rcu_torture *rp1;
 	cur_ops->sync();
 	list_add(&p->rtort_free, &rcu_torture_removed);
 	list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
 		i = rp->rtort_pipe_count;
 		if (i > RCU_TORTURE_PIPE_LEN)
 			i = RCU_TORTURE_PIPE_LEN;
 		atomic_inc(&rcu_torture_wcount[i]);
 		if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
 			rp->rtort_mbtest = 0;
 			list_del(&rp->rtort_free);
 			rcu_torture_free(rp);
 		}
 	}
 }
 static void rcu_sync_torture_init(void)
 {
 	INIT_LIST_HEAD(&rcu_torture_removed);
 }
 static struct rcu_torture_ops rcu_sync_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,
 	.readunlock	= rcu_torture_read_unlock,
 	.completed	= rcu_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_sync"
 };
 static struct rcu_torture_ops rcu_expedited_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu_expedited,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_expedited"
 };
 /*
  * Definitions for rcu_bh torture testing.
  */
 static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
 {
 	rcu_read_lock_bh();
 	return 0;
 }
 static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
 {
 	rcu_read_unlock_bh();
 }
 static int rcu_bh_torture_completed(void)
 {
 	return rcu_batches_completed_bh();
 }
 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 {
 	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
 struct rcu_bh_torture_synchronize {
 	struct rcu_head head;
 	struct completion completion;
 };
 static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
 {
 	struct rcu_bh_torture_synchronize *rcu;
 	rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
 	complete(&rcu->completion);
 }
 static void rcu_bh_torture_synchronize(void)
 {
 	struct rcu_bh_torture_synchronize rcu;
 	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
 	wait_for_completion(&rcu.completion);
 	destroy_rcu_head_on_stack(&rcu.head);
 }
 static struct rcu_torture_ops rcu_bh_ops = {
 	.init		= NULL,
 	.cleanup	= NULL,
 	.readlock	= rcu_bh_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_bh_torture_read_unlock,
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= rcu_bh_torture_synchronize,
 	.cb_barrier	= rcu_barrier_bh,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu_bh"
 };
 static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= rcu_bh_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_bh_torture_read_unlock,
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= rcu_bh_torture_synchronize,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu_bh_sync"
 };
 /*
  * Definitions for srcu torture testing.
  */
 static struct srcu_struct srcu_ctl;
 static void srcu_torture_init(void)
 {
 	init_srcu_struct(&srcu_ctl);
 	rcu_sync_torture_init();
 }
 static void srcu_torture_cleanup(void)
 {
 	synchronize_srcu(&srcu_ctl);
 	cleanup_srcu_struct(&srcu_ctl);
 }
 static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
 	return srcu_read_lock(&srcu_ctl);
 }
 static void srcu_read_delay(struct rcu_random_state *rrsp)
 {
 	long delay;
 	const long uspertick = 1000000 / HZ;
 	const long longdelay = 10;
 	/* We want there to be long-running readers, but not all the time. */
 	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
 	if (!delay)
 		schedule_timeout_interruptible(longdelay);
 	else
 		rcu_read_delay(rrsp);
 }
 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
 {
 	srcu_read_unlock(&srcu_ctl, idx);
 }
 static int srcu_torture_completed(void)
 {
 	return srcu_batches_completed(&srcu_ctl);
 }
 static void srcu_torture_synchronize(void)
 {
 	synchronize_srcu(&srcu_ctl);
 }
 static int srcu_torture_stats(char *page)
 {
 	int cnt = 0;
 	int cpu;
 	int idx = srcu_ctl.completed & 0x1;
 	cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
 		       torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
 		cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
 			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
 			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
 	}
 	cnt += sprintf(&page[cnt], "\n");
 	return cnt;
 }
 static struct rcu_torture_ops srcu_ops = {
 	.init		= srcu_torture_init,
 	.cleanup	= srcu_torture_cleanup,
 	.readlock	= srcu_torture_read_lock,
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock,
 	.completed	= srcu_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
 	.cb_barrier	= NULL,
 	.stats		= srcu_torture_stats,
 	.name		= "srcu"
 };
 static void srcu_torture_synchronize_expedited(void)
 {
 	synchronize_srcu_expedited(&srcu_ctl);
 }
 static struct rcu_torture_ops srcu_expedited_ops = {
 	.init		= srcu_torture_init,
 	.cleanup	= srcu_torture_cleanup,
 	.readlock	= srcu_torture_read_lock,
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock,
 	.completed	= srcu_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= srcu_torture_synchronize_expedited,
 	.cb_barrier	= NULL,
 	.stats		= srcu_torture_stats,
 	.name		= "srcu_expedited"
 };
 /*
  * Definitions for sched torture testing.
  */
 static int sched_torture_read_lock(void)
 {
 	preempt_disable();
 	return 0;
 }
 static void sched_torture_read_unlock(int idx)
 {
 	preempt_enable();
 }
 static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
 {
 	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
 }
 static void sched_torture_synchronize(void)
 {
 	synchronize_sched();
 }
 static struct rcu_torture_ops sched_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= sched_torture_synchronize,
 	.cb_barrier	= rcu_barrier_sched,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "sched"
 };
 static struct rcu_torture_ops sched_sync_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= sched_torture_synchronize,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
 	.name		= "sched_sync"
 };
 static struct rcu_torture_ops sched_expedited_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_sched_expedited,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "sched_expedited"
 };
 /*
  * RCU torture priority-boost testing.  Runs one real-time thread per
  * CPU for moderate bursts, repeatedly registering RCU callbacks and
  * spinning waiting for them to be invoked.  If a given callback takes
  * too long to be invoked, we assume that priority inversion has occurred.
  */
 struct rcu_boost_inflight {
 	struct rcu_head rcu;
 	int inflight;
 };
 static void rcu_torture_boost_cb(struct rcu_head *head)
 {
 	struct rcu_boost_inflight *rbip =
 		container_of(head, struct rcu_boost_inflight, rcu);
 	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
 	rbip->inflight = 0;
 }
 static int rcu_torture_boost(void *arg)
 {
 	unsigned long call_rcu_time;
 	unsigned long endtime;
 	unsigned long oldstarttime;
 	struct rcu_boost_inflight rbi = { .inflight = 0 };
 	struct sched_param sp;
 	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
 	/* Set real-time priority. */
 	sp.sched_priority = 1;
 	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
 		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
 		n_rcu_torture_boost_rterror++;
 	}
 	init_rcu_head_on_stack(&rbi.rcu);
 	/* Each pass through the following loop does one boost-test cycle. */
 	do {
 		/* Wait for the next test interval. */
 		oldstarttime = boost_starttime;
 		while (jiffies - oldstarttime > ULONG_MAX / 2) {
 			schedule_timeout_uninterruptible(1);
 			rcu_stutter_wait("rcu_torture_boost");
 			if (kthread_should_stop() ||
 			    fullstop != FULLSTOP_DONTSTOP)
 				goto checkwait;
 		}
 		/* Do one boost-test interval. */
 		endtime = oldstarttime + test_boost_duration * HZ;
 		call_rcu_time = jiffies;
 		while (jiffies - endtime > ULONG_MAX / 2) {
 			/* If we don't have a callback in flight, post one. */
 			if (!rbi.inflight) {
 				smp_mb(); /* RCU core before ->inflight = 1. */
 				rbi.inflight = 1;
 				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
 				if (jiffies - call_rcu_time >
 					 test_boost_duration * HZ - HZ / 2) {
 					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
 					n_rcu_torture_boost_failure++;
 				}
 				call_rcu_time = jiffies;
 			}
 			cond_resched();
 			rcu_stutter_wait("rcu_torture_boost");
 			if (kthread_should_stop() ||
 			    fullstop != FULLSTOP_DONTSTOP)
 				goto checkwait;
 		}
 		/*
 		 * Set the start time of the next test interval.
 		 * Yes, this is vulnerable to long delays, but such
 		 * delays simply cause a false negative for the next
 		 * interval.  Besides, we are running at RT priority,
 		 * so delays should be relatively rare.
 		 */
 		while (oldstarttime == boost_starttime) {
 			if (mutex_trylock(&boost_mutex)) {
 				boost_starttime = jiffies +
 						  test_boost_interval * HZ;
 				n_rcu_torture_boosts++;
 				mutex_unlock(&boost_mutex);
 				break;
 			}
 			schedule_timeout_uninterruptible(1);
 		}
 		/* Go do the stutter. */
 checkwait:	rcu_stutter_wait("rcu_torture_boost");
 	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
 	/* Clean up and exit. */
 	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
 	destroy_rcu_head_on_stack(&rbi.rcu);
 	rcutorture_shutdown_absorb("rcu_torture_boost");
 	while (!kthread_should_stop() || rbi.inflight)
 		schedule_timeout_uninterruptible(1);
 	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
 	return 0;
 }
 /*
  * RCU torture force-quiescent-state kthread.  Repeatedly induces
  * bursts of calls to force_quiescent_state(), increasing the probability
  * of occurrence of some important types of race conditions.
  */
 static int
 rcu_torture_fqs(void *arg)
 {
 	unsigned long fqs_resume_time;
 	int fqs_burst_remaining;
 	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
 	do {
 		fqs_resume_time = jiffies + fqs_stutter * HZ;
 		while (jiffies - fqs_resume_time > LONG_MAX) {
 			schedule_timeout_interruptible(1);
 		}
 		fqs_burst_remaining = fqs_duration;
 		while (fqs_burst_remaining > 0) {
 			cur_ops->fqs();
 			udelay(fqs_holdoff);
 			fqs_burst_remaining -= fqs_holdoff;
 		}
 		rcu_stutter_wait("rcu_torture_fqs");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_fqs");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
  * after a series of grace periods (the "pipeline").
  */
 static int
 rcu_torture_writer(void *arg)
 {
 	int i;
 	long oldbatch = rcu_batches_completed();
 	struct rcu_torture *rp;
 	struct rcu_torture *old_rp;
 	static DEFINE_RCU_RANDOM(rand);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
 	set_user_nice(current, 19);
 	do {
 		schedule_timeout_uninterruptible(1);
 		rp = rcu_torture_alloc();
 		if (rp == NULL)
 			continue;
 		rp->rtort_pipe_count = 0;
 		udelay(rcu_random(&rand) & 0x3ff);
 		old_rp = rcu_dereference_check(rcu_torture_current,
 					       current == writer_task);
 		rp->rtort_mbtest = 1;
 		rcu_assign_pointer(rcu_torture_current, rp);
 		smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
 		if (old_rp) {
 			i = old_rp->rtort_pipe_count;
 			if (i > RCU_TORTURE_PIPE_LEN)
 				i = RCU_TORTURE_PIPE_LEN;
 			atomic_inc(&rcu_torture_wcount[i]);
 			old_rp->rtort_pipe_count++;
 			cur_ops->deferred_free(old_rp);
 		}
 		rcutorture_record_progress(++rcu_torture_current_version);
 		oldbatch = cur_ops->completed();
 		rcu_stutter_wait("rcu_torture_writer");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_writer");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
 /*
  * RCU torture fake writer kthread.  Repeatedly calls sync, with a random
  * delay between calls.
  */
 static int
 rcu_torture_fakewriter(void *arg)
 {
 	DEFINE_RCU_RANDOM(rand);
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
 	set_user_nice(current, 19);
 	do {
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
 		udelay(rcu_random(&rand) & 0x3ff);
 		cur_ops->sync();
 		rcu_stutter_wait("rcu_torture_fakewriter");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_fakewriter");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
 /*
  * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
  * counter in the element should never be greater than 1, otherwise, the
  * RCU implementation is broken.
  */
 static void rcu_torture_timer(unsigned long unused)
 {
 	int idx;
 	int completed;
 	static DEFINE_RCU_RANDOM(rand);
 	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
 	int pipe_count;
 	idx = cur_ops->readlock();
 	completed = cur_ops->completed();
 	p = rcu_dereference_check(rcu_torture_current,
-				  rcu_read_lock_held() ||
 				  rcu_read_lock_bh_held() ||
 				  rcu_read_lock_sched_held() ||
 				  srcu_read_lock_held(&srcu_ctl));
 	if (p == NULL) {
 		/* Leave because rcu_torture_writer is not yet underway */
 		cur_ops->readunlock(idx);
 		return;
 	}
 	if (p->rtort_mbtest == 0)
 		atomic_inc(&n_rcu_torture_mberror);
 	spin_lock(&rand_lock);
 	cur_ops->read_delay(&rand);
 	n_rcu_torture_timers++;
 	spin_unlock(&rand_lock);
 	preempt_disable();
 	pipe_count = p->rtort_pipe_count;
 	if (pipe_count > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		pipe_count = RCU_TORTURE_PIPE_LEN;
 	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
 	completed = cur_ops->completed() - completed;
 	if (completed > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		completed = RCU_TORTURE_PIPE_LEN;
 	}
 	__this_cpu_inc(rcu_torture_batch[completed]);
 	preempt_enable();
 	cur_ops->readunlock(idx);
 }
 /*
  * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
  * counter in the element should never be greater than 1, otherwise, the
  * RCU implementation is broken.
  */
 static int
 rcu_torture_reader(void *arg)
 {
 	int completed;
 	int idx;
 	DEFINE_RCU_RANDOM(rand);
 	struct rcu_torture *p;
 	int pipe_count;
 	struct timer_list t;
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
 	if (irqreader && cur_ops->irq_capable)
 		setup_timer_on_stack(&t, rcu_torture_timer, 0);
 	do {
 		if (irqreader && cur_ops->irq_capable) {
 			if (!timer_pending(&t))
 				mod_timer(&t, jiffies + 1);
 		}
 		idx = cur_ops->readlock();
 		completed = cur_ops->completed();
 		p = rcu_dereference_check(rcu_torture_current,
-					  rcu_read_lock_held() ||
 					  rcu_read_lock_bh_held() ||
 					  rcu_read_lock_sched_held() ||
 					  srcu_read_lock_held(&srcu_ctl));
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			cur_ops->readunlock(idx);
 			schedule_timeout_interruptible(HZ);
 			continue;
 		}
 		if (p->rtort_mbtest == 0)
 			atomic_inc(&n_rcu_torture_mberror);
 		cur_ops->read_delay(&rand);
 		preempt_disable();
 		pipe_count = p->rtort_pipe_count;
 		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
 		completed = cur_ops->completed() - completed;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
 		}
 		__this_cpu_inc(rcu_torture_batch[completed]);
 		preempt_enable();
 		cur_ops->readunlock(idx);
 		schedule();
 		rcu_stutter_wait("rcu_torture_reader");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_reader");
 	if (irqreader && cur_ops->irq_capable)
 		del_timer_sync(&t);
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
 /*
  * Create an RCU-torture statistics message in the specified buffer.
  */
 static int
 rcu_torture_printk(char *page)
 {
 	int cnt = 0;
 	int cpu;
 	int i;
 	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
 			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
 		}
 	}
 	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
 		if (pipesummary[i] != 0)
 			break;
 	}
 	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
 		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
 		       "rtmbe: %d rtbke: %ld rtbre: %ld "
 		       "rtbf: %ld rtb: %ld nt: %ld",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
 		       atomic_read(&n_rcu_torture_alloc),
 		       atomic_read(&n_rcu_torture_alloc_fail),
 		       atomic_read(&n_rcu_torture_free),
 		       atomic_read(&n_rcu_torture_mberror),
 		       n_rcu_torture_boost_ktrerror,
 		       n_rcu_torture_boost_rterror,
 		       n_rcu_torture_boost_failure,
 		       n_rcu_torture_boosts,
 		       n_rcu_torture_timers);
 	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
 	    n_rcu_torture_boost_ktrerror != 0 ||
 	    n_rcu_torture_boost_rterror != 0 ||
 	    n_rcu_torture_boost_failure != 0)
 		cnt += sprintf(&page[cnt], " !!!");
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	if (i > 1) {
 		cnt += sprintf(&page[cnt], "!!! ");
 		atomic_inc(&n_rcu_torture_error);
 		WARN_ON_ONCE(1);
 	}
 	cnt += sprintf(&page[cnt], "Reader Pipe: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt], "Reader Batch: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 		cnt += sprintf(&page[cnt], " %d",
 			       atomic_read(&rcu_torture_wcount[i]));
 	}
 	cnt += sprintf(&page[cnt], "\n");
 	if (cur_ops->stats)
 		cnt += cur_ops->stats(&page[cnt]);
 	return cnt;
 }
 /*
  * Print torture statistics.  Caller must ensure that there is only
  * one call to this function at a given time!!!  This is normally
  * accomplished by relying on the module system to only have one copy
  * of the module loaded, and then by giving the rcu_torture_stats
  * kthread full control (or the init/cleanup functions when rcu_torture_stats
  * thread is not running).
  */
 static void
 rcu_torture_stats_print(void)
 {
 	int cnt;
 	cnt = rcu_torture_printk(printk_buf);
 	printk(KERN_ALERT "%s", printk_buf);
 }
 /*
  * Periodically prints torture statistics, if periodic statistics printing
  * was specified via the stat_interval module parameter.
  *
  * No need to worry about fullstop here, since this one doesn't reference
  * volatile state or register callbacks.
  */
 static int
 rcu_torture_stats(void *arg)
 {
 	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
 	do {
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
 		rcutorture_shutdown_absorb("rcu_torture_stats");
 	} while (!kthread_should_stop());
 	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
 	return 0;
 }
 static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
 /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
  * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
  */
 static void rcu_torture_shuffle_tasks(void)
 {
 	int i;
 	cpumask_setall(shuffle_tmp_mask);
 	get_online_cpus();
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	if (num_online_cpus() == 1) {
 		put_online_cpus();
 		return;
 	}
 	if (rcu_idle_cpu != -1)
 		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
 	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
 						     shuffle_tmp_mask);
 	}
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
 						     shuffle_tmp_mask);
 	}
 	if (writer_task)
 		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
 	if (stats_task)
 		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
 	else
 		rcu_idle_cpu--;
 	put_online_cpus();
 }
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
  * system to become idle at a time and cut off its timer ticks. This is meant
  * to test the support for such tickless idle CPU in RCU.
  */
 static int
 rcu_torture_shuffle(void *arg)
 {
 	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
 	do {
 		schedule_timeout_interruptible(shuffle_interval * HZ);
 		rcu_torture_shuffle_tasks();
 		rcutorture_shutdown_absorb("rcu_torture_shuffle");
 	} while (!kthread_should_stop());
 	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
 	return 0;
 }
 /* Cause the rcutorture test to "stutter", starting and stopping all
  * threads periodically.
  */
 static int
 rcu_torture_stutter(void *arg)
 {
 	VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
 	do {
 		schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 1;
 		if (!kthread_should_stop())
 			schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 0;
 		rcutorture_shutdown_absorb("rcu_torture_stutter");
 	} while (!kthread_should_stop());
 	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
 	return 0;
 }
 static inline void
 rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
 		"shuffle_interval=%d stutter=%d irqreader=%d "
 		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
 		"test_boost=%d/%d test_boost_interval=%d "
 		"test_boost_duration=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
 		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
 		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
 		test_boost, cur_ops->can_boost,
 		test_boost_interval, test_boost_duration);
 }
 static struct notifier_block rcutorture_shutdown_nb = {
 	.notifier_call = rcutorture_shutdown_notify,
 };
 static void rcutorture_booster_cleanup(int cpu)
 {
 	struct task_struct *t;
 	if (boost_tasks[cpu] == NULL)
 		return;
 	mutex_lock(&boost_mutex);
 	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
 	t = boost_tasks[cpu];
 	boost_tasks[cpu] = NULL;
 	mutex_unlock(&boost_mutex);
 	/* This must be outside of the mutex, otherwise deadlock! */
 	kthread_stop(t);
 }
 static int rcutorture_booster_init(int cpu)
 {
 	int retval;
 	if (boost_tasks[cpu] != NULL)
 		return 0;  /* Already created, nothing more to do. */
 	/* Don't allow time recalculation while creating a new task. */
 	mutex_lock(&boost_mutex);
 	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
 	boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
 					  "rcu_torture_boost");
 	if (IS_ERR(boost_tasks[cpu])) {
 		retval = PTR_ERR(boost_tasks[cpu]);
 		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
 		n_rcu_torture_boost_ktrerror++;
 		boost_tasks[cpu] = NULL;
 		mutex_unlock(&boost_mutex);
 		return retval;
 	}
 	kthread_bind(boost_tasks[cpu], cpu);
 	wake_up_process(boost_tasks[cpu]);
 	mutex_unlock(&boost_mutex);
 	return 0;
 }
 static int rcutorture_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		(void)rcutorture_booster_init(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 		rcutorture_booster_cleanup(cpu);
 		break;
 	default:
 		break;
 	}
 	return NOTIFY_OK;
 }
 static struct notifier_block rcutorture_cpu_nb = {
 	.notifier_call = rcutorture_cpu_notify,
 };
 static void
 rcu_torture_cleanup(void)
 {
 	int i;
 	mutex_lock(&fullstop_mutex);
 	rcutorture_record_test_transition();
 	if (fullstop == FULLSTOP_SHUTDOWN) {
 		printk(KERN_WARNING /* but going down anyway, so... */
 		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
 		mutex_unlock(&fullstop_mutex);
 		schedule_timeout_uninterruptible(10);
 		if (cur_ops->cb_barrier != NULL)
 			cur_ops->cb_barrier();
 		return;
 	}
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
 	unregister_reboot_notifier(&rcutorture_shutdown_nb);
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
 	}
 	stutter_task = NULL;
 	if (shuffler_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
 		free_cpumask_var(shuffle_tmp_mask);
 	}
 	shuffler_task = NULL;
 	if (writer_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
 		kthread_stop(writer_task);
 	}
 	writer_task = NULL;
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++) {
 			if (reader_tasks[i]) {
 				VERBOSE_PRINTK_STRING(
 					"Stopping rcu_torture_reader task");
 				kthread_stop(reader_tasks[i]);
 			}
 			reader_tasks[i] = NULL;
 		}
 		kfree(reader_tasks);
 		reader_tasks = NULL;
 	}
 	rcu_torture_current = NULL;
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++) {
 			if (fakewriter_tasks[i]) {
 				VERBOSE_PRINTK_STRING(
 					"Stopping rcu_torture_fakewriter task");
 				kthread_stop(fakewriter_tasks[i]);
 			}
 			fakewriter_tasks[i] = NULL;
 		}
 		kfree(fakewriter_tasks);
 		fakewriter_tasks = NULL;
 	}
 	if (stats_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
 		kthread_stop(stats_task);
 	}
 	stats_task = NULL;
 	if (fqs_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
 		kthread_stop(fqs_task);
 	}
 	fqs_task = NULL;
 	if ((test_boost == 1 && cur_ops->can_boost) ||
 	    test_boost == 2) {
 		unregister_cpu_notifier(&rcutorture_cpu_nb);
 		for_each_possible_cpu(i)
 			rcutorture_booster_cleanup(i);
 	}
 	/* Wait for all RCU callbacks to fire.  */
 	if (cur_ops->cb_barrier != NULL)
 		cur_ops->cb_barrier();
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
 	if (cur_ops->cleanup)
 		cur_ops->cleanup();
 	if (atomic_read(&n_rcu_torture_error))
 		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
 	else
 		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 static int __init
 rcu_torture_init(void)
 {
 	int i;
 	int cpu;
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
 		  &rcu_bh_ops, &rcu_bh_sync_ops,
 		  &srcu_ops, &srcu_expedited_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 	mutex_lock(&fullstop_mutex);
 	/* Process args and tell the world that the torturer is on the job. */
 	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
 		cur_ops = torture_ops[i];
 		if (strcmp(torture_type, cur_ops->name) == 0)
 			break;
 	}
 	if (i == ARRAY_SIZE(torture_ops)) {
 		printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
 		       torture_type);
 		printk(KERN_ALERT "rcu-torture types:");
 		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
 			printk(KERN_ALERT " %s", torture_ops[i]->name);
 		printk(KERN_ALERT "\n");
 		mutex_unlock(&fullstop_mutex);
 		return -EINVAL;
 	}
 	if (cur_ops->fqs == NULL && fqs_duration != 0) {
 		printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
 				  "fqs_duration, fqs disabled.\n");
 		fqs_duration = 0;
 	}
 	if (cur_ops->init)
 		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
 	if (nreaders >= 0)
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
 	rcu_torture_print_module_parms(cur_ops, "Start of test");
 	fullstop = FULLSTOP_DONTSTOP;
 	/* Set up the freelist. */
 	INIT_LIST_HEAD(&rcu_torture_freelist);
 	for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
 		rcu_tortures[i].rtort_mbtest = 0;
 		list_add_tail(&rcu_tortures[i].rtort_free,
 			      &rcu_torture_freelist);
 	}
 	/* Initialize the statistics so that each run gets its own numbers. */
 	rcu_torture_current = NULL;
 	rcu_torture_current_version = 0;
 	atomic_set(&n_rcu_torture_alloc, 0);
 	atomic_set(&n_rcu_torture_alloc_fail, 0);
 	atomic_set(&n_rcu_torture_free, 0);
 	atomic_set(&n_rcu_torture_mberror, 0);
 	atomic_set(&n_rcu_torture_error, 0);
 	n_rcu_torture_boost_ktrerror = 0;
 	n_rcu_torture_boost_rterror = 0;
 	n_rcu_torture_boost_failure = 0;
 	n_rcu_torture_boosts = 0;
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
 	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			per_cpu(rcu_torture_count, cpu)[i] = 0;
 			per_cpu(rcu_torture_batch, cpu)[i] = 0;
 		}
 	}
 	/* Start up the kthreads. */
 	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
 	writer_task = kthread_run(rcu_torture_writer, NULL,
 				  "rcu_torture_writer");
 	if (IS_ERR(writer_task)) {
 		firsterr = PTR_ERR(writer_task);
 		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
 		writer_task = NULL;
 		goto unwind;
 	}
 	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
 				   GFP_KERNEL);
 	if (fakewriter_tasks == NULL) {
 		VERBOSE_PRINTK_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
 	for (i = 0; i < nfakewriters; i++) {
 		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
 		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
 						  "rcu_torture_fakewriter");
 		if (IS_ERR(fakewriter_tasks[i])) {
 			firsterr = PTR_ERR(fakewriter_tasks[i]);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
 			fakewriter_tasks[i] = NULL;
 			goto unwind;
 		}
 	}
 	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (reader_tasks == NULL) {
 		VERBOSE_PRINTK_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
 	for (i = 0; i < nrealreaders; i++) {
 		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
 		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
 					      "rcu_torture_reader");
 		if (IS_ERR(reader_tasks[i])) {
 			firsterr = PTR_ERR(reader_tasks[i]);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
 			reader_tasks[i] = NULL;
 			goto unwind;
 		}
 	}
 	if (stat_interval > 0) {
 		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
 		stats_task = kthread_run(rcu_torture_stats, NULL,
 					"rcu_torture_stats");
 		if (IS_ERR(stats_task)) {
 			firsterr = PTR_ERR(stats_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
 			stats_task = NULL;
 			goto unwind;
 		}
 	}
 	if (test_no_idle_hz) {
 		rcu_idle_cpu = num_online_cpus() - 1;
 		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
 			firsterr = -ENOMEM;
 			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
 			goto unwind;
 		}
 		/* Create the shuffler thread */
 		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
 					  "rcu_torture_shuffle");
 		if (IS_ERR(shuffler_task)) {
 			free_cpumask_var(shuffle_tmp_mask);
 			firsterr = PTR_ERR(shuffler_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
 			shuffler_task = NULL;
 			goto unwind;
 		}
 	}
 	if (stutter < 0)
 		stutter = 0;
 	if (stutter) {
 		/* Create the stutter thread */
 		stutter_task = kthread_run(rcu_torture_stutter, NULL,
 					  "rcu_torture_stutter");
 		if (IS_ERR(stutter_task)) {
 			firsterr = PTR_ERR(stutter_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
 			stutter_task = NULL;
 			goto unwind;
 		}
 	}
 	if (fqs_duration < 0)
 		fqs_duration = 0;
 	if (fqs_duration) {
 		/* Create the stutter thread */
 		fqs_task = kthread_run(rcu_torture_fqs, NULL,
 				       "rcu_torture_fqs");
 		if (IS_ERR(fqs_task)) {
 			firsterr = PTR_ERR(fqs_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
 			fqs_task = NULL;
 			goto unwind;
 		}
 	}
 	if (test_boost_interval < 1)
 		test_boost_interval = 1;
 	if (test_boost_duration < 2)
 		test_boost_duration = 2;
 	if ((test_boost == 1 && cur_ops->can_boost) ||
 	    test_boost == 2) {
 		int retval;
 		boost_starttime = jiffies + test_boost_interval * HZ;
 		register_cpu_notifier(&rcutorture_cpu_nb);
 		for_each_possible_cpu(i) {
 			if (cpu_is_offline(i))
 				continue;  /* Heuristic: CPU can go offline. */
 			retval = rcutorture_booster_init(i);
 			if (retval < 0) {
 				firsterr = retval;
 				goto unwind;
 			}
 		}
 	}
 	register_reboot_notifier(&rcutorture_shutdown_nb);
 	rcutorture_record_test_transition();
 	mutex_unlock(&fullstop_mutex);
 	return 0;
 unwind:
 	mutex_unlock(&fullstop_mutex);
 	rcu_torture_cleanup();
 	return firsterr;
 }
 module_init(rcu_torture_init);
 module_exit(rcu_torture_cleanup);

kernel/sched.c

Diff comments View file @ d8bf4ca

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
 #include "sched_autogroup.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
  */
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 /*
  * single value that denotes runtime == period, ie unlimited time.
  */
 #define RUNTIME_INF	((u64)~0ULL)
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 		return 1;
 	return 0;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
 {
 	return rt_policy(p->policy);
 }
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_RT_PRIO];
 };
 struct rt_bandwidth {
 	/* nests inside the rq lock: */
 	raw_spinlock_t		rt_runtime_lock;
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
 };
 static struct rt_bandwidth def_rt_bandwidth;
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
 	ktime_t now;
 	int overrun;
 	int idle = 0;
 	for (;;) {
 		now = hrtimer_cb_get_time(timer);
 		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 		if (!overrun)
 			break;
 		idle = do_sched_rt_period_timer(rt_b, overrun);
 	}
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
 static
 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 {
 	rt_b->rt_period = ns_to_ktime(period);
 	rt_b->rt_runtime = runtime;
 	raw_spin_lock_init(&rt_b->rt_runtime_lock);
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 static inline int rt_bandwidth_enabled(void)
 {
 	return sysctl_sched_rt_runtime >= 0;
 }
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 	if (hrtimer_active(&rt_b->rt_period_timer))
 		return;
 	raw_spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
 		unsigned long delta;
 		ktime_t soft, hard;
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
 		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
 		delta = ktime_to_ns(ktime_sub(hard, soft));
 		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
 				HRTIMER_MODE_ABS_PINNED, 0);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 #ifdef CONFIG_RT_GROUP_SCHED
 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	hrtimer_cancel(&rt_b->rt_period_timer);
 }
 #endif
 /*
  * sched_domains_mutex serializes calls to init_sched_domains,
  * detach_destroy_domains and partition_sched_domains.
  */
 static DEFINE_MUTEX(sched_domains_mutex);
 #ifdef CONFIG_CGROUP_SCHED
 #include <linux/cgroup.h>
 struct cfs_rq;
 static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
 	atomic_t load_weight;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 	struct rt_bandwidth rt_bandwidth;
 #endif
 	struct rcu_head rcu;
 	struct list_head list;
 	struct task_group *parent;
 	struct list_head siblings;
 	struct list_head children;
 #ifdef CONFIG_SCHED_AUTOGROUP
 	struct autogroup *autogroup;
 #endif
 };
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD
 /*
  * A weight of 0 or 1 can cause arithmetics problems.
  * A weight of a cfs_rq is the sum of weights of which entities
  * are queued on this cfs_rq, so a weight of a entity should not be
  * too large, so as the shares value of a task group.
  * (The default weight is 1024 - so there's no practical
  *  limitation from this.)
  */
 #define MIN_SHARES	2
 #define MAX_SHARES	(1UL << (18 + SCHED_LOAD_RESOLUTION))
 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
 struct task_group root_task_group;
 #endif	/* CONFIG_CGROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 	u64 exec_clock;
 	u64 min_vruntime;
 #ifndef CONFIG_64BIT
 	u64 min_vruntime_copy;
 #endif
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct list_head tasks;
 	struct list_head *balance_iterator;
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr, *next, *last, *skip;
 #ifdef	CONFIG_SCHED_DEBUG
 	unsigned int nr_spread_over;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 	 * (like users, containers etc.)
 	 *
 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 	 * list is used during load balance.
 	 */
 	int on_list;
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 #ifdef CONFIG_SMP
 	/*
 	 * the part of load.weight contributed by tasks
 	 */
 	unsigned long task_weight;
 	/*
 	 *   h_load = weight * f(tg)
 	 *
 	 * Where f(tg) is the recursive weight fraction assigned to
 	 * this group.
 	 */
 	unsigned long h_load;
 	/*
 	 * Maintaining per-cpu shares distribution for group scheduling
 	 *
 	 * load_stamp is the last time we updated the load average
 	 * load_last is the last time we updated the load average and saw load
 	 * load_unacc_exec_time is currently unaccounted execution time
 	 */
 	u64 load_avg;
 	u64 load_period;
 	u64 load_stamp, load_last, load_unacc_exec_time;
 	unsigned long load_contribution;
 #endif
 #endif
 };
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int curr; /* highest queued rt task prio */
 #ifdef CONFIG_SMP
 		int next; /* next highest */
 #endif
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
 #endif
 	int rt_throttled;
 	u64 rt_time;
 	u64 rt_runtime;
 	/* Nests inside the rq lock: */
 	raw_spinlock_t rt_runtime_lock;
 #ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
 #endif
 };
 #ifdef CONFIG_SMP
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
  * fully partitioning the member cpus from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
  */
 struct root_domain {
 	atomic_t refcount;
 	struct rcu_head rcu;
 	cpumask_var_t span;
 	cpumask_var_t online;
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
 	cpumask_var_t rto_mask;
 	atomic_t rto_count;
 	struct cpupri cpupri;
 };
 /*
  * By default the system creates a single root-domain with all cpus as
  * members (mimicking the global state we have today).
  */
 static struct root_domain def_root_domain;
 #endif /* CONFIG_SMP */
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
 	/* runqueue lock: */
 	raw_spinlock_t lock;
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ
 	u64 nohz_stamp;
 	unsigned char nohz_balance_kick;
 #endif
 	int skip_clock_update;
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct list_head leaf_rt_rq_list;
 #endif
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	struct task_struct *curr, *idle, *stop;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	u64 clock;
 	u64 clock_task;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct root_domain *rd;
 	struct sched_domain *sd;
 	unsigned long cpu_power;
 	unsigned char idle_at_tick;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
 	unsigned long avg_load_per_task;
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
 	u64 avg_idle;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64 prev_irq_time;
 #endif
 	/* calc_load related fields */
 	unsigned long calc_load_update;
 	long calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
 	struct call_single_data hrtick_csd;
 #endif
 	struct hrtimer hrtick_timer;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	unsigned long long rq_cpu_time;
 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 	/* sys_sched_yield() stats */
 	unsigned int yld_count;
 	/* schedule() stats */
 	unsigned int sched_switch;
 	unsigned int sched_count;
 	unsigned int sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
 #endif
 #ifdef CONFIG_SMP
 	struct task_struct *wake_list;
 #endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	return rq->cpu;
 #else
 	return 0;
 #endif
 }
 #define rcu_dereference_check_sched_domain(p) \
 	rcu_dereference_check((p), \
-			      rcu_read_lock_held() || \
 			      lockdep_is_held(&sched_domains_mutex))
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 #ifdef CONFIG_CGROUP_SCHED
 /*
  * Return the group to which this tasks belongs.
  *
  * We use task_subsys_state_check() and extend the RCU verification with
  * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
  * task it moves into the cgroup. Therefore by holding either of those locks,
  * we pin the task to the current cgroup.
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&p->pi_lock) ||
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
 	return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
 }
 #else /* CONFIG_CGROUP_SCHED */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	return NULL;
 }
 #endif /* CONFIG_CGROUP_SCHED */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 static void update_rq_clock(struct rq *rq)
 {
 	s64 delta;
 	if (rq->skip_clock_update > 0)
 		return;
 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 	rq->clock += delta;
 	update_rq_clock_task(rq, delta);
 }
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define const_debug __read_mostly
 #else
 # define const_debug static const
 #endif
 /**
  * runqueue_is_locked - Returns true if the current cpu runqueue is locked
  * @cpu: the processor in question.
  *
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
 int runqueue_is_locked(int cpu)
 {
 	return raw_spin_is_locked(&cpu_rq(cpu)->lock);
 }
 /*
  * Debugging: various feature bits
  */
 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,
 enum {
 #include "sched_features.h"
 };
 #undef SCHED_FEAT
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
 #include "sched_features.h"
 	0;
 #undef SCHED_FEAT
 #ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 static __read_mostly char *sched_feat_names[] = {
 #include "sched_features.h"
 	NULL
 };
 #undef SCHED_FEAT
 static int sched_feat_show(struct seq_file *m, void *v)
 {
 	int i;
 	for (i = 0; sched_feat_names[i]; i++) {
 		if (!(sysctl_sched_features & (1UL << i)))
 			seq_puts(m, "NO_");
 		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
 	seq_puts(m, "\n");
 	return 0;
 }
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char buf[64];
 	char *cmp;
 	int neg = 0;
 	int i;
 	if (cnt > 63)
 		cnt = 63;
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
 	buf[cnt] = 0;
 	cmp = strstrip(buf);
 	if (strncmp(cmp, "NO_", 3) == 0) {
 		neg = 1;
 		cmp += 3;
 	}
 	for (i = 0; sched_feat_names[i]; i++) {
 		if (strcmp(cmp, sched_feat_names[i]) == 0) {
 			if (neg)
 				sysctl_sched_features &= ~(1UL << i);
 			else
 				sysctl_sched_features |= (1UL << i);
 			break;
 		}
 	}
 	if (!sched_feat_names[i])
 		return -EINVAL;
 	*ppos += cnt;
 	return cnt;
 }
 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_feat_show, NULL);
 }
 static const struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static __init int sched_init_debug(void)
 {
 	debugfs_create_file("sched_features", 0644, NULL, NULL,
 			&sched_feat_fops);
 	return 0;
 }
 late_initcall(sched_init_debug);
 #endif
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
  * period over which we average the RT time consumption, measured
  * in ms.
  *
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 static __read_mostly int scheduler_running;
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
 int sysctl_sched_rt_runtime = 950000;
 static inline u64 global_rt_period(void)
 {
 	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 }
 static inline u64 global_rt_runtime(void)
 {
 	if (sysctl_sched_rt_runtime < 0)
 		return RUNTIME_INF;
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->on_cpu;
 #else
 	return task_current(rq, p);
 #endif
 }
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->on_cpu = 1;
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->on_cpu = 0;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 	raw_spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	raw_spin_unlock_irq(&rq->lock);
 #else
 	raw_spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	lockdep_assert_held(&p->pi_lock);
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 	}
 }
 /*
  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 	}
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	raw_spin_unlock(&rq->lock);
 }
 static inline void
 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	raw_spin_lock(&rq->lock);
 	return rq;
 }
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
  *
  * Its all a bit involved since we cannot program an hrt while holding the
  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
  * reschedule event.
  *
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
 /*
  * Use hrtick when:
  *  - enabled by features
  *  - hrtimer is actually high res
  */
 static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
 	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	raw_spin_unlock(&rq->lock);
 	return HRTIMER_NORESTART;
 }
 #ifdef CONFIG_SMP
 /*
  * called from hardirq (IPI) context
  */
 static void __hrtick_start(void *arg)
 {
 	struct rq *rq = arg;
 	raw_spin_lock(&rq->lock);
 	hrtimer_restart(&rq->hrtick_timer);
 	rq->hrtick_csd_pending = 0;
 	raw_spin_unlock(&rq->lock);
 }
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 	hrtimer_set_expires(timer, time);
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
 		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
 static int
 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
 #else
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
 			HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
 static void init_rq_hrtick(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
 	rq->hrtick_csd.flags = 0;
 	rq->hrtick_csd.func = __hrtick_start;
 	rq->hrtick_csd.info = rq;
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 static inline void init_hrtick(void)
 {
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 static void resched_task(struct task_struct *p)
 {
 	int cpu;
 	assert_raw_spin_locked(&task_rq(p)->lock);
 	if (test_tsk_need_resched(p))
 		return;
 	set_tsk_need_resched(p);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 static void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_task(cpu_curr(cpu));
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_NO_HZ
 /*
  * In the semi idle case, use the nearest busy cpu for migrating timers
  * from an idle cpu.  This is good for power-savings.
  *
  * We don't do similar optimization for completely idle system, as
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
 int get_nohz_timer_target(void)
 {
 	int cpu = smp_processor_id();
 	int i;
 	struct sched_domain *sd;
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
 			if (!idle_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 unlock:
 	rcu_read_unlock();
 	return cpu;
 }
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
  * which is scheduled to wake up that CPU. In case of a completely
  * idle system the next event might even be infinite time into the
  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
  * leaves the inner idle loop so the newly added timer is taken into
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
 void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (cpu == smp_processor_id())
 		return;
 	/*
 	 * This is safe, as this function is called with the timer
 	 * wheel base lock of (cpu) held. When the CPU is on the way
 	 * to idle and has not yet set rq->curr to idle then it will
 	 * be serialized on the timer wheel base lock and take the new
 	 * timer into account automatically.
 	 */
 	if (rq->curr != rq->idle)
 		return;
 	/*
 	 * We can set TIF_RESCHED on the idle task of the other CPU
 	 * lockless. The worst case is that the other CPU runs the
 	 * idle task through an additional NOOP schedule()
 	 */
 	set_tsk_need_resched(rq->idle);
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
 {
 	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
 static void sched_avg_update(struct rq *rq)
 {
 	s64 period = sched_avg_period();
 	while ((s64)(rq->clock - rq->age_stamp) > period) {
 		/*
 		 * Inline assembly required to prevent the compiler
 		 * optimising this loop into a divmod call.
 		 * See __iter_div_u64_rem() for another example of this.
 		 */
 		asm("" : "+rm" (rq->age_stamp));
 		rq->age_stamp += period;
 		rq->rt_avg /= 2;
 	}
 }
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta;
 	sched_avg_update(rq);
 }
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
 	assert_raw_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 }
 static void sched_avg_update(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
 # define WMULT_CONST	(1UL << 32)
 #endif
 #define WMULT_SHIFT	32
 /*
  * Shift right and round:
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 /*
  * delta *= weight / lw
  */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
 	u64 tmp;
 	/*
 	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
 	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
 	 * 2^SCHED_LOAD_RESOLUTION.
 	 */
 	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
 		tmp = (u64)delta_exec * scale_load_down(weight);
 	else
 		tmp = (u64)delta_exec;
 	if (!lw->inv_weight) {
 		unsigned long w = scale_load_down(lw->weight);
 		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 			lw->inv_weight = 1;
 		else if (unlikely(!w))
 			lw->inv_weight = WMULT_CONST;
 		else
 			lw->inv_weight = WMULT_CONST / w;
 	}
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST))
 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 			WMULT_SHIFT/2);
 	else
 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
 	lw->inv_weight = 0;
 }
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
 	lw->inv_weight = 0;
 }
 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 {
 	lw->weight = w;
 	lw->inv_weight = 0;
 }
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
  * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
  * scaled version of the new time slice allocation that they receive on time
  * slice expiry etc.
  */
 #define WEIGHT_IDLEPRIO                3
 #define WMULT_IDLEPRIO         1431655765
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  * nice 1, it will get ~10% less CPU time than another CPU-bound task
  * that remained on nice 0.
  *
  * The "10% effect" is relative and cumulative: from _any_ nice level,
  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  * If a task goes up by ~10% and another task goes down by ~10% then
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
  /* -20 */     88761,     71755,     56483,     46273,     36291,
  /* -15 */     29154,     23254,     18705,     14949,     11916,
  /* -10 */      9548,      7620,      6100,      4904,      3906,
  /*  -5 */      3121,      2501,      1991,      1586,      1277,
  /*   0 */      1024,       820,       655,       526,       423,
  /*   5 */       335,       272,       215,       172,       137,
  /*  10 */       110,        87,        70,        56,        45,
  /*  15 */        36,        29,        23,        18,        15,
 };
 /*
  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
  *
  * In cases where the weight does not change often, we can use the
  * precalculated inverse to speed up arithmetics by turning divisions
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
  /* -20 */     48388,     59856,     76040,     92818,    118348,
  /* -15 */    147320,    184698,    229616,    287308,    360437,
  /* -10 */    449829,    563644,    704093,    875809,   1099582,
  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
 	CPUACCT_STAT_USER,	/* ... user mode */
 	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */
 	CPUACCT_STAT_NSTATS,
 };
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 static inline void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_add(&rq->load, load);
 }
 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_sub(&rq->load, load);
 }
 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
 	ret = (*down)(parent, data);
 	if (ret)
 		goto out_unlock;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
 up:
 		continue;
 	}
 	ret = (*up)(parent, data);
 	if (ret)
 		goto out_unlock;
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
 out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
 static int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_SMP
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->load.weight;
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return min(rq->cpu_load[type-1], total);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return max(rq->cpu_load[type-1], total);
 }
 static unsigned long power_of(int cpu)
 {
 	return cpu_rq(cpu)->cpu_power;
 }
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 	if (nr_running)
 		rq->avg_load_per_task = rq->load.weight / nr_running;
 	else
 		rq->avg_load_per_task = 0;
 	return rq->avg_load_per_task;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
 static int tg_load_down(struct task_group *tg, void *data)
 {
 	unsigned long load;
 	long cpu = (long)data;
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
 	} else {
 		load = tg->parent->cfs_rq[cpu]->h_load;
 		load *= tg->se[cpu]->load.weight;
 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
 	}
 	tg->cfs_rq[cpu]->h_load = load;
 	return 0;
 }
 static void update_h_load(long cpu)
 {
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 #endif
 #ifdef CONFIG_PREEMPT
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 /*
  * fair double_lock_balance: Safely acquires both rq->locks in a fair
  * way at the expense of forcing extra atomic operations in all
  * invocations.  This assures that the double_lock is acquired using the
  * same underlying policy as the spinlock_t on this architecture, which
  * reduces latency compared to the unfair variant below.  However, it
  * also adds more overhead and therefore may reduce throughput.
  */
 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	raw_spin_unlock(&this_rq->lock);
 	double_rq_lock(this_rq, busiest);
 	return 1;
 }
 #else
 /*
  * Unfair double_lock_balance: Optimizes throughput at the expense of
  * latency by eliminating extra atomic operations when the locks are
  * already in proper order on entry.  This favors lower cpu-ids and will
  * grant the double lock to lower cpus over higher ids under contention,
  * regardless of entry order into the function.
  */
 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 	if (unlikely(!raw_spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			raw_spin_unlock(&this_rq->lock);
 			raw_spin_lock(&busiest->lock);
 			raw_spin_lock_nested(&this_rq->lock,
 					      SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
 			raw_spin_lock_nested(&busiest->lock,
 					      SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
 #endif /* CONFIG_PREEMPT */
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		raw_spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	return _double_lock_balance(this_rq, busiest);
 }
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	raw_spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		raw_spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1 < rq2) {
 			raw_spin_lock(&rq1->lock);
 			raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			raw_spin_lock(&rq2->lock);
 			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	raw_spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		raw_spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 #else /* CONFIG_SMP */
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	BUG_ON(rq1 != rq2);
 	raw_spin_lock(&rq1->lock);
 	__acquire(rq2->lock);	/* Fake it out ;) */
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	BUG_ON(rq1 != rq2);
 	raw_spin_unlock(&rq1->lock);
 	__release(rq2->lock);
 }
 #endif
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
 static void update_cpu_load(struct rq *this_rq);
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfuly executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
 	task_thread_info(p)->cpu = cpu;
 #endif
 }
 static const struct sched_class rt_sched_class;
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 #include "sched_stats.h"
 static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
 }
 static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
 }
 static void set_load_weight(struct task_struct *p)
 {
 	int prio = p->static_prio - MAX_RT_PRIO;
 	struct load_weight *load = &p->se.load;
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	load->weight = scale_load(prio_to_weight[prio]);
 	load->inv_weight = prio_to_wmult[prio];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 /*
  * activate_task - move a task to the runqueue.
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, flags);
 	inc_nr_running(rq);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, flags);
 	dec_nr_running(rq);
 }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * There are no locks covering percpu hardirq/softirq time.
  * They are only modified in account_system_vtime, on corresponding CPU
  * with interrupts disabled. So, writes are safe.
  * They are read and saved off onto struct rq in update_rq_clock().
  * This may result in other CPU reading this CPU's irq time and can
  * race with irq/account_system_vtime on this CPU. We would either get old
  * or new value with a side effect of accounting a slice of irq time to wrong
  * task when irq is in progress while we read rq->clock. That is a worthy
  * compromise in place of having locks on each irq in account_system_time.
  */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
 static DEFINE_PER_CPU(u64, irq_start_time);
 static int sched_clock_irqtime;
 void enable_sched_clock_irqtime(void)
 {
 	sched_clock_irqtime = 1;
 }
 void disable_sched_clock_irqtime(void)
 {
 	sched_clock_irqtime = 0;
 }
 #ifndef CONFIG_64BIT
 static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
 static inline void irq_time_write_begin(void)
 {
 	__this_cpu_inc(irq_time_seq.sequence);
 	smp_wmb();
 }
 static inline void irq_time_write_end(void)
 {
 	smp_wmb();
 	__this_cpu_inc(irq_time_seq.sequence);
 }
 static inline u64 irq_time_read(int cpu)
 {
 	u64 irq_time;
 	unsigned seq;
 	do {
 		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
 		irq_time = per_cpu(cpu_softirq_time, cpu) +
 			   per_cpu(cpu_hardirq_time, cpu);
 	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
 	return irq_time;
 }
 #else /* CONFIG_64BIT */
 static inline void irq_time_write_begin(void)
 {
 }
 static inline void irq_time_write_end(void)
 {
 }
 static inline u64 irq_time_read(int cpu)
 {
 	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
 #endif /* CONFIG_64BIT */
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
  */
 void account_system_vtime(struct task_struct *curr)
 {
 	unsigned long flags;
 	s64 delta;
 	int cpu;
 	if (!sched_clock_irqtime)
 		return;
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
 	__this_cpu_add(irq_start_time, delta);
 	irq_time_write_begin();
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
 	if (hardirq_count())
 		__this_cpu_add(cpu_hardirq_time, delta);
 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		__this_cpu_add(cpu_softirq_time, delta);
 	irq_time_write_end();
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	s64 irq_delta;
 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 	/*
 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
 	 * this case when a previous update_rq_clock() happened inside a
 	 * {soft,}irq region.
 	 *
 	 * When this happens, we stop ->clock_task and only update the
 	 * prev_irq_time stamp to account for the part that fit, so that a next
 	 * update will consume the rest. This ensures ->clock_task is
 	 * monotonic.
 	 *
 	 * It does however cause some slight miss-attribution of {soft,}irq
 	 * time, a more accurate solution would be to update the irq_time using
 	 * the current rq->clock timestamp, except that would require using
 	 * atomic ops.
 	 */
 	if (irq_delta > delta)
 		irq_delta = delta;
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 	rq->clock_task += delta;
 	if (irq_delta && sched_feat(NONIRQ_POWER))
 		sched_rt_avg_update(rq, irq_delta);
 }
 static int irqtime_account_hi_update(void)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	unsigned long flags;
 	u64 latest_ns;
 	int ret = 0;
 	local_irq_save(flags);
 	latest_ns = this_cpu_read(cpu_hardirq_time);
 	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
 		ret = 1;
 	local_irq_restore(flags);
 	return ret;
 }
 static int irqtime_account_si_update(void)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	unsigned long flags;
 	u64 latest_ns;
 	int ret = 0;
 	local_irq_save(flags);
 	latest_ns = this_cpu_read(cpu_softirq_time);
 	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
 		ret = 1;
 	local_irq_restore(flags);
 	return ret;
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 #define sched_clock_irqtime	(0)
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	rq->clock_task += delta;
 }
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
 #include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 	if (stop) {
 		/*
 		 * Make it appear like a SCHED_FIFO task, its something
 		 * userspace knows about and won't get confused about.
 		 *
 		 * Also, it will make PI more or less work without too
 		 * much confusion -- but then, stop work should not
 		 * rely on PI working anyway.
 		 */
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 		stop->sched_class = &stop_sched_class;
 	}
 	cpu_rq(cpu)->stop = stop;
 	if (old_stop) {
 		/*
 		 * Reset it back to a normal scheduling class so that
 		 * it can die in pieces.
 		 */
 		old_stop->sched_class = &rt_sched_class;
 	}
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
 				       int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio)
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	const struct sched_class *class;
 	if (p->sched_class == rq->curr->sched_class) {
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 	} else {
 		for_each_class(class) {
 			if (class == rq->curr->sched_class)
 				break;
 			if (class == p->sched_class) {
 				resched_task(rq->curr);
 				break;
 			}
 		}
 	}
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
 	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 #ifdef CONFIG_SMP
 /*
  * Is this task likely cache-hot:
  */
 static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 	s64 delta;
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 	if (unlikely(p->policy == SCHED_IDLE))
 		return 0;
 	/*
 	 * Buddy candidates are cache hot:
 	 */
 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
 			(&p->se == cfs_rq_of(&p->se)->next ||
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 	delta = now - p->se.exec_start;
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
 	/*
 	 * We should never call set_task_cpu() on a blocked task,
 	 * ttwu() will sort out the placement.
 	 */
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
 	 *
 	 * sched_move_task() holds both and thus holding either pins the cgroup,
 	 * see set_task_rq().
 	 *
 	 * Furthermore, all task_rq users should acquire both locks, see
 	 * task_rq_lock().
 	 */
 	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
 				      lockdep_is_held(&task_rq(p)->lock)));
 #endif
 #endif
 	trace_sched_migrate_task(p, new_cpu);
 	if (task_cpu(p) != new_cpu) {
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
 	}
 	__set_task_cpu(p, new_cpu);
 }
 struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
 };
 static int migration_cpu_stop(void *data);
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
  * not expected to change.  If it changes, i.e. @p might have woken up,
  * then return zero.  When we succeed in waiting for @p to be off its CPU,
  * we return a positive number (its total switch count).  If a second call
  * a short while later returns the same number, the caller can be sure that
  * @p has remained unscheduled the whole time.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
 	unsigned long ncsw;
 	struct rq *rq;
 	for (;;) {
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
 		 * the runqueue lock when things look like they will
 		 * work out!
 		 */
 		rq = task_rq(p);
 		/*
 		 * If the task is actively running on another CPU
 		 * still, just relax and busy-wait without holding
 		 * any locks.
 		 *
 		 * NOTE! Since we don't hold any locks, it's not
 		 * even sure that "rq" stays as the right runqueue!
 		 * But we don't care, since "task_running()" will
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
 			if (match_state && unlikely(p->state != match_state))
 				return 0;
 			cpu_relax();
 		}
 		/*
 		 * Ok, time to look more closely! We need the rq
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		on_rq = p->on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &flags);
 		/*
 		 * If it changed from the expected state, bail out now.
 		 */
 		if (unlikely(!ncsw))
 			break;
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
 		 * Oops. Go back and try again..
 		 */
 		if (unlikely(running)) {
 			cpu_relax();
 			continue;
 		}
 		/*
 		 * It's not enough that it's not actively running,
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
 		 * So if it was still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
 			continue;
 		}
 		/*
 		 * Ahh, all good. It wasn't running, and it wasn't
 		 * runnable, which means that it will never become
 		 * running in the future either. We're all done!
 		 */
 		break;
 	}
 	return ncsw;
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesn't have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
 	int dest_cpu;
 	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
 	/* Look for allowed, online CPU in same node. */
 	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
 		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 			return dest_cpu;
 	/* Any allowed, online CPU? */
 	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
 	if (dest_cpu < nr_cpu_ids)
 		return dest_cpu;
 	/* No more Mr. Nice Guy. */
 	dest_cpu = cpuset_cpus_allowed_fallback(p);
 	/*
 	 * Don't tell them about moving exiting tasks or
 	 * kernel threads (both mm NULL), since they never
 	 * leave kernel.
 	 */
 	if (p->mm && printk_ratelimit()) {
 		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
 				task_pid_nr(p), p->comm, cpu);
 	}
 	return dest_cpu;
 }
 /*
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
 	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
 	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
 	 * cpu.
 	 *
 	 * Since this is common to all placement strategies, this lives here.
 	 *
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
 	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 	return cpu;
 }
 static void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
 #endif
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
 #ifdef CONFIG_SCHEDSTATS
 	struct rq *rq = this_rq();
 #ifdef CONFIG_SMP
 	int this_cpu = smp_processor_id();
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	} else {
 		struct sched_domain *sd;
 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
 		rcu_read_lock();
 		for_each_domain(this_cpu, sd) {
 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
 		}
 		rcu_read_unlock();
 	}
 	if (wake_flags & WF_MIGRATED)
 		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 	schedstat_inc(rq, ttwu_count);
 	schedstat_inc(p, se.statistics.nr_wakeups);
 	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.statistics.nr_wakeups_sync);
 #endif /* CONFIG_SCHEDSTATS */
 }
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = 1;
 	/* if a worker is waking up, notify workqueue */
 	if (p->flags & PF_WQ_WORKER)
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	trace_sched_wakeup(p, true);
 	check_preempt_curr(rq, p, wake_flags);
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 	if (unlikely(rq->idle_stamp)) {
 		u64 delta = rq->clock - rq->idle_stamp;
 		u64 max = 2*sysctl_sched_migration_cost;
 		if (delta > max)
 			rq->avg_idle = max;
 		else
 			update_avg(&rq->avg_idle, delta);
 		rq->idle_stamp = 0;
 	}
 #endif
 }
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
 #endif
 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 	ttwu_do_wakeup(rq, p, wake_flags);
 }
 /*
  * Called in case the task @p isn't fully descheduled from its runqueue,
  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
  * since all we need to do is flip p->state to TASK_RUNNING, since
  * the task is still ->on_rq.
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
 	struct rq *rq;
 	int ret = 0;
 	rq = __task_rq_lock(p);
 	if (p->on_rq) {
 		ttwu_do_wakeup(rq, p, wake_flags);
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
 	return ret;
 }
 #ifdef CONFIG_SMP
 static void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct task_struct *list = xchg(&rq->wake_list, NULL);
 	if (!list)
 		return;
 	raw_spin_lock(&rq->lock);
 	while (list) {
 		struct task_struct *p = list;
 		list = list->wake_entry;
 		ttwu_do_activate(rq, p, 0);
 	}
 	raw_spin_unlock(&rq->lock);
 }
 void scheduler_ipi(void)
 {
 	sched_ttwu_pending();
 }
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *next = rq->wake_list;
 	for (;;) {
 		struct task_struct *old = next;
 		p->wake_entry = next;
 		next = cmpxchg(&rq->wake_list, old, p);
 		if (next == old)
 			break;
 	}
 	if (!next)
 		smp_send_reschedule(cpu);
 }
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 {
 	struct rq *rq;
 	int ret = 0;
 	rq = __task_rq_lock(p);
 	if (p->on_cpu) {
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 		ttwu_do_wakeup(rq, p, wake_flags);
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
 	return ret;
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 #endif /* CONFIG_SMP */
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
 		return;
 	}
 #endif
 	raw_spin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
 	raw_spin_unlock(&rq->lock);
 }
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * Returns %true if @p was woken up, %false if it was already running
  * or @state didn't match @p's state.
  */
 static int
 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, success = 0;
 	smp_wmb();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	if (!(p->state & state))
 		goto out;
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 #ifdef CONFIG_SMP
 	/*
 	 * If the owning (remote) cpu is still in the middle of schedule() with
 	 * this task as prev, wait until its done referencing the task.
 	 */
 	while (p->on_cpu) {
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 		/*
 		 * In case the architecture enables interrupts in
 		 * context_switch(), we cannot busy wait, since that
 		 * would lead to deadlocks when an interrupt hits and
 		 * tries to wake up @prev. So bail and do a complete
 		 * remote wakeup.
 		 */
 		if (ttwu_activate_remote(p, wake_flags))
 			goto stat;
 #else
 		cpu_relax();
 #endif
 	}
 	/*
 	 * Pairs with the smp_wmb() in finish_lock_switch().
 	 */
 	smp_rmb();
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(p);
 	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
 #endif /* CONFIG_SMP */
 	ttwu_queue(p, cpu);
 stat:
 	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	return success;
 }
 /**
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
  * Put @p on the run-queue if it's not already there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
 static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 	BUG_ON(rq != this_rq());
 	BUG_ON(p == current);
 	lockdep_assert_held(&rq->lock);
 	if (!raw_spin_trylock(&p->pi_lock)) {
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
 	}
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 	if (!p->on_rq)
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 	ttwu_do_wakeup(rq, p, 0);
 	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
  * Attempt to wake up the nominated process and move it to the set of runnable
  * processes.  Returns 1 if the process was woken up, 0 if it was already
  * running.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 int wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_ALL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(struct task_struct *p)
 {
 	p->on_rq			= 0;
 	p->se.on_rq			= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 	INIT_LIST_HEAD(&p->rt.run_list);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 }
 /*
  * fork()/clone()-time setup:
  */
 void sched_fork(struct task_struct *p)
 {
 	unsigned long flags;
 	int cpu = get_cpu();
 	__sched_fork(p);
 	/*
 	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
 		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
 			p->policy = SCHED_NORMAL;
 			p->normal_prio = p->static_prio;
 		}
 		if (PRIO_TO_NICE(p->static_prio) < 0) {
 			p->static_prio = NICE_TO_PRIO(0);
 			p->normal_prio = p->static_prio;
 			set_load_weight(p);
 		}
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:
 		 */
 		p->sched_reset_on_fork = 0;
 	}
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
 	p->prio = current->normal_prio;
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
 	 * is ran before sched_fork().
 	 *
 	 * Silence PROVE_RCU.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP)
 	p->on_cpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
 	put_cpu();
 }
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void wake_up_new_task(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
 	 */
 	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
 #endif
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = 1;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 #endif
 	task_rq_unlock(rq, p, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	sched_info_switch(prev, next);
 	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 	trace_sched_switch(prev, next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 	perf_event_task_sched_in(current);
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 	finish_lock_switch(rq, prev);
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 }
 #ifdef CONFIG_SMP
 /* assumes rq->lock is held */
 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
 }
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
 	if (rq->post_schedule) {
 		unsigned long flags;
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->curr->sched_class->post_schedule)
 			rq->curr->sched_class->post_schedule(rq);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		rq->post_schedule = 0;
 	}
 }
 #else
 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
 {
 }
 static inline void post_schedule(struct rq *rq)
 {
 }
 #endif
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 	/*
 	 * FIXME: do we need to worry about rq being invalidated by the
 	 * task_switch?
 	 */
 	post_schedule(rq);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_start_context_switch(prev);
 	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (!prev->mm) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_iowait_cpu(int cpu)
 {
 	struct rq *this = cpu_rq(cpu);
 	return atomic_read(&this->nr_iowait);
 }
 unsigned long this_cpu_load(void)
 {
 	struct rq *this = this_rq();
 	return this->cpu_load[0];
 }
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 static long calc_load_fold_active(struct rq *this_rq)
 {
 	long nr_active, delta = 0;
 	nr_active = this_rq->nr_running;
 	nr_active += (long) this_rq->nr_uninterruptible;
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
 		this_rq->calc_load_active = nr_active;
 	}
 	return delta;
 }
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
 	load *= exp;
 	load += active * (FIXED_1 - exp);
 	load += 1UL << (FSHIFT - 1);
 	return load >> FSHIFT;
 }
 #ifdef CONFIG_NO_HZ
 /*
  * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
  *
  * When making the ILB scale, we should try to pull this in as well.
  */
 static atomic_long_t calc_load_tasks_idle;
 static void calc_load_account_idle(struct rq *this_rq)
 {
 	long delta;
 	delta = calc_load_fold_active(this_rq);
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks_idle);
 }
 static long calc_load_fold_idle(void)
 {
 	long delta = 0;
 	/*
 	 * Its got a race, we don't care...
 	 */
 	if (atomic_long_read(&calc_load_tasks_idle))
 		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
 	return delta;
 }
 /**
  * fixed_power_int - compute: x^n, in O(log n) time
  *
  * @x:         base of the power
  * @frac_bits: fractional bits of @x
  * @n:         power to raise @x to.
  *
  * By exploiting the relation between the definition of the natural power
  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
  * (where: n_i \elem {0, 1}, the binary vector representing n),
  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
  * of course trivially computable in O(log_2 n), the length of our binary
  * vector.
  */
 static unsigned long
 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
 {
 	unsigned long result = 1UL << frac_bits;
 	if (n) for (;;) {
 		if (n & 1) {
 			result *= x;
 			result += 1UL << (frac_bits - 1);
 			result >>= frac_bits;
 		}
 		n >>= 1;
 		if (!n)
 			break;
 		x *= x;
 		x += 1UL << (frac_bits - 1);
 		x >>= frac_bits;
 	}
 	return result;
 }
 /*
  * a1 = a0 * e + a * (1 - e)
  *
  * a2 = a1 * e + a * (1 - e)
  *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
  *    = a0 * e^2 + a * (1 - e) * (1 + e)
  *
  * a3 = a2 * e + a * (1 - e)
  *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
  *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
  *
  *  ...
  *
  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
  *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
  *    = a0 * e^n + a * (1 - e^n)
  *
  * [1] application of the geometric series:
  *
  *              n         1 - x^(n+1)
  *     S_n := \Sum x^i = -------------
  *             i=0          1 - x
  */
 static unsigned long
 calc_load_n(unsigned long load, unsigned long exp,
 	    unsigned long active, unsigned int n)
 {
 	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
 }
 /*
  * NO_HZ can leave us missing all per-cpu ticks calling
  * calc_load_account_active(), but since an idle CPU folds its delta into
  * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
  * in the pending idle delta if our idle period crossed a load cycle boundary.
  *
  * Once we've updated the global active value, we need to apply the exponential
  * weights adjusted to the number of cycles missed.
  */
 static void calc_global_nohz(unsigned long ticks)
 {
 	long delta, active, n;
 	if (time_before(jiffies, calc_load_update))
 		return;
 	/*
 	 * If we crossed a calc_load_update boundary, make sure to fold
 	 * any pending idle changes, the respective CPUs might have
 	 * missed the tick driven calc_load_account_active() update
 	 * due to NO_HZ.
 	 */
 	delta = calc_load_fold_idle();
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 	/*
 	 * If we were idle for multiple load cycles, apply them.
 	 */
 	if (ticks >= LOAD_FREQ) {
 		n = ticks / LOAD_FREQ;
 		active = atomic_long_read(&calc_load_tasks);
 		active = active > 0 ? active * FIXED_1 : 0;
 		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
 		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
 		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 		calc_load_update += n * LOAD_FREQ;
 	}
 	/*
 	 * Its possible the remainder of the above division also crosses
 	 * a LOAD_FREQ period, the regular check in calc_global_load()
 	 * which comes after this will take care of that.
 	 *
 	 * Consider us being 11 ticks before a cycle completion, and us
 	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
 	 * age us 4 cycles, and the test in calc_global_load() will
 	 * pick up the final one.
 	 */
 }
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
 }
 static inline long calc_load_fold_idle(void)
 {
 	return 0;
 }
 static void calc_global_nohz(unsigned long ticks)
 {
 }
 #endif
 /**
  * get_avenrun - get the load average array
  * @loads:	pointer to dest load array
  * @offset:	offset to add
  * @shift:	shift count to shift the result left
  *
  * These values are estimates at best, so no need for locking.
  */
 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
 	loads[0] = (avenrun[0] + offset) << shift;
 	loads[1] = (avenrun[1] + offset) << shift;
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
  */
 void calc_global_load(unsigned long ticks)
 {
 	long active;
 	calc_global_nohz(ticks);
 	if (time_before(jiffies, calc_load_update + 10))
 		return;
 	active = atomic_long_read(&calc_load_tasks);
 	active = active > 0 ? active * FIXED_1 : 0;
 	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
 	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
 	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
 	calc_load_update += LOAD_FREQ;
 }
 /*
  * Called from update_cpu_load() to periodically update this CPU's
  * active count.
  */
 static void calc_load_account_active(struct rq *this_rq)
 {
 	long delta;
 	if (time_before(jiffies, this_rq->calc_load_update))
 		return;
 	delta  = calc_load_fold_active(this_rq);
 	delta += calc_load_fold_idle();
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 	this_rq->calc_load_update += LOAD_FREQ;
 }
 /*
  * The exact cpuload at various idx values, calculated at every tick would be
  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
  *
  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
  * on nth tick when cpu may be busy, then we have:
  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
  *
  * decay_load_missed() below does efficient calculation of
  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
  *
  * The calculation is approximated on a 128 point scale.
  * degrade_zero_ticks is the number of ticks after which load at any
  * particular idx is approximated to be zero.
  * degrade_factor is a precomputed table, a row for each load idx.
  * Each column corresponds to degradation factor for a power of two ticks,
  * based on 128 point scale.
  * Example:
  * row 2, col 3 (=12) says that the degradation at load idx 2 after
  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
  *
  * With this power of 2 load factors, we can degrade the load n times
  * by looking at 1 bits in n and doing as many mult/shift instead of
  * n mult/shifts needed by the exact degradation.
  */
 #define DEGRADE_SHIFT		7
 static const unsigned char
 		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
 static const unsigned char
 		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
 					{0, 0, 0, 0, 0, 0, 0, 0},
 					{64, 32, 8, 0, 0, 0, 0, 0},
 					{96, 72, 40, 12, 1, 0, 0},
 					{112, 98, 75, 43, 15, 1, 0},
 					{120, 112, 98, 76, 45, 16, 2} };
 /*
  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
  * would be when CPU is idle and so we just decay the old load without
  * adding any new load.
  */
 static unsigned long
 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 {
 	int j = 0;
 	if (!missed_updates)
 		return load;
 	if (missed_updates >= degrade_zero_ticks[idx])
 		return 0;
 	if (idx == 1)
 		return load >> missed_updates;
 	while (missed_updates) {
 		if (missed_updates % 2)
 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
 		missed_updates >>= 1;
 		j++;
 	}
 	return load;
 }
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
  * every tick. We fix it up based on jiffies.
  */
 static void update_cpu_load(struct rq *this_rq)
 {
 	unsigned long this_load = this_rq->load.weight;
 	unsigned long curr_jiffies = jiffies;
 	unsigned long pending_updates;
 	int i, scale;
 	this_rq->nr_load_updates++;
 	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 	this_rq->last_load_update_tick = curr_jiffies;
 	/* Update our load: */
 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale - 1;
 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 	}
 	sched_avg_update(this_rq);
 }
 static void update_cpu_load_active(struct rq *this_rq)
 {
 	update_cpu_load(this_rq);
 	calc_load_account_active(this_rq);
 }
 #ifdef CONFIG_SMP
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	struct task_struct *p = current;
 	unsigned long flags;
 	int dest_cpu;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 	if (likely(cpu_active(dest_cpu))) {
 		struct migration_arg arg = { p, dest_cpu };
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
  * Called with task_rq_lock() held on @rq.
  */
 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
 	u64 ns = 0;
 	if (task_current(rq, p)) {
 		update_rq_clock(rq);
 		ns = rq->clock_task - p->se.exec_start;
 		if ((s64)ns < 0)
 			ns = 0;
 	}
 	return ns;
 }
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, p, &flags);
 	return ns;
 }
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
  * pending runtime that have not been accounted yet.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, p, &flags);
 	return ns;
 }
 /*
  * Return sum_exec_runtime for the thread group.
  * In case the task is currently running, return the sum plus current's
  * pending runtime that have not been accounted yet.
  *
  * Note that the thread group might have other running tasks as well,
  * so the return value not includes other pending runtime that other
  * running tasks might have.
  */
 unsigned long long thread_group_sched_runtime(struct task_struct *p)
 {
 	struct task_cputime totals;
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 	rq = task_rq_lock(p, &flags);
 	thread_group_cputime(p, &totals);
 	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, p, &flags);
 	return ns;
 }
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_user_time(struct task_struct *p, cputime_t cputime,
 		       cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	/* Add user time to process. */
 	p->utime = cputime_add(p->utime, cputime);
 	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
 	/* Account for user time used */
 	acct_update_integrals(p);
 }
 /*
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  */
 static void account_guest_time(struct task_struct *p, cputime_t cputime,
 			       cputime_t cputime_scaled)
 {
 	cputime64_t tmp;
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	tmp = cputime_to_cputime64(cputime);
 	/* Add guest time to process. */
 	p->utime = cputime_add(p->utime, cputime);
 	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 	/* Add guest time to cpustat. */
 	if (TASK_NICE(p) > 0) {
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 		cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
 	} else {
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 		cpustat->guest = cputime64_add(cpustat->guest, tmp);
 	}
 }
 /*
  * Account system cpu time to a process and desired cpustat field
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in kernel space since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  * @target_cputime64: pointer to cpustat field that has to be updated
  */
 static inline
 void __account_system_time(struct task_struct *p, cputime_t cputime,
 			cputime_t cputime_scaled, cputime64_t *target_cputime64)
 {
 	cputime64_t tmp = cputime_to_cputime64(cputime);
 	/* Add system time to process. */
 	p->stime = cputime_add(p->stime, cputime);
 	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
 	account_group_system_time(p, cputime);
 	/* Add system time to cpustat. */
 	*target_cputime64 = cputime64_add(*target_cputime64, tmp);
 	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t *target_cputime64;
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 	if (hardirq_count() - hardirq_offset)
 		target_cputime64 = &cpustat->irq;
 	else if (in_serving_softirq())
 		target_cputime64 = &cpustat->softirq;
 	else
 		target_cputime64 = &cpustat->system;
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 /*
  * Account for involuntary wait time.
  * @cputime: the cpu time spent in involuntary wait
  */
 void account_steal_time(cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t cputime64 = cputime_to_cputime64(cputime);
 	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
 }
 /*
  * Account for idle time.
  * @cputime: the cpu time spent in idle wait
  */
 void account_idle_time(cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t cputime64 = cputime_to_cputime64(cputime);
 	struct rq *rq = this_rq();
 	if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
  * @p: the process that the cpu time gets accounted to
  * @user_tick: is the tick from userspace
  * @rq: the pointer to rq
  *
  * Tick demultiplexing follows the order
  * - pending hardirq update
  * - pending softirq update
  * - user_time
  * - idle_time
  * - system time
  *   - check for guest_time
  *   - else account as system_time
  *
  * Check for hardirq is done both for system and user time as there is
  * no timer going off while we are on hardirq and hence we may never get an
  * opportunity to update it solely in system time.
  * p->stime and friends are only updated on system time and not on irq
  * softirq as those do not count in task exec_runtime any more.
  */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq)
 {
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	if (irqtime_account_hi_update()) {
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	} else if (irqtime_account_si_update()) {
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	} else if (this_cpu_ksoftirqd() == p) {
 		/*
 		 * ksoftirqd time do not get accounted in cpu_softirq_time.
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
 		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
 					&cpustat->softirq);
 	} else if (user_tick) {
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	} else if (p == rq->idle) {
 		account_idle_time(cputime_one_jiffy);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
 		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	} else {
 		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
 					&cpustat->system);
 	}
 }
 static void irqtime_account_idle_ticks(int ticks)
 {
 	int i;
 	struct rq *rq = this_rq();
 	for (i = 0; i < ticks; i++)
 		irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static void irqtime_account_idle_ticks(int ticks) {}
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
  * @user_tick: indicates if the tick is a user or a system tick
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 	if (sched_clock_irqtime) {
 		irqtime_account_process_tick(p, user_tick, rq);
 		return;
 	}
 	if (user_tick)
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
 				    one_jiffy_scaled);
 	else
 		account_idle_time(cputime_one_jiffy);
 }
 /*
  * Account multiple ticks of steal time.
  * @p: the process from which the cpu time has been stolen
  * @ticks: number of stolen ticks
  */
 void account_steal_ticks(unsigned long ticks)
 {
 	account_steal_time(jiffies_to_cputime(ticks));
 }
 /*
  * Account multiple ticks of idle time.
  * @ticks: number of stolen ticks
  */
 void account_idle_ticks(unsigned long ticks)
 {
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 	account_idle_time(jiffies_to_cputime(ticks));
 }
 #endif
 /*
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	*ut = p->utime;
 	*st = p->stime;
 }
 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	struct task_cputime cputime;
 	thread_group_cputime(p, &cputime);
 	*ut = cputime.utime;
 	*st = cputime.stime;
 }
 #else
 #ifndef nsecs_to_cputime
 # define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
 	/*
 	 * Use CFS's precise accounting:
 	 */
 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 	if (total) {
 		u64 temp = rtime;
 		temp *= utime;
 		do_div(temp, total);
 		utime = (cputime_t)temp;
 	} else
 		utime = rtime;
 	/*
 	 * Compare with previous values, to keep monotonicity:
 	 */
 	p->prev_utime = max(p->prev_utime, utime);
 	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
 	*ut = p->prev_utime;
 	*st = p->prev_stime;
 }
 /*
  * Must be called with siglock held.
  */
 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	struct signal_struct *sig = p->signal;
 	struct task_cputime cputime;
 	cputime_t rtime, utime, total;
 	thread_group_cputime(p, &cputime);
 	total = cputime_add(cputime.utime, cputime.stime);
 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 	if (total) {
 		u64 temp = rtime;
 		temp *= cputime.utime;
 		do_div(temp, total);
 		utime = (cputime_t)temp;
 	} else
 		utime = rtime;
 	sig->prev_utime = max(sig->prev_utime, utime);
 	sig->prev_stime = max(sig->prev_stime,
 			      cputime_sub(rtime, sig->prev_utime));
 	*ut = sig->prev_utime;
 	*st = sig->prev_stime;
 }
 #endif
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	sched_clock_tick();
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load_active(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	raw_spin_unlock(&rq->lock);
 	perf_event_task_tick();
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
 		if (in_lock_functions(addr))
 			addr = CALLER_ADDR3;
 	}
 	return addr;
 }
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 #endif
 	preempt_count() += val;
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 void __kprobes sub_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 #endif
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	struct pt_regs *regs = get_irq_regs();
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 	debug_show_held_locks(prev);
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 	if (regs)
 		show_regs(regs);
 	else
 		dump_stack();
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
 		__schedule_bug(prev);
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_count);
 }
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->on_rq || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 	prev->sched_class->put_prev_task(rq, prev);
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
 	}
 	BUG(); /* the idle class will always have a runnable task */
 }
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 	schedule_debug(prev);
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 	raw_spin_lock_irq(&rq->lock);
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
 			 * whether it wants to wake up a task to maintain
 			 * concurrency.
 			 */
 			if (prev->flags & PF_WQ_WORKER) {
 				struct task_struct *to_wakeup;
 				to_wakeup = wq_worker_sleeping(prev, cpu);
 				if (to_wakeup)
 					try_to_wake_up_local(to_wakeup);
 			}
 			/*
 			 * If we are going to sleep and we have plugged IO
 			 * queued, make sure to submit it to avoid deadlocks.
 			 */
 			if (blk_needs_flush_plug(prev)) {
 				raw_spin_unlock(&rq->lock);
 				blk_schedule_flush_plug(prev);
 				raw_spin_lock(&rq->lock);
 			}
 		}
 		switch_count = &prev->nvcsw;
 	}
 	pre_schedule(rq, prev);
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * The context switch have flipped the stack from under us
 		 * and restored the local variables which were saved when
 		 * this task called schedule() in the past. prev == current
 		 * is still correct, but it can be moved to another cpu/rq.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 	post_schedule(rq);
 	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
 	bool ret = false;
 	rcu_read_lock();
 	if (lock->owner != owner)
 		goto fail;
 	/*
 	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
 	 * lock->owner still matches owner, if that fails, owner might
 	 * point to free()d memory, if it still matches, the rcu_read_lock()
 	 * ensures the memory stays valid.
 	 */
 	barrier();
 	ret = owner->on_cpu;
 fail:
 	rcu_read_unlock();
 	return ret;
 }
 /*
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
  */
 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
 	if (!sched_feat(OWNER_SPIN))
 		return 0;
 	while (owner_running(lock, owner)) {
 		if (need_resched())
 			return 0;
 		arch_mutex_cpu_relax();
 	}
 	/*
 	 * If the owner changed to another task there is likely
 	 * heavy contention, stop spinning.
 	 */
 	if (lock->owner)
 		return 0;
 	return 1;
 }
 #endif
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 	do {
 		add_preempt_count_notrace(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count_notrace(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_t *curr, *next;
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 		if (curr->func(curr, mode, wake_flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
 	__wake_up_common(q, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int wake_flags = WF_SYNC;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		wake_flags = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 /**
  * complete: - signals a single thread waiting on this completion
  * @x:  holds the state of this particular completion
  *
  * This will wake up a single thread waiting on this completion. Threads will be
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 /**
  * complete_all: - signals all threads waiting on this completion
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		__add_wait_queue_tail_exclusive(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
 			__set_current_state(state);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
 		__remove_wait_queue(&x->wait, &wait);
 		if (!x->done)
 			return timeout;
 	}
 	x->done--;
 	return timeout ?: 1;
 }
 static long __sched
 wait_for_common(struct completion *x, long timeout, int state)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 /**
  * wait_for_completion: - waits for completion of a task
  * @x:  holds the state of this particular completion
  *
  * This waits to be signaled for completion of a specific task. It is NOT
  * interruptible and there is no timeout.
  *
  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
  * and interrupt capability. Also see complete().
  */
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 /**
  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. The timeout is in jiffies. It is not
  * interruptible.
  */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 /**
  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
  * @x:  holds the state of this particular completion
  *
  * This waits for completion of a specific task to be signaled. It is
  * interruptible.
  */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 /**
  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  */
 long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 /**
  * wait_for_completion_killable: - waits for completion of a task (killable)
  * @x:  holds the state of this particular completion
  *
  * This waits to be signaled for completion of a specific task. It can be
  * interrupted by a kill signal.
  */
 int __sched wait_for_completion_killable(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
 /**
  * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be
  * signaled or for a specified timeout to expire. It can be
  * interrupted by a kill signal. The timeout is in jiffies.
  */
 long __sched
 wait_for_completion_killable_timeout(struct completion *x,
 				     unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_KILLABLE);
 }
 EXPORT_SYMBOL(wait_for_completion_killable_timeout);
 /**
  *	try_wait_for_completion - try to decrement a completion without blocking
  *	@x:	completion structure
  *
  *	Returns: 0 if a decrement cannot be done without blocking
  *		 1 if a decrement succeeded.
  *
  *	If a completion is being used as a counting completion,
  *	attempt to decrement the counter without blocking. This
  *	enables us to avoid waiting if the resource the completion
  *	is protecting is not available.
  */
 bool try_wait_for_completion(struct completion *x)
 {
 	unsigned long flags;
 	int ret = 1;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
 	else
 		x->done--;
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
 /**
  *	completion_done - Test to see if a completion has any waiters
  *	@x:	completion structure
  *
  *	Returns: 0 if there are waiters (wait_for_completion() in progress)
  *		 1 if there are no waiters.
  *
  */
 bool completion_done(struct completion *x)
 {
 	unsigned long flags;
 	int ret = 1;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	__set_current_state(state);
 	spin_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, &wait);
 	spin_unlock(&q->lock);
 	timeout = schedule_timeout(timeout);
 	spin_lock_irq(&q->lock);
 	__remove_wait_queue(q, &wait);
 	spin_unlock_irqrestore(&q->lock, flags);
 	return timeout;
 }
 void __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void __sched sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(sleep_on);
 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	rq = __task_rq_lock(p);
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	p->prio = prio;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 	check_class_changed(rq, p, prev_class, oldprio);
 	__task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
 	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	on_rq = p->on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = TASK_NICE(current) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	set_load_weight(p);
 }
 /*
  * check the target process has a UID that matches the current process's
  */
 static bool check_same_owner(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred;
 	bool match;
 	rcu_read_lock();
 	pcred = __task_cred(p);
 	if (cred->user->user_ns == pcred->user->user_ns)
 		match = (cred->euid == pcred->euid ||
 			 cred->euid == pcred->uid);
 	else
 		match = false;
 	rcu_read_unlock();
 	return match;
 }
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				const struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0) {
 		reset_on_fork = p->sched_reset_on_fork;
 		policy = oldpolicy = p->policy;
 	} else {
 		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
 		policy &= ~SCHED_RESET_ON_FORK;
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 				policy != SCHED_IDLE)
 			return -EINVAL;
 	}
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio =
 					task_rlimit(p, RLIMIT_RTPRIO);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (param->sched_priority > p->rt_priority &&
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		/*
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
 			if (!can_nice(p, TASK_NICE(p)))
 				return -EPERM;
 		}
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
 			return -EPERM;
 		/* Normal users shall not reset the sched_reset_on_fork flag */
 		if (p->sched_reset_on_fork && !reset_on_fork)
 			return -EPERM;
 	}
 	if (user) {
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
 	}
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 *
 	 * To be able to change p->policy safely, the appropriate
 	 * runqueue lock must be held.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
 		task_rq_unlock(rq, p, &flags);
 		return -EINVAL;
 	}
 	/*
 	 * If not changing anything there's no need to proceed further:
 	 */
 	if (unlikely(policy == p->policy && (!rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
 		__task_rq_unlock(rq);
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		return 0;
 	}
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (user) {
 		/*
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
 				!task_group_is_autogroup(task_group(p))) {
 			task_rq_unlock(rq, p, &flags);
 			return -EPERM;
 		}
 	}
 #endif
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq)
 		activate_task(rq, p, 0);
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       const struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Just like sched_setscheduler, only don't bother checking if the
  * current context has permission.  For example, this is needed in
  * stop_machine(): we create temporary high priority worker threads,
  * but our caller might not have that capability.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 			       const struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, false);
 }
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy
 				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
 	}
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	rcu_read_unlock();
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (!p) {
 		rcu_read_unlock();
 		put_online_cpus();
 		return -ESRCH;
 	}
 	/* Prevent p going away */
 	get_task_struct(p);
 	rcu_read_unlock();
 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_put_task;
 	}
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
 	if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
 		goto out_unlock;
 	retval = security_task_setscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpuset_cpus_allowed(p, cpus_allowed);
 	cpumask_and(new_mask, in_mask, cpus_allowed);
 again:
 	retval = set_cpus_allowed_ptr(p, new_mask);
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
 		if (!cpumask_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
 			cpumask_copy(new_mask, cpus_allowed);
 			goto again;
 		}
 	}
 out_unlock:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
 	free_cpumask_var(cpus_allowed);
 out_put_task:
 	put_task_struct(p);
 	put_online_cpus();
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     struct cpumask *new_mask)
 {
 	if (len < cpumask_size())
 		cpumask_clear(new_mask);
 	else if (len > cpumask_size())
 		len = cpumask_size();
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
 	if (retval == 0)
 		retval = sched_setaffinity(pid, new_mask);
 	free_cpumask_var(new_mask);
 	return retval;
 }
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
 	unsigned long flags;
 	int retval;
 	get_online_cpus();
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
 	rcu_read_unlock();
 	put_online_cpus();
 	return retval;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
 		return -EINVAL;
 	if (len & (sizeof(unsigned long)-1))
 		return -EINVAL;
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
 		size_t retlen = min_t(size_t, len, cpumask_size());
 		if (copy_to_user(user_mask_ptr, mask, retlen))
 			ret = -EFAULT;
 		else
 			ret = retlen;
 	}
 	free_cpumask_var(mask);
 	return ret;
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
 SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	do_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static inline int should_resched(void)
 {
 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
 static void __cond_resched(void)
 {
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
 	int resched = should_resched();
 	int ret = 0;
 	lockdep_assert_held(lock);
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
 			__cond_resched();
 		else
 			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (should_resched()) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /**
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
  * processor it's on.
  * @p: target task
  * @preempt: whether task preemption is allowed or not
  *
  * It's the caller's job to ensure that the target task struct
  * can't go away on us before we can do any checks.
  *
  * Returns true if we indeed boosted the target task.
  */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
 	unsigned long flags;
 	bool yielded = 0;
 	local_irq_save(flags);
 	rq = this_rq();
 again:
 	p_rq = task_rq(p);
 	double_rq_lock(rq, p_rq);
 	while (task_rq(p) != p_rq) {
 		double_rq_unlock(rq, p_rq);
 		goto again;
 	}
 	if (!curr->sched_class->yield_to_task)
 		goto out;
 	if (curr->sched_class != p->sched_class)
 		goto out;
 	if (task_running(p_rq, p) || p->state)
 		goto out;
 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
 	if (yielded) {
 		schedstat_inc(rq, yld_count);
 		/*
 		 * Make p's CPU reschedule; pick_next_entity takes care of
 		 * fairness.
 		 */
 		if (preempt && rq != p_rq)
 			resched_task(p_rq->curr);
 	}
 out:
 	double_rq_unlock(rq, p_rq);
 	local_irq_restore(flags);
 	if (yielded)
 		schedule();
 	return yielded;
 }
 EXPORT_SYMBOL_GPL(yield_to);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = raw_rq();
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = raw_rq();
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	unsigned long flags;
 	struct rq *rq;
 	int retval;
 	struct timespec t;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	rq = task_rq_lock(p, &flags);
 	time_slice = p->sched_class->get_rr_interval(rq, p);
 	task_rq_unlock(rq, p, &flags);
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
 	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent),
 		(unsigned long)task_thread_info(p)->flags);
 	show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (!state_filter)
 		debug_show_all_locks();
 }
 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	__sched_fork(idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 	do_set_cpus_allowed(idle, cpumask_of(cpu));
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
 	 * lockdep check in task_group() will fail.
 	 *
 	 * Similar case to sched_fork(). / Alternatively we could
 	 * use task_rq_lock() here and obtain the other rq->lock.
 	 *
 	 * Silence PROVE_RCU
 	 */
 	rcu_read_lock();
 	__set_task_cpu(idle, cpu);
 	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_BITS_NONE.
  */
 cpumask_var_t nohz_cpu_mask;
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
  * so pick a second-best guess by going with the log2 of the
  * number of CPUs.
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
 static int get_update_sysctl_factor(void)
 {
 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
 	unsigned int factor;
 	switch (sysctl_sched_tunable_scaling) {
 	case SCHED_TUNABLESCALING_NONE:
 		factor = 1;
 		break;
 	case SCHED_TUNABLESCALING_LINEAR:
 		factor = cpus;
 		break;
 	case SCHED_TUNABLESCALING_LOG:
 	default:
 		factor = 1 + ilog2(cpus);
 		break;
 	}
 	return factor;
 }
 static void update_sysctl(void)
 {
 	unsigned int factor = get_update_sysctl_factor();
 #define SET_SYSCTL(name) \
 	(sysctl_##name = (factor) * normalized_sysctl_##name)
 	SET_SYSCTL(sched_min_granularity);
 	SET_SYSCTL(sched_latency);
 	SET_SYSCTL(sched_wakeup_granularity);
 #undef SET_SYSCTL
 }
 static inline void sched_init_granularity(void)
 {
 	update_sysctl();
 }
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	if (p->sched_class && p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
 		cpumask_copy(&p->cpus_allowed, new_mask);
 		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 	}
 }
 /*
  * This is how migration works:
  *
  * 1) we invoke migration_cpu_stop() on the target CPU using
  *    stop_one_cpu().
  * 2) stopper starts to run (implicitly forcing the migrated thread
  *    off the CPU)
  * 3) it checks whether the migrated task is still in the wrong runqueue.
  * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 5) stopper completes and stop_one_cpu() returns and the migration
  *    is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	unsigned long flags;
 	struct rq *rq;
 	unsigned int dest_cpu;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	do_set_cpus_allowed(p, new_mask);
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
 	if (p->on_rq) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &flags);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, p, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0;
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	raw_spin_lock(&p->pi_lock);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 		goto fail;
 	/*
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
 	if (p->on_rq) {
 		deactivate_task(rq_src, p, 0);
 		set_task_cpu(p, dest_cpu);
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
 fail:
 	double_rq_unlock(rq_src, rq_dest);
 	raw_spin_unlock(&p->pi_lock);
 	return ret;
 }
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
  * 'pushing' onto another runqueue.
  */
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
 	/*
 	 * The original target cpu might have gone down and we might
 	 * be on another cpu but it doesn't matter.
 	 */
 	local_irq_disable();
 	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
 	local_irq_enable();
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 }
 /*
  * remove the tasks which were accounted by rq from calc_load_tasks.
  */
 static void calc_global_load_remove(struct rq *rq)
 {
 	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
 	rq->calc_load_active = 0;
 }
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
  *
  * Called with rq->lock held even though we'er in stop_machine() and
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
 static void migrate_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 	/*
 	 * Fudge the rq selection such that the below task selection loop
 	 * doesn't get stuck on the currently eligible stop task.
 	 *
 	 * We're currently inside stop_machine() and the rq is either stuck
 	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
 	 * either way we should never end up calling schedule() until we're
 	 * done here.
 	 */
 	rq->stop = NULL;
 	for ( ; ; ) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
 		 */
 		if (rq->nr_running == 1)
 			break;
 		next = pick_next_task(rq);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_cpu, next);
 		raw_spin_unlock(&rq->lock);
 		__migrate_task(next, dead_cpu, dest_cpu);
 		raw_spin_lock(&rq->lock);
 	}
 	rq->stop = stop;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
 	{}
 };
 static struct ctl_table sd_ctl_root[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{}
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 	return entry;
 }
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
 	struct ctl_table *entry;
 	/*
 	 * In the intermediate directories, both the child directory and
 	 * procname are dynamically allocated and could fail but the mode
 	 * will always be set. In the lowest directory the names are
 	 * static strings and all have proc handlers.
 	 */
 	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
 		if (entry->proc_handler == NULL)
 			kfree(entry->procname);
 	}
 	kfree(*tablep);
 	*tablep = NULL;
 }
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		mode_t mode, proc_handler *proc_handler)
 {
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(13);
 	if (table == NULL)
 		return NULL;
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[11], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring);
 	/* &table[12] is terminator */
 	return table;
 }
 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	if (table == NULL)
 		return NULL;
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_possible_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	WARN_ON(sd_ctl_dir[0].child);
 	sd_ctl_dir[0].child = entry;
 	if (entry == NULL)
 		return;
 	for_each_possible_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
 	if (sd_sysctl_header)
 		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
 {
 }
 static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
 		cpumask_set_cpu(rq->cpu, rq->rd->online);
 		rq->online = 1;
 		for_each_class(class) {
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
 	}
 }
 static void set_rq_offline(struct rq *rq)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 		for_each_class(class) {
 			if (class->rq_offline)
 				class->rq_offline(rq);
 		}
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
 		rq->online = 0;
 	}
 }
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq = cpu_rq(cpu);
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		rq->calc_load_update = calc_load_update;
 		break;
 	case CPU_ONLINE:
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_online(rq);
 		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DYING:
 		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		migrate_tasks(cpu);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		migrate_nr_uninterruptible(rq);
 		calc_global_load_remove(rq);
 		break;
 #endif
 	}
 	update_max_interval();
 	return NOTIFY_OK;
 }
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = CPU_PRI_MIGRATION,
 };
 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		set_cpu_active((long)hcpu, true);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		set_cpu_active((long)hcpu, false);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	/* Register cpu active notifiers */
 	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
 	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
 	return 0;
 }
 early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
 static __read_mostly int sched_domain_debug_enabled;
 static int __init sched_domain_debug_setup(char *str)
 {
 	sched_domain_debug_enabled = 1;
 	return 0;
 }
 early_param("sched_debug", sched_domain_debug_setup);
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
 	cpumask_clear(groupmask);
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 					" has parent");
 		return -1;
 	}
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
 		if (!group) {
 			printk("\n");
 			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 		if (!group->cpu_power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
 			break;
 		}
 		if (!cpumask_weight(sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 		printk(KERN_CONT " %s", str);
 		if (group->cpu_power != SCHED_POWER_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
 				group->cpu_power);
 		}
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 	if (sd->parent &&
 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
 }
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sched_domain_debug_enabled)
 		return;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpumask_weight(sched_domain_span(sd)) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 	cpupri_cleanup(&rd->cpupri);
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
 	kfree(rd);
 }
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	if (rq->rd) {
 		old_rd = rq->rd;
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 		/*
 		 * If we dont want to free the old_rt yet then
 		 * set old_rd to NULL to skip the freeing later
 		 * in this function:
 		 */
 		if (!atomic_dec_and_test(&old_rd->refcount))
 			old_rd = NULL;
 	}
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 static int init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
 		goto out;
 	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
 		goto free_span;
 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_online;
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_rto_mask;
 	return 0;
 free_rto_mask:
 	free_cpumask_var(rd->rto_mask);
 free_online:
 	free_cpumask_var(rd->online);
 free_span:
 	free_cpumask_var(rd->span);
 out:
 	return -ENOMEM;
 }
 static void init_defrootdomain(void)
 {
 	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 	if (init_rootdomain(rd) != 0) {
 		kfree(rd);
 		return NULL;
 	}
 	return rd;
 }
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
 	if (atomic_dec_and_test(&sd->groups->ref))
 		kfree(sd->groups);
 	kfree(sd);
 }
 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
 {
 	call_rcu(&sd->rcu, free_sched_domain);
 }
 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 {
 	for (; sd; sd = sd->parent)
 		destroy_sched_domain(sd, cpu);
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 			destroy_sched_domain(parent, cpu);
 		} else
 			tmp = tmp->parent;
 	}
 	if (sd && sd_degenerate(sd)) {
 		tmp = sd;
 		sd = sd->parent;
 		destroy_sched_domain(tmp, cpu);
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
 	destroy_sched_domains(tmp, cpu);
 }
 /* cpus with isolated domains */
 static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	cpulist_parse(str, cpu_isolated_map);
 	return 1;
 }
 __setup("isolcpus=", isolated_cpu_setup);
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain. Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
 	int i, n, val, min_val, best_node = -1;
 	min_val = INT_MAX;
 	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
 		n = (node + i) % nr_node_ids;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (node_isset(n, *used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node != -1)
 		node_set(best_node, *used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @span: resulting cpumask
  *
  * Given a node, construct a good cpumask for its sched_domain to span. It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static void sched_domain_node_span(int node, struct cpumask *span)
 {
 	nodemask_t used_nodes;
 	int i;
 	cpumask_clear(span);
 	nodes_clear(used_nodes);
 	cpumask_or(span, span, cpumask_of_node(node));
 	node_set(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, &used_nodes);
 		if (next_node < 0)
 			break;
 		cpumask_or(span, span, cpumask_of_node(next_node));
 	}
 }
 static const struct cpumask *cpu_node_mask(int cpu)
 {
 	lockdep_assert_held(&sched_domains_mutex);
 	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
 	return sched_domains_tmpmask;
 }
 static const struct cpumask *cpu_allnodes_mask(int cpu)
 {
 	return cpu_possible_mask;
 }
 #endif /* CONFIG_NUMA */
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
 };
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
 };
 enum s_alloc {
 	sa_rootdomain,
 	sa_sd,
 	sa_sd_storage,
 	sa_none,
 };
 struct sched_domain_topology_level;
 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 struct sched_domain_topology_level {
 	sched_domain_init_f init;
 	sched_domain_mask_f mask;
 	struct sd_data      data;
 };
 /*
  * Assumes the sched_domain tree is fully constructed
  */
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	struct sched_domain *child = sd->child;
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 	if (sg)
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 	return cpu;
 }
 /*
  * build_sched_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
  * (due to the fact that we keep track of groups covered with a struct cpumask).
  *
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
 build_sched_groups(struct sched_domain *sd)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	struct sd_data *sdd = sd->private;
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered;
 	int i;
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 	cpumask_clear(covered);
 	for_each_cpu(i, span) {
 		struct sched_group *sg;
 		int group = get_group(i, sdd, &sg);
 		int j;
 		if (cpumask_test_cpu(i, covered))
 			continue;
 		cpumask_clear(sched_group_cpus(sg));
 		sg->cpu_power = 0;
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
 				continue;
 			cpumask_set_cpu(j, covered);
 			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	WARN_ON(!sd || !sd->groups);
 	if (cpu != group_first_cpu(sd->groups))
 		return;
 	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
 	update_group_power(sd, cpu);
 }
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define SD_INIT_NAME(sd, type)		sd->name = #type
 #else
 # define SD_INIT_NAME(sd, type)		do { } while (0)
 #endif
 #define SD_INIT_FUNC(type)						\
 static noinline struct sched_domain *					\
 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
 {									\
 	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
 	*sd = SD_##type##_INIT;						\
 	SD_INIT_NAME(sd, type);						\
 	sd->private = &tl->data;					\
 	return sd;							\
 }
 SD_INIT_FUNC(CPU)
 #ifdef CONFIG_NUMA
  SD_INIT_FUNC(ALLNODES)
  SD_INIT_FUNC(NODE)
 #endif
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
 #ifdef CONFIG_SCHED_MC
  SD_INIT_FUNC(MC)
 #endif
 #ifdef CONFIG_SCHED_BOOK
  SD_INIT_FUNC(BOOK)
 #endif
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
 	unsigned long val;
 	val = simple_strtoul(str, NULL, 0);
 	if (val < sched_domain_level_max)
 		default_relax_domain_level = val;
 	return 1;
 }
 __setup("relax_domain_level=", setup_relax_domain_level);
 static void set_domain_attribute(struct sched_domain *sd,
 				 struct sched_domain_attr *attr)
 {
 	int request;
 	if (!attr || attr->relax_domain_level < 0) {
 		if (default_relax_domain_level < 0)
 			return;
 		else
 			request = default_relax_domain_level;
 	} else
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 static void __sdt_free(const struct cpumask *cpu_map);
 static int __sdt_alloc(const struct cpumask *cpu_map);
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 				 const struct cpumask *cpu_map)
 {
 	switch (what) {
 	case sa_rootdomain:
 		if (!atomic_read(&d->rd->refcount))
 			free_rootdomain(&d->rd->rcu); /* fall through */
 	case sa_sd:
 		free_percpu(d->sd); /* fall through */
 	case sa_sd_storage:
 		__sdt_free(cpu_map); /* fall through */
 	case sa_none:
 		break;
 	}
 }
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 						   const struct cpumask *cpu_map)
 {
 	memset(d, 0, sizeof(*d));
 	if (__sdt_alloc(cpu_map))
 		return sa_sd_storage;
 	d->sd = alloc_percpu(struct sched_domain *);
 	if (!d->sd)
 		return sa_sd_storage;
 	d->rd = alloc_rootdomain();
 	if (!d->rd)
 		return sa_sd;
 	return sa_rootdomain;
 }
 /*
  * NULL the sd_data elements we've used to build the sched_domain and
  * sched_group structure so that the subsequent __free_domain_allocs()
  * will not free the data we're using.
  */
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
 	struct sd_data *sdd = sd->private;
 	struct sched_group *sg = sd->groups;
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 	if (cpu == cpumask_first(sched_group_cpus(sg))) {
 		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
 	}
 }
 #ifdef CONFIG_SCHED_SMT
 static const struct cpumask *cpu_smt_mask(int cpu)
 {
 	return topology_thread_cpumask(cpu);
 }
 #endif
 /*
  * Topology list, bottom-up.
  */
 static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ sd_init_SIBLING, cpu_smt_mask, },
 #endif
 #ifdef CONFIG_SCHED_MC
 	{ sd_init_MC, cpu_coregroup_mask, },
 #endif
 #ifdef CONFIG_SCHED_BOOK
 	{ sd_init_BOOK, cpu_book_mask, },
 #endif
 	{ sd_init_CPU, cpu_cpu_mask, },
 #ifdef CONFIG_NUMA
 	{ sd_init_NODE, cpu_node_mask, },
 	{ sd_init_ALLNODES, cpu_allnodes_mask, },
 #endif
 	{ NULL, },
 };
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
 	int j;
 	for (tl = sched_domain_topology; tl->init; tl++) {
 		struct sd_data *sdd = &tl->data;
 		sdd->sd = alloc_percpu(struct sched_domain *);
 		if (!sdd->sd)
 			return -ENOMEM;
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;
 			*per_cpu_ptr(sdd->sd, j) = sd;
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
 				return -ENOMEM;
 			*per_cpu_ptr(sdd->sg, j) = sg;
 		}
 	}
 	return 0;
 }
 static void __sdt_free(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
 	int j;
 	for (tl = sched_domain_topology; tl->init; tl++) {
 		struct sd_data *sdd = &tl->data;
 		for_each_cpu(j, cpu_map) {
 			kfree(*per_cpu_ptr(sdd->sd, j));
 			kfree(*per_cpu_ptr(sdd->sg, j));
 		}
 		free_percpu(sdd->sd);
 		free_percpu(sdd->sg);
 	}
 }
 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		struct s_data *d, const struct cpumask *cpu_map,
 		struct sched_domain_attr *attr, struct sched_domain *child,
 		int cpu)
 {
 	struct sched_domain *sd = tl->init(tl, cpu);
 	if (!sd)
 		return child;
 	set_domain_attribute(sd, attr);
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
 		child->parent = sd;
 	}
 	sd->child = child;
 	return sd;
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int build_sched_domains(const struct cpumask *cpu_map,
 			       struct sched_domain_attr *attr)
 {
 	enum s_alloc alloc_state = sa_none;
 	struct sched_domain *sd;
 	struct s_data d;
 	int i, ret = -ENOMEM;
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
 		goto error;
 	/* Set up domains for cpus specified by the cpu_map. */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain_topology_level *tl;
 		sd = NULL;
 		for (tl = sched_domain_topology; tl->init; tl++)
 			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
 		while (sd->child)
 			sd = sd->child;
 		*per_cpu_ptr(d.sd, i) = sd;
 	}
 	/* Build the groups for the domains */
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
 			get_group(i, sd->private, &sd->groups);
 			atomic_inc(&sd->groups->ref);
 			if (i != cpumask_first(sched_domain_span(sd)))
 				continue;
 			build_sched_groups(sd);
 		}
 	}
 	/* Calculate CPU power for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
 			continue;
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			claim_allocations(i, sd);
 			init_sched_groups_power(i, sd);
 		}
 	}
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
 		sd = *per_cpu_ptr(d.sd, i);
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
 	return ret;
 }
 static cpumask_var_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
  * cpumask) fails, then fallback to a single sched domain,
  * as determined by the single cpumask fallback_doms.
  */
 static cpumask_var_t fallback_doms;
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
  * cpu core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
 int __attribute__((weak)) arch_update_cpu_topology(void)
 {
 	return 0;
 }
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
 {
 	int i;
 	cpumask_var_t *doms;
 	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
 	if (!doms)
 		return NULL;
 	for (i = 0; i < ndoms; i++) {
 		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
 			free_sched_domains(doms, i);
 			return NULL;
 		}
 	}
 	return doms;
 }
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 {
 	unsigned int i;
 	for (i = 0; i < ndoms; i++)
 		free_cpumask_var(doms[i]);
 	kfree(doms);
 }
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
 static int init_sched_domains(const struct cpumask *cpu_map)
 {
 	int err;
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 	return err;
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
 	int i;
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	rcu_read_unlock();
 }
 /* handle null as "default" */
 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 			struct sched_domain_attr *new, int idx_new)
 {
 	struct sched_domain_attr tmp;
 	/* fast path */
 	if (!new && !cur)
 		return 1;
 	tmp = SD_ATTR_INIT;
 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
 			new ? (new + idx_new) : &tmp,
 			sizeof(struct sched_domain_attr));
 }
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
  * The passed in 'doms_new' should be allocated using
  * alloc_sched_domains.  This routine takes ownership of it and will
  * free_sched_domains it when done with it. If the caller failed the
  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
  * and partition_sched_domains() will fallback to the single partition
  * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
 	int new_topology;
 	mutex_lock(&sched_domains_mutex);
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	/* Let architecture update cpu core mappings. */
 	new_topology = arch_update_cpu_topology();
 	n = doms_new ? ndoms_new : 0;
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
 	}
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
 		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
 	register_sched_domain_sysctl();
 	mutex_unlock(&sched_domains_mutex);
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 static void reinit_sched_domains(void)
 {
 	get_online_cpus();
 	/* Destroy domains first to force the rebuild */
 	partition_sched_domains(0, NULL, NULL);
 	rebuild_sched_domains();
 	put_online_cpus();
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	unsigned int level = 0;
 	if (sscanf(buf, "%u", &level) != 1)
 		return -EINVAL;
 	/*
 	 * level is always be positive so don't check for
 	 * level < POWERSAVINGS_BALANCE_NONE which is 0
 	 * What happens on 0 or 1 byte write,
 	 * need to check for count as well?
 	 */
 	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
 		return -EINVAL;
 	if (smt)
 		sched_smt_power_savings = level;
 	else
 		sched_mc_power_savings = level;
 	reinit_sched_domains();
 	return count;
 }
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
 					   struct sysdev_class_attribute *attr,
 					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    struct sysdev_class_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 			 sched_mc_power_savings_show,
 			 sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
 					    struct sysdev_class_attribute *attr,
 					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     struct sysdev_class_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
 		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
 	return err;
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  * around partition_sched_domains().
  */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		cpuset_update_active_cpus();
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 			       void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		cpuset_update_active_cpus();
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
 		return NOTIFY_OK;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	init_sched_domains(cpu_active_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
 	init_hrtick();
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
 	init_sched_rt_class();
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 	/* allow initial update_cfs_load() to truncate */
 #ifdef CONFIG_SMP
 	cfs_rq->load_stamp = 1;
 #endif
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 {
 	struct rt_prio_array *array;
 	int i;
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
 		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
 #ifdef CONFIG_SMP
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
 #endif
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 	rt_rq->rt_runtime = 0;
 	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 #ifdef CONFIG_RT_GROUP_SCHED
 	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 				struct sched_entity *se, int cpu,
 				struct sched_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
 	tg->se[cpu] = se;
 	/* se could be NULL for root_task_group */
 	if (!se)
 		return;
 	if (!parent)
 		se->cfs_rq = &rq->cfs;
 	else
 		se->cfs_rq = parent->my_q;
 	se->my_q = cfs_rq;
 	update_load_set(&se->load, 0);
 	se->parent = parent;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 		struct sched_rt_entity *rt_se, int cpu,
 		struct sched_rt_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->rt_rq[cpu] = rt_rq;
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 	tg->rt_se[cpu] = rt_se;
 	if (!rt_se)
 		return;
 	if (!parent)
 		rt_se->rt_rq = &rq->rt;
 	else
 		rt_se->rt_rq = parent->my_q;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = parent;
 	INIT_LIST_HEAD(&rt_se->run_list);
 }
 #endif
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	if (alloc_size) {
 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 		for_each_possible_cpu(i) {
 			per_cpu(load_balance_tmpmask, i) = (void *)ptr;
 			ptr += cpumask_size();
 		}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
 	init_rt_bandwidth(&def_rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
 	list_add(&root_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&root_task_group.children);
 	autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 		rq = cpu_rq(i);
 		raw_spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = root_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		/*
 		 * How much cpu bandwidth does root_task_group get?
 		 *
 		 * In case of task-groups formed thr' the cgroup filesystem, it
 		 * gets 100% of the cpu resources in the system. This overall
 		 * system cpu resource is divided among the tasks of
 		 * root_task_group and its child task-groups in a fair manner,
 		 * based on each entity's (task or task-group's) weight
 		 * (se->load.weight).
 		 *
 		 * In other words, if root_task_group has 10 tasks of weight
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 		rq->last_load_update_tick = jiffies;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_power = SCHED_POWER_SCALE;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_balance_kick = 0;
 		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 #ifdef CONFIG_SMP
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	calc_load_update = jiffies + LOAD_FREQ;
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
 	atomic_set(&nohz.load_balancer, nr_cpu_ids);
 	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
 	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
 #endif
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 	scheduler_running = 1;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 	return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
 	prev_jiffy = jiffies;
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 			in_atomic(), irqs_disabled(),
 			current->pid, current->comm);
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 	dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	const struct sched_class *prev_class = p->sched_class;
 	int old_prio = p->prio;
 	int on_rq;
 	on_rq = p->on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	__setscheduler(rq, p, SCHED_NORMAL, 0);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
 	check_class_changed(rq, p, prev_class, old_prio);
 }
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
 		if (!p->mm)
 			continue;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.statistics.wait_start	= 0;
 		p->se.statistics.sleep_start	= 0;
 		p->se.statistics.block_start	= 0;
 #endif
 		if (!rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (TASK_NICE(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
 		raw_spin_lock(&p->pi_lock);
 		rq = __task_rq_lock(p);
 		normalize_task(rq, p);
 		__task_rq_unlock(rq);
 		raw_spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 /*
  * These functions are only useful for the IA64 MCA handling, or kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
 #ifdef CONFIG_IA64
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
 	}
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
 }
 static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	int i;
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 	tg->shares = NICE_0_LOAD;
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 		se = kzalloc_node(sizeof(struct sched_entity),
 				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err_free_rq;
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 	}
 	return 1;
 err_free_rq:
 	kfree(cfs_rq);
 err:
 	return 0;
 }
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	/*
 	* Only empty task groups can be destroyed; so we can speculatively
 	* check on_list without danger of it being re-added.
 	*/
 	if (!tg->cfs_rq[cpu]->on_list)
 		return;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
 static inline
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
 {
 	int i;
 	destroy_rt_bandwidth(&tg->rt_bandwidth);
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
 		if (tg->rt_se)
 			kfree(tg->rt_se[i]);
 	}
 	kfree(tg->rt_rq);
 	kfree(tg->rt_se);
 }
 static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
 	struct sched_rt_entity *rt_se;
 	int i;
 	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
 	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_se)
 		goto err;
 	init_rt_bandwidth(&tg->rt_bandwidth,
 			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 	for_each_possible_cpu(i) {
 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err_free_rq;
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 	}
 	return 1;
 err_free_rq:
 	kfree(rt_rq);
 err:
 	return 0;
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
 static inline
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
 	kfree(tg);
 }
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	unsigned long flags;
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_add_rcu(&tg->list, &task_groups);
 	WARN_ON(!parent); /* root should already exist */
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	return tg;
 err:
 	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
 static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
 	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	unsigned long flags;
 	int i;
 	/* end participation in shares distribution */
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
  *	The caller of this function should have put the task in its new group
  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(tsk, &flags);
 	running = task_current(rq, tsk);
 	on_rq = tsk->on_rq;
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
 		tsk->sched_class->task_move_group(tsk, on_rq);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
 	task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 	unsigned long flags;
 	/*
 	 * We can't change the weight of the root cgroup.
 	 */
 	if (!tg->se[0])
 		return -EINVAL;
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se;
 		se = tg->se[i];
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		for_each_sched_entity(se)
 			update_cfs_shares(group_cfs_rq(se));
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 done:
 	mutex_unlock(&shares_mutex);
 	return 0;
 }
 unsigned long sched_group_shares(struct task_group *tg)
 {
 	return tg->shares;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 20;
 	return div64_u64(runtime << 20, period);
 }
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *g, *p;
 	do_each_thread(g, p) {
 		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
 			return 1;
 	} while_each_thread(g, p);
 	return 0;
 }
 struct rt_schedulable_data {
 	struct task_group *tg;
 	u64 rt_period;
 	u64 rt_runtime;
 };
 static int tg_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	runtime = tg->rt_bandwidth.rt_runtime;
 	if (tg == d->tg) {
 		period = d->rt_period;
 		runtime = d->rt_runtime;
 	}
 	/*
 	 * Cannot have more runtime than the period.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	/*
 	 * Ensure we don't starve existing RT tasks.
 	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 	total = to_ratio(period, runtime);
 	/*
 	 * Nobody can have more than the global setting allows.
 	 */
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
 		if (child == d->tg) {
 			period = d->rt_period;
 			runtime = d->rt_runtime;
 		}
 		sum += to_ratio(period, runtime);
 	}
 	if (sum > total)
 		return -EINVAL;
 	return 0;
 }
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 static int tg_set_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
 	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = tg->rt_rq[i];
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_runtime;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return err;
 }
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
 		return -1;
 	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (rt_period == 0)
 		return -EINVAL;
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
 static int sched_rt_global_constraints(void)
 {
 	u64 runtime, period;
 	int ret = 0;
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	runtime = global_rt_runtime();
 	period = global_rt_period();
 	/*
 	 * Sanity check on the sysctl variables.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return ret;
 }
 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept realtime tasks when there is no way for them to run */
 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
 		return 0;
 	return 1;
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
 	int i;
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	/*
 	 * There's always some RT tasks in the root group
 	 * -- migration, kstopmachine etc..
 	 */
 	if (sysctl_sched_rt_runtime == 0)
 		return -EBUSY;
 	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = global_rt_runtime();
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 	return 0;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	int old_period, old_runtime;
 	static DEFINE_MUTEX(mutex);
 	mutex_lock(&mutex);
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		ret = sched_rt_global_constraints();
 		if (ret) {
 			sysctl_sched_rt_period = old_period;
 			sysctl_sched_rt_runtime = old_runtime;
 		} else {
 			def_rt_bandwidth.rt_runtime = global_rt_runtime();
 			def_rt_bandwidth.rt_period =
 				ns_to_ktime(global_rt_period());
 		}
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 #ifdef CONFIG_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
 			    struct task_group, css);
 }
 static struct cgroup_subsys_state *
 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg, *parent;
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
 		return &root_task_group.css;
 	}
 	parent = cgroup_tg(cgrp->parent);
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 	return &tg->css;
 }
 static void
 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	sched_destroy_group(tg);
 }
 static int
 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
 #endif
 	return 0;
 }
 static void
 cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	sched_move_task(tsk);
 }
 static void
 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		struct cgroup *old_cgrp, struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
 	 * Ignore this case since the task hasn't ran yet, this avoids
 	 * trying to poke a half freed task state from generic code.
 	 */
 	if (!(task->flags & PF_EXITING))
 		return;
 	sched_move_task(task);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
 	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
 }
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	return (u64) scale_load_down(tg->shares);
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				s64 val)
 {
 	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
 }
 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_runtime(cgroup_tg(cgrp));
 }
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 		u64 rt_period_us)
 {
 	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
 }
 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_period(cgroup_tg(cgrp));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
 		.read_s64 = cpu_rt_runtime_read,
 		.write_s64 = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 }
 struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
 	.destroy	= cpu_cgroup_destroy,
 	.can_attach_task = cpu_cgroup_can_attach_task,
 	.attach_task	= cpu_cgroup_attach_task,
 	.exit		= cpu_cgroup_exit,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
 	.early_init	= 1,
 };
 #endif	/* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_CGROUP_CPUACCT
 /*
  * CPU accounting code for task groups.
  *
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
 /* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 __percpu *cpuusage;
 	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 	struct cpuacct *parent;
 };
 struct cgroup_subsys cpuacct_subsys;
 /* return cpu accounting group corresponding to this container */
 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* return cpu accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
 	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
 	struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	int i;
 	if (!ca)
 		goto out;
 	ca->cpuusage = alloc_percpu(u64);
 	if (!ca->cpuusage)
 		goto out_free_ca;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		if (percpu_counter_init(&ca->cpustat[i], 0))
 			goto out_free_counters;
 	if (cgrp->parent)
 		ca->parent = cgroup_ca(cgrp->parent);
 	return &ca->css;
 out_free_counters:
 	while (--i >= 0)
 		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 out_free_ca:
 	kfree(ca);
 out:
 	return ERR_PTR(-ENOMEM);
 }
 /* destroy an existing cpu accounting group */
 static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int i;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
 	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 	 */
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
 	data = *cpuusage;
 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
 	data = *cpuusage;
 #endif
 	return data;
 }
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
 	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 	 */
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
 	*cpuusage = val;
 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
 	*cpuusage = val;
 #endif
 }
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	u64 totalcpuusage = 0;
 	int i;
 	for_each_present_cpu(i)
 		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 	return totalcpuusage;
 }
 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 								u64 reset)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int err = 0;
 	int i;
 	if (reset) {
 		err = -EINVAL;
 		goto out;
 	}
 	for_each_present_cpu(i)
 		cpuacct_cpuusage_write(ca, i, 0);
 out:
 	return err;
 }
 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 				   struct seq_file *m)
 {
 	struct cpuacct *ca = cgroup_ca(cgroup);
 	u64 percpu;
 	int i;
 	for_each_present_cpu(i) {
 		percpu = cpuacct_cpuusage_read(ca, i);
 		seq_printf(m, "%llu ", (unsigned long long) percpu);
 	}
 	seq_printf(m, "\n");
 	return 0;
 }
 static const char *cpuacct_stat_desc[] = {
 	[CPUACCT_STAT_USER] = "user",
 	[CPUACCT_STAT_SYSTEM] = "system",
 };
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
 		struct cgroup_map_cb *cb)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int i;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
 		s64 val = percpu_counter_read(&ca->cpustat[i]);
 		val = cputime64_to_clock_t(val);
 		cb->fill(cb, cpuacct_stat_desc[i], val);
 	}
 	return 0;
 }
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
 	{
 		.name = "usage_percpu",
 		.read_seq_string = cpuacct_percpu_seq_read,
 	},
 	{
 		.name = "stat",
 		.read_map = cpuacct_stats_show,
 	},
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
 }
 /*
  * charge this task's execution time to its accounting group.
  *
  * called with rq->lock held.
  */
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
 	int cpu;
 	if (unlikely(!cpuacct_subsys.active))
 		return;
 	cpu = task_cpu(tsk);
 	rcu_read_lock();
 	ca = task_ca(tsk);
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 	rcu_read_unlock();
 }
 /*
  * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
  * in cputime_t units. As a result, cpuacct_update_stats calls
  * percpu_counter_add with values large enough to always overflow the
  * per cpu batch limit causing bad SMP scalability.
  *
  * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
  * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
  * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
  */
 #ifdef CONFIG_SMP
 #define CPUACCT_BATCH	\
 	min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
 #else
 #define CPUACCT_BATCH	0
 #endif
 /*
  * Charge the system/user time to the task's accounting group.
  */
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val)
 {
 	struct cpuacct *ca;
 	int batch = CPUACCT_BATCH;
 	if (unlikely(!cpuacct_subsys.active))
 		return;
 	rcu_read_lock();
 	ca = task_ca(tsk);
 	do {
 		__percpu_counter_add(&ca->cpustat[idx], val, batch);
 		ca = ca->parent;
 	} while (ca);
 	rcu_read_unlock();
 }
 struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
 	.destroy = cpuacct_destroy,
 	.populate = cpuacct_populate,
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */

net/mac80211/sta_info.c

Diff comments View file @ d8bf4ca

 /*
  * Copyright 2002-2005, Instant802 Networks, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
 #include <linux/timer.h>
 #include <linux/rtnetlink.h>
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
 #include "driver-ops.h"
 #include "rate.h"
 #include "sta_info.h"
 #include "debugfs_sta.h"
 #include "mesh.h"
 /**
  * DOC: STA information lifetime rules
  *
  * STA info structures (&struct sta_info) are managed in a hash table
  * for faster lookup and a list for iteration. They are managed using
  * RCU, i.e. access to the list and hash table is protected by RCU.
  *
  * Upon allocating a STA info structure with sta_info_alloc(), the caller
  * owns that structure. It must then insert it into the hash table using
  * either sta_info_insert() or sta_info_insert_rcu(); only in the latter
  * case (which acquires an rcu read section but must not be called from
  * within one) will the pointer still be valid after the call. Note that
  * the caller may not do much with the STA info before inserting it, in
  * particular, it may not start any mesh peer link management or add
  * encryption keys.
  *
  * When the insertion fails (sta_info_insert()) returns non-zero), the
  * structure will have been freed by sta_info_insert()!
  *
  * Station entries are added by mac80211 when you establish a link with a
  * peer. This means different things for the different type of interfaces
  * we support. For a regular station this mean we add the AP sta when we
  * receive an association response from the AP. For IBSS this occurs when
  * get to know about a peer on the same IBSS. For WDS we add the sta for
  * the peer immediately upon device open. When using AP mode we add stations
  * for each respective station upon request from userspace through nl80211.
  *
  * In order to remove a STA info structure, various sta_info_destroy_*()
  * calls are available.
  *
  * There is no concept of ownership on a STA entry, each structure is
  * owned by the global hash table/list until it is removed. All users of
  * the structure need to be RCU protected so that the structure won't be
  * freed before they are done using it.
  */
 /* Caller must hold local->sta_lock */
 static int sta_info_hash_del(struct ieee80211_local *local,
 			     struct sta_info *sta)
 {
 	struct sta_info *s;
 	s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)],
 				      lockdep_is_held(&local->sta_lock));
 	if (!s)
 		return -ENOENT;
 	if (s == sta) {
 		rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)],
 				   s->hnext);
 		return 0;
 	}
 	while (rcu_access_pointer(s->hnext) &&
 	       rcu_access_pointer(s->hnext) != sta)
 		s = rcu_dereference_protected(s->hnext,
 					lockdep_is_held(&local->sta_lock));
 	if (rcu_access_pointer(s->hnext)) {
 		rcu_assign_pointer(s->hnext, sta->hnext);
 		return 0;
 	}
 	return -ENOENT;
 }
 /* protected by RCU */
 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
 			      const u8 *addr)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
-				    rcu_read_lock_held() ||
 				    lockdep_is_held(&local->sta_lock) ||
 				    lockdep_is_held(&local->sta_mtx));
 	while (sta) {
 		if (sta->sdata == sdata &&
 		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
 			break;
 		sta = rcu_dereference_check(sta->hnext,
-					    rcu_read_lock_held() ||
 					    lockdep_is_held(&local->sta_lock) ||
 					    lockdep_is_held(&local->sta_mtx));
 	}
 	return sta;
 }
 /*
  * Get sta info either from the specified interface
  * or from one of its vlans
  */
 struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
 				  const u8 *addr)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
-				    rcu_read_lock_held() ||
 				    lockdep_is_held(&local->sta_lock) ||
 				    lockdep_is_held(&local->sta_mtx));
 	while (sta) {
 		if ((sta->sdata == sdata ||
 		     (sta->sdata->bss && sta->sdata->bss == sdata->bss)) &&
 		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
 			break;
 		sta = rcu_dereference_check(sta->hnext,
-					    rcu_read_lock_held() ||
 					    lockdep_is_held(&local->sta_lock) ||
 					    lockdep_is_held(&local->sta_mtx));
 	}
 	return sta;
 }
 struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
 				     int idx)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	int i = 0;
 	list_for_each_entry_rcu(sta, &local->sta_list, list) {
 		if (sdata != sta->sdata)
 			continue;
 		if (i < idx) {
 			++i;
 			continue;
 		}
 		return sta;
 	}
 	return NULL;
 }
 /**
  * __sta_info_free - internal STA free helper
  *
  * @local: pointer to the global information
  * @sta: STA info to free
  *
  * This function must undo everything done by sta_info_alloc()
  * that may happen before sta_info_insert().
  */
 static void __sta_info_free(struct ieee80211_local *local,
 			    struct sta_info *sta)
 {
 	if (sta->rate_ctrl) {
 		rate_control_free_sta(sta);
 		rate_control_put(sta->rate_ctrl);
 	}
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 	kfree(sta);
 }
 /* Caller must hold local->sta_lock */
 static void sta_info_hash_add(struct ieee80211_local *local,
 			      struct sta_info *sta)
 {
 	sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)];
 	rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta);
 }
 static void sta_unblock(struct work_struct *wk)
 {
 	struct sta_info *sta;
 	sta = container_of(wk, struct sta_info, drv_unblock_wk);
 	if (sta->dead)
 		return;
 	if (!test_sta_flags(sta, WLAN_STA_PS_STA))
 		ieee80211_sta_ps_deliver_wakeup(sta);
 	else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) {
 		clear_sta_flags(sta, WLAN_STA_PS_DRIVER);
 		ieee80211_sta_ps_deliver_poll_response(sta);
 	} else
 		clear_sta_flags(sta, WLAN_STA_PS_DRIVER);
 }
 static int sta_prepare_rate_control(struct ieee80211_local *local,
 				    struct sta_info *sta, gfp_t gfp)
 {
 	if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
 		return 0;
 	sta->rate_ctrl = rate_control_get(local->rate_ctrl);
 	sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
 						     &sta->sta, gfp);
 	if (!sta->rate_ctrl_priv) {
 		rate_control_put(sta->rate_ctrl);
 		return -ENOMEM;
 	}
 	return 0;
 }
 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 				u8 *addr, gfp_t gfp)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	struct timespec uptime;
 	int i;
 	sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp);
 	if (!sta)
 		return NULL;
 	spin_lock_init(&sta->lock);
 	spin_lock_init(&sta->flaglock);
 	INIT_WORK(&sta->drv_unblock_wk, sta_unblock);
 	INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
 	mutex_init(&sta->ampdu_mlme.mtx);
 	memcpy(sta->sta.addr, addr, ETH_ALEN);
 	sta->local = local;
 	sta->sdata = sdata;
 	sta->last_rx = jiffies;
 	do_posix_clock_monotonic_gettime(&uptime);
 	sta->last_connected = uptime.tv_sec;
 	ewma_init(&sta->avg_signal, 1024, 8);
 	if (sta_prepare_rate_control(local, sta, gfp)) {
 		kfree(sta);
 		return NULL;
 	}
 	for (i = 0; i < STA_TID_NUM; i++) {
 		/*
 		 * timer_to_tid must be initialized with identity mapping
 		 * to enable session_timer's data differentiation. See
 		 * sta_rx_agg_session_timer_expired for usage.
 		 */
 		sta->timer_to_tid[i] = i;
 	}
 	skb_queue_head_init(&sta->ps_tx_buf);
 	skb_queue_head_init(&sta->tx_filtered);
 	for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
 		sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 #ifdef CONFIG_MAC80211_MESH
 	sta->plink_state = NL80211_PLINK_LISTEN;
 	init_timer(&sta->plink_timer);
 #endif
 	return sta;
 }
 static int sta_info_finish_insert(struct sta_info *sta, bool async)
 {
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct station_info sinfo;
 	unsigned long flags;
 	int err = 0;
 	lockdep_assert_held(&local->sta_mtx);
 	/* notify driver */
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		sdata = container_of(sdata->bss,
 				     struct ieee80211_sub_if_data,
 				     u.ap);
 	err = drv_sta_add(local, sdata, &sta->sta);
 	if (err) {
 		if (!async)
 			return err;
 		printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)"
 				  " - keeping it anyway.\n",
 		       sdata->name, sta->sta.addr, err);
 	} else {
 		sta->uploaded = true;
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 		if (async)
 			wiphy_debug(local->hw.wiphy,
 				    "Finished adding IBSS STA %pM\n",
 				    sta->sta.addr);
 #endif
 	}
 	sdata = sta->sdata;
 	if (!async) {
 		local->num_sta++;
 		local->sta_generation++;
 		smp_mb();
 		/* make the station visible */
 		spin_lock_irqsave(&local->sta_lock, flags);
 		sta_info_hash_add(local, sta);
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 	}
 	list_add(&sta->list, &local->sta_list);
 	ieee80211_sta_debugfs_add(sta);
 	rate_control_add_sta_debugfs(sta);
 	sinfo.filled = 0;
 	sinfo.generation = local->sta_generation;
 	cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
 	return 0;
 }
 static void sta_info_finish_pending(struct ieee80211_local *local)
 {
 	struct sta_info *sta;
 	unsigned long flags;
 	spin_lock_irqsave(&local->sta_lock, flags);
 	while (!list_empty(&local->sta_pending_list)) {
 		sta = list_first_entry(&local->sta_pending_list,
 				       struct sta_info, list);
 		list_del(&sta->list);
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 		sta_info_finish_insert(sta, true);
 		spin_lock_irqsave(&local->sta_lock, flags);
 	}
 	spin_unlock_irqrestore(&local->sta_lock, flags);
 }
 static void sta_info_finish_work(struct work_struct *work)
 {
 	struct ieee80211_local *local =
 		container_of(work, struct ieee80211_local, sta_finish_work);
 	mutex_lock(&local->sta_mtx);
 	sta_info_finish_pending(local);
 	mutex_unlock(&local->sta_mtx);
 }
 int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
 {
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	unsigned long flags;
 	int err = 0;
 	/*
 	 * Can't be a WARN_ON because it can be triggered through a race:
 	 * something inserts a STA (on one CPU) without holding the RTNL
 	 * and another CPU turns off the net device.
 	 */
 	if (unlikely(!ieee80211_sdata_running(sdata))) {
 		err = -ENETDOWN;
 		rcu_read_lock();
 		goto out_free;
 	}
 	if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 ||
 		    is_multicast_ether_addr(sta->sta.addr))) {
 		err = -EINVAL;
 		rcu_read_lock();
 		goto out_free;
 	}
 	/*
 	 * In ad-hoc mode, we sometimes need to insert stations
 	 * from tasklet context from the RX path. To avoid races,
 	 * always do so in that case -- see the comment below.
 	 */
 	if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		spin_lock_irqsave(&local->sta_lock, flags);
 		/* check if STA exists already */
 		if (sta_info_get_bss(sdata, sta->sta.addr)) {
 			spin_unlock_irqrestore(&local->sta_lock, flags);
 			rcu_read_lock();
 			err = -EEXIST;
 			goto out_free;
 		}
 		local->num_sta++;
 		local->sta_generation++;
 		smp_mb();
 		sta_info_hash_add(local, sta);
 		list_add_tail(&sta->list, &local->sta_pending_list);
 		rcu_read_lock();
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 		wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n",
 			    sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 		ieee80211_queue_work(&local->hw, &local->sta_finish_work);
 		return 0;
 	}
 	/*
 	 * On first glance, this will look racy, because the code
 	 * below this point, which inserts a station with sleeping,
 	 * unlocks the sta_lock between checking existence in the
 	 * hash table and inserting into it.
 	 *
 	 * However, it is not racy against itself because it keeps
 	 * the mutex locked. It still seems to race against the
 	 * above code that atomically inserts the station... That,
 	 * however, is not true because the above code can only
 	 * be invoked for IBSS interfaces, and the below code will
 	 * not be -- and the two do not race against each other as
 	 * the hash table also keys off the interface.
 	 */
 	might_sleep();
 	mutex_lock(&local->sta_mtx);
 	spin_lock_irqsave(&local->sta_lock, flags);
 	/* check if STA exists already */
 	if (sta_info_get_bss(sdata, sta->sta.addr)) {
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 		mutex_unlock(&local->sta_mtx);
 		rcu_read_lock();
 		err = -EEXIST;
 		goto out_free;
 	}
 	spin_unlock_irqrestore(&local->sta_lock, flags);
 	err = sta_info_finish_insert(sta, false);
 	if (err) {
 		mutex_unlock(&local->sta_mtx);
 		rcu_read_lock();
 		goto out_free;
 	}
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 	/* move reference to rcu-protected */
 	rcu_read_lock();
 	mutex_unlock(&local->sta_mtx);
 	if (ieee80211_vif_is_mesh(&sdata->vif))
 		mesh_accept_plinks_update(sdata);
 	return 0;
  out_free:
 	BUG_ON(!err);
 	__sta_info_free(local, sta);
 	return err;
 }
 int sta_info_insert(struct sta_info *sta)
 {
 	int err = sta_info_insert_rcu(sta);
 	rcu_read_unlock();
 	return err;
 }
 static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid)
 {
 	/*
 	 * This format has been mandated by the IEEE specifications,
 	 * so this line may not be changed to use the __set_bit() format.
 	 */
 	bss->tim[aid / 8] |= (1 << (aid % 8));
 }
 static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid)
 {
 	/*
 	 * This format has been mandated by the IEEE specifications,
 	 * so this line may not be changed to use the __clear_bit() format.
 	 */
 	bss->tim[aid / 8] &= ~(1 << (aid % 8));
 }
 static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss,
 				   struct sta_info *sta)
 {
 	BUG_ON(!bss);
 	__bss_tim_set(bss, sta->sta.aid);
 	if (sta->local->ops->set_tim) {
 		sta->local->tim_in_locked_section = true;
 		drv_set_tim(sta->local, &sta->sta, true);
 		sta->local->tim_in_locked_section = false;
 	}
 }
 void sta_info_set_tim_bit(struct sta_info *sta)
 {
 	unsigned long flags;
 	BUG_ON(!sta->sdata->bss);
 	spin_lock_irqsave(&sta->local->sta_lock, flags);
 	__sta_info_set_tim_bit(sta->sdata->bss, sta);
 	spin_unlock_irqrestore(&sta->local->sta_lock, flags);
 }
 static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss,
 				     struct sta_info *sta)
 {
 	BUG_ON(!bss);
 	__bss_tim_clear(bss, sta->sta.aid);
 	if (sta->local->ops->set_tim) {
 		sta->local->tim_in_locked_section = true;
 		drv_set_tim(sta->local, &sta->sta, false);
 		sta->local->tim_in_locked_section = false;
 	}
 }
 void sta_info_clear_tim_bit(struct sta_info *sta)
 {
 	unsigned long flags;
 	BUG_ON(!sta->sdata->bss);
 	spin_lock_irqsave(&sta->local->sta_lock, flags);
 	__sta_info_clear_tim_bit(sta->sdata->bss, sta);
 	spin_unlock_irqrestore(&sta->local->sta_lock, flags);
 }
 static int sta_info_buffer_expired(struct sta_info *sta,
 				   struct sk_buff *skb)
 {
 	struct ieee80211_tx_info *info;
 	int timeout;
 	if (!skb)
 		return 0;
 	info = IEEE80211_SKB_CB(skb);
 	/* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */
 	timeout = (sta->listen_interval *
 		   sta->sdata->vif.bss_conf.beacon_int *
 		   32 / 15625) * HZ;
 	if (timeout < STA_TX_BUFFER_EXPIRE)
 		timeout = STA_TX_BUFFER_EXPIRE;
 	return time_after(jiffies, info->control.jiffies + timeout);
 }
 static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local,
 					     struct sta_info *sta)
 {
 	unsigned long flags;
 	struct sk_buff *skb;
 	if (skb_queue_empty(&sta->ps_tx_buf))
 		return false;
 	for (;;) {
 		spin_lock_irqsave(&sta->ps_tx_buf.lock, flags);
 		skb = skb_peek(&sta->ps_tx_buf);
 		if (sta_info_buffer_expired(sta, skb))
 			skb = __skb_dequeue(&sta->ps_tx_buf);
 		else
 			skb = NULL;
 		spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags);
 		if (!skb)
 			break;
 		local->total_ps_buffered--;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n",
 		       sta->sta.addr);
 #endif
 		dev_kfree_skb(skb);
 		if (skb_queue_empty(&sta->ps_tx_buf) &&
 		    !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF))
 			sta_info_clear_tim_bit(sta);
 	}
 	return true;
 }
 static int __must_check __sta_info_destroy(struct sta_info *sta)
 {
 	struct ieee80211_local *local;
 	struct ieee80211_sub_if_data *sdata;
 	struct sk_buff *skb;
 	unsigned long flags;
 	int ret, i;
 	might_sleep();
 	if (!sta)
 		return -ENOENT;
 	local = sta->local;
 	sdata = sta->sdata;
 	/*
 	 * Before removing the station from the driver and
 	 * rate control, it might still start new aggregation
 	 * sessions -- block that to make sure the tear-down
 	 * will be sufficient.
 	 */
 	set_sta_flags(sta, WLAN_STA_BLOCK_BA);
 	ieee80211_sta_tear_down_BA_sessions(sta, true);
 	spin_lock_irqsave(&local->sta_lock, flags);
 	ret = sta_info_hash_del(local, sta);
 	/* this might still be the pending list ... which is fine */
 	if (!ret)
 		list_del(&sta->list);
 	spin_unlock_irqrestore(&local->sta_lock, flags);
 	if (ret)
 		return ret;
 	mutex_lock(&local->key_mtx);
 	for (i = 0; i < NUM_DEFAULT_KEYS; i++)
 		__ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i]));
 	if (sta->ptk)
 		__ieee80211_key_free(key_mtx_dereference(local, sta->ptk));
 	mutex_unlock(&local->key_mtx);
 	sta->dead = true;
 	if (test_and_clear_sta_flags(sta,
 				WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) {
 		BUG_ON(!sdata->bss);
 		atomic_dec(&sdata->bss->num_sta_ps);
 		__sta_info_clear_tim_bit(sdata->bss, sta);
 	}
 	local->num_sta--;
 	local->sta_generation++;
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		rcu_assign_pointer(sdata->u.vlan.sta, NULL);
 	if (sta->uploaded) {
 		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			sdata = container_of(sdata->bss,
 					     struct ieee80211_sub_if_data,
 					     u.ap);
 		drv_sta_remove(local, sdata, &sta->sta);
 		sdata = sta->sdata;
 	}
 	/*
 	 * At this point, after we wait for an RCU grace period,
 	 * neither mac80211 nor the driver can reference this
 	 * sta struct any more except by still existing timers
 	 * associated with this station that we clean up below.
 	 */
 	synchronize_rcu();
 #ifdef CONFIG_MAC80211_MESH
 	if (ieee80211_vif_is_mesh(&sdata->vif))
 		mesh_accept_plinks_update(sdata);
 #endif
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 	cancel_work_sync(&sta->drv_unblock_wk);
 	cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL);
 	rate_control_remove_sta_debugfs(sta);
 	ieee80211_sta_debugfs_remove(sta);
 #ifdef CONFIG_MAC80211_MESH
 	if (ieee80211_vif_is_mesh(&sta->sdata->vif)) {
 		mesh_plink_deactivate(sta);
 		del_timer_sync(&sta->plink_timer);
 	}
 #endif
 	while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) {
 		local->total_ps_buffered--;
 		dev_kfree_skb_any(skb);
 	}
 	while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL)
 		dev_kfree_skb_any(skb);
 	__sta_info_free(local, sta);
 	return 0;
 }
 int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr)
 {
 	struct sta_info *sta;
 	int ret;
 	mutex_lock(&sdata->local->sta_mtx);
 	sta = sta_info_get(sdata, addr);
 	ret = __sta_info_destroy(sta);
 	mutex_unlock(&sdata->local->sta_mtx);
 	return ret;
 }
 int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
 			      const u8 *addr)
 {
 	struct sta_info *sta;
 	int ret;
 	mutex_lock(&sdata->local->sta_mtx);
 	sta = sta_info_get_bss(sdata, addr);
 	ret = __sta_info_destroy(sta);
 	mutex_unlock(&sdata->local->sta_mtx);
 	return ret;
 }
 static void sta_info_cleanup(unsigned long data)
 {
 	struct ieee80211_local *local = (struct ieee80211_local *) data;
 	struct sta_info *sta;
 	bool timer_needed = false;
 	rcu_read_lock();
 	list_for_each_entry_rcu(sta, &local->sta_list, list)
 		if (sta_info_cleanup_expire_buffered(local, sta))
 			timer_needed = true;
 	rcu_read_unlock();
 	if (local->quiescing)
 		return;
 	if (!timer_needed)
 		return;
 	mod_timer(&local->sta_cleanup,
 		  round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
 }
 void sta_info_init(struct ieee80211_local *local)
 {
 	spin_lock_init(&local->sta_lock);
 	mutex_init(&local->sta_mtx);
 	INIT_LIST_HEAD(&local->sta_list);
 	INIT_LIST_HEAD(&local->sta_pending_list);
 	INIT_WORK(&local->sta_finish_work, sta_info_finish_work);
 	setup_timer(&local->sta_cleanup, sta_info_cleanup,
 		    (unsigned long)local);
 }
 void sta_info_stop(struct ieee80211_local *local)
 {
 	del_timer(&local->sta_cleanup);
 	sta_info_flush(local, NULL);
 }
 /**
  * sta_info_flush - flush matching STA entries from the STA table
  *
  * Returns the number of removed STA entries.
  *
  * @local: local interface data
  * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs
  */
 int sta_info_flush(struct ieee80211_local *local,
 		   struct ieee80211_sub_if_data *sdata)
 {
 	struct sta_info *sta, *tmp;
 	int ret = 0;
 	might_sleep();
 	mutex_lock(&local->sta_mtx);
 	sta_info_finish_pending(local);
 	list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
 		if (!sdata || sdata == sta->sdata)
 			WARN_ON(__sta_info_destroy(sta));
 	}
 	mutex_unlock(&local->sta_mtx);
 	return ret;
 }
 void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 			  unsigned long exp_time)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta, *tmp;
 	mutex_lock(&local->sta_mtx);
 	list_for_each_entry_safe(sta, tmp, &local->sta_list, list)
 		if (time_after(jiffies, sta->last_rx + exp_time)) {
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 			printk(KERN_DEBUG "%s: expiring inactive STA %pM\n",
 			       sdata->name, sta->sta.addr);
 #endif
 			WARN_ON(__sta_info_destroy(sta));
 		}
 	mutex_unlock(&local->sta_mtx);
 }
 struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
 					       const u8 *addr,
 					       const u8 *localaddr)
 {
 	struct sta_info *sta, *nxt;
 	/*
 	 * Just return a random station if localaddr is NULL
 	 * ... first in list.
 	 */
 	for_each_sta_info(hw_to_local(hw), addr, sta, nxt) {
 		if (localaddr &&
 		    compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0)
 			continue;
 		if (!sta->uploaded)
 			return NULL;
 		return &sta->sta;
 	}
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
 struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
 					 const u8 *addr)
 {
 	struct sta_info *sta;
 	if (!vif)
 		return NULL;
 	sta = sta_info_get_bss(vif_to_sdata(vif), addr);
 	if (!sta)
 		return NULL;
 	if (!sta->uploaded)
 		return NULL;
 	return &sta->sta;
 }
 EXPORT_SYMBOL(ieee80211_find_sta);
 static void clear_sta_ps_flags(void *_sta)
 {
 	struct sta_info *sta = _sta;
 	clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA);
 }
 /* powersave support code */
 void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
 	int sent, buffered;
 	clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF);
 	if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
 		drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
 	if (!skb_queue_empty(&sta->ps_tx_buf))
 		sta_info_clear_tim_bit(sta);
 	/* Send all buffered frames to the station */
 	sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered);
 	buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf,
 						 clear_sta_ps_flags, sta);
 	sent += buffered;
 	local->total_ps_buffered -= buffered;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 	printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames "
 	       "since STA not sleeping anymore\n", sdata->name,
 	       sta->sta.addr, sta->sta.aid, sent - buffered, buffered);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 }
 void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	int no_pending_pkts;
 	skb = skb_dequeue(&sta->tx_filtered);
 	if (!skb) {
 		skb = skb_dequeue(&sta->ps_tx_buf);
 		if (skb)
 			local->total_ps_buffered--;
 	}
 	no_pending_pkts = skb_queue_empty(&sta->tx_filtered) &&
 		skb_queue_empty(&sta->ps_tx_buf);
 	if (skb) {
 		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 		struct ieee80211_hdr *hdr =
 			(struct ieee80211_hdr *) skb->data;
 		/*
 		 * Tell TX path to send this frame even though the STA may
 		 * still remain is PS mode after this frame exchange.
 		 */
 		info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 		printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n",
 		       sta->sta.addr, sta->sta.aid,
 		       skb_queue_len(&sta->ps_tx_buf));
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 		/* Use MoreData flag to indicate whether there are more
 		 * buffered frames for this STA */
 		if (no_pending_pkts)
 			hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA);
 		else
 			hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA);
 		ieee80211_add_pending_skb(local, skb);
 		if (no_pending_pkts)
 			sta_info_clear_tim_bit(sta);
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
 	} else {
 		/*
 		 * FIXME: This can be the result of a race condition between
 		 *	  us expiring a frame and the station polling for it.
 		 *	  Should we send it a null-func frame indicating we
 		 *	  have nothing buffered for it?
 		 */
 		printk(KERN_DEBUG "%s: STA %pM sent PS Poll even "
 		       "though there are no buffered frames for it\n",
 		       sdata->name, sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 	}
 }
 void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
 			       struct ieee80211_sta *pubsta, bool block)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
 	trace_api_sta_block_awake(sta->local, pubsta, block);
 	if (block)
 		set_sta_flags(sta, WLAN_STA_PS_DRIVER);
 	else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER))
 		ieee80211_queue_work(hw, &sta->drv_unblock_wk);
 }
 EXPORT_SYMBOL(ieee80211_sta_block_awake);
 void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
 	set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF);
 	sta_info_set_tim_bit(sta);
 }
 EXPORT_SYMBOL(ieee80211_sta_set_tim);

net/netlabel/netlabel_domainhash.c

Diff comments View file @ d8bf4ca

 /*
  * NetLabel Domain Hash Table
  *
  * This file manages the domain hash table that NetLabel uses to determine
  * which network labeling protocol to use for a given domain.  The NetLabel
  * system manages static and dynamic label mappings for network protocols such
  * as CIPSO and RIPSO.
  *
  * Author: Paul Moore <paul.moore@hp.com>
  *
  */
 /*
  * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY;  without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  * the GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program;  if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  *
  */
 #include <linux/types.h>
 #include <linux/rculist.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/audit.h>
 #include <linux/slab.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <asm/bug.h>
 #include "netlabel_mgmt.h"
 #include "netlabel_addrlist.h"
 #include "netlabel_domainhash.h"
 #include "netlabel_user.h"
 struct netlbl_domhsh_tbl {
 	struct list_head *tbl;
 	u32 size;
 };
 /* Domain hash table */
 /* updates should be so rare that having one spinlock for the entire hash table
  * should be okay */
 static DEFINE_SPINLOCK(netlbl_domhsh_lock);
 #define netlbl_domhsh_rcu_deref(p) \
-	rcu_dereference_check(p, rcu_read_lock_held() || \
+	rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
-				 lockdep_is_held(&netlbl_domhsh_lock))
 static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
 static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
 /*
  * Domain Hash Table Helper Functions
  */
 /**
  * netlbl_domhsh_free_entry - Frees a domain hash table entry
  * @entry: the entry's RCU field
  *
  * Description:
  * This function is designed to be used as a callback to the call_rcu()
  * function so that the memory allocated to a hash table entry can be released
  * safely.
  *
  */
 static void netlbl_domhsh_free_entry(struct rcu_head *entry)
 {
 	struct netlbl_dom_map *ptr;
 	struct netlbl_af4list *iter4;
 	struct netlbl_af4list *tmp4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct netlbl_af6list *iter6;
 	struct netlbl_af6list *tmp6;
 #endif /* IPv6 */
 	ptr = container_of(entry, struct netlbl_dom_map, rcu);
 	if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) {
 		netlbl_af4list_foreach_safe(iter4, tmp4,
 					    &ptr->type_def.addrsel->list4) {
 			netlbl_af4list_remove_entry(iter4);
 			kfree(netlbl_domhsh_addr4_entry(iter4));
 		}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		netlbl_af6list_foreach_safe(iter6, tmp6,
 					    &ptr->type_def.addrsel->list6) {
 			netlbl_af6list_remove_entry(iter6);
 			kfree(netlbl_domhsh_addr6_entry(iter6));
 		}
 #endif /* IPv6 */
 	}
 	kfree(ptr->domain);
 	kfree(ptr);
 }
 /**
  * netlbl_domhsh_hash - Hashing function for the domain hash table
  * @domain: the domain name to hash
  *
  * Description:
  * This is the hashing function for the domain hash table, it returns the
  * correct bucket number for the domain.  The caller is responsible for
  * ensuring that the hash table is protected with either a RCU read lock or the
  * hash table lock.
  *
  */
 static u32 netlbl_domhsh_hash(const char *key)
 {
 	u32 iter;
 	u32 val;
 	u32 len;
 	/* This is taken (with slight modification) from
 	 * security/selinux/ss/symtab.c:symhash() */
 	for (iter = 0, val = 0, len = strlen(key); iter < len; iter++)
 		val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter];
 	return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
 }
 /**
  * netlbl_domhsh_search - Search for a domain entry
  * @domain: the domain
  *
  * Description:
  * Searches the domain hash table and returns a pointer to the hash table
  * entry if found, otherwise NULL is returned.  The caller is responsible for
  * ensuring that the hash table is protected with either a RCU read lock or the
  * hash table lock.
  *
  */
 static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
 {
 	u32 bkt;
 	struct list_head *bkt_list;
 	struct netlbl_dom_map *iter;
 	if (domain != NULL) {
 		bkt = netlbl_domhsh_hash(domain);
 		bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
 		list_for_each_entry_rcu(iter, bkt_list, list)
 			if (iter->valid && strcmp(iter->domain, domain) == 0)
 				return iter;
 	}
 	return NULL;
 }
 /**
  * netlbl_domhsh_search_def - Search for a domain entry
  * @domain: the domain
  * @def: return default if no match is found
  *
  * Description:
  * Searches the domain hash table and returns a pointer to the hash table
  * entry if an exact match is found, if an exact match is not present in the
  * hash table then the default entry is returned if valid otherwise NULL is
  * returned.  The caller is responsible ensuring that the hash table is
  * protected with either a RCU read lock or the hash table lock.
  *
  */
 static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
 {
 	struct netlbl_dom_map *entry;
 	entry = netlbl_domhsh_search(domain);
 	if (entry == NULL) {
 		entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def);
 		if (entry != NULL && !entry->valid)
 			entry = NULL;
 	}
 	return entry;
 }
 /**
  * netlbl_domhsh_audit_add - Generate an audit entry for an add event
  * @entry: the entry being added
  * @addr4: the IPv4 address information
  * @addr6: the IPv6 address information
  * @result: the result code
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Generate an audit record for adding a new NetLabel/LSM mapping entry with
  * the given information.  Caller is responsible for holding the necessary
  * locks.
  *
  */
 static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
 				    struct netlbl_af4list *addr4,
 				    struct netlbl_af6list *addr6,
 				    int result,
 				    struct netlbl_audit *audit_info)
 {
 	struct audit_buffer *audit_buf;
 	struct cipso_v4_doi *cipsov4 = NULL;
 	u32 type;
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
 	if (audit_buf != NULL) {
 		audit_log_format(audit_buf, " nlbl_domain=%s",
 				 entry->domain ? entry->domain : "(default)");
 		if (addr4 != NULL) {
 			struct netlbl_domaddr4_map *map4;
 			map4 = netlbl_domhsh_addr4_entry(addr4);
 			type = map4->type;
 			cipsov4 = map4->type_def.cipsov4;
 			netlbl_af4list_audit_addr(audit_buf, 0, NULL,
 						  addr4->addr, addr4->mask);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		} else if (addr6 != NULL) {
 			struct netlbl_domaddr6_map *map6;
 			map6 = netlbl_domhsh_addr6_entry(addr6);
 			type = map6->type;
 			netlbl_af6list_audit_addr(audit_buf, 0, NULL,
 						  &addr6->addr, &addr6->mask);
 #endif /* IPv6 */
 		} else {
 			type = entry->type;
 			cipsov4 = entry->type_def.cipsov4;
 		}
 		switch (type) {
 		case NETLBL_NLTYPE_UNLABELED:
 			audit_log_format(audit_buf, " nlbl_protocol=unlbl");
 			break;
 		case NETLBL_NLTYPE_CIPSOV4:
 			BUG_ON(cipsov4 == NULL);
 			audit_log_format(audit_buf,
 					 " nlbl_protocol=cipsov4 cipso_doi=%u",
 					 cipsov4->doi);
 			break;
 		}
 		audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
 		audit_log_end(audit_buf);
 	}
 }
 /*
  * Domain Hash Table Functions
  */
 /**
  * netlbl_domhsh_init - Init for the domain hash
  * @size: the number of bits to use for the hash buckets
  *
  * Description:
  * Initializes the domain hash table, should be called only by
  * netlbl_user_init() during initialization.  Returns zero on success, non-zero
  * values on error.
  *
  */
 int __init netlbl_domhsh_init(u32 size)
 {
 	u32 iter;
 	struct netlbl_domhsh_tbl *hsh_tbl;
 	if (size == 0)
 		return -EINVAL;
 	hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
 	if (hsh_tbl == NULL)
 		return -ENOMEM;
 	hsh_tbl->size = 1 << size;
 	hsh_tbl->tbl = kcalloc(hsh_tbl->size,
 			       sizeof(struct list_head),
 			       GFP_KERNEL);
 	if (hsh_tbl->tbl == NULL) {
 		kfree(hsh_tbl);
 		return -ENOMEM;
 	}
 	for (iter = 0; iter < hsh_tbl->size; iter++)
 		INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
 	spin_lock(&netlbl_domhsh_lock);
 	rcu_assign_pointer(netlbl_domhsh, hsh_tbl);
 	spin_unlock(&netlbl_domhsh_lock);
 	return 0;
 }
 /**
  * netlbl_domhsh_add - Adds a entry to the domain hash table
  * @entry: the entry to add
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Adds a new entry to the domain hash table and handles any updates to the
  * lower level protocol handler (i.e. CIPSO).  Returns zero on success,
  * negative on failure.
  *
  */
 int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 		      struct netlbl_audit *audit_info)
 {
 	int ret_val = 0;
 	struct netlbl_dom_map *entry_old;
 	struct netlbl_af4list *iter4;
 	struct netlbl_af4list *tmp4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct netlbl_af6list *iter6;
 	struct netlbl_af6list *tmp6;
 #endif /* IPv6 */
 	/* XXX - we can remove this RCU read lock as the spinlock protects the
 	 *       entire function, but before we do we need to fixup the
 	 *       netlbl_af[4,6]list RCU functions to do "the right thing" with
 	 *       respect to rcu_dereference() when only a spinlock is held. */
 	rcu_read_lock();
 	spin_lock(&netlbl_domhsh_lock);
 	if (entry->domain != NULL)
 		entry_old = netlbl_domhsh_search(entry->domain);
 	else
 		entry_old = netlbl_domhsh_search_def(entry->domain);
 	if (entry_old == NULL) {
 		entry->valid = 1;
 		if (entry->domain != NULL) {
 			u32 bkt = netlbl_domhsh_hash(entry->domain);
 			list_add_tail_rcu(&entry->list,
 				    &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
 		} else {
 			INIT_LIST_HEAD(&entry->list);
 			rcu_assign_pointer(netlbl_domhsh_def, entry);
 		}
 		if (entry->type == NETLBL_NLTYPE_ADDRSELECT) {
 			netlbl_af4list_foreach_rcu(iter4,
 					       &entry->type_def.addrsel->list4)
 				netlbl_domhsh_audit_add(entry, iter4, NULL,
 							ret_val, audit_info);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 			netlbl_af6list_foreach_rcu(iter6,
 					       &entry->type_def.addrsel->list6)
 				netlbl_domhsh_audit_add(entry, NULL, iter6,
 							ret_val, audit_info);
 #endif /* IPv6 */
 		} else
 			netlbl_domhsh_audit_add(entry, NULL, NULL,
 						ret_val, audit_info);
 	} else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT &&
 		   entry->type == NETLBL_NLTYPE_ADDRSELECT) {
 		struct list_head *old_list4;
 		struct list_head *old_list6;
 		old_list4 = &entry_old->type_def.addrsel->list4;
 		old_list6 = &entry_old->type_def.addrsel->list6;
 		/* we only allow the addition of address selectors if all of
 		 * the selectors do not exist in the existing domain map */
 		netlbl_af4list_foreach_rcu(iter4,
 					   &entry->type_def.addrsel->list4)
 			if (netlbl_af4list_search_exact(iter4->addr,
 							iter4->mask,
 							old_list4)) {
 				ret_val = -EEXIST;
 				goto add_return;
 			}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		netlbl_af6list_foreach_rcu(iter6,
 					   &entry->type_def.addrsel->list6)
 			if (netlbl_af6list_search_exact(&iter6->addr,
 							&iter6->mask,
 							old_list6)) {
 				ret_val = -EEXIST;
 				goto add_return;
 			}
 #endif /* IPv6 */
 		netlbl_af4list_foreach_safe(iter4, tmp4,
 					    &entry->type_def.addrsel->list4) {
 			netlbl_af4list_remove_entry(iter4);
 			iter4->valid = 1;
 			ret_val = netlbl_af4list_add(iter4, old_list4);
 			netlbl_domhsh_audit_add(entry_old, iter4, NULL,
 						ret_val, audit_info);
 			if (ret_val != 0)
 				goto add_return;
 		}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		netlbl_af6list_foreach_safe(iter6, tmp6,
 					    &entry->type_def.addrsel->list6) {
 			netlbl_af6list_remove_entry(iter6);
 			iter6->valid = 1;
 			ret_val = netlbl_af6list_add(iter6, old_list6);
 			netlbl_domhsh_audit_add(entry_old, NULL, iter6,
 						ret_val, audit_info);
 			if (ret_val != 0)
 				goto add_return;
 		}
 #endif /* IPv6 */
 	} else
 		ret_val = -EINVAL;
 add_return:
 	spin_unlock(&netlbl_domhsh_lock);
 	rcu_read_unlock();
 	return ret_val;
 }
 /**
  * netlbl_domhsh_add_default - Adds the default entry to the domain hash table
  * @entry: the entry to add
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Adds a new default entry to the domain hash table and handles any updates
  * to the lower level protocol handler (i.e. CIPSO).  Returns zero on success,
  * negative on failure.
  *
  */
 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
 			      struct netlbl_audit *audit_info)
 {
 	return netlbl_domhsh_add(entry, audit_info);
 }
 /**
  * netlbl_domhsh_remove_entry - Removes a given entry from the domain table
  * @entry: the entry to remove
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Removes an entry from the domain hash table and handles any updates to the
  * lower level protocol handler (i.e. CIPSO).  Caller is responsible for
  * ensuring that the RCU read lock is held.  Returns zero on success, negative
  * on failure.
  *
  */
 int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
 			       struct netlbl_audit *audit_info)
 {
 	int ret_val = 0;
 	struct audit_buffer *audit_buf;
 	if (entry == NULL)
 		return -ENOENT;
 	spin_lock(&netlbl_domhsh_lock);
 	if (entry->valid) {
 		entry->valid = 0;
 		if (entry != rcu_dereference(netlbl_domhsh_def))
 			list_del_rcu(&entry->list);
 		else
 			rcu_assign_pointer(netlbl_domhsh_def, NULL);
 	} else
 		ret_val = -ENOENT;
 	spin_unlock(&netlbl_domhsh_lock);
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
 	if (audit_buf != NULL) {
 		audit_log_format(audit_buf,
 				 " nlbl_domain=%s res=%u",
 				 entry->domain ? entry->domain : "(default)",
 				 ret_val == 0 ? 1 : 0);
 		audit_log_end(audit_buf);
 	}
 	if (ret_val == 0) {
 		struct netlbl_af4list *iter4;
 		struct netlbl_domaddr4_map *map4;
 		switch (entry->type) {
 		case NETLBL_NLTYPE_ADDRSELECT:
 			netlbl_af4list_foreach_rcu(iter4,
 					     &entry->type_def.addrsel->list4) {
 				map4 = netlbl_domhsh_addr4_entry(iter4);
 				cipso_v4_doi_putdef(map4->type_def.cipsov4);
 			}
 			/* no need to check the IPv6 list since we currently
 			 * support only unlabeled protocols for IPv6 */
 			break;
 		case NETLBL_NLTYPE_CIPSOV4:
 			cipso_v4_doi_putdef(entry->type_def.cipsov4);
 			break;
 		}
 		call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
 	}
 	return ret_val;
 }
 /**
  * netlbl_domhsh_remove_af4 - Removes an address selector entry
  * @domain: the domain
  * @addr: IPv4 address
  * @mask: IPv4 address mask
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Removes an individual address selector from a domain mapping and potentially
  * the entire mapping if it is empty.  Returns zero on success, negative values
  * on failure.
  *
  */
 int netlbl_domhsh_remove_af4(const char *domain,
 			     const struct in_addr *addr,
 			     const struct in_addr *mask,
 			     struct netlbl_audit *audit_info)
 {
 	struct netlbl_dom_map *entry_map;
 	struct netlbl_af4list *entry_addr;
 	struct netlbl_af4list *iter4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct netlbl_af6list *iter6;
 #endif /* IPv6 */
 	struct netlbl_domaddr4_map *entry;
 	rcu_read_lock();
 	if (domain)
 		entry_map = netlbl_domhsh_search(domain);
 	else
 		entry_map = netlbl_domhsh_search_def(domain);
 	if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT)
 		goto remove_af4_failure;
 	spin_lock(&netlbl_domhsh_lock);
 	entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
 					   &entry_map->type_def.addrsel->list4);
 	spin_unlock(&netlbl_domhsh_lock);
 	if (entry_addr == NULL)
 		goto remove_af4_failure;
 	netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4)
 		goto remove_af4_single_addr;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6)
 		goto remove_af4_single_addr;
 #endif /* IPv6 */
 	/* the domain mapping is empty so remove it from the mapping table */
 	netlbl_domhsh_remove_entry(entry_map, audit_info);
 remove_af4_single_addr:
 	rcu_read_unlock();
 	/* yick, we can't use call_rcu here because we don't have a rcu head
 	 * pointer but hopefully this should be a rare case so the pause
 	 * shouldn't be a problem */
 	synchronize_rcu();
 	entry = netlbl_domhsh_addr4_entry(entry_addr);
 	cipso_v4_doi_putdef(entry->type_def.cipsov4);
 	kfree(entry);
 	return 0;
 remove_af4_failure:
 	rcu_read_unlock();
 	return -ENOENT;
 }
 /**
  * netlbl_domhsh_remove - Removes an entry from the domain hash table
  * @domain: the domain to remove
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Removes an entry from the domain hash table and handles any updates to the
  * lower level protocol handler (i.e. CIPSO).  Returns zero on success,
  * negative on failure.
  *
  */
 int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
 {
 	int ret_val;
 	struct netlbl_dom_map *entry;
 	rcu_read_lock();
 	if (domain)
 		entry = netlbl_domhsh_search(domain);
 	else
 		entry = netlbl_domhsh_search_def(domain);
 	ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
 	rcu_read_unlock();
 	return ret_val;
 }
 /**
  * netlbl_domhsh_remove_default - Removes the default entry from the table
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Removes/resets the default entry for the domain hash table and handles any
  * updates to the lower level protocol handler (i.e. CIPSO).  Returns zero on
  * success, non-zero on failure.
  *
  */
 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
 {
 	return netlbl_domhsh_remove(NULL, audit_info);
 }
 /**
  * netlbl_domhsh_getentry - Get an entry from the domain hash table
  * @domain: the domain name to search for
  *
  * Description:
  * Look through the domain hash table searching for an entry to match @domain,
  * return a pointer to a copy of the entry or NULL.  The caller is responsible
  * for ensuring that rcu_read_[un]lock() is called.
  *
  */
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
 {
 	return netlbl_domhsh_search_def(domain);
 }
 /**
  * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
  * @domain: the domain name to search for
  * @addr: the IP address to search for
  *
  * Description:
  * Look through the domain hash table searching for an entry to match @domain
  * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
  * responsible for ensuring that rcu_read_[un]lock() is called.
  *
  */
 struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain,
 						       __be32 addr)
 {
 	struct netlbl_dom_map *dom_iter;
 	struct netlbl_af4list *addr_iter;
 	dom_iter = netlbl_domhsh_search_def(domain);
 	if (dom_iter == NULL)
 		return NULL;
 	if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
 		return NULL;
 	addr_iter = netlbl_af4list_search(addr,
 					  &dom_iter->type_def.addrsel->list4);
 	if (addr_iter == NULL)
 		return NULL;
 	return netlbl_domhsh_addr4_entry(addr_iter);
 }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /**
  * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
  * @domain: the domain name to search for
  * @addr: the IP address to search for
  *
  * Description:
  * Look through the domain hash table searching for an entry to match @domain
  * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
  * responsible for ensuring that rcu_read_[un]lock() is called.
  *
  */
 struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
 						   const struct in6_addr *addr)
 {
 	struct netlbl_dom_map *dom_iter;
 	struct netlbl_af6list *addr_iter;
 	dom_iter = netlbl_domhsh_search_def(domain);
 	if (dom_iter == NULL)
 		return NULL;
 	if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
 		return NULL;
 	addr_iter = netlbl_af6list_search(addr,
 					  &dom_iter->type_def.addrsel->list6);
 	if (addr_iter == NULL)
 		return NULL;
 	return netlbl_domhsh_addr6_entry(addr_iter);
 }
 #endif /* IPv6 */
 /**
  * netlbl_domhsh_walk - Iterate through the domain mapping hash table
  * @skip_bkt: the number of buckets to skip at the start
  * @skip_chain: the number of entries to skip in the first iterated bucket
  * @callback: callback for each entry
  * @cb_arg: argument for the callback function
  *
  * Description:
  * Interate over the domain mapping hash table, skipping the first @skip_bkt
  * buckets and @skip_chain entries.  For each entry in the table call
  * @callback, if @callback returns a negative value stop 'walking' through the
  * table and return.  Updates the values in @skip_bkt and @skip_chain on
  * return.  Returns zero on success, negative values on failure.
  *
  */
 int netlbl_domhsh_walk(u32 *skip_bkt,
 		     u32 *skip_chain,
 		     int (*callback) (struct netlbl_dom_map *entry, void *arg),
 		     void *cb_arg)
 {
 	int ret_val = -ENOENT;
 	u32 iter_bkt;
 	struct list_head *iter_list;
 	struct netlbl_dom_map *iter_entry;
 	u32 chain_cnt = 0;
 	rcu_read_lock();
 	for (iter_bkt = *skip_bkt;
 	     iter_bkt < rcu_dereference(netlbl_domhsh)->size;
 	     iter_bkt++, chain_cnt = 0) {
 		iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
 		list_for_each_entry_rcu(iter_entry, iter_list, list)
 			if (iter_entry->valid) {
 				if (chain_cnt++ < *skip_chain)
 					continue;
 				ret_val = callback(iter_entry, cb_arg);
 				if (ret_val < 0) {
 					chain_cnt--;
 					goto walk_return;
 				}
 			}
 	}
 walk_return:
 	rcu_read_unlock();
 	*skip_bkt = iter_bkt;
 	*skip_chain = chain_cnt;
 	return ret_val;
 }

net/netlabel/netlabel_unlabeled.c

Diff comments View file @ d8bf4ca

 /*
  * NetLabel Unlabeled Support
  *
  * This file defines functions for dealing with unlabeled packets for the
  * NetLabel system.  The NetLabel system manages static and dynamic label
  * mappings for network protocols such as CIPSO and RIPSO.
  *
  * Author: Paul Moore <paul.moore@hp.com>
  *
  */
 /*
  * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY;  without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  * the GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program;  if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  *
  */
 #include <linux/types.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/socket.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/audit.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/net_namespace.h>
 #include <net/netlabel.h>
 #include <asm/bug.h>
 #include <asm/atomic.h>
 #include "netlabel_user.h"
 #include "netlabel_addrlist.h"
 #include "netlabel_domainhash.h"
 #include "netlabel_unlabeled.h"
 #include "netlabel_mgmt.h"
 /* NOTE: at present we always use init's network namespace since we don't
  *       presently support different namespaces even though the majority of
  *       the functions in this file are "namespace safe" */
 /* The unlabeled connection hash table which we use to map network interfaces
  * and addresses of unlabeled packets to a user specified secid value for the
  * LSM.  The hash table is used to lookup the network interface entry
  * (struct netlbl_unlhsh_iface) and then the interface entry is used to
  * lookup an IP address match from an ordered list.  If a network interface
  * match can not be found in the hash table then the default entry
  * (netlbl_unlhsh_def) is used.  The IP address entry list
  * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
  * larger netmask come first.
  */
 struct netlbl_unlhsh_tbl {
 	struct list_head *tbl;
 	u32 size;
 };
 #define netlbl_unlhsh_addr4_entry(iter) \
 	container_of(iter, struct netlbl_unlhsh_addr4, list)
 struct netlbl_unlhsh_addr4 {
 	u32 secid;
 	struct netlbl_af4list list;
 	struct rcu_head rcu;
 };
 #define netlbl_unlhsh_addr6_entry(iter) \
 	container_of(iter, struct netlbl_unlhsh_addr6, list)
 struct netlbl_unlhsh_addr6 {
 	u32 secid;
 	struct netlbl_af6list list;
 	struct rcu_head rcu;
 };
 struct netlbl_unlhsh_iface {
 	int ifindex;
 	struct list_head addr4_list;
 	struct list_head addr6_list;
 	u32 valid;
 	struct list_head list;
 	struct rcu_head rcu;
 };
 /* Argument struct for netlbl_unlhsh_walk() */
 struct netlbl_unlhsh_walk_arg {
 	struct netlink_callback *nl_cb;
 	struct sk_buff *skb;
 	u32 seq;
 };
 /* Unlabeled connection hash table */
 /* updates should be so rare that having one spinlock for the entire
  * hash table should be okay */
 static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
 #define netlbl_unlhsh_rcu_deref(p) \
-	rcu_dereference_check(p, rcu_read_lock_held() || \
+	rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
-				 lockdep_is_held(&netlbl_unlhsh_lock))
 static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
 static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
 /* Accept unlabeled packets flag */
 static u8 netlabel_unlabel_acceptflg = 0;
 /* NetLabel Generic NETLINK unlabeled family */
 static struct genl_family netlbl_unlabel_gnl_family = {
 	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_UNLABELED_NAME,
 	.version = NETLBL_PROTO_VERSION,
 	.maxattr = NLBL_UNLABEL_A_MAX,
 };
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
 	[NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
 	[NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
 				      .len = sizeof(struct in6_addr) },
 	[NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
 				      .len = sizeof(struct in6_addr) },
 	[NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
 				      .len = sizeof(struct in_addr) },
 	[NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
 				      .len = sizeof(struct in_addr) },
 	[NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
 				   .len = IFNAMSIZ - 1 },
 	[NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
 };
 /*
  * Unlabeled Connection Hash Table Functions
  */
 /**
  * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
  * @entry: the entry's RCU field
  *
  * Description:
  * This function is designed to be used as a callback to the call_rcu()
  * function so that memory allocated to a hash table interface entry can be
  * released safely.  It is important to note that this function does not free
  * the IPv4 and IPv6 address lists contained as part of an interface entry.  It
  * is up to the rest of the code to make sure an interface entry is only freed
  * once it's address lists are empty.
  *
  */
 static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
 {
 	struct netlbl_unlhsh_iface *iface;
 	struct netlbl_af4list *iter4;
 	struct netlbl_af4list *tmp4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct netlbl_af6list *iter6;
 	struct netlbl_af6list *tmp6;
 #endif /* IPv6 */
 	iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
 	/* no need for locks here since we are the only one with access to this
 	 * structure */
 	netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) {
 		netlbl_af4list_remove_entry(iter4);
 		kfree(netlbl_unlhsh_addr4_entry(iter4));
 	}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) {
 		netlbl_af6list_remove_entry(iter6);
 		kfree(netlbl_unlhsh_addr6_entry(iter6));
 	}
 #endif /* IPv6 */
 	kfree(iface);
 }
 /**
  * netlbl_unlhsh_hash - Hashing function for the hash table
  * @ifindex: the network interface/device to hash
  *
  * Description:
  * This is the hashing function for the unlabeled hash table, it returns the
  * bucket number for the given device/interface.  The caller is responsible for
  * ensuring that the hash table is protected with either a RCU read lock or
  * the hash table lock.
  *
  */
 static u32 netlbl_unlhsh_hash(int ifindex)
 {
 	return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1);
 }
 /**
  * netlbl_unlhsh_search_iface - Search for a matching interface entry
  * @ifindex: the network interface
  *
  * Description:
  * Searches the unlabeled connection hash table and returns a pointer to the
  * interface entry which matches @ifindex, otherwise NULL is returned.  The
  * caller is responsible for ensuring that the hash table is protected with
  * either a RCU read lock or the hash table lock.
  *
  */
 static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
 {
 	u32 bkt;
 	struct list_head *bkt_list;
 	struct netlbl_unlhsh_iface *iter;
 	bkt = netlbl_unlhsh_hash(ifindex);
 	bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
 	list_for_each_entry_rcu(iter, bkt_list, list)
 		if (iter->valid && iter->ifindex == ifindex)
 			return iter;
 	return NULL;
 }
 /**
  * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
  * @iface: the associated interface entry
  * @addr: IPv4 address in network byte order
  * @mask: IPv4 address mask in network byte order
  * @secid: LSM secid value for entry
  *
  * Description:
  * Add a new address entry into the unlabeled connection hash table using the
  * interface entry specified by @iface.  On success zero is returned, otherwise
  * a negative value is returned.
  *
  */
 static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
 				   const struct in_addr *addr,
 				   const struct in_addr *mask,
 				   u32 secid)
 {
 	int ret_val;
 	struct netlbl_unlhsh_addr4 *entry;
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
 		return -ENOMEM;
 	entry->list.addr = addr->s_addr & mask->s_addr;
 	entry->list.mask = mask->s_addr;
 	entry->list.valid = 1;
 	entry->secid = secid;
 	spin_lock(&netlbl_unlhsh_lock);
 	ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list);
 	spin_unlock(&netlbl_unlhsh_lock);
 	if (ret_val != 0)
 		kfree(entry);
 	return ret_val;
 }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /**
  * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
  * @iface: the associated interface entry
  * @addr: IPv6 address in network byte order
  * @mask: IPv6 address mask in network byte order
  * @secid: LSM secid value for entry
  *
  * Description:
  * Add a new address entry into the unlabeled connection hash table using the
  * interface entry specified by @iface.  On success zero is returned, otherwise
  * a negative value is returned.
  *
  */
 static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
 				   const struct in6_addr *addr,
 				   const struct in6_addr *mask,
 				   u32 secid)
 {
 	int ret_val;
 	struct netlbl_unlhsh_addr6 *entry;
 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL)
 		return -ENOMEM;
 	ipv6_addr_copy(&entry->list.addr, addr);
 	entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
 	entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
 	entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
 	entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
 	ipv6_addr_copy(&entry->list.mask, mask);
 	entry->list.valid = 1;
 	entry->secid = secid;
 	spin_lock(&netlbl_unlhsh_lock);
 	ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list);
 	spin_unlock(&netlbl_unlhsh_lock);
 	if (ret_val != 0)
 		kfree(entry);
 	return 0;
 }
 #endif /* IPv6 */
 /**
  * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
  * @ifindex: network interface
  *
  * Description:
  * Add a new, empty, interface entry into the unlabeled connection hash table.
  * On success a pointer to the new interface entry is returned, on failure NULL
  * is returned.
  *
  */
 static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
 {
 	u32 bkt;
 	struct netlbl_unlhsh_iface *iface;
 	iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
 	if (iface == NULL)
 		return NULL;
 	iface->ifindex = ifindex;
 	INIT_LIST_HEAD(&iface->addr4_list);
 	INIT_LIST_HEAD(&iface->addr6_list);
 	iface->valid = 1;
 	spin_lock(&netlbl_unlhsh_lock);
 	if (ifindex > 0) {
 		bkt = netlbl_unlhsh_hash(ifindex);
 		if (netlbl_unlhsh_search_iface(ifindex) != NULL)
 			goto add_iface_failure;
 		list_add_tail_rcu(&iface->list,
 			     &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]);
 	} else {
 		INIT_LIST_HEAD(&iface->list);
 		if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL)
 			goto add_iface_failure;
 		rcu_assign_pointer(netlbl_unlhsh_def, iface);
 	}
 	spin_unlock(&netlbl_unlhsh_lock);
 	return iface;
 add_iface_failure:
 	spin_unlock(&netlbl_unlhsh_lock);
 	kfree(iface);
 	return NULL;
 }
 /**
  * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
  * @net: network namespace
  * @dev_name: interface name
  * @addr: IP address in network byte order
  * @mask: address mask in network byte order
  * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
  * @secid: LSM secid value for the entry
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Adds a new entry to the unlabeled connection hash table.  Returns zero on
  * success, negative values on failure.
  *
  */
 int netlbl_unlhsh_add(struct net *net,
 		      const char *dev_name,
 		      const void *addr,
 		      const void *mask,
 		      u32 addr_len,
 		      u32 secid,
 		      struct netlbl_audit *audit_info)
 {
 	int ret_val;
 	int ifindex;
 	struct net_device *dev;
 	struct netlbl_unlhsh_iface *iface;
 	struct audit_buffer *audit_buf = NULL;
 	char *secctx = NULL;
 	u32 secctx_len;
 	if (addr_len != sizeof(struct in_addr) &&
 	    addr_len != sizeof(struct in6_addr))
 		return -EINVAL;
 	rcu_read_lock();
 	if (dev_name != NULL) {
 		dev = dev_get_by_name_rcu(net, dev_name);
 		if (dev == NULL) {
 			ret_val = -ENODEV;
 			goto unlhsh_add_return;
 		}
 		ifindex = dev->ifindex;
 		iface = netlbl_unlhsh_search_iface(ifindex);
 	} else {
 		ifindex = 0;
 		iface = rcu_dereference(netlbl_unlhsh_def);
 	}
 	if (iface == NULL) {
 		iface = netlbl_unlhsh_add_iface(ifindex);
 		if (iface == NULL) {
 			ret_val = -ENOMEM;
 			goto unlhsh_add_return;
 		}
 	}
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
 					      audit_info);
 	switch (addr_len) {
 	case sizeof(struct in_addr): {
 		struct in_addr *addr4, *mask4;
 		addr4 = (struct in_addr *)addr;
 		mask4 = (struct in_addr *)mask;
 		ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
 		if (audit_buf != NULL)
 			netlbl_af4list_audit_addr(audit_buf, 1,
 						  dev_name,
 						  addr4->s_addr,
 						  mask4->s_addr);
 		break;
 	}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case sizeof(struct in6_addr): {
 		struct in6_addr *addr6, *mask6;
 		addr6 = (struct in6_addr *)addr;
 		mask6 = (struct in6_addr *)mask;
 		ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
 		if (audit_buf != NULL)
 			netlbl_af6list_audit_addr(audit_buf, 1,
 						  dev_name,
 						  addr6, mask6);
 		break;
 	}
 #endif /* IPv6 */
 	default:
 		ret_val = -EINVAL;
 	}
 	if (ret_val == 0)
 		atomic_inc(&netlabel_mgmt_protocount);
 unlhsh_add_return:
 	rcu_read_unlock();
 	if (audit_buf != NULL) {
 		if (security_secid_to_secctx(secid,
 					     &secctx,
 					     &secctx_len) == 0) {
 			audit_log_format(audit_buf, " sec_obj=%s", secctx);
 			security_release_secctx(secctx, secctx_len);
 		}
 		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
 		audit_log_end(audit_buf);
 	}
 	return ret_val;
 }
 /**
  * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
  * @net: network namespace
  * @iface: interface entry
  * @addr: IP address
  * @mask: IP address mask
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Remove an IP address entry from the unlabeled connection hash table.
  * Returns zero on success, negative values on failure.
  *
  */
 static int netlbl_unlhsh_remove_addr4(struct net *net,
 				      struct netlbl_unlhsh_iface *iface,
 				      const struct in_addr *addr,
 				      const struct in_addr *mask,
 				      struct netlbl_audit *audit_info)
 {
 	struct netlbl_af4list *list_entry;
 	struct netlbl_unlhsh_addr4 *entry;
 	struct audit_buffer *audit_buf;
 	struct net_device *dev;
 	char *secctx;
 	u32 secctx_len;
 	spin_lock(&netlbl_unlhsh_lock);
 	list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
 					   &iface->addr4_list);
 	spin_unlock(&netlbl_unlhsh_lock);
 	if (list_entry != NULL)
 		entry = netlbl_unlhsh_addr4_entry(list_entry);
 	else
 		entry = NULL;
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
 					      audit_info);
 	if (audit_buf != NULL) {
 		dev = dev_get_by_index(net, iface->ifindex);
 		netlbl_af4list_audit_addr(audit_buf, 1,
 					  (dev != NULL ? dev->name : NULL),
 					  addr->s_addr, mask->s_addr);
 		if (dev != NULL)
 			dev_put(dev);
 		if (entry != NULL &&
 		    security_secid_to_secctx(entry->secid,
 					     &secctx, &secctx_len) == 0) {
 			audit_log_format(audit_buf, " sec_obj=%s", secctx);
 			security_release_secctx(secctx, secctx_len);
 		}
 		audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
 		audit_log_end(audit_buf);
 	}
 	if (entry == NULL)
 		return -ENOENT;
 	kfree_rcu(entry, rcu);
 	return 0;
 }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /**
  * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
  * @net: network namespace
  * @iface: interface entry
  * @addr: IP address
  * @mask: IP address mask
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Remove an IP address entry from the unlabeled connection hash table.
  * Returns zero on success, negative values on failure.
  *
  */
 static int netlbl_unlhsh_remove_addr6(struct net *net,
 				      struct netlbl_unlhsh_iface *iface,
 				      const struct in6_addr *addr,
 				      const struct in6_addr *mask,
 				      struct netlbl_audit *audit_info)
 {
 	struct netlbl_af6list *list_entry;
 	struct netlbl_unlhsh_addr6 *entry;
 	struct audit_buffer *audit_buf;
 	struct net_device *dev;
 	char *secctx;
 	u32 secctx_len;
 	spin_lock(&netlbl_unlhsh_lock);
 	list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
 	spin_unlock(&netlbl_unlhsh_lock);
 	if (list_entry != NULL)
 		entry = netlbl_unlhsh_addr6_entry(list_entry);
 	else
 		entry = NULL;
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
 					      audit_info);
 	if (audit_buf != NULL) {
 		dev = dev_get_by_index(net, iface->ifindex);
 		netlbl_af6list_audit_addr(audit_buf, 1,
 					  (dev != NULL ? dev->name : NULL),
 					  addr, mask);
 		if (dev != NULL)
 			dev_put(dev);
 		if (entry != NULL &&
 		    security_secid_to_secctx(entry->secid,
 					     &secctx, &secctx_len) == 0) {
 			audit_log_format(audit_buf, " sec_obj=%s", secctx);
 			security_release_secctx(secctx, secctx_len);
 		}
 		audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
 		audit_log_end(audit_buf);
 	}
 	if (entry == NULL)
 		return -ENOENT;
 	kfree_rcu(entry, rcu);
 	return 0;
 }
 #endif /* IPv6 */
 /**
  * netlbl_unlhsh_condremove_iface - Remove an interface entry
  * @iface: the interface entry
  *
  * Description:
  * Remove an interface entry from the unlabeled connection hash table if it is
  * empty.  An interface entry is considered to be empty if there are no
  * address entries assigned to it.
  *
  */
 static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
 {
 	struct netlbl_af4list *iter4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct netlbl_af6list *iter6;
 #endif /* IPv6 */
 	spin_lock(&netlbl_unlhsh_lock);
 	netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list)
 		goto unlhsh_condremove_failure;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list)
 		goto unlhsh_condremove_failure;
 #endif /* IPv6 */
 	iface->valid = 0;
 	if (iface->ifindex > 0)
 		list_del_rcu(&iface->list);
 	else
 		rcu_assign_pointer(netlbl_unlhsh_def, NULL);
 	spin_unlock(&netlbl_unlhsh_lock);
 	call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
 	return;
 unlhsh_condremove_failure:
 	spin_unlock(&netlbl_unlhsh_lock);
 }
 /**
  * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
  * @net: network namespace
  * @dev_name: interface name
  * @addr: IP address in network byte order
  * @mask: address mask in network byte order
  * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Removes and existing entry from the unlabeled connection hash table.
  * Returns zero on success, negative values on failure.
  *
  */
 int netlbl_unlhsh_remove(struct net *net,
 			 const char *dev_name,
 			 const void *addr,
 			 const void *mask,
 			 u32 addr_len,
 			 struct netlbl_audit *audit_info)
 {
 	int ret_val;
 	struct net_device *dev;
 	struct netlbl_unlhsh_iface *iface;
 	if (addr_len != sizeof(struct in_addr) &&
 	    addr_len != sizeof(struct in6_addr))
 		return -EINVAL;
 	rcu_read_lock();
 	if (dev_name != NULL) {
 		dev = dev_get_by_name_rcu(net, dev_name);
 		if (dev == NULL) {
 			ret_val = -ENODEV;
 			goto unlhsh_remove_return;
 		}
 		iface = netlbl_unlhsh_search_iface(dev->ifindex);
 	} else
 		iface = rcu_dereference(netlbl_unlhsh_def);
 	if (iface == NULL) {
 		ret_val = -ENOENT;
 		goto unlhsh_remove_return;
 	}
 	switch (addr_len) {
 	case sizeof(struct in_addr):
 		ret_val = netlbl_unlhsh_remove_addr4(net,
 						     iface, addr, mask,
 						     audit_info);
 		break;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case sizeof(struct in6_addr):
 		ret_val = netlbl_unlhsh_remove_addr6(net,
 						     iface, addr, mask,
 						     audit_info);
 		break;
 #endif /* IPv6 */
 	default:
 		ret_val = -EINVAL;
 	}
 	if (ret_val == 0) {
 		netlbl_unlhsh_condremove_iface(iface);
 		atomic_dec(&netlabel_mgmt_protocount);
 	}
 unlhsh_remove_return:
 	rcu_read_unlock();
 	return ret_val;
 }
 /*
  * General Helper Functions
  */
 /**
  * netlbl_unlhsh_netdev_handler - Network device notification handler
  * @this: notifier block
  * @event: the event
  * @ptr: the network device (cast to void)
  *
  * Description:
  * Handle network device events, although at present all we care about is a
  * network device going away.  In the case of a device going away we clear any
  * related entries from the unlabeled connection hash table.
  *
  */
 static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
 					unsigned long event,
 					void *ptr)
 {
 	struct net_device *dev = ptr;
 	struct netlbl_unlhsh_iface *iface = NULL;
 	if (!net_eq(dev_net(dev), &init_net))
 		return NOTIFY_DONE;
 	/* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
 	if (event == NETDEV_DOWN) {
 		spin_lock(&netlbl_unlhsh_lock);
 		iface = netlbl_unlhsh_search_iface(dev->ifindex);
 		if (iface != NULL && iface->valid) {
 			iface->valid = 0;
 			list_del_rcu(&iface->list);
 		} else
 			iface = NULL;
 		spin_unlock(&netlbl_unlhsh_lock);
 	}
 	if (iface != NULL)
 		call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
 	return NOTIFY_DONE;
 }
 /**
  * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
  * @value: desired value
  * @audit_info: NetLabel audit information
  *
  * Description:
  * Set the value of the unlabeled accept flag to @value.
  *
  */
 static void netlbl_unlabel_acceptflg_set(u8 value,
 					 struct netlbl_audit *audit_info)
 {
 	struct audit_buffer *audit_buf;
 	u8 old_val;
 	old_val = netlabel_unlabel_acceptflg;
 	netlabel_unlabel_acceptflg = value;
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
 					      audit_info);
 	if (audit_buf != NULL) {
 		audit_log_format(audit_buf,
 				 " unlbl_accept=%u old=%u", value, old_val);
 		audit_log_end(audit_buf);
 	}
 }
 /**
  * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
  * @info: the Generic NETLINK info block
  * @addr: the IP address
  * @mask: the IP address mask
  * @len: the address length
  *
  * Description:
  * Examine the Generic NETLINK message and extract the IP address information.
  * Returns zero on success, negative values on failure.
  *
  */
 static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
 				       void **addr,
 				       void **mask,
 				       u32 *len)
 {
 	u32 addr_len;
 	if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
 		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
 		if (addr_len != sizeof(struct in_addr) &&
 		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
 			return -EINVAL;
 		*len = addr_len;
 		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
 		*mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
 		return 0;
 	} else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
 		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
 		if (addr_len != sizeof(struct in6_addr) &&
 		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
 			return -EINVAL;
 		*len = addr_len;
 		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
 		*mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
 		return 0;
 	}
 	return -EINVAL;
 }
 /*
  * NetLabel Command Handlers
  */
 /**
  * netlbl_unlabel_accept - Handle an ACCEPT message
  * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
  *
  * Description:
  * Process a user generated ACCEPT message and set the accept flag accordingly.
  * Returns zero on success, negative values on failure.
  *
  */
 static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
 {
 	u8 value;
 	struct netlbl_audit audit_info;
 	if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) {
 		value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]);
 		if (value == 1 || value == 0) {
 			netlbl_netlink_auditinfo(skb, &audit_info);
 			netlbl_unlabel_acceptflg_set(value, &audit_info);
 			return 0;
 		}
 	}
 	return -EINVAL;
 }
 /**
  * netlbl_unlabel_list - Handle a LIST message
  * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
  *
  * Description:
  * Process a user generated LIST message and respond with the current status.
  * Returns zero on success, negative values on failure.
  *
  */
 static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
 {
 	int ret_val = -EINVAL;
 	struct sk_buff *ans_skb;
 	void *data;
 	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (ans_skb == NULL)
 		goto list_failure;
 	data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family,
 				 0, NLBL_UNLABEL_C_LIST);
 	if (data == NULL) {
 		ret_val = -ENOMEM;
 		goto list_failure;
 	}
 	ret_val = nla_put_u8(ans_skb,
 			     NLBL_UNLABEL_A_ACPTFLG,
 			     netlabel_unlabel_acceptflg);
 	if (ret_val != 0)
 		goto list_failure;
 	genlmsg_end(ans_skb, data);
 	return genlmsg_reply(ans_skb, info);
 list_failure:
 	kfree_skb(ans_skb);
 	return ret_val;
 }
 /**
  * netlbl_unlabel_staticadd - Handle a STATICADD message
  * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
  *
  * Description:
  * Process a user generated STATICADD message and add a new unlabeled
  * connection entry to the hash table.  Returns zero on success, negative
  * values on failure.
  *
  */
 static int netlbl_unlabel_staticadd(struct sk_buff *skb,
 				    struct genl_info *info)
 {
 	int ret_val;
 	char *dev_name;
 	void *addr;
 	void *mask;
 	u32 addr_len;
 	u32 secid;
 	struct netlbl_audit audit_info;
 	/* Don't allow users to add both IPv4 and IPv6 addresses for a
 	 * single entry.  However, allow users to create two entries, one each
 	 * for IPv4 and IPv4, with the same LSM security context which should
 	 * achieve the same result. */
 	if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
 	    !info->attrs[NLBL_UNLABEL_A_IFACE] ||
 	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
 	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
 		return -EINVAL;
 	netlbl_netlink_auditinfo(skb, &audit_info);
 	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
 	if (ret_val != 0)
 		return ret_val;
 	dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
 	ret_val = security_secctx_to_secid(
 		                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
 				  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
 				  &secid);
 	if (ret_val != 0)
 		return ret_val;
 	return netlbl_unlhsh_add(&init_net,
 				 dev_name, addr, mask, addr_len, secid,
 				 &audit_info);
 }
 /**
  * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
  * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
  *
  * Description:
  * Process a user generated STATICADDDEF message and add a new default
  * unlabeled connection entry.  Returns zero on success, negative values on
  * failure.
  *
  */
 static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
 				       struct genl_info *info)
 {
 	int ret_val;
 	void *addr;
 	void *mask;
 	u32 addr_len;
 	u32 secid;
 	struct netlbl_audit audit_info;
 	/* Don't allow users to add both IPv4 and IPv6 addresses for a
 	 * single entry.  However, allow users to create two entries, one each
 	 * for IPv4 and IPv6, with the same LSM security context which should
 	 * achieve the same result. */
 	if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
 	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
 	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
 		return -EINVAL;
 	netlbl_netlink_auditinfo(skb, &audit_info);
 	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
 	if (ret_val != 0)
 		return ret_val;
 	ret_val = security_secctx_to_secid(
 		                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
 				  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
 				  &secid);
 	if (ret_val != 0)
 		return ret_val;
 	return netlbl_unlhsh_add(&init_net,
 				 NULL, addr, mask, addr_len, secid,
 				 &audit_info);
 }
 /**
  * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
  * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
  *
  * Description:
  * Process a user generated STATICREMOVE message and remove the specified
  * unlabeled connection entry.  Returns zero on success, negative values on
  * failure.
  *
  */
 static int netlbl_unlabel_staticremove(struct sk_buff *skb,
 				       struct genl_info *info)
 {
 	int ret_val;
 	char *dev_name;
 	void *addr;
 	void *mask;
 	u32 addr_len;
 	struct netlbl_audit audit_info;
 	/* See the note in netlbl_unlabel_staticadd() about not allowing both
 	 * IPv4 and IPv6 in the same entry. */
 	if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
 	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
 	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
 		return -EINVAL;
 	netlbl_netlink_auditinfo(skb, &audit_info);
 	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
 	if (ret_val != 0)
 		return ret_val;
 	dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
 	return netlbl_unlhsh_remove(&init_net,
 				    dev_name, addr, mask, addr_len,
 				    &audit_info);
 }
 /**
  * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
  * @skb: the NETLINK buffer
  * @info: the Generic NETLINK info block
  *
  * Description:
  * Process a user generated STATICREMOVEDEF message and remove the default
  * unlabeled connection entry.  Returns zero on success, negative values on
  * failure.
  *
  */
 static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
 					  struct genl_info *info)
 {
 	int ret_val;
 	void *addr;
 	void *mask;
 	u32 addr_len;
 	struct netlbl_audit audit_info;
 	/* See the note in netlbl_unlabel_staticadd() about not allowing both
 	 * IPv4 and IPv6 in the same entry. */
 	if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
 	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
 	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
 		return -EINVAL;
 	netlbl_netlink_auditinfo(skb, &audit_info);
 	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
 	if (ret_val != 0)
 		return ret_val;
 	return netlbl_unlhsh_remove(&init_net,
 				    NULL, addr, mask, addr_len,
 				    &audit_info);
 }
 /**
  * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
  * @cmd: command/message
  * @iface: the interface entry
  * @addr4: the IPv4 address entry
  * @addr6: the IPv6 address entry
  * @arg: the netlbl_unlhsh_walk_arg structure
  *
  * Description:
  * This function is designed to be used to generate a response for a
  * STATICLIST or STATICLISTDEF message.  When called either @addr4 or @addr6
  * can be specified, not both, the other unspecified entry should be set to
  * NULL by the caller.  Returns the size of the message on success, negative
  * values on failure.
  *
  */
 static int netlbl_unlabel_staticlist_gen(u32 cmd,
 				       const struct netlbl_unlhsh_iface *iface,
 				       const struct netlbl_unlhsh_addr4 *addr4,
 				       const struct netlbl_unlhsh_addr6 *addr6,
 				       void *arg)
 {
 	int ret_val = -ENOMEM;
 	struct netlbl_unlhsh_walk_arg *cb_arg = arg;
 	struct net_device *dev;
 	void *data;
 	u32 secid;
 	char *secctx;
 	u32 secctx_len;
 	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
 			   cb_arg->seq, &netlbl_unlabel_gnl_family,
 			   NLM_F_MULTI, cmd);
 	if (data == NULL)
 		goto list_cb_failure;
 	if (iface->ifindex > 0) {
 		dev = dev_get_by_index(&init_net, iface->ifindex);
 		if (!dev) {
 			ret_val = -ENODEV;
 			goto list_cb_failure;
 		}
 		ret_val = nla_put_string(cb_arg->skb,
 					 NLBL_UNLABEL_A_IFACE, dev->name);
 		dev_put(dev);
 		if (ret_val != 0)
 			goto list_cb_failure;
 	}
 	if (addr4) {
 		struct in_addr addr_struct;
 		addr_struct.s_addr = addr4->list.addr;
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV4ADDR,
 				  sizeof(struct in_addr),
 				  &addr_struct);
 		if (ret_val != 0)
 			goto list_cb_failure;
 		addr_struct.s_addr = addr4->list.mask;
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV4MASK,
 				  sizeof(struct in_addr),
 				  &addr_struct);
 		if (ret_val != 0)
 			goto list_cb_failure;
 		secid = addr4->secid;
 	} else {
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV6ADDR,
 				  sizeof(struct in6_addr),
 				  &addr6->list.addr);
 		if (ret_val != 0)
 			goto list_cb_failure;
 		ret_val = nla_put(cb_arg->skb,
 				  NLBL_UNLABEL_A_IPV6MASK,
 				  sizeof(struct in6_addr),
 				  &addr6->list.mask);
 		if (ret_val != 0)
 			goto list_cb_failure;
 		secid = addr6->secid;
 	}
 	ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
 	if (ret_val != 0)
 		goto list_cb_failure;
 	ret_val = nla_put(cb_arg->skb,
 			  NLBL_UNLABEL_A_SECCTX,
 			  secctx_len,
 			  secctx);
 	security_release_secctx(secctx, secctx_len);
 	if (ret_val != 0)
 		goto list_cb_failure;
 	cb_arg->seq++;
 	return genlmsg_end(cb_arg->skb, data);
 list_cb_failure:
 	genlmsg_cancel(cb_arg->skb, data);
 	return ret_val;
 }
 /**
  * netlbl_unlabel_staticlist - Handle a STATICLIST message
  * @skb: the NETLINK buffer
  * @cb: the NETLINK callback
  *
  * Description:
  * Process a user generated STATICLIST message and dump the unlabeled
  * connection hash table in a form suitable for use in a kernel generated
  * STATICLIST message.  Returns the length of @skb.
  *
  */
 static int netlbl_unlabel_staticlist(struct sk_buff *skb,
 				     struct netlink_callback *cb)
 {
 	struct netlbl_unlhsh_walk_arg cb_arg;
 	u32 skip_bkt = cb->args[0];
 	u32 skip_chain = cb->args[1];
 	u32 skip_addr4 = cb->args[2];
 	u32 skip_addr6 = cb->args[3];
 	u32 iter_bkt;
 	u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
 	struct netlbl_unlhsh_iface *iface;
 	struct list_head *iter_list;
 	struct netlbl_af4list *addr4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct netlbl_af6list *addr6;
 #endif
 	cb_arg.nl_cb = cb;
 	cb_arg.skb = skb;
 	cb_arg.seq = cb->nlh->nlmsg_seq;
 	rcu_read_lock();
 	for (iter_bkt = skip_bkt;
 	     iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
 	     iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
 		iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
 		list_for_each_entry_rcu(iface, iter_list, list) {
 			if (!iface->valid ||
 			    iter_chain++ < skip_chain)
 				continue;
 			netlbl_af4list_foreach_rcu(addr4,
 						   &iface->addr4_list) {
 				if (iter_addr4++ < skip_addr4)
 					continue;
 				if (netlbl_unlabel_staticlist_gen(
 					      NLBL_UNLABEL_C_STATICLIST,
 					      iface,
 					      netlbl_unlhsh_addr4_entry(addr4),
 					      NULL,
 					      &cb_arg) < 0) {
 					iter_addr4--;
 					iter_chain--;
 					goto unlabel_staticlist_return;
 				}
 			}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 			netlbl_af6list_foreach_rcu(addr6,
 						   &iface->addr6_list) {
 				if (iter_addr6++ < skip_addr6)
 					continue;
 				if (netlbl_unlabel_staticlist_gen(
 					      NLBL_UNLABEL_C_STATICLIST,
 					      iface,
 					      NULL,
 					      netlbl_unlhsh_addr6_entry(addr6),
 					      &cb_arg) < 0) {
 					iter_addr6--;
 					iter_chain--;
 					goto unlabel_staticlist_return;
 				}
 			}
 #endif /* IPv6 */
 		}
 	}
 unlabel_staticlist_return:
 	rcu_read_unlock();
 	cb->args[0] = skip_bkt;
 	cb->args[1] = skip_chain;
 	cb->args[2] = skip_addr4;
 	cb->args[3] = skip_addr6;
 	return skb->len;
 }
 /**
  * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
  * @skb: the NETLINK buffer
  * @cb: the NETLINK callback
  *
  * Description:
  * Process a user generated STATICLISTDEF message and dump the default
  * unlabeled connection entry in a form suitable for use in a kernel generated
  * STATICLISTDEF message.  Returns the length of @skb.
  *
  */
 static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
 					struct netlink_callback *cb)
 {
 	struct netlbl_unlhsh_walk_arg cb_arg;
 	struct netlbl_unlhsh_iface *iface;
 	u32 skip_addr4 = cb->args[0];
 	u32 skip_addr6 = cb->args[1];
 	u32 iter_addr4 = 0;
 	struct netlbl_af4list *addr4;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	u32 iter_addr6 = 0;
 	struct netlbl_af6list *addr6;
 #endif
 	cb_arg.nl_cb = cb;
 	cb_arg.skb = skb;
 	cb_arg.seq = cb->nlh->nlmsg_seq;
 	rcu_read_lock();
 	iface = rcu_dereference(netlbl_unlhsh_def);
 	if (iface == NULL || !iface->valid)
 		goto unlabel_staticlistdef_return;
 	netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
 		if (iter_addr4++ < skip_addr4)
 			continue;
 		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
 					      iface,
 					      netlbl_unlhsh_addr4_entry(addr4),
 					      NULL,
 					      &cb_arg) < 0) {
 			iter_addr4--;
 			goto unlabel_staticlistdef_return;
 		}
 	}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
 		if (iter_addr6++ < skip_addr6)
 			continue;
 		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
 					      iface,
 					      NULL,
 					      netlbl_unlhsh_addr6_entry(addr6),
 					      &cb_arg) < 0) {
 			iter_addr6--;
 			goto unlabel_staticlistdef_return;
 		}
 	}
 #endif /* IPv6 */
 unlabel_staticlistdef_return:
 	rcu_read_unlock();
 	cb->args[0] = skip_addr4;
 	cb->args[1] = skip_addr6;
 	return skb->len;
 }
 /*
  * NetLabel Generic NETLINK Command Definitions
  */
 static struct genl_ops netlbl_unlabel_genl_ops[] = {
 	{
 	.cmd = NLBL_UNLABEL_C_STATICADD,
 	.flags = GENL_ADMIN_PERM,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_staticadd,
 	.dumpit = NULL,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_STATICREMOVE,
 	.flags = GENL_ADMIN_PERM,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_staticremove,
 	.dumpit = NULL,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_STATICLIST,
 	.flags = 0,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = NULL,
 	.dumpit = netlbl_unlabel_staticlist,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_STATICADDDEF,
 	.flags = GENL_ADMIN_PERM,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_staticadddef,
 	.dumpit = NULL,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
 	.flags = GENL_ADMIN_PERM,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_staticremovedef,
 	.dumpit = NULL,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_STATICLISTDEF,
 	.flags = 0,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = NULL,
 	.dumpit = netlbl_unlabel_staticlistdef,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_ACCEPT,
 	.flags = GENL_ADMIN_PERM,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_accept,
 	.dumpit = NULL,
 	},
 	{
 	.cmd = NLBL_UNLABEL_C_LIST,
 	.flags = 0,
 	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_list,
 	.dumpit = NULL,
 	},
 };
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
 /**
  * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component
  *
  * Description:
  * Register the unlabeled packet NetLabel component with the Generic NETLINK
  * mechanism.  Returns zero on success, negative values on failure.
  *
  */
 int __init netlbl_unlabel_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
 		netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops));
 }
 /*
  * NetLabel KAPI Hooks
  */
 static struct notifier_block netlbl_unlhsh_netdev_notifier = {
 	.notifier_call = netlbl_unlhsh_netdev_handler,
 };
 /**
  * netlbl_unlabel_init - Initialize the unlabeled connection hash table
  * @size: the number of bits to use for the hash buckets
  *
  * Description:
  * Initializes the unlabeled connection hash table and registers a network
  * device notification handler.  This function should only be called by the
  * NetLabel subsystem itself during initialization.  Returns zero on success,
  * non-zero values on error.
  *
  */
 int __init netlbl_unlabel_init(u32 size)
 {
 	u32 iter;
 	struct netlbl_unlhsh_tbl *hsh_tbl;
 	if (size == 0)
 		return -EINVAL;
 	hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
 	if (hsh_tbl == NULL)
 		return -ENOMEM;
 	hsh_tbl->size = 1 << size;
 	hsh_tbl->tbl = kcalloc(hsh_tbl->size,
 			       sizeof(struct list_head),
 			       GFP_KERNEL);
 	if (hsh_tbl->tbl == NULL) {
 		kfree(hsh_tbl);
 		return -ENOMEM;
 	}
 	for (iter = 0; iter < hsh_tbl->size; iter++)
 		INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
 	rcu_read_lock();
 	spin_lock(&netlbl_unlhsh_lock);
 	rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
 	spin_unlock(&netlbl_unlhsh_lock);
 	rcu_read_unlock();
 	register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
 	return 0;
 }
 /**
  * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
  * @skb: the packet
  * @family: protocol family
  * @secattr: the security attributes
  *
  * Description:
  * Determine the security attributes, if any, for an unlabled packet and return
  * them in @secattr.  Returns zero on success and negative values on failure.
  *
  */
 int netlbl_unlabel_getattr(const struct sk_buff *skb,
 			   u16 family,
 			   struct netlbl_lsm_secattr *secattr)
 {
 	struct netlbl_unlhsh_iface *iface;
 	rcu_read_lock();
 	iface = netlbl_unlhsh_search_iface(skb->skb_iif);
 	if (iface == NULL)
 		iface = rcu_dereference(netlbl_unlhsh_def);
 	if (iface == NULL || !iface->valid)
 		goto unlabel_getattr_nolabel;
 	switch (family) {
 	case PF_INET: {
 		struct iphdr *hdr4;
 		struct netlbl_af4list *addr4;
 		hdr4 = ip_hdr(skb);
 		addr4 = netlbl_af4list_search(hdr4->saddr,
 					      &iface->addr4_list);
 		if (addr4 == NULL)
 			goto unlabel_getattr_nolabel;
 		secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid;
 		break;
 	}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case PF_INET6: {
 		struct ipv6hdr *hdr6;
 		struct netlbl_af6list *addr6;
 		hdr6 = ipv6_hdr(skb);
 		addr6 = netlbl_af6list_search(&hdr6->saddr,
 					      &iface->addr6_list);
 		if (addr6 == NULL)
 			goto unlabel_getattr_nolabel;
 		secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid;
 		break;
 	}
 #endif /* IPv6 */
 	default:
 		goto unlabel_getattr_nolabel;
 	}
 	rcu_read_unlock();
 	secattr->flags |= NETLBL_SECATTR_SECID;
 	secattr->type = NETLBL_NLTYPE_UNLABELED;
 	return 0;
 unlabel_getattr_nolabel:
 	rcu_read_unlock();
 	if (netlabel_unlabel_acceptflg == 0)
 		return -ENOMSG;
 	secattr->type = NETLBL_NLTYPE_UNLABELED;
 	return 0;
 }
 /**
  * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets
  *
  * Description:
  * Set the default NetLabel configuration to allow incoming unlabeled packets
  * and to send unlabeled network traffic by default.
  *
  */
 int __init netlbl_unlabel_defconf(void)
 {
 	int ret_val;
 	struct netlbl_dom_map *entry;
 	struct netlbl_audit audit_info;
 	/* Only the kernel is allowed to call this function and the only time
 	 * it is called is at bootup before the audit subsystem is reporting
 	 * messages so don't worry to much about these values. */
 	security_task_getsecid(current, &audit_info.secid);
 	audit_info.loginuid = 0;
 	audit_info.sessionid = 0;
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (entry == NULL)
 		return -ENOMEM;
 	entry->type = NETLBL_NLTYPE_UNLABELED;
 	ret_val = netlbl_domhsh_add_default(entry, &audit_info);
 	if (ret_val != 0)
 		return ret_val;
 	netlbl_unlabel_acceptflg_set(1, &audit_info);
 	return 0;
 }

security/keys/keyring.c

Diff comments View file @ d8bf4ca

 /* Keyring handling
  *
  * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <keys/keyring-type.h>
 #include <linux/uaccess.h>
 #include "internal.h"
 #define rcu_dereference_locked_keyring(keyring)				\
 	(rcu_dereference_protected(					\
 		(keyring)->payload.subscriptions,			\
 		rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem)))
 #define KEY_LINK_FIXQUOTA 1UL
 /*
  * When plumbing the depths of the key tree, this sets a hard limit
  * set on how deep we're willing to go.
  */
 #define KEYRING_SEARCH_MAX_DEPTH 6
 /*
  * We keep all named keyrings in a hash to speed looking them up.
  */
 #define KEYRING_NAME_HASH_SIZE	(1 << 5)
 static struct list_head	keyring_name_hash[KEYRING_NAME_HASH_SIZE];
 static DEFINE_RWLOCK(keyring_name_lock);
 static inline unsigned keyring_hash(const char *desc)
 {
 	unsigned bucket = 0;
 	for (; *desc; desc++)
 		bucket += (unsigned char)*desc;
 	return bucket & (KEYRING_NAME_HASH_SIZE - 1);
 }
 /*
  * The keyring key type definition.  Keyrings are simply keys of this type and
  * can be treated as ordinary keys in addition to having their own special
  * operations.
  */
 static int keyring_instantiate(struct key *keyring,
 			       const void *data, size_t datalen);
 static int keyring_match(const struct key *keyring, const void *criterion);
 static void keyring_revoke(struct key *keyring);
 static void keyring_destroy(struct key *keyring);
 static void keyring_describe(const struct key *keyring, struct seq_file *m);
 static long keyring_read(const struct key *keyring,
 			 char __user *buffer, size_t buflen);
 struct key_type key_type_keyring = {
 	.name		= "keyring",
 	.def_datalen	= sizeof(struct keyring_list),
 	.instantiate	= keyring_instantiate,
 	.match		= keyring_match,
 	.revoke		= keyring_revoke,
 	.destroy	= keyring_destroy,
 	.describe	= keyring_describe,
 	.read		= keyring_read,
 };
 EXPORT_SYMBOL(key_type_keyring);
 /*
  * Semaphore to serialise link/link calls to prevent two link calls in parallel
  * introducing a cycle.
  */
 static DECLARE_RWSEM(keyring_serialise_link_sem);
 /*
  * Publish the name of a keyring so that it can be found by name (if it has
  * one).
  */
 static void keyring_publish_name(struct key *keyring)
 {
 	int bucket;
 	if (keyring->description) {
 		bucket = keyring_hash(keyring->description);
 		write_lock(&keyring_name_lock);
 		if (!keyring_name_hash[bucket].next)
 			INIT_LIST_HEAD(&keyring_name_hash[bucket]);
 		list_add_tail(&keyring->type_data.link,
 			      &keyring_name_hash[bucket]);
 		write_unlock(&keyring_name_lock);
 	}
 }
 /*
  * Initialise a keyring.
  *
  * Returns 0 on success, -EINVAL if given any data.
  */
 static int keyring_instantiate(struct key *keyring,
 			       const void *data, size_t datalen)
 {
 	int ret;
 	ret = -EINVAL;
 	if (datalen == 0) {
 		/* make the keyring available by name if it has one */
 		keyring_publish_name(keyring);
 		ret = 0;
 	}
 	return ret;
 }
 /*
  * Match keyrings on their name
  */
 static int keyring_match(const struct key *keyring, const void *description)
 {
 	return keyring->description &&
 		strcmp(keyring->description, description) == 0;
 }
 /*
  * Clean up a keyring when it is destroyed.  Unpublish its name if it had one
  * and dispose of its data.
  */
 static void keyring_destroy(struct key *keyring)
 {
 	struct keyring_list *klist;
 	int loop;
 	if (keyring->description) {
 		write_lock(&keyring_name_lock);
 		if (keyring->type_data.link.next != NULL &&
 		    !list_empty(&keyring->type_data.link))
 			list_del(&keyring->type_data.link);
 		write_unlock(&keyring_name_lock);
 	}
 	klist = rcu_dereference_check(keyring->payload.subscriptions,
-				      rcu_read_lock_held() ||
 				      atomic_read(&keyring->usage) == 0);
 	if (klist) {
 		for (loop = klist->nkeys - 1; loop >= 0; loop--)
 			key_put(klist->keys[loop]);
 		kfree(klist);
 	}
 }
 /*
  * Describe a keyring for /proc.
  */
 static void keyring_describe(const struct key *keyring, struct seq_file *m)
 {
 	struct keyring_list *klist;
 	if (keyring->description)
 		seq_puts(m, keyring->description);
 	else
 		seq_puts(m, "[anon]");
 	if (key_is_instantiated(keyring)) {
 		rcu_read_lock();
 		klist = rcu_dereference(keyring->payload.subscriptions);
 		if (klist)
 			seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys);
 		else
 			seq_puts(m, ": empty");
 		rcu_read_unlock();
 	}
 }
 /*
  * Read a list of key IDs from the keyring's contents in binary form
  *
  * The keyring's semaphore is read-locked by the caller.
  */
 static long keyring_read(const struct key *keyring,
 			 char __user *buffer, size_t buflen)
 {
 	struct keyring_list *klist;
 	struct key *key;
 	size_t qty, tmp;
 	int loop, ret;
 	ret = 0;
 	klist = rcu_dereference_locked_keyring(keyring);
 	if (klist) {
 		/* calculate how much data we could return */
 		qty = klist->nkeys * sizeof(key_serial_t);
 		if (buffer && buflen > 0) {
 			if (buflen > qty)
 				buflen = qty;
 			/* copy the IDs of the subscribed keys into the
 			 * buffer */
 			ret = -EFAULT;
 			for (loop = 0; loop < klist->nkeys; loop++) {
 				key = klist->keys[loop];
 				tmp = sizeof(key_serial_t);
 				if (tmp > buflen)
 					tmp = buflen;
 				if (copy_to_user(buffer,
 						 &key->serial,
 						 tmp) != 0)
 					goto error;
 				buflen -= tmp;
 				if (buflen == 0)
 					break;
 				buffer += tmp;
 			}
 		}
 		ret = qty;
 	}
 error:
 	return ret;
 }
 /*
  * Allocate a keyring and link into the destination keyring.
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 			  const struct cred *cred, unsigned long flags,
 			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 	keyring = key_alloc(&key_type_keyring, description,
 			    uid, gid, cred,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
 			    flags);
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
 			key_put(keyring);
 			keyring = ERR_PTR(ret);
 		}
 	}
 	return keyring;
 }
 /**
  * keyring_search_aux - Search a keyring tree for a key matching some criteria
  * @keyring_ref: A pointer to the keyring with possession indicator.
  * @cred: The credentials to use for permissions checks.
  * @type: The type of key to search for.
  * @description: Parameter for @match.
  * @match: Function to rule on whether or not a key is the one required.
  * @no_state_check: Don't check if a matching key is bad
  *
  * Search the supplied keyring tree for a key that matches the criteria given.
  * The root keyring and any linked keyrings must grant Search permission to the
  * caller to be searchable and keys can only be found if they too grant Search
  * to the caller. The possession flag on the root keyring pointer controls use
  * of the possessor bits in permissions checking of the entire tree.  In
  * addition, the LSM gets to forbid keyring searches and key matches.
  *
  * The search is performed as a breadth-then-depth search up to the prescribed
  * limit (KEYRING_SEARCH_MAX_DEPTH).
  *
  * Keys are matched to the type provided and are then filtered by the match
  * function, which is given the description to use in any way it sees fit.  The
  * match function may use any attributes of a key that it wishes to to
  * determine the match.  Normally the match function from the key type would be
  * used.
  *
  * RCU is used to prevent the keyring key lists from disappearing without the
  * need to take lots of locks.
  *
  * Returns a pointer to the found key and increments the key usage count if
  * successful; -EAGAIN if no matching keys were found, or if expired or revoked
  * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the
  * specified keyring wasn't a keyring.
  *
  * In the case of a successful return, the possession attribute from
  * @keyring_ref is propagated to the returned key reference.
  */
 key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 			     const struct cred *cred,
 			     struct key_type *type,
 			     const void *description,
 			     key_match_func_t match,
 			     bool no_state_check)
 {
 	struct {
 		struct keyring_list *keylist;
 		int kix;
 	} stack[KEYRING_SEARCH_MAX_DEPTH];
 	struct keyring_list *keylist;
 	struct timespec now;
 	unsigned long possessed, kflags;
 	struct key *keyring, *key;
 	key_ref_t key_ref;
 	long err;
 	int sp, kix;
 	keyring = key_ref_to_ptr(keyring_ref);
 	possessed = is_key_possessed(keyring_ref);
 	key_check(keyring);
 	/* top keyring must have search permission to begin the search */
 	err = key_task_permission(keyring_ref, cred, KEY_SEARCH);
 	if (err < 0) {
 		key_ref = ERR_PTR(err);
 		goto error;
 	}
 	key_ref = ERR_PTR(-ENOTDIR);
 	if (keyring->type != &key_type_keyring)
 		goto error;
 	rcu_read_lock();
 	now = current_kernel_time();
 	err = -EAGAIN;
 	sp = 0;
 	/* firstly we should check to see if this top-level keyring is what we
 	 * are looking for */
 	key_ref = ERR_PTR(-EAGAIN);
 	kflags = keyring->flags;
 	if (keyring->type == type && match(keyring, description)) {
 		key = keyring;
 		if (no_state_check)
 			goto found;
 		/* check it isn't negative and hasn't expired or been
 		 * revoked */
 		if (kflags & (1 << KEY_FLAG_REVOKED))
 			goto error_2;
 		if (key->expiry && now.tv_sec >= key->expiry)
 			goto error_2;
 		key_ref = ERR_PTR(key->type_data.reject_error);
 		if (kflags & (1 << KEY_FLAG_NEGATIVE))
 			goto error_2;
 		goto found;
 	}
 	/* otherwise, the top keyring must not be revoked, expired, or
 	 * negatively instantiated if we are to search it */
 	key_ref = ERR_PTR(-EAGAIN);
 	if (kflags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_NEGATIVE)) ||
 	    (keyring->expiry && now.tv_sec >= keyring->expiry))
 		goto error_2;
 	/* start processing a new keyring */
 descend:
 	if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
 		goto not_this_keyring;
 	keylist = rcu_dereference(keyring->payload.subscriptions);
 	if (!keylist)
 		goto not_this_keyring;
 	/* iterate through the keys in this keyring first */
 	for (kix = 0; kix < keylist->nkeys; kix++) {
 		key = keylist->keys[kix];
 		kflags = key->flags;
 		/* ignore keys not of this type */
 		if (key->type != type)
 			continue;
 		/* skip revoked keys and expired keys */
 		if (!no_state_check) {
 			if (kflags & (1 << KEY_FLAG_REVOKED))
 				continue;
 			if (key->expiry && now.tv_sec >= key->expiry)
 				continue;
 		}
 		/* keys that don't match */
 		if (!match(key, description))
 			continue;
 		/* key must have search permissions */
 		if (key_task_permission(make_key_ref(key, possessed),
 					cred, KEY_SEARCH) < 0)
 			continue;
 		if (no_state_check)
 			goto found;
 		/* we set a different error code if we pass a negative key */
 		if (kflags & (1 << KEY_FLAG_NEGATIVE)) {
 			err = key->type_data.reject_error;
 			continue;
 		}
 		goto found;
 	}
 	/* search through the keyrings nested in this one */
 	kix = 0;
 ascend:
 	for (; kix < keylist->nkeys; kix++) {
 		key = keylist->keys[kix];
 		if (key->type != &key_type_keyring)
 			continue;
 		/* recursively search nested keyrings
 		 * - only search keyrings for which we have search permission
 		 */
 		if (sp >= KEYRING_SEARCH_MAX_DEPTH)
 			continue;
 		if (key_task_permission(make_key_ref(key, possessed),
 					cred, KEY_SEARCH) < 0)
 			continue;
 		/* stack the current position */
 		stack[sp].keylist = keylist;
 		stack[sp].kix = kix;
 		sp++;
 		/* begin again with the new keyring */
 		keyring = key;
 		goto descend;
 	}
 	/* the keyring we're looking at was disqualified or didn't contain a
 	 * matching key */
 not_this_keyring:
 	if (sp > 0) {
 		/* resume the processing of a keyring higher up in the tree */
 		sp--;
 		keylist = stack[sp].keylist;
 		kix = stack[sp].kix + 1;
 		goto ascend;
 	}
 	key_ref = ERR_PTR(err);
 	goto error_2;
 	/* we found a viable match */
 found:
 	atomic_inc(&key->usage);
 	key_check(key);
 	key_ref = make_key_ref(key, possessed);
 error_2:
 	rcu_read_unlock();
 error:
 	return key_ref;
 }
 /**
  * keyring_search - Search the supplied keyring tree for a matching key
  * @keyring: The root of the keyring tree to be searched.
  * @type: The type of keyring we want to find.
  * @description: The name of the keyring we want to find.
  *
  * As keyring_search_aux() above, but using the current task's credentials and
  * type's default matching function.
  */
 key_ref_t keyring_search(key_ref_t keyring,
 			 struct key_type *type,
 			 const char *description)
 {
 	if (!type->match)
 		return ERR_PTR(-ENOKEY);
 	return keyring_search_aux(keyring, current->cred,
 				  type, description, type->match, false);
 }
 EXPORT_SYMBOL(keyring_search);
 /*
  * Search the given keyring only (no recursion).
  *
  * The caller must guarantee that the keyring is a keyring and that the
  * permission is granted to search the keyring as no check is made here.
  *
  * RCU is used to make it unnecessary to lock the keyring key list here.
  *
  * Returns a pointer to the found key with usage count incremented if
  * successful and returns -ENOKEY if not found.  Revoked keys and keys not
  * providing the requested permission are skipped over.
  *
  * If successful, the possession indicator is propagated from the keyring ref
  * to the returned key reference.
  */
 key_ref_t __keyring_search_one(key_ref_t keyring_ref,
 			       const struct key_type *ktype,
 			       const char *description,
 			       key_perm_t perm)
 {
 	struct keyring_list *klist;
 	unsigned long possessed;
 	struct key *keyring, *key;
 	int loop;
 	keyring = key_ref_to_ptr(keyring_ref);
 	possessed = is_key_possessed(keyring_ref);
 	rcu_read_lock();
 	klist = rcu_dereference(keyring->payload.subscriptions);
 	if (klist) {
 		for (loop = 0; loop < klist->nkeys; loop++) {
 			key = klist->keys[loop];
 			if (key->type == ktype &&
 			    (!key->type->match ||
 			     key->type->match(key, description)) &&
 			    key_permission(make_key_ref(key, possessed),
 					   perm) == 0 &&
 			    !test_bit(KEY_FLAG_REVOKED, &key->flags)
 			    )
 				goto found;
 		}
 	}
 	rcu_read_unlock();
 	return ERR_PTR(-ENOKEY);
 found:
 	atomic_inc(&key->usage);
 	rcu_read_unlock();
 	return make_key_ref(key, possessed);
 }
 /*
  * Find a keyring with the specified name.
  *
  * All named keyrings in the current user namespace are searched, provided they
  * grant Search permission directly to the caller (unless this check is
  * skipped).  Keyrings whose usage points have reached zero or who have been
  * revoked are skipped.
  *
  * Returns a pointer to the keyring with the keyring's refcount having being
  * incremented on success.  -ENOKEY is returned if a key could not be found.
  */
 struct key *find_keyring_by_name(const char *name, bool skip_perm_check)
 {
 	struct key *keyring;
 	int bucket;
 	if (!name)
 		return ERR_PTR(-EINVAL);
 	bucket = keyring_hash(name);
 	read_lock(&keyring_name_lock);
 	if (keyring_name_hash[bucket].next) {
 		/* search this hash bucket for a keyring with a matching name
 		 * that's readable and that hasn't been revoked */
 		list_for_each_entry(keyring,
 				    &keyring_name_hash[bucket],
 				    type_data.link
 				    ) {
 			if (keyring->user->user_ns != current_user_ns())
 				continue;
 			if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
 				continue;
 			if (strcmp(keyring->description, name) != 0)
 				continue;
 			if (!skip_perm_check &&
 			    key_permission(make_key_ref(keyring, 0),
 					   KEY_SEARCH) < 0)
 				continue;
 			/* we've got a match but we might end up racing with
 			 * key_cleanup() if the keyring is currently 'dead'
 			 * (ie. it has a zero usage count) */
 			if (!atomic_inc_not_zero(&keyring->usage))
 				continue;
 			goto out;
 		}
 	}
 	keyring = ERR_PTR(-ENOKEY);
 out:
 	read_unlock(&keyring_name_lock);
 	return keyring;
 }
 /*
  * See if a cycle will will be created by inserting acyclic tree B in acyclic
  * tree A at the topmost level (ie: as a direct child of A).
  *
  * Since we are adding B to A at the top level, checking for cycles should just
  * be a matter of seeing if node A is somewhere in tree B.
  */
 static int keyring_detect_cycle(struct key *A, struct key *B)
 {
 	struct {
 		struct keyring_list *keylist;
 		int kix;
 	} stack[KEYRING_SEARCH_MAX_DEPTH];
 	struct keyring_list *keylist;
 	struct key *subtree, *key;
 	int sp, kix, ret;
 	rcu_read_lock();
 	ret = -EDEADLK;
 	if (A == B)
 		goto cycle_detected;
 	subtree = B;
 	sp = 0;
 	/* start processing a new keyring */
 descend:
 	if (test_bit(KEY_FLAG_REVOKED, &subtree->flags))
 		goto not_this_keyring;
 	keylist = rcu_dereference(subtree->payload.subscriptions);
 	if (!keylist)
 		goto not_this_keyring;
 	kix = 0;
 ascend:
 	/* iterate through the remaining keys in this keyring */
 	for (; kix < keylist->nkeys; kix++) {
 		key = keylist->keys[kix];
 		if (key == A)
 			goto cycle_detected;
 		/* recursively check nested keyrings */
 		if (key->type == &key_type_keyring) {
 			if (sp >= KEYRING_SEARCH_MAX_DEPTH)
 				goto too_deep;
 			/* stack the current position */
 			stack[sp].keylist = keylist;
 			stack[sp].kix = kix;
 			sp++;
 			/* begin again with the new keyring */
 			subtree = key;
 			goto descend;
 		}
 	}
 	/* the keyring we're looking at was disqualified or didn't contain a
 	 * matching key */
 not_this_keyring:
 	if (sp > 0) {
 		/* resume the checking of a keyring higher up in the tree */
 		sp--;
 		keylist = stack[sp].keylist;
 		kix = stack[sp].kix + 1;
 		goto ascend;
 	}
 	ret = 0; /* no cycles detected */
 error:
 	rcu_read_unlock();
 	return ret;
 too_deep:
 	ret = -ELOOP;
 	goto error;
 cycle_detected:
 	ret = -EDEADLK;
 	goto error;
 }
 /*
  * Dispose of a keyring list after the RCU grace period, freeing the unlinked
  * key
  */
 static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
 {
 	struct keyring_list *klist =
 		container_of(rcu, struct keyring_list, rcu);
 	if (klist->delkey != USHRT_MAX)
 		key_put(klist->keys[klist->delkey]);
 	kfree(klist);
 }
 /*
  * Preallocate memory so that a key can be linked into to a keyring.
  */
 int __key_link_begin(struct key *keyring, const struct key_type *type,
 		     const char *description, unsigned long *_prealloc)
 	__acquires(&keyring->sem)
 {
 	struct keyring_list *klist, *nklist;
 	unsigned long prealloc;
 	unsigned max;
 	size_t size;
 	int loop, ret;
 	kenter("%d,%s,%s,", key_serial(keyring), type->name, description);
 	if (keyring->type != &key_type_keyring)
 		return -ENOTDIR;
 	down_write(&keyring->sem);
 	ret = -EKEYREVOKED;
 	if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
 		goto error_krsem;
 	/* serialise link/link calls to prevent parallel calls causing a cycle
 	 * when linking two keyring in opposite orders */
 	if (type == &key_type_keyring)
 		down_write(&keyring_serialise_link_sem);
 	klist = rcu_dereference_locked_keyring(keyring);
 	/* see if there's a matching key we can displace */
 	if (klist && klist->nkeys > 0) {
 		for (loop = klist->nkeys - 1; loop >= 0; loop--) {
 			if (klist->keys[loop]->type == type &&
 			    strcmp(klist->keys[loop]->description,
 				   description) == 0
 			    ) {
 				/* found a match - we'll replace this one with
 				 * the new key */
 				size = sizeof(struct key *) * klist->maxkeys;
 				size += sizeof(*klist);
 				BUG_ON(size > PAGE_SIZE);
 				ret = -ENOMEM;
 				nklist = kmemdup(klist, size, GFP_KERNEL);
 				if (!nklist)
 					goto error_sem;
 				/* note replacement slot */
 				klist->delkey = nklist->delkey = loop;
 				prealloc = (unsigned long)nklist;
 				goto done;
 			}
 		}
 	}
 	/* check that we aren't going to overrun the user's quota */
 	ret = key_payload_reserve(keyring,
 				  keyring->datalen + KEYQUOTA_LINK_BYTES);
 	if (ret < 0)
 		goto error_sem;
 	if (klist && klist->nkeys < klist->maxkeys) {
 		/* there's sufficient slack space to append directly */
 		nklist = NULL;
 		prealloc = KEY_LINK_FIXQUOTA;
 	} else {
 		/* grow the key list */
 		max = 4;
 		if (klist)
 			max += klist->maxkeys;
 		ret = -ENFILE;
 		if (max > USHRT_MAX - 1)
 			goto error_quota;
 		size = sizeof(*klist) + sizeof(struct key *) * max;
 		if (size > PAGE_SIZE)
 			goto error_quota;
 		ret = -ENOMEM;
 		nklist = kmalloc(size, GFP_KERNEL);
 		if (!nklist)
 			goto error_quota;
 		nklist->maxkeys = max;
 		if (klist) {
 			memcpy(nklist->keys, klist->keys,
 			       sizeof(struct key *) * klist->nkeys);
 			nklist->delkey = klist->nkeys;
 			nklist->nkeys = klist->nkeys + 1;
 			klist->delkey = USHRT_MAX;
 		} else {
 			nklist->nkeys = 1;
 			nklist->delkey = 0;
 		}
 		/* add the key into the new space */
 		nklist->keys[nklist->delkey] = NULL;
 	}
 	prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA;
 done:
 	*_prealloc = prealloc;
 	kleave(" = 0");
 	return 0;
 error_quota:
 	/* undo the quota changes */
 	key_payload_reserve(keyring,
 			    keyring->datalen - KEYQUOTA_LINK_BYTES);
 error_sem:
 	if (type == &key_type_keyring)
 		up_write(&keyring_serialise_link_sem);
 error_krsem:
 	up_write(&keyring->sem);
 	kleave(" = %d", ret);
 	return ret;
 }
 /*
  * Check already instantiated keys aren't going to be a problem.
  *
  * The caller must have called __key_link_begin(). Don't need to call this for
  * keys that were created since __key_link_begin() was called.
  */
 int __key_link_check_live_key(struct key *keyring, struct key *key)
 {
 	if (key->type == &key_type_keyring)
 		/* check that we aren't going to create a cycle by linking one
 		 * keyring to another */
 		return keyring_detect_cycle(keyring, key);
 	return 0;
 }
 /*
  * Link a key into to a keyring.
  *
  * Must be called with __key_link_begin() having being called.  Discards any
  * already extant link to matching key if there is one, so that each keyring
  * holds at most one link to any given key of a particular type+description
  * combination.
  */
 void __key_link(struct key *keyring, struct key *key,
 		unsigned long *_prealloc)
 {
 	struct keyring_list *klist, *nklist;
 	nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA);
 	*_prealloc = 0;
 	kenter("%d,%d,%p", keyring->serial, key->serial, nklist);
 	klist = rcu_dereference_protected(keyring->payload.subscriptions,
 					  rwsem_is_locked(&keyring->sem));
 	atomic_inc(&key->usage);
 	/* there's a matching key we can displace or an empty slot in a newly
 	 * allocated list we can fill */
 	if (nklist) {
 		kdebug("replace %hu/%hu/%hu",
 		       nklist->delkey, nklist->nkeys, nklist->maxkeys);
 		nklist->keys[nklist->delkey] = key;
 		rcu_assign_pointer(keyring->payload.subscriptions, nklist);
 		/* dispose of the old keyring list and, if there was one, the
 		 * displaced key */
 		if (klist) {
 			kdebug("dispose %hu/%hu/%hu",
 			       klist->delkey, klist->nkeys, klist->maxkeys);
 			call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
 		}
 	} else {
 		/* there's sufficient slack space to append directly */
 		klist->keys[klist->nkeys] = key;
 		smp_wmb();
 		klist->nkeys++;
 	}
 }
 /*
  * Finish linking a key into to a keyring.
  *
  * Must be called with __key_link_begin() having being called.
  */
 void __key_link_end(struct key *keyring, struct key_type *type,
 		    unsigned long prealloc)
 	__releases(&keyring->sem)
 {
 	BUG_ON(type == NULL);
 	BUG_ON(type->name == NULL);
 	kenter("%d,%s,%lx", keyring->serial, type->name, prealloc);
 	if (type == &key_type_keyring)
 		up_write(&keyring_serialise_link_sem);
 	if (prealloc) {
 		if (prealloc & KEY_LINK_FIXQUOTA)
 			key_payload_reserve(keyring,
 					    keyring->datalen -
 					    KEYQUOTA_LINK_BYTES);
 		kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA));
 	}
 	up_write(&keyring->sem);
 }
 /**
  * key_link - Link a key to a keyring
  * @keyring: The keyring to make the link in.
  * @key: The key to link to.
  *
  * Make a link in a keyring to a key, such that the keyring holds a reference
  * on that key and the key can potentially be found by searching that keyring.
  *
  * This function will write-lock the keyring's semaphore and will consume some
  * of the user's key data quota to hold the link.
  *
  * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring,
  * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is
  * full, -EDQUOT if there is insufficient key data quota remaining to add
  * another link or -ENOMEM if there's insufficient memory.
  *
  * It is assumed that the caller has checked that it is permitted for a link to
  * be made (the keyring should have Write permission and the key Link
  * permission).
  */
 int key_link(struct key *keyring, struct key *key)
 {
 	unsigned long prealloc;
 	int ret;
 	key_check(keyring);
 	key_check(key);
 	ret = __key_link_begin(keyring, key->type, key->description, &prealloc);
 	if (ret == 0) {
 		ret = __key_link_check_live_key(keyring, key);
 		if (ret == 0)
 			__key_link(keyring, key, &prealloc);
 		__key_link_end(keyring, key->type, prealloc);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(key_link);
 /**
  * key_unlink - Unlink the first link to a key from a keyring.
  * @keyring: The keyring to remove the link from.
  * @key: The key the link is to.
  *
  * Remove a link from a keyring to a key.
  *
  * This function will write-lock the keyring's semaphore.
  *
  * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if
  * the key isn't linked to by the keyring or -ENOMEM if there's insufficient
  * memory.
  *
  * It is assumed that the caller has checked that it is permitted for a link to
  * be removed (the keyring should have Write permission; no permissions are
  * required on the key).
  */
 int key_unlink(struct key *keyring, struct key *key)
 {
 	struct keyring_list *klist, *nklist;
 	int loop, ret;
 	key_check(keyring);
 	key_check(key);
 	ret = -ENOTDIR;
 	if (keyring->type != &key_type_keyring)
 		goto error;
 	down_write(&keyring->sem);
 	klist = rcu_dereference_locked_keyring(keyring);
 	if (klist) {
 		/* search the keyring for the key */
 		for (loop = 0; loop < klist->nkeys; loop++)
 			if (klist->keys[loop] == key)
 				goto key_is_present;
 	}
 	up_write(&keyring->sem);
 	ret = -ENOENT;
 	goto error;
 key_is_present:
 	/* we need to copy the key list for RCU purposes */
 	nklist = kmalloc(sizeof(*klist) +
 			 sizeof(struct key *) * klist->maxkeys,
 			 GFP_KERNEL);
 	if (!nklist)
 		goto nomem;
 	nklist->maxkeys = klist->maxkeys;
 	nklist->nkeys = klist->nkeys - 1;
 	if (loop > 0)
 		memcpy(&nklist->keys[0],
 		       &klist->keys[0],
 		       loop * sizeof(struct key *));
 	if (loop < nklist->nkeys)
 		memcpy(&nklist->keys[loop],
 		       &klist->keys[loop + 1],
 		       (nklist->nkeys - loop) * sizeof(struct key *));
 	/* adjust the user's quota */
 	key_payload_reserve(keyring,
 			    keyring->datalen - KEYQUOTA_LINK_BYTES);
 	rcu_assign_pointer(keyring->payload.subscriptions, nklist);
 	up_write(&keyring->sem);
 	/* schedule for later cleanup */
 	klist->delkey = loop;
 	call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
 	ret = 0;
 error:
 	return ret;
 nomem:
 	ret = -ENOMEM;
 	up_write(&keyring->sem);
 	goto error;
 }
 EXPORT_SYMBOL(key_unlink);
 /*
  * Dispose of a keyring list after the RCU grace period, releasing the keys it
  * links to.
  */
 static void keyring_clear_rcu_disposal(struct rcu_head *rcu)
 {
 	struct keyring_list *klist;
 	int loop;
 	klist = container_of(rcu, struct keyring_list, rcu);
 	for (loop = klist->nkeys - 1; loop >= 0; loop--)
 		key_put(klist->keys[loop]);
 	kfree(klist);
 }
 /**
  * keyring_clear - Clear a keyring
  * @keyring: The keyring to clear.
  *
  * Clear the contents of the specified keyring.
  *
  * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring.
  */
 int keyring_clear(struct key *keyring)
 {
 	struct keyring_list *klist;
 	int ret;
 	ret = -ENOTDIR;
 	if (keyring->type == &key_type_keyring) {
 		/* detach the pointer block with the locks held */
 		down_write(&keyring->sem);
 		klist = rcu_dereference_locked_keyring(keyring);
 		if (klist) {
 			/* adjust the quota */
 			key_payload_reserve(keyring,
 					    sizeof(struct keyring_list));
 			rcu_assign_pointer(keyring->payload.subscriptions,
 					   NULL);
 		}
 		up_write(&keyring->sem);
 		/* free the keys after the locks have been dropped */
 		if (klist)
 			call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
 		ret = 0;
 	}
 	return ret;
 }
 EXPORT_SYMBOL(keyring_clear);
 /*
  * Dispose of the links from a revoked keyring.
  *
  * This is called with the key sem write-locked.
  */
 static void keyring_revoke(struct key *keyring)
 {
 	struct keyring_list *klist;
 	klist = rcu_dereference_locked_keyring(keyring);
 	/* adjust the quota */
 	key_payload_reserve(keyring, 0);
 	if (klist) {
 		rcu_assign_pointer(keyring->payload.subscriptions, NULL);
 		call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
 	}
 }
 /*
  * Determine whether a key is dead.
  */
 static bool key_is_dead(struct key *key, time_t limit)
 {
 	return test_bit(KEY_FLAG_DEAD, &key->flags) ||
 		(key->expiry > 0 && key->expiry <= limit);
 }
 /*
  * Collect garbage from the contents of a keyring, replacing the old list with
  * a new one with the pointers all shuffled down.
  *
  * Dead keys are classed as oned that are flagged as being dead or are revoked,
  * expired or negative keys that were revoked or expired before the specified
  * limit.
  */
 void keyring_gc(struct key *keyring, time_t limit)
 {
 	struct keyring_list *klist, *new;
 	struct key *key;
 	int loop, keep, max;
 	kenter("{%x,%s}", key_serial(keyring), keyring->description);
 	down_write(&keyring->sem);
 	klist = rcu_dereference_locked_keyring(keyring);
 	if (!klist)
 		goto no_klist;
 	/* work out how many subscriptions we're keeping */
 	keep = 0;
 	for (loop = klist->nkeys - 1; loop >= 0; loop--)
 		if (!key_is_dead(klist->keys[loop], limit))
 			keep++;
 	if (keep == klist->nkeys)
 		goto just_return;
 	/* allocate a new keyring payload */
 	max = roundup(keep, 4);
 	new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *),
 		      GFP_KERNEL);
 	if (!new)
 		goto nomem;
 	new->maxkeys = max;
 	new->nkeys = 0;
 	new->delkey = 0;
 	/* install the live keys
 	 * - must take care as expired keys may be updated back to life
 	 */
 	keep = 0;
 	for (loop = klist->nkeys - 1; loop >= 0; loop--) {
 		key = klist->keys[loop];
 		if (!key_is_dead(key, limit)) {
 			if (keep >= max)
 				goto discard_new;
 			new->keys[keep++] = key_get(key);
 		}
 	}
 	new->nkeys = keep;
 	/* adjust the quota */
 	key_payload_reserve(keyring,
 			    sizeof(struct keyring_list) +
 			    KEYQUOTA_LINK_BYTES * keep);
 	if (keep == 0) {
 		rcu_assign_pointer(keyring->payload.subscriptions, NULL);
 		kfree(new);
 	} else {
 		rcu_assign_pointer(keyring->payload.subscriptions, new);
 	}
 	up_write(&keyring->sem);
 	call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
 	kleave(" [yes]");
 	return;
 discard_new:
 	new->nkeys = keep;
 	keyring_clear_rcu_disposal(&new->rcu);
 	up_write(&keyring->sem);
 	kleave(" [discard]");
 	return;
 just_return:
 	up_write(&keyring->sem);
 	kleave(" [no dead]");
 	return;
 no_klist:
 	up_write(&keyring->sem);
 	kleave(" [no_klist]");
 	return;
 nomem:
 	up_write(&keyring->sem);
 	kleave(" [oom]");
 }