Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 8207649c41bf5c28a987be47d66545fa9d2994d8

Authored by Linus Torvalds 2014-09-26 23:11:43 +0800

Exists in ti-lsk-linux-4.1.y and in 10 other branches

Merge branch 'akpm' (fixes from Andrew Morton)

Merge fixes from Andrew Morton:
 "9 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm: softdirty: keep bit when zapping file pte
  fs/cachefiles: add missing \n to kerror conversions
  genalloc: fix device node resource counter
  drivers/rtc/rtc-efi.c: add missing module alias
  mm, slab: initialize object alignment on cache creation
  mm: softdirty: addresses before VMAs in PTE holes aren't softdirty
  ocfs2/dlm: do not get resource spinlock if lockres is new
  nilfs2: fix data loss with mmap()
  ocfs2: free vol_label in ocfs2_delete_osb()

Showing 14 changed files Inline Diff

drivers/rtc/rtc-efi.c
fs/cachefiles/bind.c
fs/cachefiles/daemon.c
fs/cachefiles/internal.h
fs/cachefiles/main.c
fs/cachefiles/namei.c
fs/cachefiles/xattr.c
fs/nilfs2/inode.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/super.c
fs/proc/task_mmu.c
lib/genalloc.c
mm/memory.c
mm/slab.c

drivers/rtc/rtc-efi.c

Diff comments View file @ 8207649

 /*
  * rtc-efi: RTC Class Driver for EFI-based systems
  *
  * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
  *
  * Author: dann frazier <dannf@hp.com>
  * Based on efirtc.c by Stephane Eranian
  *
  *  This program is free software; you can redistribute  it and/or modify it
  *  under  the terms of  the GNU General  Public License as published by the
  *  Free Software Foundation;  either version 2 of the  License, or (at your
  *  option) any later version.
  *
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/time.h>
 #include <linux/platform_device.h>
 #include <linux/rtc.h>
 #include <linux/efi.h>
 #define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT)
 /*
  * EFI Epoch is 1/1/1998
  */
 #define EFI_RTC_EPOCH		1998
 /*
  * returns day of the year [0-365]
  */
 static inline int
 compute_yday(efi_time_t *eft)
 {
 	/* efi_time_t.month is in the [1-12] so, we need -1 */
 	return rtc_year_days(eft->day, eft->month - 1, eft->year);
 }
 /*
  * returns day of the week [0-6] 0=Sunday
  *
  * Don't try to provide a year that's before 1998, please !
  */
 static int
 compute_wday(efi_time_t *eft)
 {
 	int y;
 	int ndays = 0;
 	if (eft->year < EFI_RTC_EPOCH) {
 		pr_err("EFI year < " __stringify(EFI_RTC_EPOCH) ", invalid date\n");
 		return -1;
 	}
 	for (y = EFI_RTC_EPOCH; y < eft->year; y++)
 		ndays += 365 + (is_leap_year(y) ? 1 : 0);
 	ndays += compute_yday(eft);
 	/*
 	 * 4=1/1/1998 was a Thursday
 	 */
 	return (ndays + 4) % 7;
 }
 static void
 convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft)
 {
 	eft->year	= wtime->tm_year + 1900;
 	eft->month	= wtime->tm_mon + 1;
 	eft->day	= wtime->tm_mday;
 	eft->hour	= wtime->tm_hour;
 	eft->minute	= wtime->tm_min;
 	eft->second	= wtime->tm_sec;
 	eft->nanosecond = 0;
 	eft->daylight	= wtime->tm_isdst ? EFI_ISDST : 0;
 	eft->timezone	= EFI_UNSPECIFIED_TIMEZONE;
 }
 static bool
 convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime)
 {
 	memset(wtime, 0, sizeof(*wtime));
 	if (eft->second >= 60)
 		return false;
 	wtime->tm_sec  = eft->second;
 	if (eft->minute >= 60)
 		return false;
 	wtime->tm_min  = eft->minute;
 	if (eft->hour >= 24)
 		return false;
 	wtime->tm_hour = eft->hour;
 	if (!eft->day || eft->day > 31)
 		return false;
 	wtime->tm_mday = eft->day;
 	if (!eft->month || eft->month > 12)
 		return false;
 	wtime->tm_mon  = eft->month - 1;
 	wtime->tm_year = eft->year - 1900;
 	/* day of the week [0-6], Sunday=0 */
 	wtime->tm_wday = compute_wday(eft);
 	if (wtime->tm_wday < 0)
 		return false;
 	/* day in the year [1-365]*/
 	wtime->tm_yday = compute_yday(eft);
 	switch (eft->daylight & EFI_ISDST) {
 	case EFI_ISDST:
 		wtime->tm_isdst = 1;
 		break;
 	case EFI_TIME_ADJUST_DAYLIGHT:
 		wtime->tm_isdst = 0;
 		break;
 	default:
 		wtime->tm_isdst = -1;
 	}
 	return true;
 }
 static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 {
 	efi_time_t eft;
 	efi_status_t status;
 	/*
 	 * As of EFI v1.10, this call always returns an unsupported status
 	 */
 	status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled,
 				     (efi_bool_t *)&wkalrm->pending, &eft);
 	if (status != EFI_SUCCESS)
 		return -EINVAL;
 	if (!convert_from_efi_time(&eft, &wkalrm->time))
 		return -EIO;
 	return rtc_valid_tm(&wkalrm->time);
 }
 static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 {
 	efi_time_t eft;
 	efi_status_t status;
 	convert_to_efi_time(&wkalrm->time, &eft);
 	/*
 	 * XXX Fixme:
 	 * As of EFI 0.92 with the firmware I have on my
 	 * machine this call does not seem to work quite
 	 * right
 	 *
 	 * As of v1.10, this call always returns an unsupported status
 	 */
 	status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft);
 	dev_warn(dev, "write status is %d\n", (int)status);
 	return status == EFI_SUCCESS ? 0 : -EINVAL;
 }
 static int efi_read_time(struct device *dev, struct rtc_time *tm)
 {
 	efi_status_t status;
 	efi_time_t eft;
 	efi_time_cap_t cap;
 	status = efi.get_time(&eft, &cap);
 	if (status != EFI_SUCCESS) {
 		/* should never happen */
 		dev_err(dev, "can't read time\n");
 		return -EINVAL;
 	}
 	if (!convert_from_efi_time(&eft, tm))
 		return -EIO;
 	return rtc_valid_tm(tm);
 }
 static int efi_set_time(struct device *dev, struct rtc_time *tm)
 {
 	efi_status_t status;
 	efi_time_t eft;
 	convert_to_efi_time(tm, &eft);
 	status = efi.set_time(&eft);
 	return status == EFI_SUCCESS ? 0 : -EINVAL;
 }
 static const struct rtc_class_ops efi_rtc_ops = {
 	.read_time = efi_read_time,
 	.set_time = efi_set_time,
 	.read_alarm = efi_read_alarm,
 	.set_alarm = efi_set_alarm,
 };
 static int __init efi_rtc_probe(struct platform_device *dev)
 {
 	struct rtc_device *rtc;
 	rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops,
 					THIS_MODULE);
 	if (IS_ERR(rtc))
 		return PTR_ERR(rtc);
 	platform_set_drvdata(dev, rtc);
 	return 0;
 }
 static struct platform_driver efi_rtc_driver = {
 	.driver = {
 		.name = "rtc-efi",
 		.owner = THIS_MODULE,
 	},
 };
 module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe);
+MODULE_ALIAS("platform:rtc-efi");
 MODULE_AUTHOR("dann frazier <dannf@hp.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("EFI RTC driver");

fs/cachefiles/bind.c

Diff comments View file @ 8207649

1	/* Bind and unbind a cache from the filesystem backing it	1	/* Bind and unbind a cache from the filesystem backing it
2	*	2	*
3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.	3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4	* Written by David Howells (dhowells@redhat.com)	4	* Written by David Howells (dhowells@redhat.com)
5	*	5	*
6	* This program is free software; you can redistribute it and/or	6	* This program is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU General Public Licence	7	* modify it under the terms of the GNU General Public Licence
8	* as published by the Free Software Foundation; either version	8	* as published by the Free Software Foundation; either version
9	* 2 of the Licence, or (at your option) any later version.	9	* 2 of the Licence, or (at your option) any later version.
10	*/	10	*/
11		11
12	#include <linux/module.h>	12	#include <linux/module.h>
13	#include <linux/init.h>	13	#include <linux/init.h>
14	#include <linux/sched.h>	14	#include <linux/sched.h>
15	#include <linux/completion.h>	15	#include <linux/completion.h>
16	#include <linux/slab.h>	16	#include <linux/slab.h>
17	#include <linux/fs.h>	17	#include <linux/fs.h>
18	#include <linux/file.h>	18	#include <linux/file.h>
19	#include <linux/namei.h>	19	#include <linux/namei.h>
20	#include <linux/mount.h>	20	#include <linux/mount.h>
21	#include <linux/statfs.h>	21	#include <linux/statfs.h>
22	#include <linux/ctype.h>	22	#include <linux/ctype.h>
23	#include "internal.h"	23	#include "internal.h"
24		24
25	static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);	25	static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
26		26
27	/*	27	/*
28	* bind a directory as a cache	28	* bind a directory as a cache
29	*/	29	*/
30	int cachefiles_daemon_bind(struct cachefiles_cache cache, char args)	30	int cachefiles_daemon_bind(struct cachefiles_cache cache, char args)
31	{	31	{
32	_enter("{%u,%u,%u,%u,%u,%u},%s",	32	_enter("{%u,%u,%u,%u,%u,%u},%s",
33	cache->frun_percent,	33	cache->frun_percent,
34	cache->fcull_percent,	34	cache->fcull_percent,
35	cache->fstop_percent,	35	cache->fstop_percent,
36	cache->brun_percent,	36	cache->brun_percent,
37	cache->bcull_percent,	37	cache->bcull_percent,
38	cache->bstop_percent,	38	cache->bstop_percent,
39	args);	39	args);
40		40
41	/* start by checking things over */	41	/* start by checking things over */
42	ASSERT(cache->fstop_percent >= 0 &&	42	ASSERT(cache->fstop_percent >= 0 &&
43	cache->fstop_percent < cache->fcull_percent &&	43	cache->fstop_percent < cache->fcull_percent &&
44	cache->fcull_percent < cache->frun_percent &&	44	cache->fcull_percent < cache->frun_percent &&
45	cache->frun_percent < 100);	45	cache->frun_percent < 100);
46		46
47	ASSERT(cache->bstop_percent >= 0 &&	47	ASSERT(cache->bstop_percent >= 0 &&
48	cache->bstop_percent < cache->bcull_percent &&	48	cache->bstop_percent < cache->bcull_percent &&
49	cache->bcull_percent < cache->brun_percent &&	49	cache->bcull_percent < cache->brun_percent &&
50	cache->brun_percent < 100);	50	cache->brun_percent < 100);
51		51
52	if (*args) {	52	if (*args) {
53	pr_err("'bind' command doesn't take an argument");	53	pr_err("'bind' command doesn't take an argument\n");
54	return -EINVAL;	54	return -EINVAL;
55	}	55	}
56		56
57	if (!cache->rootdirname) {	57	if (!cache->rootdirname) {
58	pr_err("No cache directory specified");	58	pr_err("No cache directory specified\n");
59	return -EINVAL;	59	return -EINVAL;
60	}	60	}
61		61
62	/* don't permit already bound caches to be re-bound */	62	/* don't permit already bound caches to be re-bound */
63	if (test_bit(CACHEFILES_READY, &cache->flags)) {	63	if (test_bit(CACHEFILES_READY, &cache->flags)) {
64	pr_err("Cache already bound");	64	pr_err("Cache already bound\n");
65	return -EBUSY;	65	return -EBUSY;
66	}	66	}
67		67
68	/* make sure we have copies of the tag and dirname strings */	68	/* make sure we have copies of the tag and dirname strings */
69	if (!cache->tag) {	69	if (!cache->tag) {
70	/* the tag string is released by the fops->release()	70	/* the tag string is released by the fops->release()
71	* function, so we don't release it on error here */	71	* function, so we don't release it on error here */
72	cache->tag = kstrdup("CacheFiles", GFP_KERNEL);	72	cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
73	if (!cache->tag)	73	if (!cache->tag)
74	return -ENOMEM;	74	return -ENOMEM;
75	}	75	}
76		76
77	/* add the cache */	77	/* add the cache */
78	return cachefiles_daemon_add_cache(cache);	78	return cachefiles_daemon_add_cache(cache);
79	}	79	}
80		80
81	/*	81	/*
82	* add a cache	82	* add a cache
83	*/	83	*/
84	static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)	84	static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85	{	85	{
86	struct cachefiles_object *fsdef;	86	struct cachefiles_object *fsdef;
87	struct path path;	87	struct path path;
88	struct kstatfs stats;	88	struct kstatfs stats;
89	struct dentry graveyard, cachedir, *root;	89	struct dentry graveyard, cachedir, *root;
90	const struct cred *saved_cred;	90	const struct cred *saved_cred;
91	int ret;	91	int ret;
92		92
93	_enter("");	93	_enter("");
94		94
95	/* we want to work under the module's security ID */	95	/* we want to work under the module's security ID */
96	ret = cachefiles_get_security_ID(cache);	96	ret = cachefiles_get_security_ID(cache);
97	if (ret < 0)	97	if (ret < 0)
98	return ret;	98	return ret;
99		99
100	cachefiles_begin_secure(cache, &saved_cred);	100	cachefiles_begin_secure(cache, &saved_cred);
101		101
102	/* allocate the root index object */	102	/* allocate the root index object */
103	ret = -ENOMEM;	103	ret = -ENOMEM;
104		104
105	fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);	105	fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
106	if (!fsdef)	106	if (!fsdef)
107	goto error_root_object;	107	goto error_root_object;
108		108
109	ASSERTCMP(fsdef->backer, ==, NULL);	109	ASSERTCMP(fsdef->backer, ==, NULL);
110		110
111	atomic_set(&fsdef->usage, 1);	111	atomic_set(&fsdef->usage, 1);
112	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;	112	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
113		113
114	_debug("- fsdef %p", fsdef);	114	_debug("- fsdef %p", fsdef);
115		115
116	/* look up the directory at the root of the cache */	116	/* look up the directory at the root of the cache */
117	ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);	117	ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
118	if (ret < 0)	118	if (ret < 0)
119	goto error_open_root;	119	goto error_open_root;
120		120
121	cache->mnt = path.mnt;	121	cache->mnt = path.mnt;
122	root = path.dentry;	122	root = path.dentry;
123		123
124	/* check parameters */	124	/* check parameters */
125	ret = -EOPNOTSUPP;	125	ret = -EOPNOTSUPP;
126	if (!root->d_inode \|\|	126	if (!root->d_inode \|\|
127	!root->d_inode->i_op->lookup \|\|	127	!root->d_inode->i_op->lookup \|\|
128	!root->d_inode->i_op->mkdir \|\|	128	!root->d_inode->i_op->mkdir \|\|
129	!root->d_inode->i_op->setxattr \|\|	129	!root->d_inode->i_op->setxattr \|\|
130	!root->d_inode->i_op->getxattr \|\|	130	!root->d_inode->i_op->getxattr \|\|
131	!root->d_sb->s_op->statfs \|\|	131	!root->d_sb->s_op->statfs \|\|
132	!root->d_sb->s_op->sync_fs)	132	!root->d_sb->s_op->sync_fs)
133	goto error_unsupported;	133	goto error_unsupported;
134		134
135	ret = -EROFS;	135	ret = -EROFS;
136	if (root->d_sb->s_flags & MS_RDONLY)	136	if (root->d_sb->s_flags & MS_RDONLY)
137	goto error_unsupported;	137	goto error_unsupported;
138		138
139	/* determine the security of the on-disk cache as this governs	139	/* determine the security of the on-disk cache as this governs
140	* security ID of files we create */	140	* security ID of files we create */
141	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);	141	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
142	if (ret < 0)	142	if (ret < 0)
143	goto error_unsupported;	143	goto error_unsupported;
144		144
145	/* get the cache size and blocksize */	145	/* get the cache size and blocksize */
146	ret = vfs_statfs(&path, &stats);	146	ret = vfs_statfs(&path, &stats);
147	if (ret < 0)	147	if (ret < 0)
148	goto error_unsupported;	148	goto error_unsupported;
149		149
150	ret = -ERANGE;	150	ret = -ERANGE;
151	if (stats.f_bsize <= 0)	151	if (stats.f_bsize <= 0)
152	goto error_unsupported;	152	goto error_unsupported;
153		153
154	ret = -EOPNOTSUPP;	154	ret = -EOPNOTSUPP;
155	if (stats.f_bsize > PAGE_SIZE)	155	if (stats.f_bsize > PAGE_SIZE)
156	goto error_unsupported;	156	goto error_unsupported;
157		157
158	cache->bsize = stats.f_bsize;	158	cache->bsize = stats.f_bsize;
159	cache->bshift = 0;	159	cache->bshift = 0;
160	if (stats.f_bsize < PAGE_SIZE)	160	if (stats.f_bsize < PAGE_SIZE)
161	cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);	161	cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
162		162
163	_debug("blksize %u (shift %u)",	163	_debug("blksize %u (shift %u)",
164	cache->bsize, cache->bshift);	164	cache->bsize, cache->bshift);
165		165
166	_debug("size %llu, avail %llu",	166	_debug("size %llu, avail %llu",
167	(unsigned long long) stats.f_blocks,	167	(unsigned long long) stats.f_blocks,
168	(unsigned long long) stats.f_bavail);	168	(unsigned long long) stats.f_bavail);
169		169
170	/* set up caching limits */	170	/* set up caching limits */
171	do_div(stats.f_files, 100);	171	do_div(stats.f_files, 100);
172	cache->fstop = stats.f_files * cache->fstop_percent;	172	cache->fstop = stats.f_files * cache->fstop_percent;
173	cache->fcull = stats.f_files * cache->fcull_percent;	173	cache->fcull = stats.f_files * cache->fcull_percent;
174	cache->frun = stats.f_files * cache->frun_percent;	174	cache->frun = stats.f_files * cache->frun_percent;
175		175
176	_debug("limits {%llu,%llu,%llu} files",	176	_debug("limits {%llu,%llu,%llu} files",
177	(unsigned long long) cache->frun,	177	(unsigned long long) cache->frun,
178	(unsigned long long) cache->fcull,	178	(unsigned long long) cache->fcull,
179	(unsigned long long) cache->fstop);	179	(unsigned long long) cache->fstop);
180		180
181	stats.f_blocks >>= cache->bshift;	181	stats.f_blocks >>= cache->bshift;
182	do_div(stats.f_blocks, 100);	182	do_div(stats.f_blocks, 100);
183	cache->bstop = stats.f_blocks * cache->bstop_percent;	183	cache->bstop = stats.f_blocks * cache->bstop_percent;
184	cache->bcull = stats.f_blocks * cache->bcull_percent;	184	cache->bcull = stats.f_blocks * cache->bcull_percent;
185	cache->brun = stats.f_blocks * cache->brun_percent;	185	cache->brun = stats.f_blocks * cache->brun_percent;
186		186
187	_debug("limits {%llu,%llu,%llu} blocks",	187	_debug("limits {%llu,%llu,%llu} blocks",
188	(unsigned long long) cache->brun,	188	(unsigned long long) cache->brun,
189	(unsigned long long) cache->bcull,	189	(unsigned long long) cache->bcull,
190	(unsigned long long) cache->bstop);	190	(unsigned long long) cache->bstop);
191		191
192	/* get the cache directory and check its type */	192	/* get the cache directory and check its type */
193	cachedir = cachefiles_get_directory(cache, root, "cache");	193	cachedir = cachefiles_get_directory(cache, root, "cache");
194	if (IS_ERR(cachedir)) {	194	if (IS_ERR(cachedir)) {
195	ret = PTR_ERR(cachedir);	195	ret = PTR_ERR(cachedir);
196	goto error_unsupported;	196	goto error_unsupported;
197	}	197	}
198		198
199	fsdef->dentry = cachedir;	199	fsdef->dentry = cachedir;
200	fsdef->fscache.cookie = NULL;	200	fsdef->fscache.cookie = NULL;
201		201
202	ret = cachefiles_check_object_type(fsdef);	202	ret = cachefiles_check_object_type(fsdef);
203	if (ret < 0)	203	if (ret < 0)
204	goto error_unsupported;	204	goto error_unsupported;
205		205
206	/* get the graveyard directory */	206	/* get the graveyard directory */
207	graveyard = cachefiles_get_directory(cache, root, "graveyard");	207	graveyard = cachefiles_get_directory(cache, root, "graveyard");
208	if (IS_ERR(graveyard)) {	208	if (IS_ERR(graveyard)) {
209	ret = PTR_ERR(graveyard);	209	ret = PTR_ERR(graveyard);
210	goto error_unsupported;	210	goto error_unsupported;
211	}	211	}
212		212
213	cache->graveyard = graveyard;	213	cache->graveyard = graveyard;
214		214
215	/* publish the cache */	215	/* publish the cache */
216	fscache_init_cache(&cache->cache,	216	fscache_init_cache(&cache->cache,
217	&cachefiles_cache_ops,	217	&cachefiles_cache_ops,
218	"%s",	218	"%s",
219	fsdef->dentry->d_sb->s_id);	219	fsdef->dentry->d_sb->s_id);
220		220
221	fscache_object_init(&fsdef->fscache, NULL, &cache->cache);	221	fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
222		222
223	ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);	223	ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
224	if (ret < 0)	224	if (ret < 0)
225	goto error_add_cache;	225	goto error_add_cache;
226		226
227	/* done */	227	/* done */
228	set_bit(CACHEFILES_READY, &cache->flags);	228	set_bit(CACHEFILES_READY, &cache->flags);
229	dput(root);	229	dput(root);
230		230
231	pr_info("File cache on %s registered\n", cache->cache.identifier);	231	pr_info("File cache on %s registered\n", cache->cache.identifier);
232		232
233	/* check how much space the cache has */	233	/* check how much space the cache has */
234	cachefiles_has_space(cache, 0, 0);	234	cachefiles_has_space(cache, 0, 0);
235	cachefiles_end_secure(cache, saved_cred);	235	cachefiles_end_secure(cache, saved_cred);
236	return 0;	236	return 0;
237		237
238	error_add_cache:	238	error_add_cache:
239	dput(cache->graveyard);	239	dput(cache->graveyard);
240	cache->graveyard = NULL;	240	cache->graveyard = NULL;
241	error_unsupported:	241	error_unsupported:
242	mntput(cache->mnt);	242	mntput(cache->mnt);
243	cache->mnt = NULL;	243	cache->mnt = NULL;
244	dput(fsdef->dentry);	244	dput(fsdef->dentry);
245	fsdef->dentry = NULL;	245	fsdef->dentry = NULL;
246	dput(root);	246	dput(root);
247	error_open_root:	247	error_open_root:
248	kmem_cache_free(cachefiles_object_jar, fsdef);	248	kmem_cache_free(cachefiles_object_jar, fsdef);
249	error_root_object:	249	error_root_object:
250	cachefiles_end_secure(cache, saved_cred);	250	cachefiles_end_secure(cache, saved_cred);
251	pr_err("Failed to register: %d", ret);	251	pr_err("Failed to register: %d\n", ret);
252	return ret;	252	return ret;
253	}	253	}
254		254
255	/*	255	/*
256	* unbind a cache on fd release	256	* unbind a cache on fd release
257	*/	257	*/
258	void cachefiles_daemon_unbind(struct cachefiles_cache *cache)	258	void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
259	{	259	{
260	_enter("");	260	_enter("");
261		261
262	if (test_bit(CACHEFILES_READY, &cache->flags)) {	262	if (test_bit(CACHEFILES_READY, &cache->flags)) {
263	pr_info("File cache on %s unregistering\n",	263	pr_info("File cache on %s unregistering\n",
264	cache->cache.identifier);	264	cache->cache.identifier);
265		265
266	fscache_withdraw_cache(&cache->cache);	266	fscache_withdraw_cache(&cache->cache);
267	}	267	}
268		268
269	dput(cache->graveyard);	269	dput(cache->graveyard);
270	mntput(cache->mnt);	270	mntput(cache->mnt);
271		271
272	kfree(cache->rootdirname);	272	kfree(cache->rootdirname);
273	kfree(cache->secctx);	273	kfree(cache->secctx);
274	kfree(cache->tag);	274	kfree(cache->tag);
275		275
276	_leave("");	276	_leave("");
277	}	277	}
278		278

fs/cachefiles/daemon.c

Diff comments View file @ 8207649

1	/* Daemon interface	1	/* Daemon interface
2	*	2	*
3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.	3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4	* Written by David Howells (dhowells@redhat.com)	4	* Written by David Howells (dhowells@redhat.com)
5	*	5	*
6	* This program is free software; you can redistribute it and/or	6	* This program is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU General Public Licence	7	* modify it under the terms of the GNU General Public Licence
8	* as published by the Free Software Foundation; either version	8	* as published by the Free Software Foundation; either version
9	* 2 of the Licence, or (at your option) any later version.	9	* 2 of the Licence, or (at your option) any later version.
10	*/	10	*/
11		11
12	#include <linux/module.h>	12	#include <linux/module.h>
13	#include <linux/init.h>	13	#include <linux/init.h>
14	#include <linux/sched.h>	14	#include <linux/sched.h>
15	#include <linux/completion.h>	15	#include <linux/completion.h>
16	#include <linux/slab.h>	16	#include <linux/slab.h>
17	#include <linux/fs.h>	17	#include <linux/fs.h>
18	#include <linux/file.h>	18	#include <linux/file.h>
19	#include <linux/namei.h>	19	#include <linux/namei.h>
20	#include <linux/poll.h>	20	#include <linux/poll.h>
21	#include <linux/mount.h>	21	#include <linux/mount.h>
22	#include <linux/statfs.h>	22	#include <linux/statfs.h>
23	#include <linux/ctype.h>	23	#include <linux/ctype.h>
24	#include <linux/string.h>	24	#include <linux/string.h>
25	#include <linux/fs_struct.h>	25	#include <linux/fs_struct.h>
26	#include "internal.h"	26	#include "internal.h"
27		27
28	static int cachefiles_daemon_open(struct inode , struct file );	28	static int cachefiles_daemon_open(struct inode , struct file );
29	static int cachefiles_daemon_release(struct inode , struct file );	29	static int cachefiles_daemon_release(struct inode , struct file );
30	static ssize_t cachefiles_daemon_read(struct file , char __user , size_t,	30	static ssize_t cachefiles_daemon_read(struct file , char __user , size_t,
31	loff_t *);	31	loff_t *);
32	static ssize_t cachefiles_daemon_write(struct file , const char __user ,	32	static ssize_t cachefiles_daemon_write(struct file , const char __user ,
33	size_t, loff_t *);	33	size_t, loff_t *);
34	static unsigned int cachefiles_daemon_poll(struct file *,	34	static unsigned int cachefiles_daemon_poll(struct file *,
35	struct poll_table_struct *);	35	struct poll_table_struct *);
36	static int cachefiles_daemon_frun(struct cachefiles_cache , char );	36	static int cachefiles_daemon_frun(struct cachefiles_cache , char );
37	static int cachefiles_daemon_fcull(struct cachefiles_cache , char );	37	static int cachefiles_daemon_fcull(struct cachefiles_cache , char );
38	static int cachefiles_daemon_fstop(struct cachefiles_cache , char );	38	static int cachefiles_daemon_fstop(struct cachefiles_cache , char );
39	static int cachefiles_daemon_brun(struct cachefiles_cache , char );	39	static int cachefiles_daemon_brun(struct cachefiles_cache , char );
40	static int cachefiles_daemon_bcull(struct cachefiles_cache , char );	40	static int cachefiles_daemon_bcull(struct cachefiles_cache , char );
41	static int cachefiles_daemon_bstop(struct cachefiles_cache , char );	41	static int cachefiles_daemon_bstop(struct cachefiles_cache , char );
42	static int cachefiles_daemon_cull(struct cachefiles_cache , char );	42	static int cachefiles_daemon_cull(struct cachefiles_cache , char );
43	static int cachefiles_daemon_debug(struct cachefiles_cache , char );	43	static int cachefiles_daemon_debug(struct cachefiles_cache , char );
44	static int cachefiles_daemon_dir(struct cachefiles_cache , char );	44	static int cachefiles_daemon_dir(struct cachefiles_cache , char );
45	static int cachefiles_daemon_inuse(struct cachefiles_cache , char );	45	static int cachefiles_daemon_inuse(struct cachefiles_cache , char );
46	static int cachefiles_daemon_secctx(struct cachefiles_cache , char );	46	static int cachefiles_daemon_secctx(struct cachefiles_cache , char );
47	static int cachefiles_daemon_tag(struct cachefiles_cache , char );	47	static int cachefiles_daemon_tag(struct cachefiles_cache , char );
48		48
49	static unsigned long cachefiles_open;	49	static unsigned long cachefiles_open;
50		50
51	const struct file_operations cachefiles_daemon_fops = {	51	const struct file_operations cachefiles_daemon_fops = {
52	.owner = THIS_MODULE,	52	.owner = THIS_MODULE,
53	.open = cachefiles_daemon_open,	53	.open = cachefiles_daemon_open,
54	.release = cachefiles_daemon_release,	54	.release = cachefiles_daemon_release,
55	.read = cachefiles_daemon_read,	55	.read = cachefiles_daemon_read,
56	.write = cachefiles_daemon_write,	56	.write = cachefiles_daemon_write,
57	.poll = cachefiles_daemon_poll,	57	.poll = cachefiles_daemon_poll,
58	.llseek = noop_llseek,	58	.llseek = noop_llseek,
59	};	59	};
60		60
61	struct cachefiles_daemon_cmd {	61	struct cachefiles_daemon_cmd {
62	char name[8];	62	char name[8];
63	int (handler)(struct cachefiles_cache cache, char *args);	63	int (handler)(struct cachefiles_cache cache, char *args);
64	};	64	};
65		65
66	static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {	66	static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
67	{ "bind", cachefiles_daemon_bind },	67	{ "bind", cachefiles_daemon_bind },
68	{ "brun", cachefiles_daemon_brun },	68	{ "brun", cachefiles_daemon_brun },
69	{ "bcull", cachefiles_daemon_bcull },	69	{ "bcull", cachefiles_daemon_bcull },
70	{ "bstop", cachefiles_daemon_bstop },	70	{ "bstop", cachefiles_daemon_bstop },
71	{ "cull", cachefiles_daemon_cull },	71	{ "cull", cachefiles_daemon_cull },
72	{ "debug", cachefiles_daemon_debug },	72	{ "debug", cachefiles_daemon_debug },
73	{ "dir", cachefiles_daemon_dir },	73	{ "dir", cachefiles_daemon_dir },
74	{ "frun", cachefiles_daemon_frun },	74	{ "frun", cachefiles_daemon_frun },
75	{ "fcull", cachefiles_daemon_fcull },	75	{ "fcull", cachefiles_daemon_fcull },
76	{ "fstop", cachefiles_daemon_fstop },	76	{ "fstop", cachefiles_daemon_fstop },
77	{ "inuse", cachefiles_daemon_inuse },	77	{ "inuse", cachefiles_daemon_inuse },
78	{ "secctx", cachefiles_daemon_secctx },	78	{ "secctx", cachefiles_daemon_secctx },
79	{ "tag", cachefiles_daemon_tag },	79	{ "tag", cachefiles_daemon_tag },
80	{ "", NULL }	80	{ "", NULL }
81	};	81	};
82		82
83		83
84	/*	84	/*
85	* do various checks	85	* do various checks
86	*/	86	*/
87	static int cachefiles_daemon_open(struct inode inode, struct file file)	87	static int cachefiles_daemon_open(struct inode inode, struct file file)
88	{	88	{
89	struct cachefiles_cache *cache;	89	struct cachefiles_cache *cache;
90		90
91	_enter("");	91	_enter("");
92		92
93	/* only the superuser may do this */	93	/* only the superuser may do this */
94	if (!capable(CAP_SYS_ADMIN))	94	if (!capable(CAP_SYS_ADMIN))
95	return -EPERM;	95	return -EPERM;
96		96
97	/* the cachefiles device may only be open once at a time */	97	/* the cachefiles device may only be open once at a time */
98	if (xchg(&cachefiles_open, 1) == 1)	98	if (xchg(&cachefiles_open, 1) == 1)
99	return -EBUSY;	99	return -EBUSY;
100		100
101	/* allocate a cache record */	101	/* allocate a cache record */
102	cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);	102	cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
103	if (!cache) {	103	if (!cache) {
104	cachefiles_open = 0;	104	cachefiles_open = 0;
105	return -ENOMEM;	105	return -ENOMEM;
106	}	106	}
107		107
108	mutex_init(&cache->daemon_mutex);	108	mutex_init(&cache->daemon_mutex);
109	cache->active_nodes = RB_ROOT;	109	cache->active_nodes = RB_ROOT;
110	rwlock_init(&cache->active_lock);	110	rwlock_init(&cache->active_lock);
111	init_waitqueue_head(&cache->daemon_pollwq);	111	init_waitqueue_head(&cache->daemon_pollwq);
112		112
113	/* set default caching limits	113	/* set default caching limits
114	* - limit at 1% free space and/or free files	114	* - limit at 1% free space and/or free files
115	* - cull below 5% free space and/or free files	115	* - cull below 5% free space and/or free files
116	* - cease culling above 7% free space and/or free files	116	* - cease culling above 7% free space and/or free files
117	*/	117	*/
118	cache->frun_percent = 7;	118	cache->frun_percent = 7;
119	cache->fcull_percent = 5;	119	cache->fcull_percent = 5;
120	cache->fstop_percent = 1;	120	cache->fstop_percent = 1;
121	cache->brun_percent = 7;	121	cache->brun_percent = 7;
122	cache->bcull_percent = 5;	122	cache->bcull_percent = 5;
123	cache->bstop_percent = 1;	123	cache->bstop_percent = 1;
124		124
125	file->private_data = cache;	125	file->private_data = cache;
126	cache->cachefilesd = file;	126	cache->cachefilesd = file;
127	return 0;	127	return 0;
128	}	128	}
129		129
130	/*	130	/*
131	* release a cache	131	* release a cache
132	*/	132	*/
133	static int cachefiles_daemon_release(struct inode inode, struct file file)	133	static int cachefiles_daemon_release(struct inode inode, struct file file)
134	{	134	{
135	struct cachefiles_cache *cache = file->private_data;	135	struct cachefiles_cache *cache = file->private_data;
136		136
137	_enter("");	137	_enter("");
138		138
139	ASSERT(cache);	139	ASSERT(cache);
140		140
141	set_bit(CACHEFILES_DEAD, &cache->flags);	141	set_bit(CACHEFILES_DEAD, &cache->flags);
142		142
143	cachefiles_daemon_unbind(cache);	143	cachefiles_daemon_unbind(cache);
144		144
145	ASSERT(!cache->active_nodes.rb_node);	145	ASSERT(!cache->active_nodes.rb_node);
146		146
147	/* clean up the control file interface */	147	/* clean up the control file interface */
148	cache->cachefilesd = NULL;	148	cache->cachefilesd = NULL;
149	file->private_data = NULL;	149	file->private_data = NULL;
150	cachefiles_open = 0;	150	cachefiles_open = 0;
151		151
152	kfree(cache);	152	kfree(cache);
153		153
154	_leave("");	154	_leave("");
155	return 0;	155	return 0;
156	}	156	}
157		157
158	/*	158	/*
159	* read the cache state	159	* read the cache state
160	*/	160	*/
161	static ssize_t cachefiles_daemon_read(struct file file, char __user _buffer,	161	static ssize_t cachefiles_daemon_read(struct file file, char __user _buffer,
162	size_t buflen, loff_t *pos)	162	size_t buflen, loff_t *pos)
163	{	163	{
164	struct cachefiles_cache *cache = file->private_data;	164	struct cachefiles_cache *cache = file->private_data;
165	char buffer[256];	165	char buffer[256];
166	int n;	166	int n;
167		167
168	//_enter(",,%zu,", buflen);	168	//_enter(",,%zu,", buflen);
169		169
170	if (!test_bit(CACHEFILES_READY, &cache->flags))	170	if (!test_bit(CACHEFILES_READY, &cache->flags))
171	return 0;	171	return 0;
172		172
173	/* check how much space the cache has */	173	/* check how much space the cache has */
174	cachefiles_has_space(cache, 0, 0);	174	cachefiles_has_space(cache, 0, 0);
175		175
176	/* summarise */	176	/* summarise */
177	clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);	177	clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
178		178
179	n = snprintf(buffer, sizeof(buffer),	179	n = snprintf(buffer, sizeof(buffer),
180	"cull=%c"	180	"cull=%c"
181	" frun=%llx"	181	" frun=%llx"
182	" fcull=%llx"	182	" fcull=%llx"
183	" fstop=%llx"	183	" fstop=%llx"
184	" brun=%llx"	184	" brun=%llx"
185	" bcull=%llx"	185	" bcull=%llx"
186	" bstop=%llx",	186	" bstop=%llx",
187	test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',	187	test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
188	(unsigned long long) cache->frun,	188	(unsigned long long) cache->frun,
189	(unsigned long long) cache->fcull,	189	(unsigned long long) cache->fcull,
190	(unsigned long long) cache->fstop,	190	(unsigned long long) cache->fstop,
191	(unsigned long long) cache->brun,	191	(unsigned long long) cache->brun,
192	(unsigned long long) cache->bcull,	192	(unsigned long long) cache->bcull,
193	(unsigned long long) cache->bstop	193	(unsigned long long) cache->bstop
194	);	194	);
195		195
196	if (n > buflen)	196	if (n > buflen)
197	return -EMSGSIZE;	197	return -EMSGSIZE;
198		198
199	if (copy_to_user(_buffer, buffer, n) != 0)	199	if (copy_to_user(_buffer, buffer, n) != 0)
200	return -EFAULT;	200	return -EFAULT;
201		201
202	return n;	202	return n;
203	}	203	}
204		204
205	/*	205	/*
206	* command the cache	206	* command the cache
207	*/	207	*/
208	static ssize_t cachefiles_daemon_write(struct file *file,	208	static ssize_t cachefiles_daemon_write(struct file *file,
209	const char __user *_data,	209	const char __user *_data,
210	size_t datalen,	210	size_t datalen,
211	loff_t *pos)	211	loff_t *pos)
212	{	212	{
213	const struct cachefiles_daemon_cmd *cmd;	213	const struct cachefiles_daemon_cmd *cmd;
214	struct cachefiles_cache *cache = file->private_data;	214	struct cachefiles_cache *cache = file->private_data;
215	ssize_t ret;	215	ssize_t ret;
216	char data, args, *cp;	216	char data, args, *cp;
217		217
218	//_enter(",,%zu,", datalen);	218	//_enter(",,%zu,", datalen);
219		219
220	ASSERT(cache);	220	ASSERT(cache);
221		221
222	if (test_bit(CACHEFILES_DEAD, &cache->flags))	222	if (test_bit(CACHEFILES_DEAD, &cache->flags))
223	return -EIO;	223	return -EIO;
224		224
225	if (datalen < 0 \|\| datalen > PAGE_SIZE - 1)	225	if (datalen < 0 \|\| datalen > PAGE_SIZE - 1)
226	return -EOPNOTSUPP;	226	return -EOPNOTSUPP;
227		227
228	/* drag the command string into the kernel so we can parse it */	228	/* drag the command string into the kernel so we can parse it */
229	data = kmalloc(datalen + 1, GFP_KERNEL);	229	data = kmalloc(datalen + 1, GFP_KERNEL);
230	if (!data)	230	if (!data)
231	return -ENOMEM;	231	return -ENOMEM;
232		232
233	ret = -EFAULT;	233	ret = -EFAULT;
234	if (copy_from_user(data, _data, datalen) != 0)	234	if (copy_from_user(data, _data, datalen) != 0)
235	goto error;	235	goto error;
236		236
237	data[datalen] = '\0';	237	data[datalen] = '\0';
238		238
239	ret = -EINVAL;	239	ret = -EINVAL;
240	if (memchr(data, '\0', datalen))	240	if (memchr(data, '\0', datalen))
241	goto error;	241	goto error;
242		242
243	/* strip any newline */	243	/* strip any newline */
244	cp = memchr(data, '\n', datalen);	244	cp = memchr(data, '\n', datalen);
245	if (cp) {	245	if (cp) {
246	if (cp == data)	246	if (cp == data)
247	goto error;	247	goto error;
248		248
249	*cp = '\0';	249	*cp = '\0';
250	}	250	}
251		251
252	/* parse the command */	252	/* parse the command */
253	ret = -EOPNOTSUPP;	253	ret = -EOPNOTSUPP;
254		254
255	for (args = data; *args; args++)	255	for (args = data; *args; args++)
256	if (isspace(*args))	256	if (isspace(*args))
257	break;	257	break;
258	if (*args) {	258	if (*args) {
259	if (args == data)	259	if (args == data)
260	goto error;	260	goto error;
261	*args = '\0';	261	*args = '\0';
262	args = skip_spaces(++args);	262	args = skip_spaces(++args);
263	}	263	}
264		264
265	/* run the appropriate command handler */	265	/* run the appropriate command handler */
266	for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)	266	for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
267	if (strcmp(cmd->name, data) == 0)	267	if (strcmp(cmd->name, data) == 0)
268	goto found_command;	268	goto found_command;
269		269
270	error:	270	error:
271	kfree(data);	271	kfree(data);
272	//_leave(" = %zd", ret);	272	//_leave(" = %zd", ret);
273	return ret;	273	return ret;
274		274
275	found_command:	275	found_command:
276	mutex_lock(&cache->daemon_mutex);	276	mutex_lock(&cache->daemon_mutex);
277		277
278	ret = -EIO;	278	ret = -EIO;
279	if (!test_bit(CACHEFILES_DEAD, &cache->flags))	279	if (!test_bit(CACHEFILES_DEAD, &cache->flags))
280	ret = cmd->handler(cache, args);	280	ret = cmd->handler(cache, args);
281		281
282	mutex_unlock(&cache->daemon_mutex);	282	mutex_unlock(&cache->daemon_mutex);
283		283
284	if (ret == 0)	284	if (ret == 0)
285	ret = datalen;	285	ret = datalen;
286	goto error;	286	goto error;
287	}	287	}
288		288
289	/*	289	/*
290	* poll for culling state	290	* poll for culling state
291	* - use POLLOUT to indicate culling state	291	* - use POLLOUT to indicate culling state
292	*/	292	*/
293	static unsigned int cachefiles_daemon_poll(struct file *file,	293	static unsigned int cachefiles_daemon_poll(struct file *file,
294	struct poll_table_struct *poll)	294	struct poll_table_struct *poll)
295	{	295	{
296	struct cachefiles_cache *cache = file->private_data;	296	struct cachefiles_cache *cache = file->private_data;
297	unsigned int mask;	297	unsigned int mask;
298		298
299	poll_wait(file, &cache->daemon_pollwq, poll);	299	poll_wait(file, &cache->daemon_pollwq, poll);
300	mask = 0;	300	mask = 0;
301		301
302	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))	302	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
303	mask \|= POLLIN;	303	mask \|= POLLIN;
304		304
305	if (test_bit(CACHEFILES_CULLING, &cache->flags))	305	if (test_bit(CACHEFILES_CULLING, &cache->flags))
306	mask \|= POLLOUT;	306	mask \|= POLLOUT;
307		307
308	return mask;	308	return mask;
309	}	309	}
310		310
311	/*	311	/*
312	* give a range error for cache space constraints	312	* give a range error for cache space constraints
313	* - can be tail-called	313	* - can be tail-called
314	*/	314	*/
315	static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,	315	static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
316	char *args)	316	char *args)
317	{	317	{
318	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");	318	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
319		319
320	return -EINVAL;	320	return -EINVAL;
321	}	321	}
322		322
323	/*	323	/*
324	* set the percentage of files at which to stop culling	324	* set the percentage of files at which to stop culling
325	* - command: "frun <N>%"	325	* - command: "frun <N>%"
326	*/	326	*/
327	static int cachefiles_daemon_frun(struct cachefiles_cache cache, char args)	327	static int cachefiles_daemon_frun(struct cachefiles_cache cache, char args)
328	{	328	{
329	unsigned long frun;	329	unsigned long frun;
330		330
331	_enter(",%s", args);	331	_enter(",%s", args);
332		332
333	if (!*args)	333	if (!*args)
334	return -EINVAL;	334	return -EINVAL;
335		335
336	frun = simple_strtoul(args, &args, 10);	336	frun = simple_strtoul(args, &args, 10);
337	if (args[0] != '%' \|\| args[1] != '\0')	337	if (args[0] != '%' \|\| args[1] != '\0')
338	return -EINVAL;	338	return -EINVAL;
339		339
340	if (frun <= cache->fcull_percent \|\| frun >= 100)	340	if (frun <= cache->fcull_percent \|\| frun >= 100)
341	return cachefiles_daemon_range_error(cache, args);	341	return cachefiles_daemon_range_error(cache, args);
342		342
343	cache->frun_percent = frun;	343	cache->frun_percent = frun;
344	return 0;	344	return 0;
345	}	345	}
346		346
347	/*	347	/*
348	* set the percentage of files at which to start culling	348	* set the percentage of files at which to start culling
349	* - command: "fcull <N>%"	349	* - command: "fcull <N>%"
350	*/	350	*/
351	static int cachefiles_daemon_fcull(struct cachefiles_cache cache, char args)	351	static int cachefiles_daemon_fcull(struct cachefiles_cache cache, char args)
352	{	352	{
353	unsigned long fcull;	353	unsigned long fcull;
354		354
355	_enter(",%s", args);	355	_enter(",%s", args);
356		356
357	if (!*args)	357	if (!*args)
358	return -EINVAL;	358	return -EINVAL;
359		359
360	fcull = simple_strtoul(args, &args, 10);	360	fcull = simple_strtoul(args, &args, 10);
361	if (args[0] != '%' \|\| args[1] != '\0')	361	if (args[0] != '%' \|\| args[1] != '\0')
362	return -EINVAL;	362	return -EINVAL;
363		363
364	if (fcull <= cache->fstop_percent \|\| fcull >= cache->frun_percent)	364	if (fcull <= cache->fstop_percent \|\| fcull >= cache->frun_percent)
365	return cachefiles_daemon_range_error(cache, args);	365	return cachefiles_daemon_range_error(cache, args);
366		366
367	cache->fcull_percent = fcull;	367	cache->fcull_percent = fcull;
368	return 0;	368	return 0;
369	}	369	}
370		370
371	/*	371	/*
372	* set the percentage of files at which to stop allocating	372	* set the percentage of files at which to stop allocating
373	* - command: "fstop <N>%"	373	* - command: "fstop <N>%"
374	*/	374	*/
375	static int cachefiles_daemon_fstop(struct cachefiles_cache cache, char args)	375	static int cachefiles_daemon_fstop(struct cachefiles_cache cache, char args)
376	{	376	{
377	unsigned long fstop;	377	unsigned long fstop;
378		378
379	_enter(",%s", args);	379	_enter(",%s", args);
380		380
381	if (!*args)	381	if (!*args)
382	return -EINVAL;	382	return -EINVAL;
383		383
384	fstop = simple_strtoul(args, &args, 10);	384	fstop = simple_strtoul(args, &args, 10);
385	if (args[0] != '%' \|\| args[1] != '\0')	385	if (args[0] != '%' \|\| args[1] != '\0')
386	return -EINVAL;	386	return -EINVAL;
387		387
388	if (fstop < 0 \|\| fstop >= cache->fcull_percent)	388	if (fstop < 0 \|\| fstop >= cache->fcull_percent)
389	return cachefiles_daemon_range_error(cache, args);	389	return cachefiles_daemon_range_error(cache, args);
390		390
391	cache->fstop_percent = fstop;	391	cache->fstop_percent = fstop;
392	return 0;	392	return 0;
393	}	393	}
394		394
395	/*	395	/*
396	* set the percentage of blocks at which to stop culling	396	* set the percentage of blocks at which to stop culling
397	* - command: "brun <N>%"	397	* - command: "brun <N>%"
398	*/	398	*/
399	static int cachefiles_daemon_brun(struct cachefiles_cache cache, char args)	399	static int cachefiles_daemon_brun(struct cachefiles_cache cache, char args)
400	{	400	{
401	unsigned long brun;	401	unsigned long brun;
402		402
403	_enter(",%s", args);	403	_enter(",%s", args);
404		404
405	if (!*args)	405	if (!*args)
406	return -EINVAL;	406	return -EINVAL;
407		407
408	brun = simple_strtoul(args, &args, 10);	408	brun = simple_strtoul(args, &args, 10);
409	if (args[0] != '%' \|\| args[1] != '\0')	409	if (args[0] != '%' \|\| args[1] != '\0')
410	return -EINVAL;	410	return -EINVAL;
411		411
412	if (brun <= cache->bcull_percent \|\| brun >= 100)	412	if (brun <= cache->bcull_percent \|\| brun >= 100)
413	return cachefiles_daemon_range_error(cache, args);	413	return cachefiles_daemon_range_error(cache, args);
414		414
415	cache->brun_percent = brun;	415	cache->brun_percent = brun;
416	return 0;	416	return 0;
417	}	417	}
418		418
419	/*	419	/*
420	* set the percentage of blocks at which to start culling	420	* set the percentage of blocks at which to start culling
421	* - command: "bcull <N>%"	421	* - command: "bcull <N>%"
422	*/	422	*/
423	static int cachefiles_daemon_bcull(struct cachefiles_cache cache, char args)	423	static int cachefiles_daemon_bcull(struct cachefiles_cache cache, char args)
424	{	424	{
425	unsigned long bcull;	425	unsigned long bcull;
426		426
427	_enter(",%s", args);	427	_enter(",%s", args);
428		428
429	if (!*args)	429	if (!*args)
430	return -EINVAL;	430	return -EINVAL;
431		431
432	bcull = simple_strtoul(args, &args, 10);	432	bcull = simple_strtoul(args, &args, 10);
433	if (args[0] != '%' \|\| args[1] != '\0')	433	if (args[0] != '%' \|\| args[1] != '\0')
434	return -EINVAL;	434	return -EINVAL;
435		435
436	if (bcull <= cache->bstop_percent \|\| bcull >= cache->brun_percent)	436	if (bcull <= cache->bstop_percent \|\| bcull >= cache->brun_percent)
437	return cachefiles_daemon_range_error(cache, args);	437	return cachefiles_daemon_range_error(cache, args);
438		438
439	cache->bcull_percent = bcull;	439	cache->bcull_percent = bcull;
440	return 0;	440	return 0;
441	}	441	}
442		442
443	/*	443	/*
444	* set the percentage of blocks at which to stop allocating	444	* set the percentage of blocks at which to stop allocating
445	* - command: "bstop <N>%"	445	* - command: "bstop <N>%"
446	*/	446	*/
447	static int cachefiles_daemon_bstop(struct cachefiles_cache cache, char args)	447	static int cachefiles_daemon_bstop(struct cachefiles_cache cache, char args)
448	{	448	{
449	unsigned long bstop;	449	unsigned long bstop;
450		450
451	_enter(",%s", args);	451	_enter(",%s", args);
452		452
453	if (!*args)	453	if (!*args)
454	return -EINVAL;	454	return -EINVAL;
455		455
456	bstop = simple_strtoul(args, &args, 10);	456	bstop = simple_strtoul(args, &args, 10);
457	if (args[0] != '%' \|\| args[1] != '\0')	457	if (args[0] != '%' \|\| args[1] != '\0')
458	return -EINVAL;	458	return -EINVAL;
459		459
460	if (bstop < 0 \|\| bstop >= cache->bcull_percent)	460	if (bstop < 0 \|\| bstop >= cache->bcull_percent)
461	return cachefiles_daemon_range_error(cache, args);	461	return cachefiles_daemon_range_error(cache, args);
462		462
463	cache->bstop_percent = bstop;	463	cache->bstop_percent = bstop;
464	return 0;	464	return 0;
465	}	465	}
466		466
467	/*	467	/*
468	* set the cache directory	468	* set the cache directory
469	* - command: "dir <name>"	469	* - command: "dir <name>"
470	*/	470	*/
471	static int cachefiles_daemon_dir(struct cachefiles_cache cache, char args)	471	static int cachefiles_daemon_dir(struct cachefiles_cache cache, char args)
472	{	472	{
473	char *dir;	473	char *dir;
474		474
475	_enter(",%s", args);	475	_enter(",%s", args);
476		476
477	if (!*args) {	477	if (!*args) {
478	pr_err("Empty directory specified");	478	pr_err("Empty directory specified\n");
479	return -EINVAL;	479	return -EINVAL;
480	}	480	}
481		481
482	if (cache->rootdirname) {	482	if (cache->rootdirname) {
483	pr_err("Second cache directory specified");	483	pr_err("Second cache directory specified\n");
484	return -EEXIST;	484	return -EEXIST;
485	}	485	}
486		486
487	dir = kstrdup(args, GFP_KERNEL);	487	dir = kstrdup(args, GFP_KERNEL);
488	if (!dir)	488	if (!dir)
489	return -ENOMEM;	489	return -ENOMEM;
490		490
491	cache->rootdirname = dir;	491	cache->rootdirname = dir;
492	return 0;	492	return 0;
493	}	493	}
494		494
495	/*	495	/*
496	* set the cache security context	496	* set the cache security context
497	* - command: "secctx <ctx>"	497	* - command: "secctx <ctx>"
498	*/	498	*/
499	static int cachefiles_daemon_secctx(struct cachefiles_cache cache, char args)	499	static int cachefiles_daemon_secctx(struct cachefiles_cache cache, char args)
500	{	500	{
501	char *secctx;	501	char *secctx;
502		502
503	_enter(",%s", args);	503	_enter(",%s", args);
504		504
505	if (!*args) {	505	if (!*args) {
506	pr_err("Empty security context specified");	506	pr_err("Empty security context specified\n");
507	return -EINVAL;	507	return -EINVAL;
508	}	508	}
509		509
510	if (cache->secctx) {	510	if (cache->secctx) {
511	pr_err("Second security context specified");	511	pr_err("Second security context specified\n");
512	return -EINVAL;	512	return -EINVAL;
513	}	513	}
514		514
515	secctx = kstrdup(args, GFP_KERNEL);	515	secctx = kstrdup(args, GFP_KERNEL);
516	if (!secctx)	516	if (!secctx)
517	return -ENOMEM;	517	return -ENOMEM;
518		518
519	cache->secctx = secctx;	519	cache->secctx = secctx;
520	return 0;	520	return 0;
521	}	521	}
522		522
523	/*	523	/*
524	* set the cache tag	524	* set the cache tag
525	* - command: "tag <name>"	525	* - command: "tag <name>"
526	*/	526	*/
527	static int cachefiles_daemon_tag(struct cachefiles_cache cache, char args)	527	static int cachefiles_daemon_tag(struct cachefiles_cache cache, char args)
528	{	528	{
529	char *tag;	529	char *tag;
530		530
531	_enter(",%s", args);	531	_enter(",%s", args);
532		532
533	if (!*args) {	533	if (!*args) {
534	pr_err("Empty tag specified");	534	pr_err("Empty tag specified\n");
535	return -EINVAL;	535	return -EINVAL;
536	}	536	}
537		537
538	if (cache->tag)	538	if (cache->tag)
539	return -EEXIST;	539	return -EEXIST;
540		540
541	tag = kstrdup(args, GFP_KERNEL);	541	tag = kstrdup(args, GFP_KERNEL);
542	if (!tag)	542	if (!tag)
543	return -ENOMEM;	543	return -ENOMEM;
544		544
545	cache->tag = tag;	545	cache->tag = tag;
546	return 0;	546	return 0;
547	}	547	}
548		548
549	/*	549	/*
550	* request a node in the cache be culled from the current working directory	550	* request a node in the cache be culled from the current working directory
551	* - command: "cull <name>"	551	* - command: "cull <name>"
552	*/	552	*/
553	static int cachefiles_daemon_cull(struct cachefiles_cache cache, char args)	553	static int cachefiles_daemon_cull(struct cachefiles_cache cache, char args)
554	{	554	{
555	struct path path;	555	struct path path;
556	const struct cred *saved_cred;	556	const struct cred *saved_cred;
557	int ret;	557	int ret;
558		558
559	_enter(",%s", args);	559	_enter(",%s", args);
560		560
561	if (strchr(args, '/'))	561	if (strchr(args, '/'))
562	goto inval;	562	goto inval;
563		563
564	if (!test_bit(CACHEFILES_READY, &cache->flags)) {	564	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
565	pr_err("cull applied to unready cache");	565	pr_err("cull applied to unready cache\n");
566	return -EIO;	566	return -EIO;
567	}	567	}
568		568
569	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {	569	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
570	pr_err("cull applied to dead cache");	570	pr_err("cull applied to dead cache\n");
571	return -EIO;	571	return -EIO;
572	}	572	}
573		573
574	/* extract the directory dentry from the cwd */	574	/* extract the directory dentry from the cwd */
575	get_fs_pwd(current->fs, &path);	575	get_fs_pwd(current->fs, &path);
576		576
577	if (!S_ISDIR(path.dentry->d_inode->i_mode))	577	if (!S_ISDIR(path.dentry->d_inode->i_mode))
578	goto notdir;	578	goto notdir;
579		579
580	cachefiles_begin_secure(cache, &saved_cred);	580	cachefiles_begin_secure(cache, &saved_cred);
581	ret = cachefiles_cull(cache, path.dentry, args);	581	ret = cachefiles_cull(cache, path.dentry, args);
582	cachefiles_end_secure(cache, saved_cred);	582	cachefiles_end_secure(cache, saved_cred);
583		583
584	path_put(&path);	584	path_put(&path);
585	_leave(" = %d", ret);	585	_leave(" = %d", ret);
586	return ret;	586	return ret;
587		587
588	notdir:	588	notdir:
589	path_put(&path);	589	path_put(&path);
590	pr_err("cull command requires dirfd to be a directory");	590	pr_err("cull command requires dirfd to be a directory\n");
591	return -ENOTDIR;	591	return -ENOTDIR;
592		592
593	inval:	593	inval:
594	pr_err("cull command requires dirfd and filename");	594	pr_err("cull command requires dirfd and filename\n");
595	return -EINVAL;	595	return -EINVAL;
596	}	596	}
597		597
598	/*	598	/*
599	* set debugging mode	599	* set debugging mode
600	* - command: "debug <mask>"	600	* - command: "debug <mask>"
601	*/	601	*/
602	static int cachefiles_daemon_debug(struct cachefiles_cache cache, char args)	602	static int cachefiles_daemon_debug(struct cachefiles_cache cache, char args)
603	{	603	{
604	unsigned long mask;	604	unsigned long mask;
605		605
606	_enter(",%s", args);	606	_enter(",%s", args);
607		607
608	mask = simple_strtoul(args, &args, 0);	608	mask = simple_strtoul(args, &args, 0);
609	if (args[0] != '\0')	609	if (args[0] != '\0')
610	goto inval;	610	goto inval;
611		611
612	cachefiles_debug = mask;	612	cachefiles_debug = mask;
613	_leave(" = 0");	613	_leave(" = 0");
614	return 0;	614	return 0;
615		615
616	inval:	616	inval:
617	pr_err("debug command requires mask");	617	pr_err("debug command requires mask\n");
618	return -EINVAL;	618	return -EINVAL;
619	}	619	}
620		620
621	/*	621	/*
622	* find out whether an object in the current working directory is in use or not	622	* find out whether an object in the current working directory is in use or not
623	* - command: "inuse <name>"	623	* - command: "inuse <name>"
624	*/	624	*/
625	static int cachefiles_daemon_inuse(struct cachefiles_cache cache, char args)	625	static int cachefiles_daemon_inuse(struct cachefiles_cache cache, char args)
626	{	626	{
627	struct path path;	627	struct path path;
628	const struct cred *saved_cred;	628	const struct cred *saved_cred;
629	int ret;	629	int ret;
630		630
631	//_enter(",%s", args);	631	//_enter(",%s", args);
632		632
633	if (strchr(args, '/'))	633	if (strchr(args, '/'))
634	goto inval;	634	goto inval;
635		635
636	if (!test_bit(CACHEFILES_READY, &cache->flags)) {	636	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
637	pr_err("inuse applied to unready cache");	637	pr_err("inuse applied to unready cache\n");
638	return -EIO;	638	return -EIO;
639	}	639	}
640		640
641	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {	641	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
642	pr_err("inuse applied to dead cache");	642	pr_err("inuse applied to dead cache\n");
643	return -EIO;	643	return -EIO;
644	}	644	}
645		645
646	/* extract the directory dentry from the cwd */	646	/* extract the directory dentry from the cwd */
647	get_fs_pwd(current->fs, &path);	647	get_fs_pwd(current->fs, &path);
648		648
649	if (!S_ISDIR(path.dentry->d_inode->i_mode))	649	if (!S_ISDIR(path.dentry->d_inode->i_mode))
650	goto notdir;	650	goto notdir;
651		651
652	cachefiles_begin_secure(cache, &saved_cred);	652	cachefiles_begin_secure(cache, &saved_cred);
653	ret = cachefiles_check_in_use(cache, path.dentry, args);	653	ret = cachefiles_check_in_use(cache, path.dentry, args);
654	cachefiles_end_secure(cache, saved_cred);	654	cachefiles_end_secure(cache, saved_cred);
655		655
656	path_put(&path);	656	path_put(&path);
657	//_leave(" = %d", ret);	657	//_leave(" = %d", ret);
658	return ret;	658	return ret;
659		659
660	notdir:	660	notdir:
661	path_put(&path);	661	path_put(&path);
662	pr_err("inuse command requires dirfd to be a directory");	662	pr_err("inuse command requires dirfd to be a directory\n");
663	return -ENOTDIR;	663	return -ENOTDIR;
664		664
665	inval:	665	inval:
666	pr_err("inuse command requires dirfd and filename");	666	pr_err("inuse command requires dirfd and filename\n");
667	return -EINVAL;	667	return -EINVAL;
668	}	668	}
669		669
670	/*	670	/*
671	* see if we have space for a number of pages and/or a number of files in the	671	* see if we have space for a number of pages and/or a number of files in the
672	* cache	672	* cache
673	*/	673	*/
674	int cachefiles_has_space(struct cachefiles_cache *cache,	674	int cachefiles_has_space(struct cachefiles_cache *cache,
675	unsigned fnr, unsigned bnr)	675	unsigned fnr, unsigned bnr)
676	{	676	{
677	struct kstatfs stats;	677	struct kstatfs stats;
678	struct path path = {	678	struct path path = {
679	.mnt = cache->mnt,	679	.mnt = cache->mnt,
680	.dentry = cache->mnt->mnt_root,	680	.dentry = cache->mnt->mnt_root,
681	};	681	};
682	int ret;	682	int ret;
683		683
684	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",	684	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
685	// (unsigned long long) cache->frun,	685	// (unsigned long long) cache->frun,
686	// (unsigned long long) cache->fcull,	686	// (unsigned long long) cache->fcull,
687	// (unsigned long long) cache->fstop,	687	// (unsigned long long) cache->fstop,
688	// (unsigned long long) cache->brun,	688	// (unsigned long long) cache->brun,
689	// (unsigned long long) cache->bcull,	689	// (unsigned long long) cache->bcull,
690	// (unsigned long long) cache->bstop,	690	// (unsigned long long) cache->bstop,
691	// fnr, bnr);	691	// fnr, bnr);
692		692
693	/* find out how many pages of blockdev are available */	693	/* find out how many pages of blockdev are available */
694	memset(&stats, 0, sizeof(stats));	694	memset(&stats, 0, sizeof(stats));
695		695
696	ret = vfs_statfs(&path, &stats);	696	ret = vfs_statfs(&path, &stats);
697	if (ret < 0) {	697	if (ret < 0) {
698	if (ret == -EIO)	698	if (ret == -EIO)
699	cachefiles_io_error(cache, "statfs failed");	699	cachefiles_io_error(cache, "statfs failed");
700	_leave(" = %d", ret);	700	_leave(" = %d", ret);
701	return ret;	701	return ret;
702	}	702	}
703		703
704	stats.f_bavail >>= cache->bshift;	704	stats.f_bavail >>= cache->bshift;
705		705
706	//_debug("avail %llu,%llu",	706	//_debug("avail %llu,%llu",
707	// (unsigned long long) stats.f_ffree,	707	// (unsigned long long) stats.f_ffree,
708	// (unsigned long long) stats.f_bavail);	708	// (unsigned long long) stats.f_bavail);
709		709
710	/* see if there is sufficient space */	710	/* see if there is sufficient space */
711	if (stats.f_ffree > fnr)	711	if (stats.f_ffree > fnr)
712	stats.f_ffree -= fnr;	712	stats.f_ffree -= fnr;
713	else	713	else
714	stats.f_ffree = 0;	714	stats.f_ffree = 0;
715		715
716	if (stats.f_bavail > bnr)	716	if (stats.f_bavail > bnr)
717	stats.f_bavail -= bnr;	717	stats.f_bavail -= bnr;
718	else	718	else
719	stats.f_bavail = 0;	719	stats.f_bavail = 0;
720		720
721	ret = -ENOBUFS;	721	ret = -ENOBUFS;
722	if (stats.f_ffree < cache->fstop \|\|	722	if (stats.f_ffree < cache->fstop \|\|
723	stats.f_bavail < cache->bstop)	723	stats.f_bavail < cache->bstop)
724	goto begin_cull;	724	goto begin_cull;
725		725
726	ret = 0;	726	ret = 0;
727	if (stats.f_ffree < cache->fcull \|\|	727	if (stats.f_ffree < cache->fcull \|\|
728	stats.f_bavail < cache->bcull)	728	stats.f_bavail < cache->bcull)
729	goto begin_cull;	729	goto begin_cull;
730		730
731	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&	731	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
732	stats.f_ffree >= cache->frun &&	732	stats.f_ffree >= cache->frun &&
733	stats.f_bavail >= cache->brun &&	733	stats.f_bavail >= cache->brun &&
734	test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)	734	test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
735	) {	735	) {
736	_debug("cease culling");	736	_debug("cease culling");
737	cachefiles_state_changed(cache);	737	cachefiles_state_changed(cache);
738	}	738	}
739		739
740	//_leave(" = 0");	740	//_leave(" = 0");
741	return 0;	741	return 0;
742		742
743	begin_cull:	743	begin_cull:
744	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {	744	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
745	_debug("### CULL CACHE ###");	745	_debug("### CULL CACHE ###");
746	cachefiles_state_changed(cache);	746	cachefiles_state_changed(cache);
747	}	747	}
748		748
749	_leave(" = %d", ret);	749	_leave(" = %d", ret);
750	return ret;	750	return ret;
751	}	751	}
752		752

fs/cachefiles/internal.h

Diff comments View file @ 8207649

fs/cachefiles/main.c

Diff comments View file @ 8207649

1	/* Network filesystem caching backend to use cache files on a premounted	1	/* Network filesystem caching backend to use cache files on a premounted
2	* filesystem	2	* filesystem
3	*	3	*
4	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.	4	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5	* Written by David Howells (dhowells@redhat.com)	5	* Written by David Howells (dhowells@redhat.com)
6	*	6	*
7	* This program is free software; you can redistribute it and/or	7	* This program is free software; you can redistribute it and/or
8	* modify it under the terms of the GNU General Public Licence	8	* modify it under the terms of the GNU General Public Licence
9	* as published by the Free Software Foundation; either version	9	* as published by the Free Software Foundation; either version
10	* 2 of the Licence, or (at your option) any later version.	10	* 2 of the Licence, or (at your option) any later version.
11	*/	11	*/
12		12
13	#include <linux/module.h>	13	#include <linux/module.h>
14	#include <linux/init.h>	14	#include <linux/init.h>
15	#include <linux/sched.h>	15	#include <linux/sched.h>
16	#include <linux/completion.h>	16	#include <linux/completion.h>
17	#include <linux/slab.h>	17	#include <linux/slab.h>
18	#include <linux/fs.h>	18	#include <linux/fs.h>
19	#include <linux/file.h>	19	#include <linux/file.h>
20	#include <linux/namei.h>	20	#include <linux/namei.h>
21	#include <linux/mount.h>	21	#include <linux/mount.h>
22	#include <linux/statfs.h>	22	#include <linux/statfs.h>
23	#include <linux/sysctl.h>	23	#include <linux/sysctl.h>
24	#include <linux/miscdevice.h>	24	#include <linux/miscdevice.h>
25	#include "internal.h"	25	#include "internal.h"
26		26
27	unsigned cachefiles_debug;	27	unsigned cachefiles_debug;
28	module_param_named(debug, cachefiles_debug, uint, S_IWUSR \| S_IRUGO);	28	module_param_named(debug, cachefiles_debug, uint, S_IWUSR \| S_IRUGO);
29	MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");	29	MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
30		30
31	MODULE_DESCRIPTION("Mounted-filesystem based cache");	31	MODULE_DESCRIPTION("Mounted-filesystem based cache");
32	MODULE_AUTHOR("Red Hat, Inc.");	32	MODULE_AUTHOR("Red Hat, Inc.");
33	MODULE_LICENSE("GPL");	33	MODULE_LICENSE("GPL");
34		34
35	struct kmem_cache *cachefiles_object_jar;	35	struct kmem_cache *cachefiles_object_jar;
36		36
37	static struct miscdevice cachefiles_dev = {	37	static struct miscdevice cachefiles_dev = {
38	.minor = MISC_DYNAMIC_MINOR,	38	.minor = MISC_DYNAMIC_MINOR,
39	.name = "cachefiles",	39	.name = "cachefiles",
40	.fops = &cachefiles_daemon_fops,	40	.fops = &cachefiles_daemon_fops,
41	};	41	};
42		42
43	static void cachefiles_object_init_once(void *_object)	43	static void cachefiles_object_init_once(void *_object)
44	{	44	{
45	struct cachefiles_object *object = _object;	45	struct cachefiles_object *object = _object;
46		46
47	memset(object, 0, sizeof(*object));	47	memset(object, 0, sizeof(*object));
48	spin_lock_init(&object->work_lock);	48	spin_lock_init(&object->work_lock);
49	}	49	}
50		50
51	/*	51	/*
52	* initialise the fs caching module	52	* initialise the fs caching module
53	*/	53	*/
54	static int __init cachefiles_init(void)	54	static int __init cachefiles_init(void)
55	{	55	{
56	int ret;	56	int ret;
57		57
58	ret = misc_register(&cachefiles_dev);	58	ret = misc_register(&cachefiles_dev);
59	if (ret < 0)	59	if (ret < 0)
60	goto error_dev;	60	goto error_dev;
61		61
62	/* create an object jar */	62	/* create an object jar */
63	ret = -ENOMEM;	63	ret = -ENOMEM;
64	cachefiles_object_jar =	64	cachefiles_object_jar =
65	kmem_cache_create("cachefiles_object_jar",	65	kmem_cache_create("cachefiles_object_jar",
66	sizeof(struct cachefiles_object),	66	sizeof(struct cachefiles_object),
67	0,	67	0,
68	SLAB_HWCACHE_ALIGN,	68	SLAB_HWCACHE_ALIGN,
69	cachefiles_object_init_once);	69	cachefiles_object_init_once);
70	if (!cachefiles_object_jar) {	70	if (!cachefiles_object_jar) {
71	pr_notice("Failed to allocate an object jar\n");	71	pr_notice("Failed to allocate an object jar\n");
72	goto error_object_jar;	72	goto error_object_jar;
73	}	73	}
74		74
75	ret = cachefiles_proc_init();	75	ret = cachefiles_proc_init();
76	if (ret < 0)	76	if (ret < 0)
77	goto error_proc;	77	goto error_proc;
78		78
79	pr_info("Loaded\n");	79	pr_info("Loaded\n");
80	return 0;	80	return 0;
81		81
82	error_proc:	82	error_proc:
83	kmem_cache_destroy(cachefiles_object_jar);	83	kmem_cache_destroy(cachefiles_object_jar);
84	error_object_jar:	84	error_object_jar:
85	misc_deregister(&cachefiles_dev);	85	misc_deregister(&cachefiles_dev);
86	error_dev:	86	error_dev:
87	pr_err("failed to register: %d", ret);	87	pr_err("failed to register: %d\n", ret);
88	return ret;	88	return ret;
89	}	89	}
90		90
91	fs_initcall(cachefiles_init);	91	fs_initcall(cachefiles_init);
92		92
93	/*	93	/*
94	* clean up on module removal	94	* clean up on module removal
95	*/	95	*/
96	static void __exit cachefiles_exit(void)	96	static void __exit cachefiles_exit(void)
97	{	97	{
98	pr_info("Unloading\n");	98	pr_info("Unloading\n");
99		99
100	cachefiles_proc_cleanup();	100	cachefiles_proc_cleanup();
101	kmem_cache_destroy(cachefiles_object_jar);	101	kmem_cache_destroy(cachefiles_object_jar);
102	misc_deregister(&cachefiles_dev);	102	misc_deregister(&cachefiles_dev);
103	}	103	}
104		104
105	module_exit(cachefiles_exit);	105	module_exit(cachefiles_exit);
106		106

fs/cachefiles/namei.c

Diff comments View file @ 8207649

1	/* CacheFiles path walking and related routines	1	/* CacheFiles path walking and related routines
2	*	2	*
3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.	3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4	* Written by David Howells (dhowells@redhat.com)	4	* Written by David Howells (dhowells@redhat.com)
5	*	5	*
6	* This program is free software; you can redistribute it and/or	6	* This program is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU General Public Licence	7	* modify it under the terms of the GNU General Public Licence
8	* as published by the Free Software Foundation; either version	8	* as published by the Free Software Foundation; either version
9	* 2 of the Licence, or (at your option) any later version.	9	* 2 of the Licence, or (at your option) any later version.
10	*/	10	*/
11		11
12	#include <linux/module.h>	12	#include <linux/module.h>
13	#include <linux/sched.h>	13	#include <linux/sched.h>
14	#include <linux/file.h>	14	#include <linux/file.h>
15	#include <linux/fs.h>	15	#include <linux/fs.h>
16	#include <linux/fsnotify.h>	16	#include <linux/fsnotify.h>
17	#include <linux/quotaops.h>	17	#include <linux/quotaops.h>
18	#include <linux/xattr.h>	18	#include <linux/xattr.h>
19	#include <linux/mount.h>	19	#include <linux/mount.h>
20	#include <linux/namei.h>	20	#include <linux/namei.h>
21	#include <linux/security.h>	21	#include <linux/security.h>
22	#include <linux/slab.h>	22	#include <linux/slab.h>
23	#include "internal.h"	23	#include "internal.h"
24		24
25	#define CACHEFILES_KEYBUF_SIZE 512	25	#define CACHEFILES_KEYBUF_SIZE 512
26		26
27	/*	27	/*
28	* dump debugging info about an object	28	* dump debugging info about an object
29	*/	29	*/
30	static noinline	30	static noinline
31	void __cachefiles_printk_object(struct cachefiles_object *object,	31	void __cachefiles_printk_object(struct cachefiles_object *object,
32	const char *prefix,	32	const char *prefix,
33	u8 *keybuf)	33	u8 *keybuf)
34	{	34	{
35	struct fscache_cookie *cookie;	35	struct fscache_cookie *cookie;
36	unsigned keylen, loop;	36	unsigned keylen, loop;
37		37
38	pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);	38	pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
39	pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",	39	pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
40	prefix, object->fscache.state->name,	40	prefix, object->fscache.state->name,
41	object->fscache.flags, work_busy(&object->fscache.work),	41	object->fscache.flags, work_busy(&object->fscache.work),
42	object->fscache.events, object->fscache.event_mask);	42	object->fscache.events, object->fscache.event_mask);
43	pr_err("%sops=%u inp=%u exc=%u\n",	43	pr_err("%sops=%u inp=%u exc=%u\n",
44	prefix, object->fscache.n_ops, object->fscache.n_in_progress,	44	prefix, object->fscache.n_ops, object->fscache.n_in_progress,
45	object->fscache.n_exclusive);	45	object->fscache.n_exclusive);
46	pr_err("%sparent=%p\n",	46	pr_err("%sparent=%p\n",
47	prefix, object->fscache.parent);	47	prefix, object->fscache.parent);
48		48
49	spin_lock(&object->fscache.lock);	49	spin_lock(&object->fscache.lock);
50	cookie = object->fscache.cookie;	50	cookie = object->fscache.cookie;
51	if (cookie) {	51	if (cookie) {
52	pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",	52	pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",
53	prefix,	53	prefix,
54	object->fscache.cookie,	54	object->fscache.cookie,
55	object->fscache.cookie->parent,	55	object->fscache.cookie->parent,
56	object->fscache.cookie->netfs_data,	56	object->fscache.cookie->netfs_data,
57	object->fscache.cookie->flags);	57	object->fscache.cookie->flags);
58	if (keybuf && cookie->def)	58	if (keybuf && cookie->def)
59	keylen = cookie->def->get_key(cookie->netfs_data, keybuf,	59	keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
60	CACHEFILES_KEYBUF_SIZE);	60	CACHEFILES_KEYBUF_SIZE);
61	else	61	else
62	keylen = 0;	62	keylen = 0;
63	} else {	63	} else {
64	pr_err("%scookie=NULL\n", prefix);	64	pr_err("%scookie=NULL\n", prefix);
65	keylen = 0;	65	keylen = 0;
66	}	66	}
67	spin_unlock(&object->fscache.lock);	67	spin_unlock(&object->fscache.lock);
68		68
69	if (keylen) {	69	if (keylen) {
70	pr_err("%skey=[%u] '", prefix, keylen);	70	pr_err("%skey=[%u] '", prefix, keylen);
71	for (loop = 0; loop < keylen; loop++)	71	for (loop = 0; loop < keylen; loop++)
72	pr_cont("%02x", keybuf[loop]);	72	pr_cont("%02x", keybuf[loop]);
73	pr_cont("'\n");	73	pr_cont("'\n");
74	}	74	}
75	}	75	}
76		76
77	/*	77	/*
78	* dump debugging info about a pair of objects	78	* dump debugging info about a pair of objects
79	*/	79	*/
80	static noinline void cachefiles_printk_object(struct cachefiles_object *object,	80	static noinline void cachefiles_printk_object(struct cachefiles_object *object,
81	struct cachefiles_object *xobject)	81	struct cachefiles_object *xobject)
82	{	82	{
83	u8 *keybuf;	83	u8 *keybuf;
84		84
85	keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);	85	keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
86	if (object)	86	if (object)
87	__cachefiles_printk_object(object, "", keybuf);	87	__cachefiles_printk_object(object, "", keybuf);
88	if (xobject)	88	if (xobject)
89	__cachefiles_printk_object(xobject, "x", keybuf);	89	__cachefiles_printk_object(xobject, "x", keybuf);
90	kfree(keybuf);	90	kfree(keybuf);
91	}	91	}
92		92
93	/*	93	/*
94	* mark the owner of a dentry, if there is one, to indicate that that dentry	94	* mark the owner of a dentry, if there is one, to indicate that that dentry
95	* has been preemptively deleted	95	* has been preemptively deleted
96	* - the caller must hold the i_mutex on the dentry's parent as required to	96	* - the caller must hold the i_mutex on the dentry's parent as required to
97	* call vfs_unlink(), vfs_rmdir() or vfs_rename()	97	* call vfs_unlink(), vfs_rmdir() or vfs_rename()
98	*/	98	*/
99	static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,	99	static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
100	struct dentry *dentry)	100	struct dentry *dentry)
101	{	101	{
102	struct cachefiles_object *object;	102	struct cachefiles_object *object;
103	struct rb_node *p;	103	struct rb_node *p;
104		104
105	_enter(",'%.s'",	105	_enter(",'%.s'",
106	dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);	106	dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
107		107
108	write_lock(&cache->active_lock);	108	write_lock(&cache->active_lock);
109		109
110	p = cache->active_nodes.rb_node;	110	p = cache->active_nodes.rb_node;
111	while (p) {	111	while (p) {
112	object = rb_entry(p, struct cachefiles_object, active_node);	112	object = rb_entry(p, struct cachefiles_object, active_node);
113	if (object->dentry > dentry)	113	if (object->dentry > dentry)
114	p = p->rb_left;	114	p = p->rb_left;
115	else if (object->dentry < dentry)	115	else if (object->dentry < dentry)
116	p = p->rb_right;	116	p = p->rb_right;
117	else	117	else
118	goto found_dentry;	118	goto found_dentry;
119	}	119	}
120		120
121	write_unlock(&cache->active_lock);	121	write_unlock(&cache->active_lock);
122	_leave(" [no owner]");	122	_leave(" [no owner]");
123	return;	123	return;
124		124
125	/* found the dentry for */	125	/* found the dentry for */
126	found_dentry:	126	found_dentry:
127	kdebug("preemptive burial: OBJ%x [%s] %p",	127	kdebug("preemptive burial: OBJ%x [%s] %p",
128	object->fscache.debug_id,	128	object->fscache.debug_id,
129	object->fscache.state->name,	129	object->fscache.state->name,
130	dentry);	130	dentry);
131		131
132	if (fscache_object_is_live(&object->fscache)) {	132	if (fscache_object_is_live(&object->fscache)) {
133	pr_err("\n");	133	pr_err("\n");
134	pr_err("Error: Can't preemptively bury live object\n");	134	pr_err("Error: Can't preemptively bury live object\n");
135	cachefiles_printk_object(object, NULL);	135	cachefiles_printk_object(object, NULL);
136	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {	136	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
137	pr_err("Error: Object already preemptively buried\n");	137	pr_err("Error: Object already preemptively buried\n");
138	}	138	}
139		139
140	write_unlock(&cache->active_lock);	140	write_unlock(&cache->active_lock);
141	_leave(" [owner marked]");	141	_leave(" [owner marked]");
142	}	142	}
143		143
144	/*	144	/*
145	* record the fact that an object is now active	145	* record the fact that an object is now active
146	*/	146	*/
147	static int cachefiles_mark_object_active(struct cachefiles_cache *cache,	147	static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
148	struct cachefiles_object *object)	148	struct cachefiles_object *object)
149	{	149	{
150	struct cachefiles_object *xobject;	150	struct cachefiles_object *xobject;
151	struct rb_node *_p, _parent = NULL;	151	struct rb_node *_p, _parent = NULL;
152	struct dentry *dentry;	152	struct dentry *dentry;
153		153
154	_enter(",%p", object);	154	_enter(",%p", object);
155		155
156	try_again:	156	try_again:
157	write_lock(&cache->active_lock);	157	write_lock(&cache->active_lock);
158		158
159	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {	159	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
160	pr_err("Error: Object already active\n");	160	pr_err("Error: Object already active\n");
161	cachefiles_printk_object(object, NULL);	161	cachefiles_printk_object(object, NULL);
162	BUG();	162	BUG();
163	}	163	}
164		164
165	dentry = object->dentry;	165	dentry = object->dentry;
166	_p = &cache->active_nodes.rb_node;	166	_p = &cache->active_nodes.rb_node;
167	while (*_p) {	167	while (*_p) {
168	_parent = *_p;	168	_parent = *_p;
169	xobject = rb_entry(_parent,	169	xobject = rb_entry(_parent,
170	struct cachefiles_object, active_node);	170	struct cachefiles_object, active_node);
171		171
172	ASSERT(xobject != object);	172	ASSERT(xobject != object);
173		173
174	if (xobject->dentry > dentry)	174	if (xobject->dentry > dentry)
175	_p = &(*_p)->rb_left;	175	_p = &(*_p)->rb_left;
176	else if (xobject->dentry < dentry)	176	else if (xobject->dentry < dentry)
177	_p = &(*_p)->rb_right;	177	_p = &(*_p)->rb_right;
178	else	178	else
179	goto wait_for_old_object;	179	goto wait_for_old_object;
180	}	180	}
181		181
182	rb_link_node(&object->active_node, _parent, _p);	182	rb_link_node(&object->active_node, _parent, _p);
183	rb_insert_color(&object->active_node, &cache->active_nodes);	183	rb_insert_color(&object->active_node, &cache->active_nodes);
184		184
185	write_unlock(&cache->active_lock);	185	write_unlock(&cache->active_lock);
186	_leave(" = 0");	186	_leave(" = 0");
187	return 0;	187	return 0;
188		188
189	/* an old object from a previous incarnation is hogging the slot - we	189	/* an old object from a previous incarnation is hogging the slot - we
190	* need to wait for it to be destroyed */	190	* need to wait for it to be destroyed */
191	wait_for_old_object:	191	wait_for_old_object:
192	if (fscache_object_is_live(&object->fscache)) {	192	if (fscache_object_is_live(&object->fscache)) {
193	pr_err("\n");	193	pr_err("\n");
194	pr_err("Error: Unexpected object collision\n");	194	pr_err("Error: Unexpected object collision\n");
195	cachefiles_printk_object(object, xobject);	195	cachefiles_printk_object(object, xobject);
196	BUG();	196	BUG();
197	}	197	}
198	atomic_inc(&xobject->usage);	198	atomic_inc(&xobject->usage);
199	write_unlock(&cache->active_lock);	199	write_unlock(&cache->active_lock);
200		200
201	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {	201	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
202	wait_queue_head_t *wq;	202	wait_queue_head_t *wq;
203		203
204	signed long timeout = 60 * HZ;	204	signed long timeout = 60 * HZ;
205	wait_queue_t wait;	205	wait_queue_t wait;
206	bool requeue;	206	bool requeue;
207		207
208	/* if the object we're waiting for is queued for processing,	208	/* if the object we're waiting for is queued for processing,
209	* then just put ourselves on the queue behind it */	209	* then just put ourselves on the queue behind it */
210	if (work_pending(&xobject->fscache.work)) {	210	if (work_pending(&xobject->fscache.work)) {
211	_debug("queue OBJ%x behind OBJ%x immediately",	211	_debug("queue OBJ%x behind OBJ%x immediately",
212	object->fscache.debug_id,	212	object->fscache.debug_id,
213	xobject->fscache.debug_id);	213	xobject->fscache.debug_id);
214	goto requeue;	214	goto requeue;
215	}	215	}
216		216
217	/* otherwise we sleep until either the object we're waiting for	217	/* otherwise we sleep until either the object we're waiting for
218	* is done, or the fscache_object is congested */	218	* is done, or the fscache_object is congested */
219	wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);	219	wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
220	init_wait(&wait);	220	init_wait(&wait);
221	requeue = false;	221	requeue = false;
222	do {	222	do {
223	prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);	223	prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
224	if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))	224	if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
225	break;	225	break;
226		226
227	requeue = fscache_object_sleep_till_congested(&timeout);	227	requeue = fscache_object_sleep_till_congested(&timeout);
228	} while (timeout > 0 && !requeue);	228	} while (timeout > 0 && !requeue);
229	finish_wait(wq, &wait);	229	finish_wait(wq, &wait);
230		230
231	if (requeue &&	231	if (requeue &&
232	test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {	232	test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
233	_debug("queue OBJ%x behind OBJ%x after wait",	233	_debug("queue OBJ%x behind OBJ%x after wait",
234	object->fscache.debug_id,	234	object->fscache.debug_id,
235	xobject->fscache.debug_id);	235	xobject->fscache.debug_id);
236	goto requeue;	236	goto requeue;
237	}	237	}
238		238
239	if (timeout <= 0) {	239	if (timeout <= 0) {
240	pr_err("\n");	240	pr_err("\n");
241	pr_err("Error: Overlong wait for old active object to go away\n");	241	pr_err("Error: Overlong wait for old active object to go away\n");
242	cachefiles_printk_object(object, xobject);	242	cachefiles_printk_object(object, xobject);
243	goto requeue;	243	goto requeue;
244	}	244	}
245	}	245	}
246		246
247	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));	247	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
248		248
249	cache->cache.ops->put_object(&xobject->fscache);	249	cache->cache.ops->put_object(&xobject->fscache);
250	goto try_again;	250	goto try_again;
251		251
252	requeue:	252	requeue:
253	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);	253	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
254	cache->cache.ops->put_object(&xobject->fscache);	254	cache->cache.ops->put_object(&xobject->fscache);
255	_leave(" = -ETIMEDOUT");	255	_leave(" = -ETIMEDOUT");
256	return -ETIMEDOUT;	256	return -ETIMEDOUT;
257	}	257	}
258		258
259	/*	259	/*
260	* delete an object representation from the cache	260	* delete an object representation from the cache
261	* - file backed objects are unlinked	261	* - file backed objects are unlinked
262	* - directory backed objects are stuffed into the graveyard for userspace to	262	* - directory backed objects are stuffed into the graveyard for userspace to
263	* delete	263	* delete
264	* - unlocks the directory mutex	264	* - unlocks the directory mutex
265	*/	265	*/
266	static int cachefiles_bury_object(struct cachefiles_cache *cache,	266	static int cachefiles_bury_object(struct cachefiles_cache *cache,
267	struct dentry *dir,	267	struct dentry *dir,
268	struct dentry *rep,	268	struct dentry *rep,
269	bool preemptive)	269	bool preemptive)
270	{	270	{
271	struct dentry grave, trap;	271	struct dentry grave, trap;
272	struct path path, path_to_graveyard;	272	struct path path, path_to_graveyard;
273	char nbuffer[8 + 8 + 1];	273	char nbuffer[8 + 8 + 1];
274	int ret;	274	int ret;
275		275
276	_enter(",'%.s','%.s'",	276	_enter(",'%.s','%.s'",
277	dir->d_name.len, dir->d_name.len, dir->d_name.name,	277	dir->d_name.len, dir->d_name.len, dir->d_name.name,
278	rep->d_name.len, rep->d_name.len, rep->d_name.name);	278	rep->d_name.len, rep->d_name.len, rep->d_name.name);
279		279
280	_debug("remove %p from %p", rep, dir);	280	_debug("remove %p from %p", rep, dir);
281		281
282	/* non-directories can just be unlinked */	282	/* non-directories can just be unlinked */
283	if (!S_ISDIR(rep->d_inode->i_mode)) {	283	if (!S_ISDIR(rep->d_inode->i_mode)) {
284	_debug("unlink stale object");	284	_debug("unlink stale object");
285		285
286	path.mnt = cache->mnt;	286	path.mnt = cache->mnt;
287	path.dentry = dir;	287	path.dentry = dir;
288	ret = security_path_unlink(&path, rep);	288	ret = security_path_unlink(&path, rep);
289	if (ret < 0) {	289	if (ret < 0) {
290	cachefiles_io_error(cache, "Unlink security error");	290	cachefiles_io_error(cache, "Unlink security error");
291	} else {	291	} else {
292	ret = vfs_unlink(dir->d_inode, rep, NULL);	292	ret = vfs_unlink(dir->d_inode, rep, NULL);
293		293
294	if (preemptive)	294	if (preemptive)
295	cachefiles_mark_object_buried(cache, rep);	295	cachefiles_mark_object_buried(cache, rep);
296	}	296	}
297		297
298	mutex_unlock(&dir->d_inode->i_mutex);	298	mutex_unlock(&dir->d_inode->i_mutex);
299		299
300	if (ret == -EIO)	300	if (ret == -EIO)
301	cachefiles_io_error(cache, "Unlink failed");	301	cachefiles_io_error(cache, "Unlink failed");
302		302
303	_leave(" = %d", ret);	303	_leave(" = %d", ret);
304	return ret;	304	return ret;
305	}	305	}
306		306
307	/* directories have to be moved to the graveyard */	307	/* directories have to be moved to the graveyard */
308	_debug("move stale object to graveyard");	308	_debug("move stale object to graveyard");
309	mutex_unlock(&dir->d_inode->i_mutex);	309	mutex_unlock(&dir->d_inode->i_mutex);
310		310
311	try_again:	311	try_again:
312	/* first step is to make up a grave dentry in the graveyard */	312	/* first step is to make up a grave dentry in the graveyard */
313	sprintf(nbuffer, "%08x%08x",	313	sprintf(nbuffer, "%08x%08x",
314	(uint32_t) get_seconds(),	314	(uint32_t) get_seconds(),
315	(uint32_t) atomic_inc_return(&cache->gravecounter));	315	(uint32_t) atomic_inc_return(&cache->gravecounter));
316		316
317	/* do the multiway lock magic */	317	/* do the multiway lock magic */
318	trap = lock_rename(cache->graveyard, dir);	318	trap = lock_rename(cache->graveyard, dir);
319		319
320	/* do some checks before getting the grave dentry */	320	/* do some checks before getting the grave dentry */
321	if (rep->d_parent != dir) {	321	if (rep->d_parent != dir) {
322	/* the entry was probably culled when we dropped the parent dir	322	/* the entry was probably culled when we dropped the parent dir
323	* lock */	323	* lock */
324	unlock_rename(cache->graveyard, dir);	324	unlock_rename(cache->graveyard, dir);
325	_leave(" = 0 [culled?]");	325	_leave(" = 0 [culled?]");
326	return 0;	326	return 0;
327	}	327	}
328		328
329	if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {	329	if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
330	unlock_rename(cache->graveyard, dir);	330	unlock_rename(cache->graveyard, dir);
331	cachefiles_io_error(cache, "Graveyard no longer a directory");	331	cachefiles_io_error(cache, "Graveyard no longer a directory");
332	return -EIO;	332	return -EIO;
333	}	333	}
334		334
335	if (trap == rep) {	335	if (trap == rep) {
336	unlock_rename(cache->graveyard, dir);	336	unlock_rename(cache->graveyard, dir);
337	cachefiles_io_error(cache, "May not make directory loop");	337	cachefiles_io_error(cache, "May not make directory loop");
338	return -EIO;	338	return -EIO;
339	}	339	}
340		340
341	if (d_mountpoint(rep)) {	341	if (d_mountpoint(rep)) {
342	unlock_rename(cache->graveyard, dir);	342	unlock_rename(cache->graveyard, dir);
343	cachefiles_io_error(cache, "Mountpoint in cache");	343	cachefiles_io_error(cache, "Mountpoint in cache");
344	return -EIO;	344	return -EIO;
345	}	345	}
346		346
347	grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));	347	grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
348	if (IS_ERR(grave)) {	348	if (IS_ERR(grave)) {
349	unlock_rename(cache->graveyard, dir);	349	unlock_rename(cache->graveyard, dir);
350		350
351	if (PTR_ERR(grave) == -ENOMEM) {	351	if (PTR_ERR(grave) == -ENOMEM) {
352	_leave(" = -ENOMEM");	352	_leave(" = -ENOMEM");
353	return -ENOMEM;	353	return -ENOMEM;
354	}	354	}
355		355
356	cachefiles_io_error(cache, "Lookup error %ld",	356	cachefiles_io_error(cache, "Lookup error %ld",
357	PTR_ERR(grave));	357	PTR_ERR(grave));
358	return -EIO;	358	return -EIO;
359	}	359	}
360		360
361	if (grave->d_inode) {	361	if (grave->d_inode) {
362	unlock_rename(cache->graveyard, dir);	362	unlock_rename(cache->graveyard, dir);
363	dput(grave);	363	dput(grave);
364	grave = NULL;	364	grave = NULL;
365	cond_resched();	365	cond_resched();
366	goto try_again;	366	goto try_again;
367	}	367	}
368		368
369	if (d_mountpoint(grave)) {	369	if (d_mountpoint(grave)) {
370	unlock_rename(cache->graveyard, dir);	370	unlock_rename(cache->graveyard, dir);
371	dput(grave);	371	dput(grave);
372	cachefiles_io_error(cache, "Mountpoint in graveyard");	372	cachefiles_io_error(cache, "Mountpoint in graveyard");
373	return -EIO;	373	return -EIO;
374	}	374	}
375		375
376	/* target should not be an ancestor of source */	376	/* target should not be an ancestor of source */
377	if (trap == grave) {	377	if (trap == grave) {
378	unlock_rename(cache->graveyard, dir);	378	unlock_rename(cache->graveyard, dir);
379	dput(grave);	379	dput(grave);
380	cachefiles_io_error(cache, "May not make directory loop");	380	cachefiles_io_error(cache, "May not make directory loop");
381	return -EIO;	381	return -EIO;
382	}	382	}
383		383
384	/* attempt the rename */	384	/* attempt the rename */
385	path.mnt = cache->mnt;	385	path.mnt = cache->mnt;
386	path.dentry = dir;	386	path.dentry = dir;
387	path_to_graveyard.mnt = cache->mnt;	387	path_to_graveyard.mnt = cache->mnt;
388	path_to_graveyard.dentry = cache->graveyard;	388	path_to_graveyard.dentry = cache->graveyard;
389	ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);	389	ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
390	if (ret < 0) {	390	if (ret < 0) {
391	cachefiles_io_error(cache, "Rename security error %d", ret);	391	cachefiles_io_error(cache, "Rename security error %d", ret);
392	} else {	392	} else {
393	ret = vfs_rename(dir->d_inode, rep,	393	ret = vfs_rename(dir->d_inode, rep,
394	cache->graveyard->d_inode, grave, NULL, 0);	394	cache->graveyard->d_inode, grave, NULL, 0);
395	if (ret != 0 && ret != -ENOMEM)	395	if (ret != 0 && ret != -ENOMEM)
396	cachefiles_io_error(cache,	396	cachefiles_io_error(cache,
397	"Rename failed with error %d", ret);	397	"Rename failed with error %d", ret);
398		398
399	if (preemptive)	399	if (preemptive)
400	cachefiles_mark_object_buried(cache, rep);	400	cachefiles_mark_object_buried(cache, rep);
401	}	401	}
402		402
403	unlock_rename(cache->graveyard, dir);	403	unlock_rename(cache->graveyard, dir);
404	dput(grave);	404	dput(grave);
405	_leave(" = 0");	405	_leave(" = 0");
406	return 0;	406	return 0;
407	}	407	}
408		408
409	/*	409	/*
410	* delete an object representation from the cache	410	* delete an object representation from the cache
411	*/	411	*/
412	int cachefiles_delete_object(struct cachefiles_cache *cache,	412	int cachefiles_delete_object(struct cachefiles_cache *cache,
413	struct cachefiles_object *object)	413	struct cachefiles_object *object)
414	{	414	{
415	struct dentry *dir;	415	struct dentry *dir;
416	int ret;	416	int ret;
417		417
418	_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);	418	_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
419		419
420	ASSERT(object->dentry);	420	ASSERT(object->dentry);
421	ASSERT(object->dentry->d_inode);	421	ASSERT(object->dentry->d_inode);
422	ASSERT(object->dentry->d_parent);	422	ASSERT(object->dentry->d_parent);
423		423
424	dir = dget_parent(object->dentry);	424	dir = dget_parent(object->dentry);
425		425
426	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);	426	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
427		427
428	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {	428	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
429	/* object allocation for the same key preemptively deleted this	429	/* object allocation for the same key preemptively deleted this
430	* object's file so that it could create its own file */	430	* object's file so that it could create its own file */
431	_debug("object preemptively buried");	431	_debug("object preemptively buried");
432	mutex_unlock(&dir->d_inode->i_mutex);	432	mutex_unlock(&dir->d_inode->i_mutex);
433	ret = 0;	433	ret = 0;
434	} else {	434	} else {
435	/* we need to check that our parent is _still_ our parent - it	435	/* we need to check that our parent is _still_ our parent - it
436	* may have been renamed */	436	* may have been renamed */
437	if (dir == object->dentry->d_parent) {	437	if (dir == object->dentry->d_parent) {
438	ret = cachefiles_bury_object(cache, dir,	438	ret = cachefiles_bury_object(cache, dir,
439	object->dentry, false);	439	object->dentry, false);
440	} else {	440	} else {
441	/* it got moved, presumably by cachefilesd culling it,	441	/* it got moved, presumably by cachefilesd culling it,
442	* so it's no longer in the key path and we can ignore	442	* so it's no longer in the key path and we can ignore
443	* it */	443	* it */
444	mutex_unlock(&dir->d_inode->i_mutex);	444	mutex_unlock(&dir->d_inode->i_mutex);
445	ret = 0;	445	ret = 0;
446	}	446	}
447	}	447	}
448		448
449	dput(dir);	449	dput(dir);
450	_leave(" = %d", ret);	450	_leave(" = %d", ret);
451	return ret;	451	return ret;
452	}	452	}
453		453
454	/*	454	/*
455	* walk from the parent object to the child object through the backing	455	* walk from the parent object to the child object through the backing
456	* filesystem, creating directories as we go	456	* filesystem, creating directories as we go
457	*/	457	*/
458	int cachefiles_walk_to_object(struct cachefiles_object *parent,	458	int cachefiles_walk_to_object(struct cachefiles_object *parent,
459	struct cachefiles_object *object,	459	struct cachefiles_object *object,
460	const char *key,	460	const char *key,
461	struct cachefiles_xattr *auxdata)	461	struct cachefiles_xattr *auxdata)
462	{	462	{
463	struct cachefiles_cache *cache;	463	struct cachefiles_cache *cache;
464	struct dentry dir, next = NULL;	464	struct dentry dir, next = NULL;
465	struct path path;	465	struct path path;
466	unsigned long start;	466	unsigned long start;
467	const char *name;	467	const char *name;
468	int ret, nlen;	468	int ret, nlen;
469		469
470	_enter("OBJ%x{%p},OBJ%x,%s,",	470	_enter("OBJ%x{%p},OBJ%x,%s,",
471	parent->fscache.debug_id, parent->dentry,	471	parent->fscache.debug_id, parent->dentry,
472	object->fscache.debug_id, key);	472	object->fscache.debug_id, key);
473		473
474	cache = container_of(parent->fscache.cache,	474	cache = container_of(parent->fscache.cache,
475	struct cachefiles_cache, cache);	475	struct cachefiles_cache, cache);
476	path.mnt = cache->mnt;	476	path.mnt = cache->mnt;
477		477
478	ASSERT(parent->dentry);	478	ASSERT(parent->dentry);
479	ASSERT(parent->dentry->d_inode);	479	ASSERT(parent->dentry->d_inode);
480		480
481	if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {	481	if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
482	// TODO: convert file to dir	482	// TODO: convert file to dir
483	_leave("looking up in none directory");	483	_leave("looking up in none directory");
484	return -ENOBUFS;	484	return -ENOBUFS;
485	}	485	}
486		486
487	dir = dget(parent->dentry);	487	dir = dget(parent->dentry);
488		488
489	advance:	489	advance:
490	/* attempt to transit the first directory component */	490	/* attempt to transit the first directory component */
491	name = key;	491	name = key;
492	nlen = strlen(key);	492	nlen = strlen(key);
493		493
494	/* key ends in a double NUL */	494	/* key ends in a double NUL */
495	key = key + nlen + 1;	495	key = key + nlen + 1;
496	if (!*key)	496	if (!*key)
497	key = NULL;	497	key = NULL;
498		498
499	lookup_again:	499	lookup_again:
500	/* search the current directory for the element name */	500	/* search the current directory for the element name */
501	_debug("lookup '%s'", name);	501	_debug("lookup '%s'", name);
502		502
503	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);	503	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
504		504
505	start = jiffies;	505	start = jiffies;
506	next = lookup_one_len(name, dir, nlen);	506	next = lookup_one_len(name, dir, nlen);
507	cachefiles_hist(cachefiles_lookup_histogram, start);	507	cachefiles_hist(cachefiles_lookup_histogram, start);
508	if (IS_ERR(next))	508	if (IS_ERR(next))
509	goto lookup_error;	509	goto lookup_error;
510		510
511	_debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");	511	_debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
512		512
513	if (!key)	513	if (!key)
514	object->new = !next->d_inode;	514	object->new = !next->d_inode;
515		515
516	/* if this element of the path doesn't exist, then the lookup phase	516	/* if this element of the path doesn't exist, then the lookup phase
517	* failed, and we can release any readers in the certain knowledge that	517	* failed, and we can release any readers in the certain knowledge that
518	* there's nothing for them to actually read */	518	* there's nothing for them to actually read */
519	if (!next->d_inode)	519	if (!next->d_inode)
520	fscache_object_lookup_negative(&object->fscache);	520	fscache_object_lookup_negative(&object->fscache);
521		521
522	/* we need to create the object if it's negative */	522	/* we need to create the object if it's negative */
523	if (key \|\| object->type == FSCACHE_COOKIE_TYPE_INDEX) {	523	if (key \|\| object->type == FSCACHE_COOKIE_TYPE_INDEX) {
524	/* index objects and intervening tree levels must be subdirs */	524	/* index objects and intervening tree levels must be subdirs */
525	if (!next->d_inode) {	525	if (!next->d_inode) {
526	ret = cachefiles_has_space(cache, 1, 0);	526	ret = cachefiles_has_space(cache, 1, 0);
527	if (ret < 0)	527	if (ret < 0)
528	goto create_error;	528	goto create_error;
529		529
530	path.dentry = dir;	530	path.dentry = dir;
531	ret = security_path_mkdir(&path, next, 0);	531	ret = security_path_mkdir(&path, next, 0);
532	if (ret < 0)	532	if (ret < 0)
533	goto create_error;	533	goto create_error;
534	start = jiffies;	534	start = jiffies;
535	ret = vfs_mkdir(dir->d_inode, next, 0);	535	ret = vfs_mkdir(dir->d_inode, next, 0);
536	cachefiles_hist(cachefiles_mkdir_histogram, start);	536	cachefiles_hist(cachefiles_mkdir_histogram, start);
537	if (ret < 0)	537	if (ret < 0)
538	goto create_error;	538	goto create_error;
539		539
540	ASSERT(next->d_inode);	540	ASSERT(next->d_inode);
541		541
542	_debug("mkdir -> %p{%p{ino=%lu}}",	542	_debug("mkdir -> %p{%p{ino=%lu}}",
543	next, next->d_inode, next->d_inode->i_ino);	543	next, next->d_inode, next->d_inode->i_ino);
544		544
545	} else if (!S_ISDIR(next->d_inode->i_mode)) {	545	} else if (!S_ISDIR(next->d_inode->i_mode)) {
546	pr_err("inode %lu is not a directory",	546	pr_err("inode %lu is not a directory\n",
547	next->d_inode->i_ino);	547	next->d_inode->i_ino);
548	ret = -ENOBUFS;	548	ret = -ENOBUFS;
549	goto error;	549	goto error;
550	}	550	}
551		551
552	} else {	552	} else {
553	/* non-index objects start out life as files */	553	/* non-index objects start out life as files */
554	if (!next->d_inode) {	554	if (!next->d_inode) {
555	ret = cachefiles_has_space(cache, 1, 0);	555	ret = cachefiles_has_space(cache, 1, 0);
556	if (ret < 0)	556	if (ret < 0)
557	goto create_error;	557	goto create_error;
558		558
559	path.dentry = dir;	559	path.dentry = dir;
560	ret = security_path_mknod(&path, next, S_IFREG, 0);	560	ret = security_path_mknod(&path, next, S_IFREG, 0);
561	if (ret < 0)	561	if (ret < 0)
562	goto create_error;	562	goto create_error;
563	start = jiffies;	563	start = jiffies;
564	ret = vfs_create(dir->d_inode, next, S_IFREG, true);	564	ret = vfs_create(dir->d_inode, next, S_IFREG, true);
565	cachefiles_hist(cachefiles_create_histogram, start);	565	cachefiles_hist(cachefiles_create_histogram, start);
566	if (ret < 0)	566	if (ret < 0)
567	goto create_error;	567	goto create_error;
568		568
569	ASSERT(next->d_inode);	569	ASSERT(next->d_inode);
570		570
571	_debug("create -> %p{%p{ino=%lu}}",	571	_debug("create -> %p{%p{ino=%lu}}",
572	next, next->d_inode, next->d_inode->i_ino);	572	next, next->d_inode, next->d_inode->i_ino);
573		573
574	} else if (!S_ISDIR(next->d_inode->i_mode) &&	574	} else if (!S_ISDIR(next->d_inode->i_mode) &&
575	!S_ISREG(next->d_inode->i_mode)	575	!S_ISREG(next->d_inode->i_mode)
576	) {	576	) {
577	pr_err("inode %lu is not a file or directory",	577	pr_err("inode %lu is not a file or directory\n",
578	next->d_inode->i_ino);	578	next->d_inode->i_ino);
579	ret = -ENOBUFS;	579	ret = -ENOBUFS;
580	goto error;	580	goto error;
581	}	581	}
582	}	582	}
583		583
584	/* process the next component */	584	/* process the next component */
585	if (key) {	585	if (key) {
586	_debug("advance");	586	_debug("advance");
587	mutex_unlock(&dir->d_inode->i_mutex);	587	mutex_unlock(&dir->d_inode->i_mutex);
588	dput(dir);	588	dput(dir);
589	dir = next;	589	dir = next;
590	next = NULL;	590	next = NULL;
591	goto advance;	591	goto advance;
592	}	592	}
593		593
594	/* we've found the object we were looking for */	594	/* we've found the object we were looking for */
595	object->dentry = next;	595	object->dentry = next;
596		596
597	/* if we've found that the terminal object exists, then we need to	597	/* if we've found that the terminal object exists, then we need to
598	* check its attributes and delete it if it's out of date */	598	* check its attributes and delete it if it's out of date */
599	if (!object->new) {	599	if (!object->new) {
600	_debug("validate '%.s'",	600	_debug("validate '%.s'",
601	next->d_name.len, next->d_name.len, next->d_name.name);	601	next->d_name.len, next->d_name.len, next->d_name.name);
602		602
603	ret = cachefiles_check_object_xattr(object, auxdata);	603	ret = cachefiles_check_object_xattr(object, auxdata);
604	if (ret == -ESTALE) {	604	if (ret == -ESTALE) {
605	/* delete the object (the deleter drops the directory	605	/* delete the object (the deleter drops the directory
606	* mutex) */	606	* mutex) */
607	object->dentry = NULL;	607	object->dentry = NULL;
608		608
609	ret = cachefiles_bury_object(cache, dir, next, true);	609	ret = cachefiles_bury_object(cache, dir, next, true);
610	dput(next);	610	dput(next);
611	next = NULL;	611	next = NULL;
612		612
613	if (ret < 0)	613	if (ret < 0)
614	goto delete_error;	614	goto delete_error;
615		615
616	_debug("redo lookup");	616	_debug("redo lookup");
617	goto lookup_again;	617	goto lookup_again;
618	}	618	}
619	}	619	}
620		620
621	/* note that we're now using this object */	621	/* note that we're now using this object */
622	ret = cachefiles_mark_object_active(cache, object);	622	ret = cachefiles_mark_object_active(cache, object);
623		623
624	mutex_unlock(&dir->d_inode->i_mutex);	624	mutex_unlock(&dir->d_inode->i_mutex);
625	dput(dir);	625	dput(dir);
626	dir = NULL;	626	dir = NULL;
627		627
628	if (ret == -ETIMEDOUT)	628	if (ret == -ETIMEDOUT)
629	goto mark_active_timed_out;	629	goto mark_active_timed_out;
630		630
631	_debug("=== OBTAINED_OBJECT ===");	631	_debug("=== OBTAINED_OBJECT ===");
632		632
633	if (object->new) {	633	if (object->new) {
634	/* attach data to a newly constructed terminal object */	634	/* attach data to a newly constructed terminal object */
635	ret = cachefiles_set_object_xattr(object, auxdata);	635	ret = cachefiles_set_object_xattr(object, auxdata);
636	if (ret < 0)	636	if (ret < 0)
637	goto check_error;	637	goto check_error;
638	} else {	638	} else {
639	/* always update the atime on an object we've just looked up	639	/* always update the atime on an object we've just looked up
640	* (this is used to keep track of culling, and atimes are only	640	* (this is used to keep track of culling, and atimes are only
641	* updated by read, write and readdir but not lookup or	641	* updated by read, write and readdir but not lookup or
642	* open) */	642	* open) */
643	path.dentry = next;	643	path.dentry = next;
644	touch_atime(&path);	644	touch_atime(&path);
645	}	645	}
646		646
647	/* open a file interface onto a data file */	647	/* open a file interface onto a data file */
648	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {	648	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
649	if (S_ISREG(object->dentry->d_inode->i_mode)) {	649	if (S_ISREG(object->dentry->d_inode->i_mode)) {
650	const struct address_space_operations *aops;	650	const struct address_space_operations *aops;
651		651
652	ret = -EPERM;	652	ret = -EPERM;
653	aops = object->dentry->d_inode->i_mapping->a_ops;	653	aops = object->dentry->d_inode->i_mapping->a_ops;
654	if (!aops->bmap)	654	if (!aops->bmap)
655	goto check_error;	655	goto check_error;
656		656
657	object->backer = object->dentry;	657	object->backer = object->dentry;
658	} else {	658	} else {
659	BUG(); // TODO: open file in data-class subdir	659	BUG(); // TODO: open file in data-class subdir
660	}	660	}
661	}	661	}
662		662
663	object->new = 0;	663	object->new = 0;
664	fscache_obtained_object(&object->fscache);	664	fscache_obtained_object(&object->fscache);
665		665
666	_leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);	666	_leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
667	return 0;	667	return 0;
668		668
669	create_error:	669	create_error:
670	_debug("create error %d", ret);	670	_debug("create error %d", ret);
671	if (ret == -EIO)	671	if (ret == -EIO)
672	cachefiles_io_error(cache, "Create/mkdir failed");	672	cachefiles_io_error(cache, "Create/mkdir failed");
673	goto error;	673	goto error;
674		674
675	mark_active_timed_out:	675	mark_active_timed_out:
676	_debug("mark active timed out");	676	_debug("mark active timed out");
677	goto release_dentry;	677	goto release_dentry;
678		678
679	check_error:	679	check_error:
680	_debug("check error %d", ret);	680	_debug("check error %d", ret);
681	write_lock(&cache->active_lock);	681	write_lock(&cache->active_lock);
682	rb_erase(&object->active_node, &cache->active_nodes);	682	rb_erase(&object->active_node, &cache->active_nodes);
683	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);	683	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
684	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);	684	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
685	write_unlock(&cache->active_lock);	685	write_unlock(&cache->active_lock);
686	release_dentry:	686	release_dentry:
687	dput(object->dentry);	687	dput(object->dentry);
688	object->dentry = NULL;	688	object->dentry = NULL;
689	goto error_out;	689	goto error_out;
690		690
691	delete_error:	691	delete_error:
692	_debug("delete error %d", ret);	692	_debug("delete error %d", ret);
693	goto error_out2;	693	goto error_out2;
694		694
695	lookup_error:	695	lookup_error:
696	_debug("lookup error %ld", PTR_ERR(next));	696	_debug("lookup error %ld", PTR_ERR(next));
697	ret = PTR_ERR(next);	697	ret = PTR_ERR(next);
698	if (ret == -EIO)	698	if (ret == -EIO)
699	cachefiles_io_error(cache, "Lookup failed");	699	cachefiles_io_error(cache, "Lookup failed");
700	next = NULL;	700	next = NULL;
701	error:	701	error:
702	mutex_unlock(&dir->d_inode->i_mutex);	702	mutex_unlock(&dir->d_inode->i_mutex);
703	dput(next);	703	dput(next);
704	error_out2:	704	error_out2:
705	dput(dir);	705	dput(dir);
706	error_out:	706	error_out:
707	_leave(" = error %d", -ret);	707	_leave(" = error %d", -ret);
708	return ret;	708	return ret;
709	}	709	}
710		710
711	/*	711	/*
712	* get a subdirectory	712	* get a subdirectory
713	*/	713	*/
714	struct dentry cachefiles_get_directory(struct cachefiles_cache cache,	714	struct dentry cachefiles_get_directory(struct cachefiles_cache cache,
715	struct dentry *dir,	715	struct dentry *dir,
716	const char *dirname)	716	const char *dirname)
717	{	717	{
718	struct dentry *subdir;	718	struct dentry *subdir;
719	unsigned long start;	719	unsigned long start;
720	struct path path;	720	struct path path;
721	int ret;	721	int ret;
722		722
723	_enter(",,%s", dirname);	723	_enter(",,%s", dirname);
724		724
725	/* search the current directory for the element name */	725	/* search the current directory for the element name */
726	mutex_lock(&dir->d_inode->i_mutex);	726	mutex_lock(&dir->d_inode->i_mutex);
727		727
728	start = jiffies;	728	start = jiffies;
729	subdir = lookup_one_len(dirname, dir, strlen(dirname));	729	subdir = lookup_one_len(dirname, dir, strlen(dirname));
730	cachefiles_hist(cachefiles_lookup_histogram, start);	730	cachefiles_hist(cachefiles_lookup_histogram, start);
731	if (IS_ERR(subdir)) {	731	if (IS_ERR(subdir)) {
732	if (PTR_ERR(subdir) == -ENOMEM)	732	if (PTR_ERR(subdir) == -ENOMEM)
733	goto nomem_d_alloc;	733	goto nomem_d_alloc;
734	goto lookup_error;	734	goto lookup_error;
735	}	735	}
736		736
737	_debug("subdir -> %p %s",	737	_debug("subdir -> %p %s",
738	subdir, subdir->d_inode ? "positive" : "negative");	738	subdir, subdir->d_inode ? "positive" : "negative");
739		739
740	/* we need to create the subdir if it doesn't exist yet */	740	/* we need to create the subdir if it doesn't exist yet */
741	if (!subdir->d_inode) {	741	if (!subdir->d_inode) {
742	ret = cachefiles_has_space(cache, 1, 0);	742	ret = cachefiles_has_space(cache, 1, 0);
743	if (ret < 0)	743	if (ret < 0)
744	goto mkdir_error;	744	goto mkdir_error;
745		745
746	_debug("attempt mkdir");	746	_debug("attempt mkdir");
747		747
748	path.mnt = cache->mnt;	748	path.mnt = cache->mnt;
749	path.dentry = dir;	749	path.dentry = dir;
750	ret = security_path_mkdir(&path, subdir, 0700);	750	ret = security_path_mkdir(&path, subdir, 0700);
751	if (ret < 0)	751	if (ret < 0)
752	goto mkdir_error;	752	goto mkdir_error;
753	ret = vfs_mkdir(dir->d_inode, subdir, 0700);	753	ret = vfs_mkdir(dir->d_inode, subdir, 0700);
754	if (ret < 0)	754	if (ret < 0)
755	goto mkdir_error;	755	goto mkdir_error;
756		756
757	ASSERT(subdir->d_inode);	757	ASSERT(subdir->d_inode);
758		758
759	_debug("mkdir -> %p{%p{ino=%lu}}",	759	_debug("mkdir -> %p{%p{ino=%lu}}",
760	subdir,	760	subdir,
761	subdir->d_inode,	761	subdir->d_inode,
762	subdir->d_inode->i_ino);	762	subdir->d_inode->i_ino);
763	}	763	}
764		764
765	mutex_unlock(&dir->d_inode->i_mutex);	765	mutex_unlock(&dir->d_inode->i_mutex);
766		766
767	/* we need to make sure the subdir is a directory */	767	/* we need to make sure the subdir is a directory */
768	ASSERT(subdir->d_inode);	768	ASSERT(subdir->d_inode);
769		769
770	if (!S_ISDIR(subdir->d_inode->i_mode)) {	770	if (!S_ISDIR(subdir->d_inode->i_mode)) {
771	pr_err("%s is not a directory", dirname);	771	pr_err("%s is not a directory\n", dirname);
772	ret = -EIO;	772	ret = -EIO;
773	goto check_error;	773	goto check_error;
774	}	774	}
775		775
776	ret = -EPERM;	776	ret = -EPERM;
777	if (!subdir->d_inode->i_op->setxattr \|\|	777	if (!subdir->d_inode->i_op->setxattr \|\|
778	!subdir->d_inode->i_op->getxattr \|\|	778	!subdir->d_inode->i_op->getxattr \|\|
779	!subdir->d_inode->i_op->lookup \|\|	779	!subdir->d_inode->i_op->lookup \|\|
780	!subdir->d_inode->i_op->mkdir \|\|	780	!subdir->d_inode->i_op->mkdir \|\|
781	!subdir->d_inode->i_op->create \|\|	781	!subdir->d_inode->i_op->create \|\|
782	(!subdir->d_inode->i_op->rename &&	782	(!subdir->d_inode->i_op->rename &&
783	!subdir->d_inode->i_op->rename2) \|\|	783	!subdir->d_inode->i_op->rename2) \|\|
784	!subdir->d_inode->i_op->rmdir \|\|	784	!subdir->d_inode->i_op->rmdir \|\|
785	!subdir->d_inode->i_op->unlink)	785	!subdir->d_inode->i_op->unlink)
786	goto check_error;	786	goto check_error;
787		787
788	_leave(" = [%lu]", subdir->d_inode->i_ino);	788	_leave(" = [%lu]", subdir->d_inode->i_ino);
789	return subdir;	789	return subdir;
790		790
791	check_error:	791	check_error:
792	dput(subdir);	792	dput(subdir);
793	_leave(" = %d [check]", ret);	793	_leave(" = %d [check]", ret);
794	return ERR_PTR(ret);	794	return ERR_PTR(ret);
795		795
796	mkdir_error:	796	mkdir_error:
797	mutex_unlock(&dir->d_inode->i_mutex);	797	mutex_unlock(&dir->d_inode->i_mutex);
798	dput(subdir);	798	dput(subdir);
799	pr_err("mkdir %s failed with error %d", dirname, ret);	799	pr_err("mkdir %s failed with error %d\n", dirname, ret);
800	return ERR_PTR(ret);	800	return ERR_PTR(ret);
801		801
802	lookup_error:	802	lookup_error:
803	mutex_unlock(&dir->d_inode->i_mutex);	803	mutex_unlock(&dir->d_inode->i_mutex);
804	ret = PTR_ERR(subdir);	804	ret = PTR_ERR(subdir);
805	pr_err("Lookup %s failed with error %d", dirname, ret);	805	pr_err("Lookup %s failed with error %d\n", dirname, ret);
806	return ERR_PTR(ret);	806	return ERR_PTR(ret);
807		807
808	nomem_d_alloc:	808	nomem_d_alloc:
809	mutex_unlock(&dir->d_inode->i_mutex);	809	mutex_unlock(&dir->d_inode->i_mutex);
810	_leave(" = -ENOMEM");	810	_leave(" = -ENOMEM");
811	return ERR_PTR(-ENOMEM);	811	return ERR_PTR(-ENOMEM);
812	}	812	}
813		813
814	/*	814	/*
815	* find out if an object is in use or not	815	* find out if an object is in use or not
816	* - if finds object and it's not in use:	816	* - if finds object and it's not in use:
817	* - returns a pointer to the object and a reference on it	817	* - returns a pointer to the object and a reference on it
818	* - returns with the directory locked	818	* - returns with the directory locked
819	*/	819	*/
820	static struct dentry cachefiles_check_active(struct cachefiles_cache cache,	820	static struct dentry cachefiles_check_active(struct cachefiles_cache cache,
821	struct dentry *dir,	821	struct dentry *dir,
822	char *filename)	822	char *filename)
823	{	823	{
824	struct cachefiles_object *object;	824	struct cachefiles_object *object;
825	struct rb_node *_n;	825	struct rb_node *_n;
826	struct dentry *victim;	826	struct dentry *victim;
827	unsigned long start;	827	unsigned long start;
828	int ret;	828	int ret;
829		829
830	//_enter(",%.s/,%s",	830	//_enter(",%.s/,%s",
831	// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);	831	// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
832		832
833	/* look up the victim */	833	/* look up the victim */
834	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);	834	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
835		835
836	start = jiffies;	836	start = jiffies;
837	victim = lookup_one_len(filename, dir, strlen(filename));	837	victim = lookup_one_len(filename, dir, strlen(filename));
838	cachefiles_hist(cachefiles_lookup_histogram, start);	838	cachefiles_hist(cachefiles_lookup_histogram, start);
839	if (IS_ERR(victim))	839	if (IS_ERR(victim))
840	goto lookup_error;	840	goto lookup_error;
841		841
842	//_debug("victim -> %p %s",	842	//_debug("victim -> %p %s",
843	// victim, victim->d_inode ? "positive" : "negative");	843	// victim, victim->d_inode ? "positive" : "negative");
844		844
845	/* if the object is no longer there then we probably retired the object	845	/* if the object is no longer there then we probably retired the object
846	* at the netfs's request whilst the cull was in progress	846	* at the netfs's request whilst the cull was in progress
847	*/	847	*/
848	if (!victim->d_inode) {	848	if (!victim->d_inode) {
849	mutex_unlock(&dir->d_inode->i_mutex);	849	mutex_unlock(&dir->d_inode->i_mutex);
850	dput(victim);	850	dput(victim);
851	_leave(" = -ENOENT [absent]");	851	_leave(" = -ENOENT [absent]");
852	return ERR_PTR(-ENOENT);	852	return ERR_PTR(-ENOENT);
853	}	853	}
854		854
855	/* check to see if we're using this object */	855	/* check to see if we're using this object */
856	read_lock(&cache->active_lock);	856	read_lock(&cache->active_lock);
857		857
858	_n = cache->active_nodes.rb_node;	858	_n = cache->active_nodes.rb_node;
859		859
860	while (_n) {	860	while (_n) {
861	object = rb_entry(_n, struct cachefiles_object, active_node);	861	object = rb_entry(_n, struct cachefiles_object, active_node);
862		862
863	if (object->dentry > victim)	863	if (object->dentry > victim)
864	_n = _n->rb_left;	864	_n = _n->rb_left;
865	else if (object->dentry < victim)	865	else if (object->dentry < victim)
866	_n = _n->rb_right;	866	_n = _n->rb_right;
867	else	867	else
868	goto object_in_use;	868	goto object_in_use;
869	}	869	}
870		870
871	read_unlock(&cache->active_lock);	871	read_unlock(&cache->active_lock);
872		872
873	//_leave(" = %p", victim);	873	//_leave(" = %p", victim);
874	return victim;	874	return victim;
875		875
876	object_in_use:	876	object_in_use:
877	read_unlock(&cache->active_lock);	877	read_unlock(&cache->active_lock);
878	mutex_unlock(&dir->d_inode->i_mutex);	878	mutex_unlock(&dir->d_inode->i_mutex);
879	dput(victim);	879	dput(victim);
880	//_leave(" = -EBUSY [in use]");	880	//_leave(" = -EBUSY [in use]");
881	return ERR_PTR(-EBUSY);	881	return ERR_PTR(-EBUSY);
882		882
883	lookup_error:	883	lookup_error:
884	mutex_unlock(&dir->d_inode->i_mutex);	884	mutex_unlock(&dir->d_inode->i_mutex);
885	ret = PTR_ERR(victim);	885	ret = PTR_ERR(victim);
886	if (ret == -ENOENT) {	886	if (ret == -ENOENT) {
887	/* file or dir now absent - probably retired by netfs */	887	/* file or dir now absent - probably retired by netfs */
888	_leave(" = -ESTALE [absent]");	888	_leave(" = -ESTALE [absent]");
889	return ERR_PTR(-ESTALE);	889	return ERR_PTR(-ESTALE);
890	}	890	}
891		891
892	if (ret == -EIO) {	892	if (ret == -EIO) {
893	cachefiles_io_error(cache, "Lookup failed");	893	cachefiles_io_error(cache, "Lookup failed");
894	} else if (ret != -ENOMEM) {	894	} else if (ret != -ENOMEM) {
895	pr_err("Internal error: %d", ret);	895	pr_err("Internal error: %d\n", ret);
896	ret = -EIO;	896	ret = -EIO;
897	}	897	}
898		898
899	_leave(" = %d", ret);	899	_leave(" = %d", ret);
900	return ERR_PTR(ret);	900	return ERR_PTR(ret);
901	}	901	}
902		902
903	/*	903	/*
904	* cull an object if it's not in use	904	* cull an object if it's not in use
905	* - called only by cache manager daemon	905	* - called only by cache manager daemon
906	*/	906	*/
907	int cachefiles_cull(struct cachefiles_cache cache, struct dentry dir,	907	int cachefiles_cull(struct cachefiles_cache cache, struct dentry dir,
908	char *filename)	908	char *filename)
909	{	909	{
910	struct dentry *victim;	910	struct dentry *victim;
911	int ret;	911	int ret;
912		912
913	_enter(",%.s/,%s",	913	_enter(",%.s/,%s",
914	dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);	914	dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
915		915
916	victim = cachefiles_check_active(cache, dir, filename);	916	victim = cachefiles_check_active(cache, dir, filename);
917	if (IS_ERR(victim))	917	if (IS_ERR(victim))
918	return PTR_ERR(victim);	918	return PTR_ERR(victim);
919		919
920	_debug("victim -> %p %s",	920	_debug("victim -> %p %s",
921	victim, victim->d_inode ? "positive" : "negative");	921	victim, victim->d_inode ? "positive" : "negative");
922		922
923	/* okay... the victim is not being used so we can cull it	923	/* okay... the victim is not being used so we can cull it
924	* - start by marking it as stale	924	* - start by marking it as stale
925	*/	925	*/
926	_debug("victim is cullable");	926	_debug("victim is cullable");
927		927
928	ret = cachefiles_remove_object_xattr(cache, victim);	928	ret = cachefiles_remove_object_xattr(cache, victim);
929	if (ret < 0)	929	if (ret < 0)
930	goto error_unlock;	930	goto error_unlock;
931		931
932	/* actually remove the victim (drops the dir mutex) */	932	/* actually remove the victim (drops the dir mutex) */
933	_debug("bury");	933	_debug("bury");
934		934
935	ret = cachefiles_bury_object(cache, dir, victim, false);	935	ret = cachefiles_bury_object(cache, dir, victim, false);
936	if (ret < 0)	936	if (ret < 0)
937	goto error;	937	goto error;
938		938
939	dput(victim);	939	dput(victim);
940	_leave(" = 0");	940	_leave(" = 0");
941	return 0;	941	return 0;
942		942
943	error_unlock:	943	error_unlock:
944	mutex_unlock(&dir->d_inode->i_mutex);	944	mutex_unlock(&dir->d_inode->i_mutex);
945	error:	945	error:
946	dput(victim);	946	dput(victim);
947	if (ret == -ENOENT) {	947	if (ret == -ENOENT) {
948	/* file or dir now absent - probably retired by netfs */	948	/* file or dir now absent - probably retired by netfs */
949	_leave(" = -ESTALE [absent]");	949	_leave(" = -ESTALE [absent]");
950	return -ESTALE;	950	return -ESTALE;
951	}	951	}
952		952
953	if (ret != -ENOMEM) {	953	if (ret != -ENOMEM) {
954	pr_err("Internal error: %d", ret);	954	pr_err("Internal error: %d\n", ret);
955	ret = -EIO;	955	ret = -EIO;
956	}	956	}
957		957
958	_leave(" = %d", ret);	958	_leave(" = %d", ret);
959	return ret;	959	return ret;
960	}	960	}
961		961
962	/*	962	/*
963	* find out if an object is in use or not	963	* find out if an object is in use or not
964	* - called only by cache manager daemon	964	* - called only by cache manager daemon
965	* - returns -EBUSY or 0 to indicate whether an object is in use or not	965	* - returns -EBUSY or 0 to indicate whether an object is in use or not
966	*/	966	*/
967	int cachefiles_check_in_use(struct cachefiles_cache cache, struct dentry dir,	967	int cachefiles_check_in_use(struct cachefiles_cache cache, struct dentry dir,
968	char *filename)	968	char *filename)
969	{	969	{
970	struct dentry *victim;	970	struct dentry *victim;
971		971
972	//_enter(",%.s/,%s",	972	//_enter(",%.s/,%s",
973	// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);	973	// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
974		974
975	victim = cachefiles_check_active(cache, dir, filename);	975	victim = cachefiles_check_active(cache, dir, filename);
976	if (IS_ERR(victim))	976	if (IS_ERR(victim))
977	return PTR_ERR(victim);	977	return PTR_ERR(victim);
978		978
979	mutex_unlock(&dir->d_inode->i_mutex);	979	mutex_unlock(&dir->d_inode->i_mutex);
980	dput(victim);	980	dput(victim);
981	//_leave(" = 0");	981	//_leave(" = 0");
982	return 0;	982	return 0;
983	}	983	}
984		984

fs/cachefiles/xattr.c

Diff comments View file @ 8207649

1	/* CacheFiles extended attribute management	1	/* CacheFiles extended attribute management
2	*	2	*
3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.	3	* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4	* Written by David Howells (dhowells@redhat.com)	4	* Written by David Howells (dhowells@redhat.com)
5	*	5	*
6	* This program is free software; you can redistribute it and/or	6	* This program is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU General Public Licence	7	* modify it under the terms of the GNU General Public Licence
8	* as published by the Free Software Foundation; either version	8	* as published by the Free Software Foundation; either version
9	* 2 of the Licence, or (at your option) any later version.	9	* 2 of the Licence, or (at your option) any later version.
10	*/	10	*/
11		11
12	#include <linux/module.h>	12	#include <linux/module.h>
13	#include <linux/sched.h>	13	#include <linux/sched.h>
14	#include <linux/file.h>	14	#include <linux/file.h>
15	#include <linux/fs.h>	15	#include <linux/fs.h>
16	#include <linux/fsnotify.h>	16	#include <linux/fsnotify.h>
17	#include <linux/quotaops.h>	17	#include <linux/quotaops.h>
18	#include <linux/xattr.h>	18	#include <linux/xattr.h>
19	#include <linux/slab.h>	19	#include <linux/slab.h>
20	#include "internal.h"	20	#include "internal.h"
21		21
22	static const char cachefiles_xattr_cache[] =	22	static const char cachefiles_xattr_cache[] =
23	XATTR_USER_PREFIX "CacheFiles.cache";	23	XATTR_USER_PREFIX "CacheFiles.cache";
24		24
25	/*	25	/*
26	* check the type label on an object	26	* check the type label on an object
27	* - done using xattrs	27	* - done using xattrs
28	*/	28	*/
29	int cachefiles_check_object_type(struct cachefiles_object *object)	29	int cachefiles_check_object_type(struct cachefiles_object *object)
30	{	30	{
31	struct dentry *dentry = object->dentry;	31	struct dentry *dentry = object->dentry;
32	char type[3], xtype[3];	32	char type[3], xtype[3];
33	int ret;	33	int ret;
34		34
35	ASSERT(dentry);	35	ASSERT(dentry);
36	ASSERT(dentry->d_inode);	36	ASSERT(dentry->d_inode);
37		37
38	if (!object->fscache.cookie)	38	if (!object->fscache.cookie)
39	strcpy(type, "C3");	39	strcpy(type, "C3");
40	else	40	else
41	snprintf(type, 3, "%02x", object->fscache.cookie->def->type);	41	snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
42		42
43	_enter("%p{%s}", object, type);	43	_enter("%p{%s}", object, type);
44		44
45	/* attempt to install a type label directly */	45	/* attempt to install a type label directly */
46	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,	46	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
47	XATTR_CREATE);	47	XATTR_CREATE);
48	if (ret == 0) {	48	if (ret == 0) {
49	_debug("SET"); /* we succeeded */	49	_debug("SET"); /* we succeeded */
50	goto error;	50	goto error;
51	}	51	}
52		52
53	if (ret != -EEXIST) {	53	if (ret != -EEXIST) {
54	pr_err("Can't set xattr on %.s [%lu] (err %d)",	54	pr_err("Can't set xattr on %.s [%lu] (err %d)\n",
55	dentry->d_name.len, dentry->d_name.len,	55	dentry->d_name.len, dentry->d_name.len,
56	dentry->d_name.name, dentry->d_inode->i_ino,	56	dentry->d_name.name, dentry->d_inode->i_ino,
57	-ret);	57	-ret);
58	goto error;	58	goto error;
59	}	59	}
60		60
61	/* read the current type label */	61	/* read the current type label */
62	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);	62	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
63	if (ret < 0) {	63	if (ret < 0) {
64	if (ret == -ERANGE)	64	if (ret == -ERANGE)
65	goto bad_type_length;	65	goto bad_type_length;
66		66
67	pr_err("Can't read xattr on %.s [%lu] (err %d)",	67	pr_err("Can't read xattr on %.s [%lu] (err %d)\n",
68	dentry->d_name.len, dentry->d_name.len,	68	dentry->d_name.len, dentry->d_name.len,
69	dentry->d_name.name, dentry->d_inode->i_ino,	69	dentry->d_name.name, dentry->d_inode->i_ino,
70	-ret);	70	-ret);
71	goto error;	71	goto error;
72	}	72	}
73		73
74	/* check the type is what we're expecting */	74	/* check the type is what we're expecting */
75	if (ret != 2)	75	if (ret != 2)
76	goto bad_type_length;	76	goto bad_type_length;
77		77
78	if (xtype[0] != type[0] \|\| xtype[1] != type[1])	78	if (xtype[0] != type[0] \|\| xtype[1] != type[1])
79	goto bad_type;	79	goto bad_type;
80		80
81	ret = 0;	81	ret = 0;
82		82
83	error:	83	error:
84	_leave(" = %d", ret);	84	_leave(" = %d", ret);
85	return ret;	85	return ret;
86		86
87	bad_type_length:	87	bad_type_length:
88	pr_err("Cache object %lu type xattr length incorrect",	88	pr_err("Cache object %lu type xattr length incorrect\n",
89	dentry->d_inode->i_ino);	89	dentry->d_inode->i_ino);
90	ret = -EIO;	90	ret = -EIO;
91	goto error;	91	goto error;
92		92
93	bad_type:	93	bad_type:
94	xtype[2] = 0;	94	xtype[2] = 0;
95	pr_err("Cache object %.s [%lu] type %s not %s",	95	pr_err("Cache object %.s [%lu] type %s not %s\n",
96	dentry->d_name.len, dentry->d_name.len,	96	dentry->d_name.len, dentry->d_name.len,
97	dentry->d_name.name, dentry->d_inode->i_ino,	97	dentry->d_name.name, dentry->d_inode->i_ino,
98	xtype, type);	98	xtype, type);
99	ret = -EIO;	99	ret = -EIO;
100	goto error;	100	goto error;
101	}	101	}
102		102
103	/*	103	/*
104	* set the state xattr on a cache file	104	* set the state xattr on a cache file
105	*/	105	*/
106	int cachefiles_set_object_xattr(struct cachefiles_object *object,	106	int cachefiles_set_object_xattr(struct cachefiles_object *object,
107	struct cachefiles_xattr *auxdata)	107	struct cachefiles_xattr *auxdata)
108	{	108	{
109	struct dentry *dentry = object->dentry;	109	struct dentry *dentry = object->dentry;
110	int ret;	110	int ret;
111		111
112	ASSERT(dentry);	112	ASSERT(dentry);
113		113
114	_enter("%p,#%d", object, auxdata->len);	114	_enter("%p,#%d", object, auxdata->len);
115		115
116	/* attempt to install the cache metadata directly */	116	/* attempt to install the cache metadata directly */
117	_debug("SET #%u", auxdata->len);	117	_debug("SET #%u", auxdata->len);
118		118
119	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,	119	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
120	&auxdata->type, auxdata->len,	120	&auxdata->type, auxdata->len,
121	XATTR_CREATE);	121	XATTR_CREATE);
122	if (ret < 0 && ret != -ENOMEM)	122	if (ret < 0 && ret != -ENOMEM)
123	cachefiles_io_error_obj(	123	cachefiles_io_error_obj(
124	object,	124	object,
125	"Failed to set xattr with error %d", ret);	125	"Failed to set xattr with error %d", ret);
126		126
127	_leave(" = %d", ret);	127	_leave(" = %d", ret);
128	return ret;	128	return ret;
129	}	129	}
130		130
131	/*	131	/*
132	* update the state xattr on a cache file	132	* update the state xattr on a cache file
133	*/	133	*/
134	int cachefiles_update_object_xattr(struct cachefiles_object *object,	134	int cachefiles_update_object_xattr(struct cachefiles_object *object,
135	struct cachefiles_xattr *auxdata)	135	struct cachefiles_xattr *auxdata)
136	{	136	{
137	struct dentry *dentry = object->dentry;	137	struct dentry *dentry = object->dentry;
138	int ret;	138	int ret;
139		139
140	ASSERT(dentry);	140	ASSERT(dentry);
141		141
142	_enter("%p,#%d", object, auxdata->len);	142	_enter("%p,#%d", object, auxdata->len);
143		143
144	/* attempt to install the cache metadata directly */	144	/* attempt to install the cache metadata directly */
145	_debug("SET #%u", auxdata->len);	145	_debug("SET #%u", auxdata->len);
146		146
147	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,	147	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
148	&auxdata->type, auxdata->len,	148	&auxdata->type, auxdata->len,
149	XATTR_REPLACE);	149	XATTR_REPLACE);
150	if (ret < 0 && ret != -ENOMEM)	150	if (ret < 0 && ret != -ENOMEM)
151	cachefiles_io_error_obj(	151	cachefiles_io_error_obj(
152	object,	152	object,
153	"Failed to update xattr with error %d", ret);	153	"Failed to update xattr with error %d", ret);
154		154
155	_leave(" = %d", ret);	155	_leave(" = %d", ret);
156	return ret;	156	return ret;
157	}	157	}
158		158
159	/*	159	/*
160	* check the consistency between the backing cache and the FS-Cache cookie	160	* check the consistency between the backing cache and the FS-Cache cookie
161	*/	161	*/
162	int cachefiles_check_auxdata(struct cachefiles_object *object)	162	int cachefiles_check_auxdata(struct cachefiles_object *object)
163	{	163	{
164	struct cachefiles_xattr *auxbuf;	164	struct cachefiles_xattr *auxbuf;
165	enum fscache_checkaux validity;	165	enum fscache_checkaux validity;
166	struct dentry *dentry = object->dentry;	166	struct dentry *dentry = object->dentry;
167	ssize_t xlen;	167	ssize_t xlen;
168	int ret;	168	int ret;
169		169
170	ASSERT(dentry);	170	ASSERT(dentry);
171	ASSERT(dentry->d_inode);	171	ASSERT(dentry->d_inode);
172	ASSERT(object->fscache.cookie->def->check_aux);	172	ASSERT(object->fscache.cookie->def->check_aux);
173		173
174	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);	174	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
175	if (!auxbuf)	175	if (!auxbuf)
176	return -ENOMEM;	176	return -ENOMEM;
177		177
178	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,	178	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
179	&auxbuf->type, 512 + 1);	179	&auxbuf->type, 512 + 1);
180	ret = -ESTALE;	180	ret = -ESTALE;
181	if (xlen < 1 \|\|	181	if (xlen < 1 \|\|
182	auxbuf->type != object->fscache.cookie->def->type)	182	auxbuf->type != object->fscache.cookie->def->type)
183	goto error;	183	goto error;
184		184
185	xlen--;	185	xlen--;
186	validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);	186	validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
187	if (validity != FSCACHE_CHECKAUX_OKAY)	187	if (validity != FSCACHE_CHECKAUX_OKAY)
188	goto error;	188	goto error;
189		189
190	ret = 0;	190	ret = 0;
191	error:	191	error:
192	kfree(auxbuf);	192	kfree(auxbuf);
193	return ret;	193	return ret;
194	}	194	}
195		195
196	/*	196	/*
197	* check the state xattr on a cache file	197	* check the state xattr on a cache file
198	* - return -ESTALE if the object should be deleted	198	* - return -ESTALE if the object should be deleted
199	*/	199	*/
200	int cachefiles_check_object_xattr(struct cachefiles_object *object,	200	int cachefiles_check_object_xattr(struct cachefiles_object *object,
201	struct cachefiles_xattr *auxdata)	201	struct cachefiles_xattr *auxdata)
202	{	202	{
203	struct cachefiles_xattr *auxbuf;	203	struct cachefiles_xattr *auxbuf;
204	struct dentry *dentry = object->dentry;	204	struct dentry *dentry = object->dentry;
205	int ret;	205	int ret;
206		206
207	_enter("%p,#%d", object, auxdata->len);	207	_enter("%p,#%d", object, auxdata->len);
208		208
209	ASSERT(dentry);	209	ASSERT(dentry);
210	ASSERT(dentry->d_inode);	210	ASSERT(dentry->d_inode);
211		211
212	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);	212	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
213	if (!auxbuf) {	213	if (!auxbuf) {
214	_leave(" = -ENOMEM");	214	_leave(" = -ENOMEM");
215	return -ENOMEM;	215	return -ENOMEM;
216	}	216	}
217		217
218	/* read the current type label */	218	/* read the current type label */
219	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,	219	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
220	&auxbuf->type, 512 + 1);	220	&auxbuf->type, 512 + 1);
221	if (ret < 0) {	221	if (ret < 0) {
222	if (ret == -ENODATA)	222	if (ret == -ENODATA)
223	goto stale; /* no attribute - power went off	223	goto stale; /* no attribute - power went off
224	* mid-cull? */	224	* mid-cull? */
225		225
226	if (ret == -ERANGE)	226	if (ret == -ERANGE)
227	goto bad_type_length;	227	goto bad_type_length;
228		228
229	cachefiles_io_error_obj(object,	229	cachefiles_io_error_obj(object,
230	"Can't read xattr on %lu (err %d)",	230	"Can't read xattr on %lu (err %d)",
231	dentry->d_inode->i_ino, -ret);	231	dentry->d_inode->i_ino, -ret);
232	goto error;	232	goto error;
233	}	233	}
234		234
235	/* check the on-disk object */	235	/* check the on-disk object */
236	if (ret < 1)	236	if (ret < 1)
237	goto bad_type_length;	237	goto bad_type_length;
238		238
239	if (auxbuf->type != auxdata->type)	239	if (auxbuf->type != auxdata->type)
240	goto stale;	240	goto stale;
241		241
242	auxbuf->len = ret;	242	auxbuf->len = ret;
243		243
244	/* consult the netfs */	244	/* consult the netfs */
245	if (object->fscache.cookie->def->check_aux) {	245	if (object->fscache.cookie->def->check_aux) {
246	enum fscache_checkaux result;	246	enum fscache_checkaux result;
247	unsigned int dlen;	247	unsigned int dlen;
248		248
249	dlen = auxbuf->len - 1;	249	dlen = auxbuf->len - 1;
250		250
251	_debug("checkaux %s #%u",	251	_debug("checkaux %s #%u",
252	object->fscache.cookie->def->name, dlen);	252	object->fscache.cookie->def->name, dlen);
253		253
254	result = fscache_check_aux(&object->fscache,	254	result = fscache_check_aux(&object->fscache,
255	&auxbuf->data, dlen);	255	&auxbuf->data, dlen);
256		256
257	switch (result) {	257	switch (result) {
258	/* entry okay as is */	258	/* entry okay as is */
259	case FSCACHE_CHECKAUX_OKAY:	259	case FSCACHE_CHECKAUX_OKAY:
260	goto okay;	260	goto okay;
261		261
262	/* entry requires update */	262	/* entry requires update */
263	case FSCACHE_CHECKAUX_NEEDS_UPDATE:	263	case FSCACHE_CHECKAUX_NEEDS_UPDATE:
264	break;	264	break;
265		265
266	/* entry requires deletion */	266	/* entry requires deletion */
267	case FSCACHE_CHECKAUX_OBSOLETE:	267	case FSCACHE_CHECKAUX_OBSOLETE:
268	goto stale;	268	goto stale;
269		269
270	default:	270	default:
271	BUG();	271	BUG();
272	}	272	}
273		273
274	/* update the current label */	274	/* update the current label */
275	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,	275	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
276	&auxdata->type, auxdata->len,	276	&auxdata->type, auxdata->len,
277	XATTR_REPLACE);	277	XATTR_REPLACE);
278	if (ret < 0) {	278	if (ret < 0) {
279	cachefiles_io_error_obj(object,	279	cachefiles_io_error_obj(object,
280	"Can't update xattr on %lu"	280	"Can't update xattr on %lu"
281	" (error %d)",	281	" (error %d)",
282	dentry->d_inode->i_ino, -ret);	282	dentry->d_inode->i_ino, -ret);
283	goto error;	283	goto error;
284	}	284	}
285	}	285	}
286		286
287	okay:	287	okay:
288	ret = 0;	288	ret = 0;
289		289
290	error:	290	error:
291	kfree(auxbuf);	291	kfree(auxbuf);
292	_leave(" = %d", ret);	292	_leave(" = %d", ret);
293	return ret;	293	return ret;
294		294
295	bad_type_length:	295	bad_type_length:
296	pr_err("Cache object %lu xattr length incorrect",	296	pr_err("Cache object %lu xattr length incorrect\n",
297	dentry->d_inode->i_ino);	297	dentry->d_inode->i_ino);
298	ret = -EIO;	298	ret = -EIO;
299	goto error;	299	goto error;
300		300
301	stale:	301	stale:
302	ret = -ESTALE;	302	ret = -ESTALE;
303	goto error;	303	goto error;
304	}	304	}
305		305
306	/*	306	/*
307	* remove the object's xattr to mark it stale	307	* remove the object's xattr to mark it stale
308	*/	308	*/
309	int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,	309	int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
310	struct dentry *dentry)	310	struct dentry *dentry)
311	{	311	{
312	int ret;	312	int ret;
313		313
314	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);	314	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
315	if (ret < 0) {	315	if (ret < 0) {
316	if (ret == -ENOENT \|\| ret == -ENODATA)	316	if (ret == -ENOENT \|\| ret == -ENODATA)
317	ret = 0;	317	ret = 0;
318	else if (ret != -ENOMEM)	318	else if (ret != -ENOMEM)
319	cachefiles_io_error(cache,	319	cachefiles_io_error(cache,
320	"Can't remove xattr from %lu"	320	"Can't remove xattr from %lu"
321	" (error %d)",	321	" (error %d)",
322	dentry->d_inode->i_ino, -ret);	322	dentry->d_inode->i_ino, -ret);
323	}	323	}
324		324
325	_leave(" = %d", ret);	325	_leave(" = %d", ret);
326	return ret;	326	return ret;
327	}	327	}
328		328

fs/nilfs2/inode.c

Diff comments View file @ 8207649

 /*
  * inode.c - NILFS inode operations.
  *
  * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
+#include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/aio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
 #include "cpfile.h"
 #include "ifile.h"
 /**
  * struct nilfs_iget_args - arguments used during comparison between inodes
  * @ino: inode number
  * @cno: checkpoint number
  * @root: pointer on NILFS root object (mounted checkpoint)
  * @for_gc: inode for GC flag
  */
 struct nilfs_iget_args {
 	u64 ino;
 	__u64 cno;
 	struct nilfs_root *root;
 	int for_gc;
 };
 void nilfs_inode_add_blocks(struct inode *inode, int n)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
 		atomic64_add(n, &root->blocks_count);
 }
 void nilfs_inode_sub_blocks(struct inode *inode, int n)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
 		atomic64_sub(n, &root->blocks_count);
 }
 /**
  * nilfs_get_block() - get a file block on the filesystem (callback function)
  * @inode - inode struct of the target file
  * @blkoff - file block number
  * @bh_result - buffer head to be mapped on
  * @create - indicate whether allocating the block or not when it has not
  *      been allocated yet.
  *
  * This function does not issue actual read request of the specified data
  * block. It is done by VFS.
  */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		    struct buffer_head *bh_result, int create)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 blknum = 0;
 	int err = 0, ret;
 	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
 	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	if (ret >= 0) {	/* found */
 		map_bh(bh_result, inode->i_sb, blknum);
 		if (ret > 0)
 			bh_result->b_size = (ret << inode->i_blkbits);
 		goto out;
 	}
 	/* data block was not found */
 	if (ret == -ENOENT && create) {
 		struct nilfs_transaction_info ti;
 		bh_result->b_blocknr = 0;
 		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
 		if (unlikely(err))
 			goto out;
 		err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
 					(unsigned long)bh_result);
 		if (unlikely(err != 0)) {
 			if (err == -EEXIST) {
 				/*
 				 * The get_block() function could be called
 				 * from multiple callers for an inode.
 				 * However, the page having this block must
 				 * be locked in this case.
 				 */
 				printk(KERN_WARNING
 				       "nilfs_get_block: a race condition "
 				       "while inserting a data block. "
 				       "(inode number=%lu, file block "
 				       "offset=%llu)\n",
 				       inode->i_ino,
 				       (unsigned long long)blkoff);
 				err = 0;
 			}
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
 		}
 		nilfs_mark_inode_dirty(inode);
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
 		set_buffer_delay(bh_result);
 		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
 						      to proper value */
 	} else if (ret == -ENOENT) {
 		/* not found is not error (e.g. hole); must return without
 		   the mapped state flag. */
 		;
 	} else {
 		err = ret;
 	}
  out:
 	return err;
 }
 /**
  * nilfs_readpage() - implement readpage() method of nilfs_aops {}
  * address_space_operations.
  * @file - file struct of the file to be read
  * @page - the page to be read
  */
 static int nilfs_readpage(struct file *file, struct page *page)
 {
 	return mpage_readpage(page, nilfs_get_block);
 }
 /**
  * nilfs_readpages() - implement readpages() method of nilfs_aops {}
  * address_space_operations.
  * @file - file struct of the file to be read
  * @mapping - address_space struct used for reading multiple pages
  * @pages - the pages to be read
  * @nr_pages - number of pages to be read
  */
 static int nilfs_readpages(struct file *file, struct address_space *mapping,
 			   struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
 }
 static int nilfs_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
 	int err = 0;
 	if (inode->i_sb->s_flags & MS_RDONLY) {
 		nilfs_clear_dirty_pages(mapping, false);
 		return -EROFS;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
 						    wbc->range_start,
 						    wbc->range_end);
 	return err;
 }
 static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	int err;
 	if (inode->i_sb->s_flags & MS_RDONLY) {
 		/*
 		 * It means that filesystem was remounted in read-only
 		 * mode because of error or metadata corruption. But we
 		 * have dirty pages that try to be flushed in background.
 		 * So, here we simply discard this dirty page.
 		 */
 		nilfs_clear_dirty_page(page, false);
 		unlock_page(page);
 		return -EROFS;
 	}
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	if (wbc->sync_mode == WB_SYNC_ALL) {
 		err = nilfs_construct_segment(inode->i_sb);
 		if (unlikely(err))
 			return err;
 	} else if (wbc->for_reclaim)
 		nilfs_flush_segment(inode->i_sb, inode->i_ino);
 	return 0;
 }
 static int nilfs_set_page_dirty(struct page *page)
 {
+	struct inode *inode = page->mapping->host;
 	int ret = __set_page_dirty_nobuffers(page);
 	if (page_has_buffers(page)) {
-		struct inode *inode = page->mapping->host;
 		unsigned nr_dirty = 0;
 		struct buffer_head *bh, *head;
 		/*
 		 * This page is locked by callers, and no other thread
 		 * concurrently marks its buffers dirty since they are
 		 * only dirtied through routines in fs/buffer.c in
 		 * which call sites of mark_buffer_dirty are protected
 		 * by page lock.
 		 */
 		bh = head = page_buffers(page);
 		do {
 			/* Do not mark hole blocks dirty */
 			if (buffer_dirty(bh) || !buffer_mapped(bh))
 				continue;
 			set_buffer_dirty(bh);
 			nr_dirty++;
 		} while (bh = bh->b_this_page, bh != head);
 		if (nr_dirty)
 			nilfs_set_file_dirty(inode, nr_dirty);
+	} else if (ret) {
+		unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		nilfs_set_file_dirty(inode, nr_dirty);
 	}
 	return ret;
 }
 void nilfs_write_failed(struct address_space *mapping, loff_t to)
 {
 	struct inode *inode = mapping->host;
 	if (to > inode->i_size) {
 		truncate_pagecache(inode, inode->i_size);
 		nilfs_truncate(inode);
 	}
 }
 static int nilfs_write_begin(struct file *file, struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
 	if (unlikely(err))
 		return err;
 	err = block_write_begin(mapping, pos, len, flags, pagep,
 				nilfs_get_block);
 	if (unlikely(err)) {
 		nilfs_write_failed(mapping, pos + len);
 		nilfs_transaction_abort(inode->i_sb);
 	}
 	return err;
 }
 static int nilfs_write_end(struct file *file, struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned nr_dirty;
 	int err;
 	nr_dirty = nilfs_page_count_clean_buffers(page, start,
 						  start + copied);
 	copied = generic_write_end(file, mapping, pos, len, copied, page,
 				   fsdata);
 	nilfs_set_file_dirty(inode, nr_dirty);
 	err = nilfs_transaction_commit(inode->i_sb);
 	return err ? : copied;
 }
 static ssize_t
 nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 		loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = iov_iter_count(iter);
 	ssize_t size;
 	if (rw == WRITE)
 		return 0;
 	/* Needs synchronization with the cleaner */
 	size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
 				  nilfs_get_block);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && size < 0)) {
 		loff_t isize = i_size_read(inode);
 		loff_t end = offset + count;
 		if (end > isize)
 			nilfs_write_failed(mapping, end);
 	}
 	return size;
 }
 const struct address_space_operations nilfs_aops = {
 	.writepage		= nilfs_writepage,
 	.readpage		= nilfs_readpage,
 	.writepages		= nilfs_writepages,
 	.set_page_dirty		= nilfs_set_page_dirty,
 	.readpages		= nilfs_readpages,
 	.write_begin		= nilfs_write_begin,
 	.write_end		= nilfs_write_end,
 	/* .releasepage		= nilfs_releasepage, */
 	.invalidatepage		= block_invalidatepage,
 	.direct_IO		= nilfs_direct_IO,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
 	struct nilfs_root *root;
 	int err = -ENOMEM;
 	ino_t ino;
 	inode = new_inode(sb);
 	if (unlikely(!inode))
 		goto failed;
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 	root = NILFS_I(dir)->i_root;
 	ii = NILFS_I(inode);
 	ii->i_state = 1 << NILFS_I_NEW;
 	ii->i_root = root;
 	err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
 	if (unlikely(err))
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 	atomic64_inc(&root->inodes_count);
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		err = nilfs_bmap_read(ii->i_bmap, NULL);
 		if (err < 0)
 			goto failed_bmap;
 		set_bit(NILFS_I_BMAP, &ii->i_state);
 		/* No lock is needed; iget() ensures it. */
 	}
 	ii->i_flags = nilfs_mask_flags(
 		mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
 	ii->i_dir_start_lookup = 0;
 	nilfs_set_inode_flags(inode);
 	spin_lock(&nilfs->ns_next_gen_lock);
 	inode->i_generation = nilfs->ns_next_generation++;
 	spin_unlock(&nilfs->ns_next_gen_lock);
 	insert_inode_hash(inode);
 	err = nilfs_init_acl(inode, dir);
 	if (unlikely(err))
 		goto failed_acl; /* never occur. When supporting
 				    nilfs_init_acl(), proper cancellation of
 				    above jobs should be considered */
 	return inode;
  failed_acl:
  failed_bmap:
 	clear_nlink(inode);
 	iput(inode);  /* raw_inode will be deleted through
 			 generic_delete_inode() */
 	goto failed;
  failed_ifile_create_inode:
 	make_bad_inode(inode);
 	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
 			 called */
  failed:
 	return ERR_PTR(err);
 }
 void nilfs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = NILFS_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
 			    S_DIRSYNC);
 	if (flags & FS_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & FS_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & FS_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & FS_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 }
 int nilfs_read_inode_common(struct inode *inode,
 			    struct nilfs_inode *raw_inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	i_uid_write(inode, le32_to_cpu(raw_inode->i_uid));
 	i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le64_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
 	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
 	if (inode->i_nlink == 0 && inode->i_mode == 0)
 		return -EINVAL; /* this inode is deleted */
 	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
 	ii->i_flags = le32_to_cpu(raw_inode->i_flags);
 #if 0
 	ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
 	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
 		0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
 	ii->i_dir_start_lookup = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)) {
 		err = nilfs_bmap_read(ii->i_bmap, raw_inode);
 		if (err < 0)
 			return err;
 		set_bit(NILFS_I_BMAP, &ii->i_state);
 		/* No lock is needed; iget() ensures it. */
 	}
 	return 0;
 }
 static int __nilfs_read_inode(struct super_block *sb,
 			      struct nilfs_root *root, unsigned long ino,
 			      struct inode *inode)
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct buffer_head *bh;
 	struct nilfs_inode *raw_inode;
 	int err;
 	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
 	if (unlikely(err))
 		goto bad_inode;
 	raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
 	err = nilfs_read_inode_common(inode, raw_inode);
 	if (err)
 		goto failed_unmap;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &nilfs_file_inode_operations;
 		inode->i_fop = &nilfs_file_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &nilfs_dir_inode_operations;
 		inode->i_fop = &nilfs_dir_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &nilfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else {
 		inode->i_op = &nilfs_special_inode_operations;
 		init_special_inode(
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
 	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
 	return 0;
  failed_unmap:
 	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
  bad_inode:
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	return err;
 }
 static int nilfs_iget_test(struct inode *inode, void *opaque)
 {
 	struct nilfs_iget_args *args = opaque;
 	struct nilfs_inode_info *ii;
 	if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
 		return 0;
 	ii = NILFS_I(inode);
 	if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
 		return !args->for_gc;
 	return args->for_gc && args->cno == ii->i_cno;
 }
 static int nilfs_iget_set(struct inode *inode, void *opaque)
 {
 	struct nilfs_iget_args *args = opaque;
 	inode->i_ino = args->ino;
 	if (args->for_gc) {
 		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
 		NILFS_I(inode)->i_cno = args->cno;
 		NILFS_I(inode)->i_root = NULL;
 	} else {
 		if (args->root && args->ino == NILFS_ROOT_INO)
 			nilfs_get_root(args->root);
 		NILFS_I(inode)->i_root = args->root;
 	}
 	return 0;
 }
 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
 			    unsigned long ino)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = root, .cno = 0, .for_gc = 0
 	};
 	return ilookup5(sb, ino, nilfs_iget_test, &args);
 }
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
 				unsigned long ino)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = root, .cno = 0, .for_gc = 0
 	};
 	return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
 }
 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
 			 unsigned long ino)
 {
 	struct inode *inode;
 	int err;
 	inode = nilfs_iget_locked(sb, root, ino);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	err = __nilfs_read_inode(sb, root, ino, inode);
 	if (unlikely(err)) {
 		iget_failed(inode);
 		return ERR_PTR(err);
 	}
 	unlock_new_inode(inode);
 	return inode;
 }
 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
 				__u64 cno)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
 	};
 	struct inode *inode;
 	int err;
 	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	err = nilfs_init_gcinode(inode);
 	if (unlikely(err)) {
 		iget_failed(inode);
 		return ERR_PTR(err);
 	}
 	unlock_new_inode(inode);
 	return inode;
 }
 void nilfs_write_inode_common(struct inode *inode,
 			      struct nilfs_inode *raw_inode, int has_bmap)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	raw_inode->i_uid = cpu_to_le32(i_uid_read(inode));
 	raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le64(inode->i_size);
 	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
 	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
 		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 		/* zero-fill unused portion in the case of super root block */
 		raw_inode->i_xattr = 0;
 		raw_inode->i_pad = 0;
 		memset((void *)raw_inode + sizeof(*raw_inode), 0,
 		       nilfs->ns_inode_size - sizeof(*raw_inode));
 	}
 	if (has_bmap)
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_device_code =
 			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 	/* When extending inode, nilfs->ns_inode_size should be checked
 	   for substitutions of appended fields */
 }
 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 {
 	ino_t ino = inode->i_ino;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct inode *ifile = ii->i_root->ifile;
 	struct nilfs_inode *raw_inode;
 	raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
 		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
 	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
 	nilfs_write_inode_common(inode, raw_inode, 0);
 		/* XXX: call with has_bmap = 0 is a workaround to avoid
 		   deadlock of bmap. This delays update of i_bmap to just
 		   before writing */
 	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
 static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
 				unsigned long from)
 {
 	unsigned long b;
 	int ret;
 	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
 		return;
 repeat:
 	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
 	if (ret == -ENOENT)
 		return;
 	else if (ret < 0)
 		goto failed;
 	if (b < from)
 		return;
 	b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
 	ret = nilfs_bmap_truncate(ii->i_bmap, b);
 	nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
 	if (!ret || (ret == -ENOMEM &&
 		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
 		goto repeat;
 failed:
 	nilfs_warning(ii->vfs_inode.i_sb, __func__,
 		      "failed to truncate bmap (ino=%lu, err=%d)",
 		      ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
 {
 	unsigned long blkoff;
 	unsigned int blocksize;
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 	blocksize = sb->s_blocksize;
 	blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
 	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
 	block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
 	nilfs_truncate_bmap(ii, blkoff);
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_mark_inode_dirty(inode);
 	nilfs_set_file_dirty(inode, 0);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
 	   But truncate has no return value. */
 }
 static void nilfs_clear_inode(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 	/*
 	 * Free resources allocated in nilfs_read_inode(), here.
 	 */
 	BUG_ON(!list_empty(&ii->i_dirty));
 	brelse(ii->i_bh);
 	ii->i_bh = NULL;
 	if (mdi && mdi->mi_palloc_cache)
 		nilfs_palloc_destroy_cache(inode);
 	if (test_bit(NILFS_I_BMAP, &ii->i_state))
 		nilfs_bmap_clear(ii->i_bmap);
 	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 	if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
 		nilfs_put_root(ii->i_root);
 }
 void nilfs_evict_inode(struct inode *inode)
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int ret;
 	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
 		truncate_inode_pages_final(&inode->i_data);
 		clear_inode(inode);
 		nilfs_clear_inode(inode);
 		return;
 	}
 	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
 	truncate_inode_pages_final(&inode->i_data);
 	/* TODO: some of the following operations may fail.  */
 	nilfs_truncate_bmap(ii, 0);
 	nilfs_mark_inode_dirty(inode);
 	clear_inode(inode);
 	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
 	if (!ret)
 		atomic64_dec(&ii->i_root->inodes_count);
 	nilfs_clear_inode(inode);
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
 	   But delete_inode has no return value. */
 }
 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct nilfs_transaction_info ti;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	int err;
 	err = inode_change_ok(inode, iattr);
 	if (err)
 		return err;
 	err = nilfs_transaction_begin(sb, &ti, 0);
 	if (unlikely(err))
 		return err;
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(inode)) {
 		inode_dio_wait(inode);
 		truncate_setsize(inode, iattr->ia_size);
 		nilfs_truncate(inode);
 	}
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE) {
 		err = nilfs_acl_chmod(inode);
 		if (unlikely(err))
 			goto out_err;
 	}
 	return nilfs_transaction_commit(sb);
 out_err:
 	nilfs_transaction_abort(sb);
 	return err;
 }
 int nilfs_permission(struct inode *inode, int mask)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	if ((mask & MAY_WRITE) && root &&
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 	return generic_permission(inode, mask);
 }
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 	spin_lock(&nilfs->ns_inode_lock);
 	if (ii->i_bh == NULL) {
 		spin_unlock(&nilfs->ns_inode_lock);
 		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
 						  inode->i_ino, pbh);
 		if (unlikely(err))
 			return err;
 		spin_lock(&nilfs->ns_inode_lock);
 		if (ii->i_bh == NULL)
 			ii->i_bh = *pbh;
 		else {
 			brelse(*pbh);
 			*pbh = ii->i_bh;
 		}
 	} else
 		*pbh = ii->i_bh;
 	get_bh(*pbh);
 	spin_unlock(&nilfs->ns_inode_lock);
 	return 0;
 }
 int nilfs_inode_dirty(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	int ret = 0;
 	if (!list_empty(&ii->i_dirty)) {
 		spin_lock(&nilfs->ns_inode_lock);
 		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
 			test_bit(NILFS_I_BUSY, &ii->i_state);
 		spin_unlock(&nilfs->ns_inode_lock);
 	}
 	return ret;
 }
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
 	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
 		return 0;
 	spin_lock(&nilfs->ns_inode_lock);
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
 		/* Because this routine may race with nilfs_dispose_list(),
 		   we have to check NILFS_I_QUEUED here, too. */
 		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
 			/* This will happen when somebody is freeing
 			   this inode. */
 			nilfs_warning(inode->i_sb, __func__,
 				      "cannot get inode (ino=%lu)\n",
 				      inode->i_ino);
 			spin_unlock(&nilfs->ns_inode_lock);
 			return -EINVAL; /* NILFS_I_DIRTY may remain for
 					   freeing inode */
 		}
 		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
 		set_bit(NILFS_I_QUEUED, &ii->i_state);
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
 	return 0;
 }
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
 	struct buffer_head *ibh;
 	int err;
 	err = nilfs_load_inode_block(inode, &ibh);
 	if (unlikely(err)) {
 		nilfs_warning(inode->i_sb, __func__,
 			      "failed to reget inode block.\n");
 		return err;
 	}
 	nilfs_update_inode(inode, ibh);
 	mark_buffer_dirty(ibh);
 	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
 	brelse(ibh);
 	return 0;
 }
 /**
  * nilfs_dirty_inode - reflect changes on given inode to an inode block.
  * @inode: inode of the file to be registered.
  *
  * nilfs_dirty_inode() loads a inode block containing the specified
  * @inode and copies data from a nilfs_inode to a corresponding inode
  * entry in the inode block. This operation is excluded from the segment
  * construction. This function can be called both as a single operation
  * and as a part of indivisible file operations.
  */
 void nilfs_dirty_inode(struct inode *inode, int flags)
 {
 	struct nilfs_transaction_info ti;
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 	if (is_bad_inode(inode)) {
 		nilfs_warning(inode->i_sb, __func__,
 			      "tried to mark bad_inode dirty. ignored.\n");
 		dump_stack();
 		return;
 	}
 	if (mdi) {
 		nilfs_mdt_mark_dirty(inode);
 		return;
 	}
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	nilfs_mark_inode_dirty(inode);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 __u64 start, __u64 len)
 {
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 logical = 0, phys = 0, size = 0;
 	__u32 flags = 0;
 	loff_t isize;
 	sector_t blkoff, end_blkoff;
 	sector_t delalloc_blkoff;
 	unsigned long delalloc_blklen;
 	unsigned int blkbits = inode->i_blkbits;
 	int ret, n;
 	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
 	if (ret)
 		return ret;
 	mutex_lock(&inode->i_mutex);
 	isize = i_size_read(inode);
 	blkoff = start >> blkbits;
 	end_blkoff = (start + len - 1) >> blkbits;
 	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
 							&delalloc_blkoff);
 	do {
 		__u64 blkphy;
 		unsigned int maxblocks;
 		if (delalloc_blklen && blkoff == delalloc_blkoff) {
 			if (size) {
 				/* End of the current extent */
 				ret = fiemap_fill_next_extent(
 					fieinfo, logical, phys, size, flags);
 				if (ret)
 					break;
 			}
 			if (blkoff > end_blkoff)
 				break;
 			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
 			logical = blkoff << blkbits;
 			phys = 0;
 			size = delalloc_blklen << blkbits;
 			blkoff = delalloc_blkoff + delalloc_blklen;
 			delalloc_blklen = nilfs_find_uncommitted_extent(
 				inode, blkoff, &delalloc_blkoff);
 			continue;
 		}
 		/*
 		 * Limit the number of blocks that we look up so as
 		 * not to get into the next delayed allocation extent.
 		 */
 		maxblocks = INT_MAX;
 		if (delalloc_blklen)
 			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
 					  maxblocks);
 		blkphy = 0;
 		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 		n = nilfs_bmap_lookup_contig(
 			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
 		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 		if (n < 0) {
 			int past_eof;
 			if (unlikely(n != -ENOENT))
 				break; /* error */
 			/* HOLE */
 			blkoff++;
 			past_eof = ((blkoff << blkbits) >= isize);
 			if (size) {
 				/* End of the current extent */
 				if (past_eof)
 					flags |= FIEMAP_EXTENT_LAST;
 				ret = fiemap_fill_next_extent(
 					fieinfo, logical, phys, size, flags);
 				if (ret)
 					break;
 				size = 0;
 			}
 			if (blkoff > end_blkoff || past_eof)
 				break;
 		} else {
 			if (size) {
 				if (phys && blkphy << blkbits == phys + size) {
 					/* The current extent goes on */
 					size += n << blkbits;
 				} else {
 					/* Terminate the current extent */
 					ret = fiemap_fill_next_extent(
 						fieinfo, logical, phys, size,
 						flags);
 					if (ret || blkoff > end_blkoff)
 						break;
 					/* Start another extent */
 					flags = FIEMAP_EXTENT_MERGED;
 					logical = blkoff << blkbits;
 					phys = blkphy << blkbits;
 					size = n << blkbits;
 				}
 			} else {
 				/* Start a new extent */
 				flags = FIEMAP_EXTENT_MERGED;
 				logical = blkoff << blkbits;
 				phys = blkphy << blkbits;
 				size = n << blkbits;
 			}
 			blkoff += n;
 		}
 		cond_resched();
 	} while (true);
 	/* If ret is 1 then we just hit the end of the extent array */
 	if (ret == 1)
 		ret = 0;
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }

fs/ocfs2/dlm/dlmmaster.c

Diff comments View file @ 8207649

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* dlmmod.c	4	* dlmmod.c
5	*	5	*
6	* standalone DLM module	6	* standalone DLM module
7	*	7	*
8	* Copyright (C) 2004 Oracle. All rights reserved.	8	* Copyright (C) 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License as published by the Free Software Foundation; either	12	* License as published by the Free Software Foundation; either
13	* version 2 of the License, or (at your option) any later version.	13	* version 2 of the License, or (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful,	15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.	18	* General Public License for more details.
19	*	19	*
20	* You should have received a copy of the GNU General Public	20	* You should have received a copy of the GNU General Public
21	* License along with this program; if not, write to the	21	* License along with this program; if not, write to the
22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23	* Boston, MA 021110-1307, USA.	23	* Boston, MA 021110-1307, USA.
24	*	24	*
25	*/	25	*/
26		26
27		27
28	#include <linux/module.h>	28	#include <linux/module.h>
29	#include <linux/fs.h>	29	#include <linux/fs.h>
30	#include <linux/types.h>	30	#include <linux/types.h>
31	#include <linux/slab.h>	31	#include <linux/slab.h>
32	#include <linux/highmem.h>	32	#include <linux/highmem.h>
33	#include <linux/init.h>	33	#include <linux/init.h>
34	#include <linux/sysctl.h>	34	#include <linux/sysctl.h>
35	#include <linux/random.h>	35	#include <linux/random.h>
36	#include <linux/blkdev.h>	36	#include <linux/blkdev.h>
37	#include <linux/socket.h>	37	#include <linux/socket.h>
38	#include <linux/inet.h>	38	#include <linux/inet.h>
39	#include <linux/spinlock.h>	39	#include <linux/spinlock.h>
40	#include <linux/delay.h>	40	#include <linux/delay.h>
41		41
42		42
43	#include "cluster/heartbeat.h"	43	#include "cluster/heartbeat.h"
44	#include "cluster/nodemanager.h"	44	#include "cluster/nodemanager.h"
45	#include "cluster/tcp.h"	45	#include "cluster/tcp.h"
46		46
47	#include "dlmapi.h"	47	#include "dlmapi.h"
48	#include "dlmcommon.h"	48	#include "dlmcommon.h"
49	#include "dlmdomain.h"	49	#include "dlmdomain.h"
50	#include "dlmdebug.h"	50	#include "dlmdebug.h"
51		51
52	#define MLOG_MASK_PREFIX (ML_DLM\|ML_DLM_MASTER)	52	#define MLOG_MASK_PREFIX (ML_DLM\|ML_DLM_MASTER)
53	#include "cluster/masklog.h"	53	#include "cluster/masklog.h"
54		54
55	static void dlm_mle_node_down(struct dlm_ctxt *dlm,	55	static void dlm_mle_node_down(struct dlm_ctxt *dlm,
56	struct dlm_master_list_entry *mle,	56	struct dlm_master_list_entry *mle,
57	struct o2nm_node *node,	57	struct o2nm_node *node,
58	int idx);	58	int idx);
59	static void dlm_mle_node_up(struct dlm_ctxt *dlm,	59	static void dlm_mle_node_up(struct dlm_ctxt *dlm,
60	struct dlm_master_list_entry *mle,	60	struct dlm_master_list_entry *mle,
61	struct o2nm_node *node,	61	struct o2nm_node *node,
62	int idx);	62	int idx);
63		63
64	static void dlm_assert_master_worker(struct dlm_work_item item, void data);	64	static void dlm_assert_master_worker(struct dlm_work_item item, void data);
65	static int dlm_do_assert_master(struct dlm_ctxt *dlm,	65	static int dlm_do_assert_master(struct dlm_ctxt *dlm,
66	struct dlm_lock_resource *res,	66	struct dlm_lock_resource *res,
67	void *nodemap, u32 flags);	67	void *nodemap, u32 flags);
68	static void dlm_deref_lockres_worker(struct dlm_work_item item, void data);	68	static void dlm_deref_lockres_worker(struct dlm_work_item item, void data);
69		69
70	static inline int dlm_mle_equal(struct dlm_ctxt *dlm,	70	static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
71	struct dlm_master_list_entry *mle,	71	struct dlm_master_list_entry *mle,
72	const char *name,	72	const char *name,
73	unsigned int namelen)	73	unsigned int namelen)
74	{	74	{
75	if (dlm != mle->dlm)	75	if (dlm != mle->dlm)
76	return 0;	76	return 0;
77		77
78	if (namelen != mle->mnamelen \|\|	78	if (namelen != mle->mnamelen \|\|
79	memcmp(name, mle->mname, namelen) != 0)	79	memcmp(name, mle->mname, namelen) != 0)
80	return 0;	80	return 0;
81		81
82	return 1;	82	return 1;
83	}	83	}
84		84
85	static struct kmem_cache *dlm_lockres_cache;	85	static struct kmem_cache *dlm_lockres_cache;
86	static struct kmem_cache *dlm_lockname_cache;	86	static struct kmem_cache *dlm_lockname_cache;
87	static struct kmem_cache *dlm_mle_cache;	87	static struct kmem_cache *dlm_mle_cache;
88		88
89	static void dlm_mle_release(struct kref *kref);	89	static void dlm_mle_release(struct kref *kref);
90	static void dlm_init_mle(struct dlm_master_list_entry *mle,	90	static void dlm_init_mle(struct dlm_master_list_entry *mle,
91	enum dlm_mle_type type,	91	enum dlm_mle_type type,
92	struct dlm_ctxt *dlm,	92	struct dlm_ctxt *dlm,
93	struct dlm_lock_resource *res,	93	struct dlm_lock_resource *res,
94	const char *name,	94	const char *name,
95	unsigned int namelen);	95	unsigned int namelen);
96	static void dlm_put_mle(struct dlm_master_list_entry *mle);	96	static void dlm_put_mle(struct dlm_master_list_entry *mle);
97	static void __dlm_put_mle(struct dlm_master_list_entry *mle);	97	static void __dlm_put_mle(struct dlm_master_list_entry *mle);
98	static int dlm_find_mle(struct dlm_ctxt *dlm,	98	static int dlm_find_mle(struct dlm_ctxt *dlm,
99	struct dlm_master_list_entry **mle,	99	struct dlm_master_list_entry **mle,
100	char *name, unsigned int namelen);	100	char *name, unsigned int namelen);
101		101
102	static int dlm_do_master_request(struct dlm_lock_resource *res,	102	static int dlm_do_master_request(struct dlm_lock_resource *res,
103	struct dlm_master_list_entry *mle, int to);	103	struct dlm_master_list_entry *mle, int to);
104		104
105		105
106	static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,	106	static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
107	struct dlm_lock_resource *res,	107	struct dlm_lock_resource *res,
108	struct dlm_master_list_entry *mle,	108	struct dlm_master_list_entry *mle,
109	int *blocked);	109	int *blocked);
110	static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,	110	static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
111	struct dlm_lock_resource *res,	111	struct dlm_lock_resource *res,
112	struct dlm_master_list_entry *mle,	112	struct dlm_master_list_entry *mle,
113	int blocked);	113	int blocked);
114	static int dlm_add_migration_mle(struct dlm_ctxt *dlm,	114	static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
115	struct dlm_lock_resource *res,	115	struct dlm_lock_resource *res,
116	struct dlm_master_list_entry *mle,	116	struct dlm_master_list_entry *mle,
117	struct dlm_master_list_entry **oldmle,	117	struct dlm_master_list_entry **oldmle,
118	const char *name, unsigned int namelen,	118	const char *name, unsigned int namelen,
119	u8 new_master, u8 master);	119	u8 new_master, u8 master);
120		120
121	static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,	121	static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
122	struct dlm_lock_resource *res);	122	struct dlm_lock_resource *res);
123	static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,	123	static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
124	struct dlm_lock_resource *res);	124	struct dlm_lock_resource *res);
125	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,	125	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
126	struct dlm_lock_resource *res,	126	struct dlm_lock_resource *res,
127	u8 target);	127	u8 target);
128	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,	128	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
129	struct dlm_lock_resource *res);	129	struct dlm_lock_resource *res);
130		130
131		131
132	int dlm_is_host_down(int errno)	132	int dlm_is_host_down(int errno)
133	{	133	{
134	switch (errno) {	134	switch (errno) {
135	case -EBADF:	135	case -EBADF:
136	case -ECONNREFUSED:	136	case -ECONNREFUSED:
137	case -ENOTCONN:	137	case -ENOTCONN:
138	case -ECONNRESET:	138	case -ECONNRESET:
139	case -EPIPE:	139	case -EPIPE:
140	case -EHOSTDOWN:	140	case -EHOSTDOWN:
141	case -EHOSTUNREACH:	141	case -EHOSTUNREACH:
142	case -ETIMEDOUT:	142	case -ETIMEDOUT:
143	case -ECONNABORTED:	143	case -ECONNABORTED:
144	case -ENETDOWN:	144	case -ENETDOWN:
145	case -ENETUNREACH:	145	case -ENETUNREACH:
146	case -ENETRESET:	146	case -ENETRESET:
147	case -ESHUTDOWN:	147	case -ESHUTDOWN:
148	case -ENOPROTOOPT:	148	case -ENOPROTOOPT:
149	case -EINVAL: /* if returned from our tcp code,	149	case -EINVAL: /* if returned from our tcp code,
150	this means there is no socket */	150	this means there is no socket */
151	return 1;	151	return 1;
152	}	152	}
153	return 0;	153	return 0;
154	}	154	}
155		155
156		156
157	/*	157	/*
158	* MASTER LIST FUNCTIONS	158	* MASTER LIST FUNCTIONS
159	*/	159	*/
160		160
161		161
162	/*	162	/*
163	* regarding master list entries and heartbeat callbacks:	163	* regarding master list entries and heartbeat callbacks:
164	*	164	*
165	* in order to avoid sleeping and allocation that occurs in	165	* in order to avoid sleeping and allocation that occurs in
166	* heartbeat, master list entries are simply attached to the	166	* heartbeat, master list entries are simply attached to the
167	* dlm's established heartbeat callbacks. the mle is attached	167	* dlm's established heartbeat callbacks. the mle is attached
168	* when it is created, and since the dlm->spinlock is held at	168	* when it is created, and since the dlm->spinlock is held at
169	* that time, any heartbeat event will be properly discovered	169	* that time, any heartbeat event will be properly discovered
170	* by the mle. the mle needs to be detached from the	170	* by the mle. the mle needs to be detached from the
171	* dlm->mle_hb_events list as soon as heartbeat events are no	171	* dlm->mle_hb_events list as soon as heartbeat events are no
172	* longer useful to the mle, and before the mle is freed.	172	* longer useful to the mle, and before the mle is freed.
173	*	173	*
174	* as a general rule, heartbeat events are no longer needed by	174	* as a general rule, heartbeat events are no longer needed by
175	* the mle once an "answer" regarding the lock master has been	175	* the mle once an "answer" regarding the lock master has been
176	* received.	176	* received.
177	*/	177	*/
178	static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,	178	static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
179	struct dlm_master_list_entry *mle)	179	struct dlm_master_list_entry *mle)
180	{	180	{
181	assert_spin_locked(&dlm->spinlock);	181	assert_spin_locked(&dlm->spinlock);
182		182
183	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);	183	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
184	}	184	}
185		185
186		186
187	static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,	187	static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
188	struct dlm_master_list_entry *mle)	188	struct dlm_master_list_entry *mle)
189	{	189	{
190	if (!list_empty(&mle->hb_events))	190	if (!list_empty(&mle->hb_events))
191	list_del_init(&mle->hb_events);	191	list_del_init(&mle->hb_events);
192	}	192	}
193		193
194		194
195	static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,	195	static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
196	struct dlm_master_list_entry *mle)	196	struct dlm_master_list_entry *mle)
197	{	197	{
198	spin_lock(&dlm->spinlock);	198	spin_lock(&dlm->spinlock);
199	__dlm_mle_detach_hb_events(dlm, mle);	199	__dlm_mle_detach_hb_events(dlm, mle);
200	spin_unlock(&dlm->spinlock);	200	spin_unlock(&dlm->spinlock);
201	}	201	}
202		202
203	static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)	203	static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
204	{	204	{
205	struct dlm_ctxt *dlm;	205	struct dlm_ctxt *dlm;
206	dlm = mle->dlm;	206	dlm = mle->dlm;
207		207
208	assert_spin_locked(&dlm->spinlock);	208	assert_spin_locked(&dlm->spinlock);
209	assert_spin_locked(&dlm->master_lock);	209	assert_spin_locked(&dlm->master_lock);
210	mle->inuse++;	210	mle->inuse++;
211	kref_get(&mle->mle_refs);	211	kref_get(&mle->mle_refs);
212	}	212	}
213		213
214	static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)	214	static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
215	{	215	{
216	struct dlm_ctxt *dlm;	216	struct dlm_ctxt *dlm;
217	dlm = mle->dlm;	217	dlm = mle->dlm;
218		218
219	spin_lock(&dlm->spinlock);	219	spin_lock(&dlm->spinlock);
220	spin_lock(&dlm->master_lock);	220	spin_lock(&dlm->master_lock);
221	mle->inuse--;	221	mle->inuse--;
222	__dlm_put_mle(mle);	222	__dlm_put_mle(mle);
223	spin_unlock(&dlm->master_lock);	223	spin_unlock(&dlm->master_lock);
224	spin_unlock(&dlm->spinlock);	224	spin_unlock(&dlm->spinlock);
225		225
226	}	226	}
227		227
228	/* remove from list and free */	228	/* remove from list and free */
229	static void __dlm_put_mle(struct dlm_master_list_entry *mle)	229	static void __dlm_put_mle(struct dlm_master_list_entry *mle)
230	{	230	{
231	struct dlm_ctxt *dlm;	231	struct dlm_ctxt *dlm;
232	dlm = mle->dlm;	232	dlm = mle->dlm;
233		233
234	assert_spin_locked(&dlm->spinlock);	234	assert_spin_locked(&dlm->spinlock);
235	assert_spin_locked(&dlm->master_lock);	235	assert_spin_locked(&dlm->master_lock);
236	if (!atomic_read(&mle->mle_refs.refcount)) {	236	if (!atomic_read(&mle->mle_refs.refcount)) {
237	/* this may or may not crash, but who cares.	237	/* this may or may not crash, but who cares.
238	* it's a BUG. */	238	* it's a BUG. */
239	mlog(ML_ERROR, "bad mle: %p\n", mle);	239	mlog(ML_ERROR, "bad mle: %p\n", mle);
240	dlm_print_one_mle(mle);	240	dlm_print_one_mle(mle);
241	BUG();	241	BUG();
242	} else	242	} else
243	kref_put(&mle->mle_refs, dlm_mle_release);	243	kref_put(&mle->mle_refs, dlm_mle_release);
244	}	244	}
245		245
246		246
247	/* must not have any spinlocks coming in */	247	/* must not have any spinlocks coming in */
248	static void dlm_put_mle(struct dlm_master_list_entry *mle)	248	static void dlm_put_mle(struct dlm_master_list_entry *mle)
249	{	249	{
250	struct dlm_ctxt *dlm;	250	struct dlm_ctxt *dlm;
251	dlm = mle->dlm;	251	dlm = mle->dlm;
252		252
253	spin_lock(&dlm->spinlock);	253	spin_lock(&dlm->spinlock);
254	spin_lock(&dlm->master_lock);	254	spin_lock(&dlm->master_lock);
255	__dlm_put_mle(mle);	255	__dlm_put_mle(mle);
256	spin_unlock(&dlm->master_lock);	256	spin_unlock(&dlm->master_lock);
257	spin_unlock(&dlm->spinlock);	257	spin_unlock(&dlm->spinlock);
258	}	258	}
259		259
260	static inline void dlm_get_mle(struct dlm_master_list_entry *mle)	260	static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
261	{	261	{
262	kref_get(&mle->mle_refs);	262	kref_get(&mle->mle_refs);
263	}	263	}
264		264
265	static void dlm_init_mle(struct dlm_master_list_entry *mle,	265	static void dlm_init_mle(struct dlm_master_list_entry *mle,
266	enum dlm_mle_type type,	266	enum dlm_mle_type type,
267	struct dlm_ctxt *dlm,	267	struct dlm_ctxt *dlm,
268	struct dlm_lock_resource *res,	268	struct dlm_lock_resource *res,
269	const char *name,	269	const char *name,
270	unsigned int namelen)	270	unsigned int namelen)
271	{	271	{
272	assert_spin_locked(&dlm->spinlock);	272	assert_spin_locked(&dlm->spinlock);
273		273
274	mle->dlm = dlm;	274	mle->dlm = dlm;
275	mle->type = type;	275	mle->type = type;
276	INIT_HLIST_NODE(&mle->master_hash_node);	276	INIT_HLIST_NODE(&mle->master_hash_node);
277	INIT_LIST_HEAD(&mle->hb_events);	277	INIT_LIST_HEAD(&mle->hb_events);
278	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));	278	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
279	spin_lock_init(&mle->spinlock);	279	spin_lock_init(&mle->spinlock);
280	init_waitqueue_head(&mle->wq);	280	init_waitqueue_head(&mle->wq);
281	atomic_set(&mle->woken, 0);	281	atomic_set(&mle->woken, 0);
282	kref_init(&mle->mle_refs);	282	kref_init(&mle->mle_refs);
283	memset(mle->response_map, 0, sizeof(mle->response_map));	283	memset(mle->response_map, 0, sizeof(mle->response_map));
284	mle->master = O2NM_MAX_NODES;	284	mle->master = O2NM_MAX_NODES;
285	mle->new_master = O2NM_MAX_NODES;	285	mle->new_master = O2NM_MAX_NODES;
286	mle->inuse = 0;	286	mle->inuse = 0;
287		287
288	BUG_ON(mle->type != DLM_MLE_BLOCK &&	288	BUG_ON(mle->type != DLM_MLE_BLOCK &&
289	mle->type != DLM_MLE_MASTER &&	289	mle->type != DLM_MLE_MASTER &&
290	mle->type != DLM_MLE_MIGRATION);	290	mle->type != DLM_MLE_MIGRATION);
291		291
292	if (mle->type == DLM_MLE_MASTER) {	292	if (mle->type == DLM_MLE_MASTER) {
293	BUG_ON(!res);	293	BUG_ON(!res);
294	mle->mleres = res;	294	mle->mleres = res;
295	memcpy(mle->mname, res->lockname.name, res->lockname.len);	295	memcpy(mle->mname, res->lockname.name, res->lockname.len);
296	mle->mnamelen = res->lockname.len;	296	mle->mnamelen = res->lockname.len;
297	mle->mnamehash = res->lockname.hash;	297	mle->mnamehash = res->lockname.hash;
298	} else {	298	} else {
299	BUG_ON(!name);	299	BUG_ON(!name);
300	mle->mleres = NULL;	300	mle->mleres = NULL;
301	memcpy(mle->mname, name, namelen);	301	memcpy(mle->mname, name, namelen);
302	mle->mnamelen = namelen;	302	mle->mnamelen = namelen;
303	mle->mnamehash = dlm_lockid_hash(name, namelen);	303	mle->mnamehash = dlm_lockid_hash(name, namelen);
304	}	304	}
305		305
306	atomic_inc(&dlm->mle_tot_count[mle->type]);	306	atomic_inc(&dlm->mle_tot_count[mle->type]);
307	atomic_inc(&dlm->mle_cur_count[mle->type]);	307	atomic_inc(&dlm->mle_cur_count[mle->type]);
308		308
309	/* copy off the node_map and register hb callbacks on our copy */	309	/* copy off the node_map and register hb callbacks on our copy */
310	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));	310	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
311	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));	311	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
312	clear_bit(dlm->node_num, mle->vote_map);	312	clear_bit(dlm->node_num, mle->vote_map);
313	clear_bit(dlm->node_num, mle->node_map);	313	clear_bit(dlm->node_num, mle->node_map);
314		314
315	/* attach the mle to the domain node up/down events */	315	/* attach the mle to the domain node up/down events */
316	__dlm_mle_attach_hb_events(dlm, mle);	316	__dlm_mle_attach_hb_events(dlm, mle);
317	}	317	}
318		318
319	void __dlm_unlink_mle(struct dlm_ctxt dlm, struct dlm_master_list_entry mle)	319	void __dlm_unlink_mle(struct dlm_ctxt dlm, struct dlm_master_list_entry mle)
320	{	320	{
321	assert_spin_locked(&dlm->spinlock);	321	assert_spin_locked(&dlm->spinlock);
322	assert_spin_locked(&dlm->master_lock);	322	assert_spin_locked(&dlm->master_lock);
323		323
324	if (!hlist_unhashed(&mle->master_hash_node))	324	if (!hlist_unhashed(&mle->master_hash_node))
325	hlist_del_init(&mle->master_hash_node);	325	hlist_del_init(&mle->master_hash_node);
326	}	326	}
327		327
328	void __dlm_insert_mle(struct dlm_ctxt dlm, struct dlm_master_list_entry mle)	328	void __dlm_insert_mle(struct dlm_ctxt dlm, struct dlm_master_list_entry mle)
329	{	329	{
330	struct hlist_head *bucket;	330	struct hlist_head *bucket;
331		331
332	assert_spin_locked(&dlm->master_lock);	332	assert_spin_locked(&dlm->master_lock);
333		333
334	bucket = dlm_master_hash(dlm, mle->mnamehash);	334	bucket = dlm_master_hash(dlm, mle->mnamehash);
335	hlist_add_head(&mle->master_hash_node, bucket);	335	hlist_add_head(&mle->master_hash_node, bucket);
336	}	336	}
337		337
338	/* returns 1 if found, 0 if not */	338	/* returns 1 if found, 0 if not */
339	static int dlm_find_mle(struct dlm_ctxt *dlm,	339	static int dlm_find_mle(struct dlm_ctxt *dlm,
340	struct dlm_master_list_entry **mle,	340	struct dlm_master_list_entry **mle,
341	char *name, unsigned int namelen)	341	char *name, unsigned int namelen)
342	{	342	{
343	struct dlm_master_list_entry *tmpmle;	343	struct dlm_master_list_entry *tmpmle;
344	struct hlist_head *bucket;	344	struct hlist_head *bucket;
345	unsigned int hash;	345	unsigned int hash;
346		346
347	assert_spin_locked(&dlm->master_lock);	347	assert_spin_locked(&dlm->master_lock);
348		348
349	hash = dlm_lockid_hash(name, namelen);	349	hash = dlm_lockid_hash(name, namelen);
350	bucket = dlm_master_hash(dlm, hash);	350	bucket = dlm_master_hash(dlm, hash);
351	hlist_for_each_entry(tmpmle, bucket, master_hash_node) {	351	hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
352	if (!dlm_mle_equal(dlm, tmpmle, name, namelen))	352	if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
353	continue;	353	continue;
354	dlm_get_mle(tmpmle);	354	dlm_get_mle(tmpmle);
355	*mle = tmpmle;	355	*mle = tmpmle;
356	return 1;	356	return 1;
357	}	357	}
358	return 0;	358	return 0;
359	}	359	}
360		360
361	void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)	361	void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
362	{	362	{
363	struct dlm_master_list_entry *mle;	363	struct dlm_master_list_entry *mle;
364		364
365	assert_spin_locked(&dlm->spinlock);	365	assert_spin_locked(&dlm->spinlock);
366		366
367	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {	367	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
368	if (node_up)	368	if (node_up)
369	dlm_mle_node_up(dlm, mle, NULL, idx);	369	dlm_mle_node_up(dlm, mle, NULL, idx);
370	else	370	else
371	dlm_mle_node_down(dlm, mle, NULL, idx);	371	dlm_mle_node_down(dlm, mle, NULL, idx);
372	}	372	}
373	}	373	}
374		374
375	static void dlm_mle_node_down(struct dlm_ctxt *dlm,	375	static void dlm_mle_node_down(struct dlm_ctxt *dlm,
376	struct dlm_master_list_entry *mle,	376	struct dlm_master_list_entry *mle,
377	struct o2nm_node *node, int idx)	377	struct o2nm_node *node, int idx)
378	{	378	{
379	spin_lock(&mle->spinlock);	379	spin_lock(&mle->spinlock);
380		380
381	if (!test_bit(idx, mle->node_map))	381	if (!test_bit(idx, mle->node_map))
382	mlog(0, "node %u already removed from nodemap!\n", idx);	382	mlog(0, "node %u already removed from nodemap!\n", idx);
383	else	383	else
384	clear_bit(idx, mle->node_map);	384	clear_bit(idx, mle->node_map);
385		385
386	spin_unlock(&mle->spinlock);	386	spin_unlock(&mle->spinlock);
387	}	387	}
388		388
389	static void dlm_mle_node_up(struct dlm_ctxt *dlm,	389	static void dlm_mle_node_up(struct dlm_ctxt *dlm,
390	struct dlm_master_list_entry *mle,	390	struct dlm_master_list_entry *mle,
391	struct o2nm_node *node, int idx)	391	struct o2nm_node *node, int idx)
392	{	392	{
393	spin_lock(&mle->spinlock);	393	spin_lock(&mle->spinlock);
394		394
395	if (test_bit(idx, mle->node_map))	395	if (test_bit(idx, mle->node_map))
396	mlog(0, "node %u already in node map!\n", idx);	396	mlog(0, "node %u already in node map!\n", idx);
397	else	397	else
398	set_bit(idx, mle->node_map);	398	set_bit(idx, mle->node_map);
399		399
400	spin_unlock(&mle->spinlock);	400	spin_unlock(&mle->spinlock);
401	}	401	}
402		402
403		403
404	int dlm_init_mle_cache(void)	404	int dlm_init_mle_cache(void)
405	{	405	{
406	dlm_mle_cache = kmem_cache_create("o2dlm_mle",	406	dlm_mle_cache = kmem_cache_create("o2dlm_mle",
407	sizeof(struct dlm_master_list_entry),	407	sizeof(struct dlm_master_list_entry),
408	0, SLAB_HWCACHE_ALIGN,	408	0, SLAB_HWCACHE_ALIGN,
409	NULL);	409	NULL);
410	if (dlm_mle_cache == NULL)	410	if (dlm_mle_cache == NULL)
411	return -ENOMEM;	411	return -ENOMEM;
412	return 0;	412	return 0;
413	}	413	}
414		414
415	void dlm_destroy_mle_cache(void)	415	void dlm_destroy_mle_cache(void)
416	{	416	{
417	if (dlm_mle_cache)	417	if (dlm_mle_cache)
418	kmem_cache_destroy(dlm_mle_cache);	418	kmem_cache_destroy(dlm_mle_cache);
419	}	419	}
420		420
421	static void dlm_mle_release(struct kref *kref)	421	static void dlm_mle_release(struct kref *kref)
422	{	422	{
423	struct dlm_master_list_entry *mle;	423	struct dlm_master_list_entry *mle;
424	struct dlm_ctxt *dlm;	424	struct dlm_ctxt *dlm;
425		425
426	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);	426	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
427	dlm = mle->dlm;	427	dlm = mle->dlm;
428		428
429	assert_spin_locked(&dlm->spinlock);	429	assert_spin_locked(&dlm->spinlock);
430	assert_spin_locked(&dlm->master_lock);	430	assert_spin_locked(&dlm->master_lock);
431		431
432	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,	432	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
433	mle->type);	433	mle->type);
434		434
435	/* remove from list if not already */	435	/* remove from list if not already */
436	__dlm_unlink_mle(dlm, mle);	436	__dlm_unlink_mle(dlm, mle);
437		437
438	/* detach the mle from the domain node up/down events */	438	/* detach the mle from the domain node up/down events */
439	__dlm_mle_detach_hb_events(dlm, mle);	439	__dlm_mle_detach_hb_events(dlm, mle);
440		440
441	atomic_dec(&dlm->mle_cur_count[mle->type]);	441	atomic_dec(&dlm->mle_cur_count[mle->type]);
442		442
443	/* NOTE: kfree under spinlock here.	443	/* NOTE: kfree under spinlock here.
444	* if this is bad, we can move this to a freelist. */	444	* if this is bad, we can move this to a freelist. */
445	kmem_cache_free(dlm_mle_cache, mle);	445	kmem_cache_free(dlm_mle_cache, mle);
446	}	446	}
447		447
448		448
449	/*	449	/*
450	* LOCK RESOURCE FUNCTIONS	450	* LOCK RESOURCE FUNCTIONS
451	*/	451	*/
452		452
453	int dlm_init_master_caches(void)	453	int dlm_init_master_caches(void)
454	{	454	{
455	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",	455	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
456	sizeof(struct dlm_lock_resource),	456	sizeof(struct dlm_lock_resource),
457	0, SLAB_HWCACHE_ALIGN, NULL);	457	0, SLAB_HWCACHE_ALIGN, NULL);
458	if (!dlm_lockres_cache)	458	if (!dlm_lockres_cache)
459	goto bail;	459	goto bail;
460		460
461	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",	461	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
462	DLM_LOCKID_NAME_MAX, 0,	462	DLM_LOCKID_NAME_MAX, 0,
463	SLAB_HWCACHE_ALIGN, NULL);	463	SLAB_HWCACHE_ALIGN, NULL);
464	if (!dlm_lockname_cache)	464	if (!dlm_lockname_cache)
465	goto bail;	465	goto bail;
466		466
467	return 0;	467	return 0;
468	bail:	468	bail:
469	dlm_destroy_master_caches();	469	dlm_destroy_master_caches();
470	return -ENOMEM;	470	return -ENOMEM;
471	}	471	}
472		472
473	void dlm_destroy_master_caches(void)	473	void dlm_destroy_master_caches(void)
474	{	474	{
475	if (dlm_lockname_cache) {	475	if (dlm_lockname_cache) {
476	kmem_cache_destroy(dlm_lockname_cache);	476	kmem_cache_destroy(dlm_lockname_cache);
477	dlm_lockname_cache = NULL;	477	dlm_lockname_cache = NULL;
478	}	478	}
479		479
480	if (dlm_lockres_cache) {	480	if (dlm_lockres_cache) {
481	kmem_cache_destroy(dlm_lockres_cache);	481	kmem_cache_destroy(dlm_lockres_cache);
482	dlm_lockres_cache = NULL;	482	dlm_lockres_cache = NULL;
483	}	483	}
484	}	484	}
485		485
486	static void dlm_lockres_release(struct kref *kref)	486	static void dlm_lockres_release(struct kref *kref)
487	{	487	{
488	struct dlm_lock_resource *res;	488	struct dlm_lock_resource *res;
489	struct dlm_ctxt *dlm;	489	struct dlm_ctxt *dlm;
490		490
491	res = container_of(kref, struct dlm_lock_resource, refs);	491	res = container_of(kref, struct dlm_lock_resource, refs);
492	dlm = res->dlm;	492	dlm = res->dlm;
493		493
494	/* This should not happen -- all lockres' have a name	494	/* This should not happen -- all lockres' have a name
495	* associated with them at init time. */	495	* associated with them at init time. */
496	BUG_ON(!res->lockname.name);	496	BUG_ON(!res->lockname.name);
497		497
498	mlog(0, "destroying lockres %.*s\n", res->lockname.len,	498	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
499	res->lockname.name);	499	res->lockname.name);
500		500
501	spin_lock(&dlm->track_lock);	501	spin_lock(&dlm->track_lock);
502	if (!list_empty(&res->tracking))	502	if (!list_empty(&res->tracking))
503	list_del_init(&res->tracking);	503	list_del_init(&res->tracking);
504	else {	504	else {
505	mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",	505	mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
506	res->lockname.len, res->lockname.name);	506	res->lockname.len, res->lockname.name);
507	dlm_print_one_lock_resource(res);	507	dlm_print_one_lock_resource(res);
508	}	508	}
509	spin_unlock(&dlm->track_lock);	509	spin_unlock(&dlm->track_lock);
510		510
511	atomic_dec(&dlm->res_cur_count);	511	atomic_dec(&dlm->res_cur_count);
512		512
513	if (!hlist_unhashed(&res->hash_node) \|\|	513	if (!hlist_unhashed(&res->hash_node) \|\|
514	!list_empty(&res->granted) \|\|	514	!list_empty(&res->granted) \|\|
515	!list_empty(&res->converting) \|\|	515	!list_empty(&res->converting) \|\|
516	!list_empty(&res->blocked) \|\|	516	!list_empty(&res->blocked) \|\|
517	!list_empty(&res->dirty) \|\|	517	!list_empty(&res->dirty) \|\|
518	!list_empty(&res->recovering) \|\|	518	!list_empty(&res->recovering) \|\|
519	!list_empty(&res->purge)) {	519	!list_empty(&res->purge)) {
520	mlog(ML_ERROR,	520	mlog(ML_ERROR,
521	"Going to BUG for resource %.*s."	521	"Going to BUG for resource %.*s."
522	" We're on a list! [%c%c%c%c%c%c%c]\n",	522	" We're on a list! [%c%c%c%c%c%c%c]\n",
523	res->lockname.len, res->lockname.name,	523	res->lockname.len, res->lockname.name,
524	!hlist_unhashed(&res->hash_node) ? 'H' : ' ',	524	!hlist_unhashed(&res->hash_node) ? 'H' : ' ',
525	!list_empty(&res->granted) ? 'G' : ' ',	525	!list_empty(&res->granted) ? 'G' : ' ',
526	!list_empty(&res->converting) ? 'C' : ' ',	526	!list_empty(&res->converting) ? 'C' : ' ',
527	!list_empty(&res->blocked) ? 'B' : ' ',	527	!list_empty(&res->blocked) ? 'B' : ' ',
528	!list_empty(&res->dirty) ? 'D' : ' ',	528	!list_empty(&res->dirty) ? 'D' : ' ',
529	!list_empty(&res->recovering) ? 'R' : ' ',	529	!list_empty(&res->recovering) ? 'R' : ' ',
530	!list_empty(&res->purge) ? 'P' : ' ');	530	!list_empty(&res->purge) ? 'P' : ' ');
531		531
532	dlm_print_one_lock_resource(res);	532	dlm_print_one_lock_resource(res);
533	}	533	}
534		534
535	/* By the time we're ready to blow this guy away, we shouldn't	535	/* By the time we're ready to blow this guy away, we shouldn't
536	* be on any lists. */	536	* be on any lists. */
537	BUG_ON(!hlist_unhashed(&res->hash_node));	537	BUG_ON(!hlist_unhashed(&res->hash_node));
538	BUG_ON(!list_empty(&res->granted));	538	BUG_ON(!list_empty(&res->granted));
539	BUG_ON(!list_empty(&res->converting));	539	BUG_ON(!list_empty(&res->converting));
540	BUG_ON(!list_empty(&res->blocked));	540	BUG_ON(!list_empty(&res->blocked));
541	BUG_ON(!list_empty(&res->dirty));	541	BUG_ON(!list_empty(&res->dirty));
542	BUG_ON(!list_empty(&res->recovering));	542	BUG_ON(!list_empty(&res->recovering));
543	BUG_ON(!list_empty(&res->purge));	543	BUG_ON(!list_empty(&res->purge));
544		544
545	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);	545	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
546		546
547	kmem_cache_free(dlm_lockres_cache, res);	547	kmem_cache_free(dlm_lockres_cache, res);
548	}	548	}
549		549
550	void dlm_lockres_put(struct dlm_lock_resource *res)	550	void dlm_lockres_put(struct dlm_lock_resource *res)
551	{	551	{
552	kref_put(&res->refs, dlm_lockres_release);	552	kref_put(&res->refs, dlm_lockres_release);
553	}	553	}
554		554
555	static void dlm_init_lockres(struct dlm_ctxt *dlm,	555	static void dlm_init_lockres(struct dlm_ctxt *dlm,
556	struct dlm_lock_resource *res,	556	struct dlm_lock_resource *res,
557	const char *name, unsigned int namelen)	557	const char *name, unsigned int namelen)
558	{	558	{
559	char *qname;	559	char *qname;
560		560
561	/* If we memset here, we lose our reference to the kmalloc'd	561	/* If we memset here, we lose our reference to the kmalloc'd
562	* res->lockname.name, so be sure to init every field	562	* res->lockname.name, so be sure to init every field
563	* correctly! */	563	* correctly! */
564		564
565	qname = (char *) res->lockname.name;	565	qname = (char *) res->lockname.name;
566	memcpy(qname, name, namelen);	566	memcpy(qname, name, namelen);
567		567
568	res->lockname.len = namelen;	568	res->lockname.len = namelen;
569	res->lockname.hash = dlm_lockid_hash(name, namelen);	569	res->lockname.hash = dlm_lockid_hash(name, namelen);
570		570
571	init_waitqueue_head(&res->wq);	571	init_waitqueue_head(&res->wq);
572	spin_lock_init(&res->spinlock);	572	spin_lock_init(&res->spinlock);
573	INIT_HLIST_NODE(&res->hash_node);	573	INIT_HLIST_NODE(&res->hash_node);
574	INIT_LIST_HEAD(&res->granted);	574	INIT_LIST_HEAD(&res->granted);
575	INIT_LIST_HEAD(&res->converting);	575	INIT_LIST_HEAD(&res->converting);
576	INIT_LIST_HEAD(&res->blocked);	576	INIT_LIST_HEAD(&res->blocked);
577	INIT_LIST_HEAD(&res->dirty);	577	INIT_LIST_HEAD(&res->dirty);
578	INIT_LIST_HEAD(&res->recovering);	578	INIT_LIST_HEAD(&res->recovering);
579	INIT_LIST_HEAD(&res->purge);	579	INIT_LIST_HEAD(&res->purge);
580	INIT_LIST_HEAD(&res->tracking);	580	INIT_LIST_HEAD(&res->tracking);
581	atomic_set(&res->asts_reserved, 0);	581	atomic_set(&res->asts_reserved, 0);
582	res->migration_pending = 0;	582	res->migration_pending = 0;
583	res->inflight_locks = 0;	583	res->inflight_locks = 0;
584	res->inflight_assert_workers = 0;	584	res->inflight_assert_workers = 0;
585		585
586	res->dlm = dlm;	586	res->dlm = dlm;
587		587
588	kref_init(&res->refs);	588	kref_init(&res->refs);
589		589
590	atomic_inc(&dlm->res_tot_count);	590	atomic_inc(&dlm->res_tot_count);
591	atomic_inc(&dlm->res_cur_count);	591	atomic_inc(&dlm->res_cur_count);
592		592
593	/* just for consistency */	593	/* just for consistency */
594	spin_lock(&res->spinlock);	594	spin_lock(&res->spinlock);
595	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);	595	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
596	spin_unlock(&res->spinlock);	596	spin_unlock(&res->spinlock);
597		597
598	res->state = DLM_LOCK_RES_IN_PROGRESS;	598	res->state = DLM_LOCK_RES_IN_PROGRESS;
599		599
600	res->last_used = 0;	600	res->last_used = 0;
601		601
602	spin_lock(&dlm->spinlock);	602	spin_lock(&dlm->spinlock);
603	list_add_tail(&res->tracking, &dlm->tracking_list);	603	list_add_tail(&res->tracking, &dlm->tracking_list);
604	spin_unlock(&dlm->spinlock);	604	spin_unlock(&dlm->spinlock);
605		605
606	memset(res->lvb, 0, DLM_LVB_LEN);	606	memset(res->lvb, 0, DLM_LVB_LEN);
607	memset(res->refmap, 0, sizeof(res->refmap));	607	memset(res->refmap, 0, sizeof(res->refmap));
608	}	608	}
609		609
610	struct dlm_lock_resource dlm_new_lockres(struct dlm_ctxt dlm,	610	struct dlm_lock_resource dlm_new_lockres(struct dlm_ctxt dlm,
611	const char *name,	611	const char *name,
612	unsigned int namelen)	612	unsigned int namelen)
613	{	613	{
614	struct dlm_lock_resource *res = NULL;	614	struct dlm_lock_resource *res = NULL;
615		615
616	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);	616	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
617	if (!res)	617	if (!res)
618	goto error;	618	goto error;
619		619
620	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);	620	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
621	if (!res->lockname.name)	621	if (!res->lockname.name)
622	goto error;	622	goto error;
623		623
624	dlm_init_lockres(dlm, res, name, namelen);	624	dlm_init_lockres(dlm, res, name, namelen);
625	return res;	625	return res;
626		626
627	error:	627	error:
628	if (res && res->lockname.name)	628	if (res && res->lockname.name)
629	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);	629	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
630		630
631	if (res)	631	if (res)
632	kmem_cache_free(dlm_lockres_cache, res);	632	kmem_cache_free(dlm_lockres_cache, res);
633	return NULL;	633	return NULL;
634	}	634	}
635		635
636	void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,	636	void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
637	struct dlm_lock_resource *res, int bit)	637	struct dlm_lock_resource *res, int bit)
638	{	638	{
639	assert_spin_locked(&res->spinlock);	639	assert_spin_locked(&res->spinlock);
640		640
641	mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,	641	mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
642	res->lockname.name, bit, __builtin_return_address(0));	642	res->lockname.name, bit, __builtin_return_address(0));
643		643
644	set_bit(bit, res->refmap);	644	set_bit(bit, res->refmap);
645	}	645	}
646		646
647	void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,	647	void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
648	struct dlm_lock_resource *res, int bit)	648	struct dlm_lock_resource *res, int bit)
649	{	649	{
650	assert_spin_locked(&res->spinlock);	650	assert_spin_locked(&res->spinlock);
651		651
652	mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,	652	mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
653	res->lockname.name, bit, __builtin_return_address(0));	653	res->lockname.name, bit, __builtin_return_address(0));
654		654
655	clear_bit(bit, res->refmap);	655	clear_bit(bit, res->refmap);
656	}	656	}
657		657
658		658	static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
659	void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
660	struct dlm_lock_resource *res)	659	struct dlm_lock_resource *res)
661	{	660	{
662	assert_spin_locked(&res->spinlock);
663
664	res->inflight_locks++;	661	res->inflight_locks++;
665		662
666	mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,	663	mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
667	res->lockname.len, res->lockname.name, res->inflight_locks,	664	res->lockname.len, res->lockname.name, res->inflight_locks,
668	__builtin_return_address(0));	665	__builtin_return_address(0));
669	}	666	}
670		667
		668	void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
		669	struct dlm_lock_resource *res)
		670	{
		671	assert_spin_locked(&res->spinlock);
		672	__dlm_lockres_grab_inflight_ref(dlm, res);
		673	}
		674
671	void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,	675	void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
672	struct dlm_lock_resource *res)	676	struct dlm_lock_resource *res)
673	{	677	{
674	assert_spin_locked(&res->spinlock);	678	assert_spin_locked(&res->spinlock);
675		679
676	BUG_ON(res->inflight_locks == 0);	680	BUG_ON(res->inflight_locks == 0);
677		681
678	res->inflight_locks--;	682	res->inflight_locks--;
679		683
680	mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,	684	mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
681	res->lockname.len, res->lockname.name, res->inflight_locks,	685	res->lockname.len, res->lockname.name, res->inflight_locks,
682	__builtin_return_address(0));	686	__builtin_return_address(0));
683		687
684	wake_up(&res->wq);	688	wake_up(&res->wq);
685	}	689	}
686		690
687	void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,	691	void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
688	struct dlm_lock_resource *res)	692	struct dlm_lock_resource *res)
689	{	693	{
690	assert_spin_locked(&res->spinlock);	694	assert_spin_locked(&res->spinlock);
691	res->inflight_assert_workers++;	695	res->inflight_assert_workers++;
692	mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",	696	mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
693	dlm->name, res->lockname.len, res->lockname.name,	697	dlm->name, res->lockname.len, res->lockname.name,
694	res->inflight_assert_workers);	698	res->inflight_assert_workers);
695	}	699	}
696		700
697	static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,	701	static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
698	struct dlm_lock_resource *res)	702	struct dlm_lock_resource *res)
699	{	703	{
700	spin_lock(&res->spinlock);	704	spin_lock(&res->spinlock);
701	__dlm_lockres_grab_inflight_worker(dlm, res);	705	__dlm_lockres_grab_inflight_worker(dlm, res);
702	spin_unlock(&res->spinlock);	706	spin_unlock(&res->spinlock);
703	}	707	}
704		708
705	static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,	709	static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
706	struct dlm_lock_resource *res)	710	struct dlm_lock_resource *res)
707	{	711	{
708	assert_spin_locked(&res->spinlock);	712	assert_spin_locked(&res->spinlock);
709	BUG_ON(res->inflight_assert_workers == 0);	713	BUG_ON(res->inflight_assert_workers == 0);
710	res->inflight_assert_workers--;	714	res->inflight_assert_workers--;
711	mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",	715	mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
712	dlm->name, res->lockname.len, res->lockname.name,	716	dlm->name, res->lockname.len, res->lockname.name,
713	res->inflight_assert_workers);	717	res->inflight_assert_workers);
714	}	718	}
715		719
716	static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,	720	static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
717	struct dlm_lock_resource *res)	721	struct dlm_lock_resource *res)
718	{	722	{
719	spin_lock(&res->spinlock);	723	spin_lock(&res->spinlock);
720	__dlm_lockres_drop_inflight_worker(dlm, res);	724	__dlm_lockres_drop_inflight_worker(dlm, res);
721	spin_unlock(&res->spinlock);	725	spin_unlock(&res->spinlock);
722	}	726	}
723		727
724	/*	728	/*
725	* lookup a lock resource by name.	729	* lookup a lock resource by name.
726	* may already exist in the hashtable.	730	* may already exist in the hashtable.
727	* lockid is null terminated	731	* lockid is null terminated
728	*	732	*
729	* if not, allocate enough for the lockres and for	733	* if not, allocate enough for the lockres and for
730	* the temporary structure used in doing the mastering.	734	* the temporary structure used in doing the mastering.
731	*	735	*
732	* also, do a lookup in the dlm->master_list to see	736	* also, do a lookup in the dlm->master_list to see
733	* if another node has begun mastering the same lock.	737	* if another node has begun mastering the same lock.
734	* if so, there should be a block entry in there	738	* if so, there should be a block entry in there
735	* for this name, and we should not attempt to master	739	* for this name, and we should not attempt to master
736	* the lock here. need to wait around for that node	740	* the lock here. need to wait around for that node
737	* to assert_master (or die).	741	* to assert_master (or die).
738	*	742	*
739	*/	743	*/
740	struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,	744	struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
741	const char *lockid,	745	const char *lockid,
742	int namelen,	746	int namelen,
743	int flags)	747	int flags)
744	{	748	{
745	struct dlm_lock_resource tmpres=NULL, res=NULL;	749	struct dlm_lock_resource tmpres=NULL, res=NULL;
746	struct dlm_master_list_entry *mle = NULL;	750	struct dlm_master_list_entry *mle = NULL;
747	struct dlm_master_list_entry *alloc_mle = NULL;	751	struct dlm_master_list_entry *alloc_mle = NULL;
748	int blocked = 0;	752	int blocked = 0;
749	int ret, nodenum;	753	int ret, nodenum;
750	struct dlm_node_iter iter;	754	struct dlm_node_iter iter;
751	unsigned int hash;	755	unsigned int hash;
752	int tries = 0;	756	int tries = 0;
753	int bit, wait_on_recovery = 0;	757	int bit, wait_on_recovery = 0;
754		758
755	BUG_ON(!lockid);	759	BUG_ON(!lockid);
756		760
757	hash = dlm_lockid_hash(lockid, namelen);	761	hash = dlm_lockid_hash(lockid, namelen);
758		762
759	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);	763	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
760		764
761	lookup:	765	lookup:
762	spin_lock(&dlm->spinlock);	766	spin_lock(&dlm->spinlock);
763	tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);	767	tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
764	if (tmpres) {	768	if (tmpres) {
765	spin_unlock(&dlm->spinlock);	769	spin_unlock(&dlm->spinlock);
766	spin_lock(&tmpres->spinlock);	770	spin_lock(&tmpres->spinlock);
767	/* Wait on the thread that is mastering the resource */	771	/* Wait on the thread that is mastering the resource */
768	if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {	772	if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
769	__dlm_wait_on_lockres(tmpres);	773	__dlm_wait_on_lockres(tmpres);
770	BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);	774	BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
771	spin_unlock(&tmpres->spinlock);	775	spin_unlock(&tmpres->spinlock);
772	dlm_lockres_put(tmpres);	776	dlm_lockres_put(tmpres);
773	tmpres = NULL;	777	tmpres = NULL;
774	goto lookup;	778	goto lookup;
775	}	779	}
776		780
777	/* Wait on the resource purge to complete before continuing */	781	/* Wait on the resource purge to complete before continuing */
778	if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {	782	if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
779	BUG_ON(tmpres->owner == dlm->node_num);	783	BUG_ON(tmpres->owner == dlm->node_num);
780	__dlm_wait_on_lockres_flags(tmpres,	784	__dlm_wait_on_lockres_flags(tmpres,
781	DLM_LOCK_RES_DROPPING_REF);	785	DLM_LOCK_RES_DROPPING_REF);
782	spin_unlock(&tmpres->spinlock);	786	spin_unlock(&tmpres->spinlock);
783	dlm_lockres_put(tmpres);	787	dlm_lockres_put(tmpres);
784	tmpres = NULL;	788	tmpres = NULL;
785	goto lookup;	789	goto lookup;
786	}	790	}
787		791
788	/* Grab inflight ref to pin the resource */	792	/* Grab inflight ref to pin the resource */
789	dlm_lockres_grab_inflight_ref(dlm, tmpres);	793	dlm_lockres_grab_inflight_ref(dlm, tmpres);
790		794
791	spin_unlock(&tmpres->spinlock);	795	spin_unlock(&tmpres->spinlock);
792	if (res)	796	if (res)
793	dlm_lockres_put(res);	797	dlm_lockres_put(res);
794	res = tmpres;	798	res = tmpres;
795	goto leave;	799	goto leave;
796	}	800	}
797		801
798	if (!res) {	802	if (!res) {
799	spin_unlock(&dlm->spinlock);	803	spin_unlock(&dlm->spinlock);
800	mlog(0, "allocating a new resource\n");	804	mlog(0, "allocating a new resource\n");
801	/* nothing found and we need to allocate one. */	805	/* nothing found and we need to allocate one. */
802	alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);	806	alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
803	if (!alloc_mle)	807	if (!alloc_mle)
804	goto leave;	808	goto leave;
805	res = dlm_new_lockres(dlm, lockid, namelen);	809	res = dlm_new_lockres(dlm, lockid, namelen);
806	if (!res)	810	if (!res)
807	goto leave;	811	goto leave;
808	goto lookup;	812	goto lookup;
809	}	813	}
810		814
811	mlog(0, "no lockres found, allocated our own: %p\n", res);	815	mlog(0, "no lockres found, allocated our own: %p\n", res);
812		816
813	if (flags & LKM_LOCAL) {	817	if (flags & LKM_LOCAL) {
814	/* caller knows it's safe to assume it's not mastered elsewhere	818	/* caller knows it's safe to assume it's not mastered elsewhere
815	* DONE! return right away */	819	* DONE! return right away */
816	spin_lock(&res->spinlock);	820	spin_lock(&res->spinlock);
817	dlm_change_lockres_owner(dlm, res, dlm->node_num);	821	dlm_change_lockres_owner(dlm, res, dlm->node_num);
818	__dlm_insert_lockres(dlm, res);	822	__dlm_insert_lockres(dlm, res);
819	dlm_lockres_grab_inflight_ref(dlm, res);	823	dlm_lockres_grab_inflight_ref(dlm, res);
820	spin_unlock(&res->spinlock);	824	spin_unlock(&res->spinlock);
821	spin_unlock(&dlm->spinlock);	825	spin_unlock(&dlm->spinlock);
822	/* lockres still marked IN_PROGRESS */	826	/* lockres still marked IN_PROGRESS */
823	goto wake_waiters;	827	goto wake_waiters;
824	}	828	}
825		829
826	/* check master list to see if another node has started mastering it */	830	/* check master list to see if another node has started mastering it */
827	spin_lock(&dlm->master_lock);	831	spin_lock(&dlm->master_lock);
828		832
829	/* if we found a block, wait for lock to be mastered by another node */	833	/* if we found a block, wait for lock to be mastered by another node */
830	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);	834	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
831	if (blocked) {	835	if (blocked) {
832	int mig;	836	int mig;
833	if (mle->type == DLM_MLE_MASTER) {	837	if (mle->type == DLM_MLE_MASTER) {
834	mlog(ML_ERROR, "master entry for nonexistent lock!\n");	838	mlog(ML_ERROR, "master entry for nonexistent lock!\n");
835	BUG();	839	BUG();
836	}	840	}
837	mig = (mle->type == DLM_MLE_MIGRATION);	841	mig = (mle->type == DLM_MLE_MIGRATION);
838	/* if there is a migration in progress, let the migration	842	/* if there is a migration in progress, let the migration
839	* finish before continuing. we can wait for the absence	843	* finish before continuing. we can wait for the absence
840	* of the MIGRATION mle: either the migrate finished or	844	* of the MIGRATION mle: either the migrate finished or
841	* one of the nodes died and the mle was cleaned up.	845	* one of the nodes died and the mle was cleaned up.
842	* if there is a BLOCK here, but it already has a master	846	* if there is a BLOCK here, but it already has a master
843	* set, we are too late. the master does not have a ref	847	* set, we are too late. the master does not have a ref
844	* for us in the refmap. detach the mle and drop it.	848	* for us in the refmap. detach the mle and drop it.
845	* either way, go back to the top and start over. */	849	* either way, go back to the top and start over. */
846	if (mig \|\| mle->master != O2NM_MAX_NODES) {	850	if (mig \|\| mle->master != O2NM_MAX_NODES) {
847	BUG_ON(mig && mle->master == dlm->node_num);	851	BUG_ON(mig && mle->master == dlm->node_num);
848	/* we arrived too late. the master does not	852	/* we arrived too late. the master does not
849	* have a ref for us. retry. */	853	* have a ref for us. retry. */
850	mlog(0, "%s:%.*s: late on %s\n",	854	mlog(0, "%s:%.*s: late on %s\n",
851	dlm->name, namelen, lockid,	855	dlm->name, namelen, lockid,
852	mig ? "MIGRATION" : "BLOCK");	856	mig ? "MIGRATION" : "BLOCK");
853	spin_unlock(&dlm->master_lock);	857	spin_unlock(&dlm->master_lock);
854	spin_unlock(&dlm->spinlock);	858	spin_unlock(&dlm->spinlock);
855		859
856	/* master is known, detach */	860	/* master is known, detach */
857	if (!mig)	861	if (!mig)
858	dlm_mle_detach_hb_events(dlm, mle);	862	dlm_mle_detach_hb_events(dlm, mle);
859	dlm_put_mle(mle);	863	dlm_put_mle(mle);
860	mle = NULL;	864	mle = NULL;
861	/* this is lame, but we can't wait on either	865	/* this is lame, but we can't wait on either
862	* the mle or lockres waitqueue here */	866	* the mle or lockres waitqueue here */
863	if (mig)	867	if (mig)
864	msleep(100);	868	msleep(100);
865	goto lookup;	869	goto lookup;
866	}	870	}
867	} else {	871	} else {
868	/* go ahead and try to master lock on this node */	872	/* go ahead and try to master lock on this node */
869	mle = alloc_mle;	873	mle = alloc_mle;
870	/* make sure this does not get freed below */	874	/* make sure this does not get freed below */
871	alloc_mle = NULL;	875	alloc_mle = NULL;
872	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);	876	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
873	set_bit(dlm->node_num, mle->maybe_map);	877	set_bit(dlm->node_num, mle->maybe_map);
874	__dlm_insert_mle(dlm, mle);	878	__dlm_insert_mle(dlm, mle);
875		879
876	/* still holding the dlm spinlock, check the recovery map	880	/* still holding the dlm spinlock, check the recovery map
877	* to see if there are any nodes that still need to be	881	* to see if there are any nodes that still need to be
878	* considered. these will not appear in the mle nodemap	882	* considered. these will not appear in the mle nodemap
879	* but they might own this lockres. wait on them. */	883	* but they might own this lockres. wait on them. */
880	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);	884	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
881	if (bit < O2NM_MAX_NODES) {	885	if (bit < O2NM_MAX_NODES) {
882	mlog(0, "%s: res %.*s, At least one node (%d) "	886	mlog(0, "%s: res %.*s, At least one node (%d) "
883	"to recover before lock mastery can begin\n",	887	"to recover before lock mastery can begin\n",
884	dlm->name, namelen, (char *)lockid, bit);	888	dlm->name, namelen, (char *)lockid, bit);
885	wait_on_recovery = 1;	889	wait_on_recovery = 1;
886	}	890	}
887	}	891	}
888		892
889	/* at this point there is either a DLM_MLE_BLOCK or a	893	/* at this point there is either a DLM_MLE_BLOCK or a
890	* DLM_MLE_MASTER on the master list, so it's safe to add the	894	* DLM_MLE_MASTER on the master list, so it's safe to add the
891	* lockres to the hashtable. anyone who finds the lock will	895	* lockres to the hashtable. anyone who finds the lock will
892	* still have to wait on the IN_PROGRESS. */	896	* still have to wait on the IN_PROGRESS. */
893		897
894	/* finally add the lockres to its hash bucket */	898	/* finally add the lockres to its hash bucket */
895	__dlm_insert_lockres(dlm, res);	899	__dlm_insert_lockres(dlm, res);
896		900
897	/* Grab inflight ref to pin the resource */	901	/* since this lockres is new it doesn't not require the spinlock */
898	spin_lock(&res->spinlock);	902	__dlm_lockres_grab_inflight_ref(dlm, res);
899	dlm_lockres_grab_inflight_ref(dlm, res);
900	spin_unlock(&res->spinlock);
901		903
902	/* get an extra ref on the mle in case this is a BLOCK	904	/* get an extra ref on the mle in case this is a BLOCK
903	* if so, the creator of the BLOCK may try to put the last	905	* if so, the creator of the BLOCK may try to put the last
904	* ref at this time in the assert master handler, so we	906	* ref at this time in the assert master handler, so we
905	* need an extra one to keep from a bad ptr deref. */	907	* need an extra one to keep from a bad ptr deref. */
906	dlm_get_mle_inuse(mle);	908	dlm_get_mle_inuse(mle);
907	spin_unlock(&dlm->master_lock);	909	spin_unlock(&dlm->master_lock);
908	spin_unlock(&dlm->spinlock);	910	spin_unlock(&dlm->spinlock);
909		911
910	redo_request:	912	redo_request:
911	while (wait_on_recovery) {	913	while (wait_on_recovery) {
912	/* any cluster changes that occurred after dropping the	914	/* any cluster changes that occurred after dropping the
913	* dlm spinlock would be detectable be a change on the mle,	915	* dlm spinlock would be detectable be a change on the mle,
914	* so we only need to clear out the recovery map once. */	916	* so we only need to clear out the recovery map once. */
915	if (dlm_is_recovery_lock(lockid, namelen)) {	917	if (dlm_is_recovery_lock(lockid, namelen)) {
916	mlog(0, "%s: Recovery map is not empty, but must "	918	mlog(0, "%s: Recovery map is not empty, but must "
917	"master $RECOVERY lock now\n", dlm->name);	919	"master $RECOVERY lock now\n", dlm->name);
918	if (!dlm_pre_master_reco_lockres(dlm, res))	920	if (!dlm_pre_master_reco_lockres(dlm, res))
919	wait_on_recovery = 0;	921	wait_on_recovery = 0;
920	else {	922	else {
921	mlog(0, "%s: waiting 500ms for heartbeat state "	923	mlog(0, "%s: waiting 500ms for heartbeat state "
922	"change\n", dlm->name);	924	"change\n", dlm->name);
923	msleep(500);	925	msleep(500);
924	}	926	}
925	continue;	927	continue;
926	}	928	}
927		929
928	dlm_kick_recovery_thread(dlm);	930	dlm_kick_recovery_thread(dlm);
929	msleep(1000);	931	msleep(1000);
930	dlm_wait_for_recovery(dlm);	932	dlm_wait_for_recovery(dlm);
931		933
932	spin_lock(&dlm->spinlock);	934	spin_lock(&dlm->spinlock);
933	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);	935	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
934	if (bit < O2NM_MAX_NODES) {	936	if (bit < O2NM_MAX_NODES) {
935	mlog(0, "%s: res %.*s, At least one node (%d) "	937	mlog(0, "%s: res %.*s, At least one node (%d) "
936	"to recover before lock mastery can begin\n",	938	"to recover before lock mastery can begin\n",
937	dlm->name, namelen, (char *)lockid, bit);	939	dlm->name, namelen, (char *)lockid, bit);
938	wait_on_recovery = 1;	940	wait_on_recovery = 1;
939	} else	941	} else
940	wait_on_recovery = 0;	942	wait_on_recovery = 0;
941	spin_unlock(&dlm->spinlock);	943	spin_unlock(&dlm->spinlock);
942		944
943	if (wait_on_recovery)	945	if (wait_on_recovery)
944	dlm_wait_for_node_recovery(dlm, bit, 10000);	946	dlm_wait_for_node_recovery(dlm, bit, 10000);
945	}	947	}
946		948
947	/* must wait for lock to be mastered elsewhere */	949	/* must wait for lock to be mastered elsewhere */
948	if (blocked)	950	if (blocked)
949	goto wait;	951	goto wait;
950		952
951	ret = -EINVAL;	953	ret = -EINVAL;
952	dlm_node_iter_init(mle->vote_map, &iter);	954	dlm_node_iter_init(mle->vote_map, &iter);
953	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {	955	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
954	ret = dlm_do_master_request(res, mle, nodenum);	956	ret = dlm_do_master_request(res, mle, nodenum);
955	if (ret < 0)	957	if (ret < 0)
956	mlog_errno(ret);	958	mlog_errno(ret);
957	if (mle->master != O2NM_MAX_NODES) {	959	if (mle->master != O2NM_MAX_NODES) {
958	/* found a master ! */	960	/* found a master ! */
959	if (mle->master <= nodenum)	961	if (mle->master <= nodenum)
960	break;	962	break;
961	/* if our master request has not reached the master	963	/* if our master request has not reached the master
962	* yet, keep going until it does. this is how the	964	* yet, keep going until it does. this is how the
963	* master will know that asserts are needed back to	965	* master will know that asserts are needed back to
964	* the lower nodes. */	966	* the lower nodes. */
965	mlog(0, "%s: res %.*s, Requests only up to %u but "	967	mlog(0, "%s: res %.*s, Requests only up to %u but "
966	"master is %u, keep going\n", dlm->name, namelen,	968	"master is %u, keep going\n", dlm->name, namelen,
967	lockid, nodenum, mle->master);	969	lockid, nodenum, mle->master);
968	}	970	}
969	}	971	}
970		972
971	wait:	973	wait:
972	/* keep going until the response map includes all nodes */	974	/* keep going until the response map includes all nodes */
973	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);	975	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
974	if (ret < 0) {	976	if (ret < 0) {
975	wait_on_recovery = 1;	977	wait_on_recovery = 1;
976	mlog(0, "%s: res %.*s, Node map changed, redo the master "	978	mlog(0, "%s: res %.*s, Node map changed, redo the master "
977	"request now, blocked=%d\n", dlm->name, res->lockname.len,	979	"request now, blocked=%d\n", dlm->name, res->lockname.len,
978	res->lockname.name, blocked);	980	res->lockname.name, blocked);
979	if (++tries > 20) {	981	if (++tries > 20) {
980	mlog(ML_ERROR, "%s: res %.*s, Spinning on "	982	mlog(ML_ERROR, "%s: res %.*s, Spinning on "
981	"dlm_wait_for_lock_mastery, blocked = %d\n",	983	"dlm_wait_for_lock_mastery, blocked = %d\n",
982	dlm->name, res->lockname.len,	984	dlm->name, res->lockname.len,
983	res->lockname.name, blocked);	985	res->lockname.name, blocked);
984	dlm_print_one_lock_resource(res);	986	dlm_print_one_lock_resource(res);
985	dlm_print_one_mle(mle);	987	dlm_print_one_mle(mle);
986	tries = 0;	988	tries = 0;
987	}	989	}
988	goto redo_request;	990	goto redo_request;
989	}	991	}
990		992
991	mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,	993	mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
992	res->lockname.name, res->owner);	994	res->lockname.name, res->owner);
993	/* make sure we never continue without this */	995	/* make sure we never continue without this */
994	BUG_ON(res->owner == O2NM_MAX_NODES);	996	BUG_ON(res->owner == O2NM_MAX_NODES);
995		997
996	/* master is known, detach if not already detached */	998	/* master is known, detach if not already detached */
997	dlm_mle_detach_hb_events(dlm, mle);	999	dlm_mle_detach_hb_events(dlm, mle);
998	dlm_put_mle(mle);	1000	dlm_put_mle(mle);
999	/* put the extra ref */	1001	/* put the extra ref */
1000	dlm_put_mle_inuse(mle);	1002	dlm_put_mle_inuse(mle);
1001		1003
1002	wake_waiters:	1004	wake_waiters:
1003	spin_lock(&res->spinlock);	1005	spin_lock(&res->spinlock);
1004	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;	1006	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1005	spin_unlock(&res->spinlock);	1007	spin_unlock(&res->spinlock);
1006	wake_up(&res->wq);	1008	wake_up(&res->wq);
1007		1009
1008	leave:	1010	leave:
1009	/* need to free the unused mle */	1011	/* need to free the unused mle */
1010	if (alloc_mle)	1012	if (alloc_mle)
1011	kmem_cache_free(dlm_mle_cache, alloc_mle);	1013	kmem_cache_free(dlm_mle_cache, alloc_mle);
1012		1014
1013	return res;	1015	return res;
1014	}	1016	}
1015		1017
1016		1018
1017	#define DLM_MASTERY_TIMEOUT_MS 5000	1019	#define DLM_MASTERY_TIMEOUT_MS 5000
1018		1020
1019	static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,	1021	static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
1020	struct dlm_lock_resource *res,	1022	struct dlm_lock_resource *res,
1021	struct dlm_master_list_entry *mle,	1023	struct dlm_master_list_entry *mle,
1022	int *blocked)	1024	int *blocked)
1023	{	1025	{
1024	u8 m;	1026	u8 m;
1025	int ret, bit;	1027	int ret, bit;
1026	int map_changed, voting_done;	1028	int map_changed, voting_done;
1027	int assert, sleep;	1029	int assert, sleep;
1028		1030
1029	recheck:	1031	recheck:
1030	ret = 0;	1032	ret = 0;
1031	assert = 0;	1033	assert = 0;
1032		1034
1033	/* check if another node has already become the owner */	1035	/* check if another node has already become the owner */
1034	spin_lock(&res->spinlock);	1036	spin_lock(&res->spinlock);
1035	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {	1037	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1036	mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,	1038	mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
1037	res->lockname.len, res->lockname.name, res->owner);	1039	res->lockname.len, res->lockname.name, res->owner);
1038	spin_unlock(&res->spinlock);	1040	spin_unlock(&res->spinlock);
1039	/* this will cause the master to re-assert across	1041	/* this will cause the master to re-assert across
1040	* the whole cluster, freeing up mles */	1042	* the whole cluster, freeing up mles */
1041	if (res->owner != dlm->node_num) {	1043	if (res->owner != dlm->node_num) {
1042	ret = dlm_do_master_request(res, mle, res->owner);	1044	ret = dlm_do_master_request(res, mle, res->owner);
1043	if (ret < 0) {	1045	if (ret < 0) {
1044	/* give recovery a chance to run */	1046	/* give recovery a chance to run */
1045	mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);	1047	mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
1046	msleep(500);	1048	msleep(500);
1047	goto recheck;	1049	goto recheck;
1048	}	1050	}
1049	}	1051	}
1050	ret = 0;	1052	ret = 0;
1051	goto leave;	1053	goto leave;
1052	}	1054	}
1053	spin_unlock(&res->spinlock);	1055	spin_unlock(&res->spinlock);
1054		1056
1055	spin_lock(&mle->spinlock);	1057	spin_lock(&mle->spinlock);
1056	m = mle->master;	1058	m = mle->master;
1057	map_changed = (memcmp(mle->vote_map, mle->node_map,	1059	map_changed = (memcmp(mle->vote_map, mle->node_map,
1058	sizeof(mle->vote_map)) != 0);	1060	sizeof(mle->vote_map)) != 0);
1059	voting_done = (memcmp(mle->vote_map, mle->response_map,	1061	voting_done = (memcmp(mle->vote_map, mle->response_map,
1060	sizeof(mle->vote_map)) == 0);	1062	sizeof(mle->vote_map)) == 0);
1061		1063
1062	/* restart if we hit any errors */	1064	/* restart if we hit any errors */
1063	if (map_changed) {	1065	if (map_changed) {
1064	int b;	1066	int b;
1065	mlog(0, "%s: %.*s: node map changed, restarting\n",	1067	mlog(0, "%s: %.*s: node map changed, restarting\n",
1066	dlm->name, res->lockname.len, res->lockname.name);	1068	dlm->name, res->lockname.len, res->lockname.name);
1067	ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);	1069	ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1068	b = (mle->type == DLM_MLE_BLOCK);	1070	b = (mle->type == DLM_MLE_BLOCK);
1069	if ((blocked && !b) \|\| (!blocked && b)) {	1071	if ((blocked && !b) \|\| (!blocked && b)) {
1070	mlog(0, "%s:%.*s: status change: old=%d new=%d\n",	1072	mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1071	dlm->name, res->lockname.len, res->lockname.name,	1073	dlm->name, res->lockname.len, res->lockname.name,
1072	*blocked, b);	1074	*blocked, b);
1073	*blocked = b;	1075	*blocked = b;
1074	}	1076	}
1075	spin_unlock(&mle->spinlock);	1077	spin_unlock(&mle->spinlock);
1076	if (ret < 0) {	1078	if (ret < 0) {
1077	mlog_errno(ret);	1079	mlog_errno(ret);
1078	goto leave;	1080	goto leave;
1079	}	1081	}
1080	mlog(0, "%s:%.*s: restart lock mastery succeeded, "	1082	mlog(0, "%s:%.*s: restart lock mastery succeeded, "
1081	"rechecking now\n", dlm->name, res->lockname.len,	1083	"rechecking now\n", dlm->name, res->lockname.len,
1082	res->lockname.name);	1084	res->lockname.name);
1083	goto recheck;	1085	goto recheck;
1084	} else {	1086	} else {
1085	if (!voting_done) {	1087	if (!voting_done) {
1086	mlog(0, "map not changed and voting not done "	1088	mlog(0, "map not changed and voting not done "
1087	"for %s:%.*s\n", dlm->name, res->lockname.len,	1089	"for %s:%.*s\n", dlm->name, res->lockname.len,
1088	res->lockname.name);	1090	res->lockname.name);
1089	}	1091	}
1090	}	1092	}
1091		1093
1092	if (m != O2NM_MAX_NODES) {	1094	if (m != O2NM_MAX_NODES) {
1093	/* another node has done an assert!	1095	/* another node has done an assert!
1094	* all done! */	1096	* all done! */
1095	sleep = 0;	1097	sleep = 0;
1096	} else {	1098	} else {
1097	sleep = 1;	1099	sleep = 1;
1098	/* have all nodes responded? */	1100	/* have all nodes responded? */
1099	if (voting_done && !*blocked) {	1101	if (voting_done && !*blocked) {
1100	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);	1102	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
1101	if (dlm->node_num <= bit) {	1103	if (dlm->node_num <= bit) {
1102	/* my node number is lowest.	1104	/* my node number is lowest.
1103	* now tell other nodes that I am	1105	* now tell other nodes that I am
1104	* mastering this. */	1106	* mastering this. */
1105	mle->master = dlm->node_num;	1107	mle->master = dlm->node_num;
1106	/* ref was grabbed in get_lock_resource	1108	/* ref was grabbed in get_lock_resource
1107	* will be dropped in dlmlock_master */	1109	* will be dropped in dlmlock_master */
1108	assert = 1;	1110	assert = 1;
1109	sleep = 0;	1111	sleep = 0;
1110	}	1112	}
1111	/* if voting is done, but we have not received	1113	/* if voting is done, but we have not received
1112	* an assert master yet, we must sleep */	1114	* an assert master yet, we must sleep */
1113	}	1115	}
1114	}	1116	}
1115		1117
1116	spin_unlock(&mle->spinlock);	1118	spin_unlock(&mle->spinlock);
1117		1119
1118	/* sleep if we haven't finished voting yet */	1120	/* sleep if we haven't finished voting yet */
1119	if (sleep) {	1121	if (sleep) {
1120	unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);	1122	unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
1121		1123
1122	/*	1124	/*
1123	if (atomic_read(&mle->mle_refs.refcount) < 2)	1125	if (atomic_read(&mle->mle_refs.refcount) < 2)
1124	mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,	1126	mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
1125	atomic_read(&mle->mle_refs.refcount),	1127	atomic_read(&mle->mle_refs.refcount),
1126	res->lockname.len, res->lockname.name);	1128	res->lockname.len, res->lockname.name);
1127	*/	1129	*/
1128	atomic_set(&mle->woken, 0);	1130	atomic_set(&mle->woken, 0);
1129	(void)wait_event_timeout(mle->wq,	1131	(void)wait_event_timeout(mle->wq,
1130	(atomic_read(&mle->woken) == 1),	1132	(atomic_read(&mle->woken) == 1),
1131	timeo);	1133	timeo);
1132	if (res->owner == O2NM_MAX_NODES) {	1134	if (res->owner == O2NM_MAX_NODES) {
1133	mlog(0, "%s:%.*s: waiting again\n", dlm->name,	1135	mlog(0, "%s:%.*s: waiting again\n", dlm->name,
1134	res->lockname.len, res->lockname.name);	1136	res->lockname.len, res->lockname.name);
1135	goto recheck;	1137	goto recheck;
1136	}	1138	}
1137	mlog(0, "done waiting, master is %u\n", res->owner);	1139	mlog(0, "done waiting, master is %u\n", res->owner);
1138	ret = 0;	1140	ret = 0;
1139	goto leave;	1141	goto leave;
1140	}	1142	}
1141		1143
1142	ret = 0; /* done */	1144	ret = 0; /* done */
1143	if (assert) {	1145	if (assert) {
1144	m = dlm->node_num;	1146	m = dlm->node_num;
1145	mlog(0, "about to master %.*s here, this=%u\n",	1147	mlog(0, "about to master %.*s here, this=%u\n",
1146	res->lockname.len, res->lockname.name, m);	1148	res->lockname.len, res->lockname.name, m);
1147	ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);	1149	ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
1148	if (ret) {	1150	if (ret) {
1149	/* This is a failure in the network path,	1151	/* This is a failure in the network path,
1150	* not in the response to the assert_master	1152	* not in the response to the assert_master
1151	* (any nonzero response is a BUG on this node).	1153	* (any nonzero response is a BUG on this node).
1152	* Most likely a socket just got disconnected	1154	* Most likely a socket just got disconnected
1153	* due to node death. */	1155	* due to node death. */
1154	mlog_errno(ret);	1156	mlog_errno(ret);
1155	}	1157	}
1156	/* no longer need to restart lock mastery.	1158	/* no longer need to restart lock mastery.
1157	* all living nodes have been contacted. */	1159	* all living nodes have been contacted. */
1158	ret = 0;	1160	ret = 0;
1159	}	1161	}
1160		1162
1161	/* set the lockres owner */	1163	/* set the lockres owner */
1162	spin_lock(&res->spinlock);	1164	spin_lock(&res->spinlock);
1163	/* mastery reference obtained either during	1165	/* mastery reference obtained either during
1164	* assert_master_handler or in get_lock_resource */	1166	* assert_master_handler or in get_lock_resource */
1165	dlm_change_lockres_owner(dlm, res, m);	1167	dlm_change_lockres_owner(dlm, res, m);
1166	spin_unlock(&res->spinlock);	1168	spin_unlock(&res->spinlock);
1167		1169
1168	leave:	1170	leave:
1169	return ret;	1171	return ret;
1170	}	1172	}
1171		1173
1172	struct dlm_bitmap_diff_iter	1174	struct dlm_bitmap_diff_iter
1173	{	1175	{
1174	int curnode;	1176	int curnode;
1175	unsigned long *orig_bm;	1177	unsigned long *orig_bm;
1176	unsigned long *cur_bm;	1178	unsigned long *cur_bm;
1177	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];	1179	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
1178	};	1180	};
1179		1181
1180	enum dlm_node_state_change	1182	enum dlm_node_state_change
1181	{	1183	{
1182	NODE_DOWN = -1,	1184	NODE_DOWN = -1,
1183	NODE_NO_CHANGE = 0,	1185	NODE_NO_CHANGE = 0,
1184	NODE_UP	1186	NODE_UP
1185	};	1187	};
1186		1188
1187	static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,	1189	static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
1188	unsigned long *orig_bm,	1190	unsigned long *orig_bm,
1189	unsigned long *cur_bm)	1191	unsigned long *cur_bm)
1190	{	1192	{
1191	unsigned long p1, p2;	1193	unsigned long p1, p2;
1192	int i;	1194	int i;
1193		1195
1194	iter->curnode = -1;	1196	iter->curnode = -1;
1195	iter->orig_bm = orig_bm;	1197	iter->orig_bm = orig_bm;
1196	iter->cur_bm = cur_bm;	1198	iter->cur_bm = cur_bm;
1197		1199
1198	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {	1200	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1199	p1 = *(iter->orig_bm + i);	1201	p1 = *(iter->orig_bm + i);
1200	p2 = *(iter->cur_bm + i);	1202	p2 = *(iter->cur_bm + i);
1201	iter->diff_bm[i] = (p1 & ~p2) \| (p2 & ~p1);	1203	iter->diff_bm[i] = (p1 & ~p2) \| (p2 & ~p1);
1202	}	1204	}
1203	}	1205	}
1204		1206
1205	static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,	1207	static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1206	enum dlm_node_state_change *state)	1208	enum dlm_node_state_change *state)
1207	{	1209	{
1208	int bit;	1210	int bit;
1209		1211
1210	if (iter->curnode >= O2NM_MAX_NODES)	1212	if (iter->curnode >= O2NM_MAX_NODES)
1211	return -ENOENT;	1213	return -ENOENT;
1212		1214
1213	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,	1215	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1214	iter->curnode+1);	1216	iter->curnode+1);
1215	if (bit >= O2NM_MAX_NODES) {	1217	if (bit >= O2NM_MAX_NODES) {
1216	iter->curnode = O2NM_MAX_NODES;	1218	iter->curnode = O2NM_MAX_NODES;
1217	return -ENOENT;	1219	return -ENOENT;
1218	}	1220	}
1219		1221
1220	/* if it was there in the original then this node died */	1222	/* if it was there in the original then this node died */
1221	if (test_bit(bit, iter->orig_bm))	1223	if (test_bit(bit, iter->orig_bm))
1222	*state = NODE_DOWN;	1224	*state = NODE_DOWN;
1223	else	1225	else
1224	*state = NODE_UP;	1226	*state = NODE_UP;
1225		1227
1226	iter->curnode = bit;	1228	iter->curnode = bit;
1227	return bit;	1229	return bit;
1228	}	1230	}
1229		1231
1230		1232
1231	static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,	1233	static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1232	struct dlm_lock_resource *res,	1234	struct dlm_lock_resource *res,
1233	struct dlm_master_list_entry *mle,	1235	struct dlm_master_list_entry *mle,
1234	int blocked)	1236	int blocked)
1235	{	1237	{
1236	struct dlm_bitmap_diff_iter bdi;	1238	struct dlm_bitmap_diff_iter bdi;
1237	enum dlm_node_state_change sc;	1239	enum dlm_node_state_change sc;
1238	int node;	1240	int node;
1239	int ret = 0;	1241	int ret = 0;
1240		1242
1241	mlog(0, "something happened such that the "	1243	mlog(0, "something happened such that the "
1242	"master process may need to be restarted!\n");	1244	"master process may need to be restarted!\n");
1243		1245
1244	assert_spin_locked(&mle->spinlock);	1246	assert_spin_locked(&mle->spinlock);
1245		1247
1246	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);	1248	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1247	node = dlm_bitmap_diff_iter_next(&bdi, &sc);	1249	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1248	while (node >= 0) {	1250	while (node >= 0) {
1249	if (sc == NODE_UP) {	1251	if (sc == NODE_UP) {
1250	/* a node came up. clear any old vote from	1252	/* a node came up. clear any old vote from
1251	* the response map and set it in the vote map	1253	* the response map and set it in the vote map
1252	* then restart the mastery. */	1254	* then restart the mastery. */
1253	mlog(ML_NOTICE, "node %d up while restarting\n", node);	1255	mlog(ML_NOTICE, "node %d up while restarting\n", node);
1254		1256
1255	/* redo the master request, but only for the new node */	1257	/* redo the master request, but only for the new node */
1256	mlog(0, "sending request to new node\n");	1258	mlog(0, "sending request to new node\n");
1257	clear_bit(node, mle->response_map);	1259	clear_bit(node, mle->response_map);
1258	set_bit(node, mle->vote_map);	1260	set_bit(node, mle->vote_map);
1259	} else {	1261	} else {
1260	mlog(ML_ERROR, "node down! %d\n", node);	1262	mlog(ML_ERROR, "node down! %d\n", node);
1261	if (blocked) {	1263	if (blocked) {
1262	int lowest = find_next_bit(mle->maybe_map,	1264	int lowest = find_next_bit(mle->maybe_map,
1263	O2NM_MAX_NODES, 0);	1265	O2NM_MAX_NODES, 0);
1264		1266
1265	/* act like it was never there */	1267	/* act like it was never there */
1266	clear_bit(node, mle->maybe_map);	1268	clear_bit(node, mle->maybe_map);
1267		1269
1268	if (node == lowest) {	1270	if (node == lowest) {
1269	mlog(0, "expected master %u died"	1271	mlog(0, "expected master %u died"
1270	" while this node was blocked "	1272	" while this node was blocked "
1271	"waiting on it!\n", node);	1273	"waiting on it!\n", node);
1272	lowest = find_next_bit(mle->maybe_map,	1274	lowest = find_next_bit(mle->maybe_map,
1273	O2NM_MAX_NODES,	1275	O2NM_MAX_NODES,
1274	lowest+1);	1276	lowest+1);
1275	if (lowest < O2NM_MAX_NODES) {	1277	if (lowest < O2NM_MAX_NODES) {
1276	mlog(0, "%s:%.*s:still "	1278	mlog(0, "%s:%.*s:still "
1277	"blocked. waiting on %u "	1279	"blocked. waiting on %u "
1278	"now\n", dlm->name,	1280	"now\n", dlm->name,
1279	res->lockname.len,	1281	res->lockname.len,
1280	res->lockname.name,	1282	res->lockname.name,
1281	lowest);	1283	lowest);
1282	} else {	1284	} else {
1283	/* mle is an MLE_BLOCK, but	1285	/* mle is an MLE_BLOCK, but
1284	* there is now nothing left to	1286	* there is now nothing left to
1285	* block on. we need to return	1287	* block on. we need to return
1286	* all the way back out and try	1288	* all the way back out and try
1287	* again with an MLE_MASTER.	1289	* again with an MLE_MASTER.
1288	* dlm_do_local_recovery_cleanup	1290	* dlm_do_local_recovery_cleanup
1289	* has already run, so the mle	1291	* has already run, so the mle
1290	* refcount is ok */	1292	* refcount is ok */
1291	mlog(0, "%s:%.*s: no "	1293	mlog(0, "%s:%.*s: no "
1292	"longer blocking. try to "	1294	"longer blocking. try to "
1293	"master this here\n",	1295	"master this here\n",
1294	dlm->name,	1296	dlm->name,
1295	res->lockname.len,	1297	res->lockname.len,
1296	res->lockname.name);	1298	res->lockname.name);
1297	mle->type = DLM_MLE_MASTER;	1299	mle->type = DLM_MLE_MASTER;
1298	mle->mleres = res;	1300	mle->mleres = res;
1299	}	1301	}
1300	}	1302	}
1301	}	1303	}
1302		1304
1303	/* now blank out everything, as if we had never	1305	/* now blank out everything, as if we had never
1304	* contacted anyone */	1306	* contacted anyone */
1305	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));	1307	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
1306	memset(mle->response_map, 0, sizeof(mle->response_map));	1308	memset(mle->response_map, 0, sizeof(mle->response_map));
1307	/* reset the vote_map to the current node_map */	1309	/* reset the vote_map to the current node_map */
1308	memcpy(mle->vote_map, mle->node_map,	1310	memcpy(mle->vote_map, mle->node_map,
1309	sizeof(mle->node_map));	1311	sizeof(mle->node_map));
1310	/* put myself into the maybe map */	1312	/* put myself into the maybe map */
1311	if (mle->type != DLM_MLE_BLOCK)	1313	if (mle->type != DLM_MLE_BLOCK)
1312	set_bit(dlm->node_num, mle->maybe_map);	1314	set_bit(dlm->node_num, mle->maybe_map);
1313	}	1315	}
1314	ret = -EAGAIN;	1316	ret = -EAGAIN;
1315	node = dlm_bitmap_diff_iter_next(&bdi, &sc);	1317	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1316	}	1318	}
1317	return ret;	1319	return ret;
1318	}	1320	}
1319		1321
1320		1322
1321	/*	1323	/*
1322	* DLM_MASTER_REQUEST_MSG	1324	* DLM_MASTER_REQUEST_MSG
1323	*	1325	*
1324	* returns: 0 on success,	1326	* returns: 0 on success,
1325	* -errno on a network error	1327	* -errno on a network error
1326	*	1328	*
1327	* on error, the caller should assume the target node is "dead"	1329	* on error, the caller should assume the target node is "dead"
1328	*	1330	*
1329	*/	1331	*/
1330		1332
1331	static int dlm_do_master_request(struct dlm_lock_resource *res,	1333	static int dlm_do_master_request(struct dlm_lock_resource *res,
1332	struct dlm_master_list_entry *mle, int to)	1334	struct dlm_master_list_entry *mle, int to)
1333	{	1335	{
1334	struct dlm_ctxt *dlm = mle->dlm;	1336	struct dlm_ctxt *dlm = mle->dlm;
1335	struct dlm_master_request request;	1337	struct dlm_master_request request;
1336	int ret, response=0, resend;	1338	int ret, response=0, resend;
1337		1339
1338	memset(&request, 0, sizeof(request));	1340	memset(&request, 0, sizeof(request));
1339	request.node_idx = dlm->node_num;	1341	request.node_idx = dlm->node_num;
1340		1342
1341	BUG_ON(mle->type == DLM_MLE_MIGRATION);	1343	BUG_ON(mle->type == DLM_MLE_MIGRATION);
1342		1344
1343	request.namelen = (u8)mle->mnamelen;	1345	request.namelen = (u8)mle->mnamelen;
1344	memcpy(request.name, mle->mname, request.namelen);	1346	memcpy(request.name, mle->mname, request.namelen);
1345		1347
1346	again:	1348	again:
1347	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,	1349	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1348	sizeof(request), to, &response);	1350	sizeof(request), to, &response);
1349	if (ret < 0) {	1351	if (ret < 0) {
1350	if (ret == -ESRCH) {	1352	if (ret == -ESRCH) {
1351	/* should never happen */	1353	/* should never happen */
1352	mlog(ML_ERROR, "TCP stack not ready!\n");	1354	mlog(ML_ERROR, "TCP stack not ready!\n");
1353	BUG();	1355	BUG();
1354	} else if (ret == -EINVAL) {	1356	} else if (ret == -EINVAL) {
1355	mlog(ML_ERROR, "bad args passed to o2net!\n");	1357	mlog(ML_ERROR, "bad args passed to o2net!\n");
1356	BUG();	1358	BUG();
1357	} else if (ret == -ENOMEM) {	1359	} else if (ret == -ENOMEM) {
1358	mlog(ML_ERROR, "out of memory while trying to send "	1360	mlog(ML_ERROR, "out of memory while trying to send "
1359	"network message! retrying\n");	1361	"network message! retrying\n");
1360	/* this is totally crude */	1362	/* this is totally crude */
1361	msleep(50);	1363	msleep(50);
1362	goto again;	1364	goto again;
1363	} else if (!dlm_is_host_down(ret)) {	1365	} else if (!dlm_is_host_down(ret)) {
1364	/* not a network error. bad. */	1366	/* not a network error. bad. */
1365	mlog_errno(ret);	1367	mlog_errno(ret);
1366	mlog(ML_ERROR, "unhandled error!");	1368	mlog(ML_ERROR, "unhandled error!");
1367	BUG();	1369	BUG();
1368	}	1370	}
1369	/* all other errors should be network errors,	1371	/* all other errors should be network errors,
1370	* and likely indicate node death */	1372	* and likely indicate node death */
1371	mlog(ML_ERROR, "link to %d went down!\n", to);	1373	mlog(ML_ERROR, "link to %d went down!\n", to);
1372	goto out;	1374	goto out;
1373	}	1375	}
1374		1376
1375	ret = 0;	1377	ret = 0;
1376	resend = 0;	1378	resend = 0;
1377	spin_lock(&mle->spinlock);	1379	spin_lock(&mle->spinlock);
1378	switch (response) {	1380	switch (response) {
1379	case DLM_MASTER_RESP_YES:	1381	case DLM_MASTER_RESP_YES:
1380	set_bit(to, mle->response_map);	1382	set_bit(to, mle->response_map);
1381	mlog(0, "node %u is the master, response=YES\n", to);	1383	mlog(0, "node %u is the master, response=YES\n", to);
1382	mlog(0, "%s:%.*s: master node %u now knows I have a "	1384	mlog(0, "%s:%.*s: master node %u now knows I have a "
1383	"reference\n", dlm->name, res->lockname.len,	1385	"reference\n", dlm->name, res->lockname.len,
1384	res->lockname.name, to);	1386	res->lockname.name, to);
1385	mle->master = to;	1387	mle->master = to;
1386	break;	1388	break;
1387	case DLM_MASTER_RESP_NO:	1389	case DLM_MASTER_RESP_NO:
1388	mlog(0, "node %u not master, response=NO\n", to);	1390	mlog(0, "node %u not master, response=NO\n", to);
1389	set_bit(to, mle->response_map);	1391	set_bit(to, mle->response_map);
1390	break;	1392	break;
1391	case DLM_MASTER_RESP_MAYBE:	1393	case DLM_MASTER_RESP_MAYBE:
1392	mlog(0, "node %u not master, response=MAYBE\n", to);	1394	mlog(0, "node %u not master, response=MAYBE\n", to);
1393	set_bit(to, mle->response_map);	1395	set_bit(to, mle->response_map);
1394	set_bit(to, mle->maybe_map);	1396	set_bit(to, mle->maybe_map);
1395	break;	1397	break;
1396	case DLM_MASTER_RESP_ERROR:	1398	case DLM_MASTER_RESP_ERROR:
1397	mlog(0, "node %u hit an error, resending\n", to);	1399	mlog(0, "node %u hit an error, resending\n", to);
1398	resend = 1;	1400	resend = 1;
1399	response = 0;	1401	response = 0;
1400	break;	1402	break;
1401	default:	1403	default:
1402	mlog(ML_ERROR, "bad response! %u\n", response);	1404	mlog(ML_ERROR, "bad response! %u\n", response);
1403	BUG();	1405	BUG();
1404	}	1406	}
1405	spin_unlock(&mle->spinlock);	1407	spin_unlock(&mle->spinlock);
1406	if (resend) {	1408	if (resend) {
1407	/* this is also totally crude */	1409	/* this is also totally crude */
1408	msleep(50);	1410	msleep(50);
1409	goto again;	1411	goto again;
1410	}	1412	}
1411		1413
1412	out:	1414	out:
1413	return ret;	1415	return ret;
1414	}	1416	}
1415		1417
1416	/*	1418	/*
1417	* locks that can be taken here:	1419	* locks that can be taken here:
1418	* dlm->spinlock	1420	* dlm->spinlock
1419	* res->spinlock	1421	* res->spinlock
1420	* mle->spinlock	1422	* mle->spinlock
1421	* dlm->master_list	1423	* dlm->master_list
1422	*	1424	*
1423	* if possible, TRIM THIS DOWN!!!	1425	* if possible, TRIM THIS DOWN!!!
1424	*/	1426	*/
1425	int dlm_master_request_handler(struct o2net_msg msg, u32 len, void data,	1427	int dlm_master_request_handler(struct o2net_msg msg, u32 len, void data,
1426	void **ret_data)	1428	void **ret_data)
1427	{	1429	{
1428	u8 response = DLM_MASTER_RESP_MAYBE;	1430	u8 response = DLM_MASTER_RESP_MAYBE;
1429	struct dlm_ctxt *dlm = data;	1431	struct dlm_ctxt *dlm = data;
1430	struct dlm_lock_resource *res = NULL;	1432	struct dlm_lock_resource *res = NULL;
1431	struct dlm_master_request request = (struct dlm_master_request ) msg->buf;	1433	struct dlm_master_request request = (struct dlm_master_request ) msg->buf;
1432	struct dlm_master_list_entry mle = NULL, tmpmle = NULL;	1434	struct dlm_master_list_entry mle = NULL, tmpmle = NULL;
1433	char *name;	1435	char *name;
1434	unsigned int namelen, hash;	1436	unsigned int namelen, hash;
1435	int found, ret;	1437	int found, ret;
1436	int set_maybe;	1438	int set_maybe;
1437	int dispatch_assert = 0;	1439	int dispatch_assert = 0;
1438		1440
1439	if (!dlm_grab(dlm))	1441	if (!dlm_grab(dlm))
1440	return DLM_MASTER_RESP_NO;	1442	return DLM_MASTER_RESP_NO;
1441		1443
1442	if (!dlm_domain_fully_joined(dlm)) {	1444	if (!dlm_domain_fully_joined(dlm)) {
1443	response = DLM_MASTER_RESP_NO;	1445	response = DLM_MASTER_RESP_NO;
1444	goto send_response;	1446	goto send_response;
1445	}	1447	}
1446		1448
1447	name = request->name;	1449	name = request->name;
1448	namelen = request->namelen;	1450	namelen = request->namelen;
1449	hash = dlm_lockid_hash(name, namelen);	1451	hash = dlm_lockid_hash(name, namelen);
1450		1452
1451	if (namelen > DLM_LOCKID_NAME_MAX) {	1453	if (namelen > DLM_LOCKID_NAME_MAX) {
1452	response = DLM_IVBUFLEN;	1454	response = DLM_IVBUFLEN;
1453	goto send_response;	1455	goto send_response;
1454	}	1456	}
1455		1457
1456	way_up_top:	1458	way_up_top:
1457	spin_lock(&dlm->spinlock);	1459	spin_lock(&dlm->spinlock);
1458	res = __dlm_lookup_lockres(dlm, name, namelen, hash);	1460	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1459	if (res) {	1461	if (res) {
1460	spin_unlock(&dlm->spinlock);	1462	spin_unlock(&dlm->spinlock);
1461		1463
1462	/* take care of the easy cases up front */	1464	/* take care of the easy cases up front */
1463	spin_lock(&res->spinlock);	1465	spin_lock(&res->spinlock);
1464	if (res->state & (DLM_LOCK_RES_RECOVERING\|	1466	if (res->state & (DLM_LOCK_RES_RECOVERING\|
1465	DLM_LOCK_RES_MIGRATING)) {	1467	DLM_LOCK_RES_MIGRATING)) {
1466	spin_unlock(&res->spinlock);	1468	spin_unlock(&res->spinlock);
1467	mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "	1469	mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1468	"being recovered/migrated\n");	1470	"being recovered/migrated\n");
1469	response = DLM_MASTER_RESP_ERROR;	1471	response = DLM_MASTER_RESP_ERROR;
1470	if (mle)	1472	if (mle)
1471	kmem_cache_free(dlm_mle_cache, mle);	1473	kmem_cache_free(dlm_mle_cache, mle);
1472	goto send_response;	1474	goto send_response;
1473	}	1475	}
1474		1476
1475	if (res->owner == dlm->node_num) {	1477	if (res->owner == dlm->node_num) {
1476	dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);	1478	dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
1477	spin_unlock(&res->spinlock);	1479	spin_unlock(&res->spinlock);
1478	response = DLM_MASTER_RESP_YES;	1480	response = DLM_MASTER_RESP_YES;
1479	if (mle)	1481	if (mle)
1480	kmem_cache_free(dlm_mle_cache, mle);	1482	kmem_cache_free(dlm_mle_cache, mle);
1481		1483
1482	/* this node is the owner.	1484	/* this node is the owner.
1483	* there is some extra work that needs to	1485	* there is some extra work that needs to
1484	* happen now. the requesting node has	1486	* happen now. the requesting node has
1485	* caused all nodes up to this one to	1487	* caused all nodes up to this one to
1486	* create mles. this node now needs to	1488	* create mles. this node now needs to
1487	* go back and clean those up. */	1489	* go back and clean those up. */
1488	dispatch_assert = 1;	1490	dispatch_assert = 1;
1489	goto send_response;	1491	goto send_response;
1490	} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {	1492	} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1491	spin_unlock(&res->spinlock);	1493	spin_unlock(&res->spinlock);
1492	// mlog(0, "node %u is the master\n", res->owner);	1494	// mlog(0, "node %u is the master\n", res->owner);
1493	response = DLM_MASTER_RESP_NO;	1495	response = DLM_MASTER_RESP_NO;
1494	if (mle)	1496	if (mle)
1495	kmem_cache_free(dlm_mle_cache, mle);	1497	kmem_cache_free(dlm_mle_cache, mle);
1496	goto send_response;	1498	goto send_response;
1497	}	1499	}
1498		1500
1499	/* ok, there is no owner. either this node is	1501	/* ok, there is no owner. either this node is
1500	* being blocked, or it is actively trying to	1502	* being blocked, or it is actively trying to
1501	* master this lock. */	1503	* master this lock. */
1502	if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {	1504	if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1503	mlog(ML_ERROR, "lock with no owner should be "	1505	mlog(ML_ERROR, "lock with no owner should be "
1504	"in-progress!\n");	1506	"in-progress!\n");
1505	BUG();	1507	BUG();
1506	}	1508	}
1507		1509
1508	// mlog(0, "lockres is in progress...\n");	1510	// mlog(0, "lockres is in progress...\n");
1509	spin_lock(&dlm->master_lock);	1511	spin_lock(&dlm->master_lock);
1510	found = dlm_find_mle(dlm, &tmpmle, name, namelen);	1512	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1511	if (!found) {	1513	if (!found) {
1512	mlog(ML_ERROR, "no mle found for this lock!\n");	1514	mlog(ML_ERROR, "no mle found for this lock!\n");
1513	BUG();	1515	BUG();
1514	}	1516	}
1515	set_maybe = 1;	1517	set_maybe = 1;
1516	spin_lock(&tmpmle->spinlock);	1518	spin_lock(&tmpmle->spinlock);
1517	if (tmpmle->type == DLM_MLE_BLOCK) {	1519	if (tmpmle->type == DLM_MLE_BLOCK) {
1518	// mlog(0, "this node is waiting for "	1520	// mlog(0, "this node is waiting for "
1519	// "lockres to be mastered\n");	1521	// "lockres to be mastered\n");
1520	response = DLM_MASTER_RESP_NO;	1522	response = DLM_MASTER_RESP_NO;
1521	} else if (tmpmle->type == DLM_MLE_MIGRATION) {	1523	} else if (tmpmle->type == DLM_MLE_MIGRATION) {
1522	mlog(0, "node %u is master, but trying to migrate to "	1524	mlog(0, "node %u is master, but trying to migrate to "
1523	"node %u.\n", tmpmle->master, tmpmle->new_master);	1525	"node %u.\n", tmpmle->master, tmpmle->new_master);
1524	if (tmpmle->master == dlm->node_num) {	1526	if (tmpmle->master == dlm->node_num) {
1525	mlog(ML_ERROR, "no owner on lockres, but this "	1527	mlog(ML_ERROR, "no owner on lockres, but this "
1526	"node is trying to migrate it to %u?!\n",	1528	"node is trying to migrate it to %u?!\n",
1527	tmpmle->new_master);	1529	tmpmle->new_master);
1528	BUG();	1530	BUG();
1529	} else {	1531	} else {
1530	/* the real master can respond on its own */	1532	/* the real master can respond on its own */
1531	response = DLM_MASTER_RESP_NO;	1533	response = DLM_MASTER_RESP_NO;
1532	}	1534	}
1533	} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {	1535	} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1534	set_maybe = 0;	1536	set_maybe = 0;
1535	if (tmpmle->master == dlm->node_num) {	1537	if (tmpmle->master == dlm->node_num) {
1536	response = DLM_MASTER_RESP_YES;	1538	response = DLM_MASTER_RESP_YES;
1537	/* this node will be the owner.	1539	/* this node will be the owner.
1538	* go back and clean the mles on any	1540	* go back and clean the mles on any
1539	* other nodes */	1541	* other nodes */
1540	dispatch_assert = 1;	1542	dispatch_assert = 1;
1541	dlm_lockres_set_refmap_bit(dlm, res,	1543	dlm_lockres_set_refmap_bit(dlm, res,
1542	request->node_idx);	1544	request->node_idx);
1543	} else	1545	} else
1544	response = DLM_MASTER_RESP_NO;	1546	response = DLM_MASTER_RESP_NO;
1545	} else {	1547	} else {
1546	// mlog(0, "this node is attempting to "	1548	// mlog(0, "this node is attempting to "
1547	// "master lockres\n");	1549	// "master lockres\n");
1548	response = DLM_MASTER_RESP_MAYBE;	1550	response = DLM_MASTER_RESP_MAYBE;
1549	}	1551	}
1550	if (set_maybe)	1552	if (set_maybe)
1551	set_bit(request->node_idx, tmpmle->maybe_map);	1553	set_bit(request->node_idx, tmpmle->maybe_map);
1552	spin_unlock(&tmpmle->spinlock);	1554	spin_unlock(&tmpmle->spinlock);
1553		1555
1554	spin_unlock(&dlm->master_lock);	1556	spin_unlock(&dlm->master_lock);
1555	spin_unlock(&res->spinlock);	1557	spin_unlock(&res->spinlock);
1556		1558
1557	/* keep the mle attached to heartbeat events */	1559	/* keep the mle attached to heartbeat events */
1558	dlm_put_mle(tmpmle);	1560	dlm_put_mle(tmpmle);
1559	if (mle)	1561	if (mle)
1560	kmem_cache_free(dlm_mle_cache, mle);	1562	kmem_cache_free(dlm_mle_cache, mle);
1561	goto send_response;	1563	goto send_response;
1562	}	1564	}
1563		1565
1564	/*	1566	/*
1565	* lockres doesn't exist on this node	1567	* lockres doesn't exist on this node
1566	* if there is an MLE_BLOCK, return NO	1568	* if there is an MLE_BLOCK, return NO
1567	* if there is an MLE_MASTER, return MAYBE	1569	* if there is an MLE_MASTER, return MAYBE
1568	* otherwise, add an MLE_BLOCK, return NO	1570	* otherwise, add an MLE_BLOCK, return NO
1569	*/	1571	*/
1570	spin_lock(&dlm->master_lock);	1572	spin_lock(&dlm->master_lock);
1571	found = dlm_find_mle(dlm, &tmpmle, name, namelen);	1573	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1572	if (!found) {	1574	if (!found) {
1573	/* this lockid has never been seen on this node yet */	1575	/* this lockid has never been seen on this node yet */
1574	// mlog(0, "no mle found\n");	1576	// mlog(0, "no mle found\n");
1575	if (!mle) {	1577	if (!mle) {
1576	spin_unlock(&dlm->master_lock);	1578	spin_unlock(&dlm->master_lock);
1577	spin_unlock(&dlm->spinlock);	1579	spin_unlock(&dlm->spinlock);
1578		1580
1579	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);	1581	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1580	if (!mle) {	1582	if (!mle) {
1581	response = DLM_MASTER_RESP_ERROR;	1583	response = DLM_MASTER_RESP_ERROR;
1582	mlog_errno(-ENOMEM);	1584	mlog_errno(-ENOMEM);
1583	goto send_response;	1585	goto send_response;
1584	}	1586	}
1585	goto way_up_top;	1587	goto way_up_top;
1586	}	1588	}
1587		1589
1588	// mlog(0, "this is second time thru, already allocated, "	1590	// mlog(0, "this is second time thru, already allocated, "
1589	// "add the block.\n");	1591	// "add the block.\n");
1590	dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);	1592	dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1591	set_bit(request->node_idx, mle->maybe_map);	1593	set_bit(request->node_idx, mle->maybe_map);
1592	__dlm_insert_mle(dlm, mle);	1594	__dlm_insert_mle(dlm, mle);
1593	response = DLM_MASTER_RESP_NO;	1595	response = DLM_MASTER_RESP_NO;
1594	} else {	1596	} else {
1595	// mlog(0, "mle was found\n");	1597	// mlog(0, "mle was found\n");
1596	set_maybe = 1;	1598	set_maybe = 1;
1597	spin_lock(&tmpmle->spinlock);	1599	spin_lock(&tmpmle->spinlock);
1598	if (tmpmle->master == dlm->node_num) {	1600	if (tmpmle->master == dlm->node_num) {
1599	mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");	1601	mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
1600	BUG();	1602	BUG();
1601	}	1603	}
1602	if (tmpmle->type == DLM_MLE_BLOCK)	1604	if (tmpmle->type == DLM_MLE_BLOCK)
1603	response = DLM_MASTER_RESP_NO;	1605	response = DLM_MASTER_RESP_NO;
1604	else if (tmpmle->type == DLM_MLE_MIGRATION) {	1606	else if (tmpmle->type == DLM_MLE_MIGRATION) {
1605	mlog(0, "migration mle was found (%u->%u)\n",	1607	mlog(0, "migration mle was found (%u->%u)\n",
1606	tmpmle->master, tmpmle->new_master);	1608	tmpmle->master, tmpmle->new_master);
1607	/* real master can respond on its own */	1609	/* real master can respond on its own */
1608	response = DLM_MASTER_RESP_NO;	1610	response = DLM_MASTER_RESP_NO;
1609	} else	1611	} else
1610	response = DLM_MASTER_RESP_MAYBE;	1612	response = DLM_MASTER_RESP_MAYBE;
1611	if (set_maybe)	1613	if (set_maybe)
1612	set_bit(request->node_idx, tmpmle->maybe_map);	1614	set_bit(request->node_idx, tmpmle->maybe_map);
1613	spin_unlock(&tmpmle->spinlock);	1615	spin_unlock(&tmpmle->spinlock);
1614	}	1616	}
1615	spin_unlock(&dlm->master_lock);	1617	spin_unlock(&dlm->master_lock);
1616	spin_unlock(&dlm->spinlock);	1618	spin_unlock(&dlm->spinlock);
1617		1619
1618	if (found) {	1620	if (found) {
1619	/* keep the mle attached to heartbeat events */	1621	/* keep the mle attached to heartbeat events */
1620	dlm_put_mle(tmpmle);	1622	dlm_put_mle(tmpmle);
1621	}	1623	}
1622	send_response:	1624	send_response:
1623	/*	1625	/*
1624	* __dlm_lookup_lockres() grabbed a reference to this lockres.	1626	* __dlm_lookup_lockres() grabbed a reference to this lockres.
1625	* The reference is released by dlm_assert_master_worker() under	1627	* The reference is released by dlm_assert_master_worker() under
1626	* the call to dlm_dispatch_assert_master(). If	1628	* the call to dlm_dispatch_assert_master(). If
1627	* dlm_assert_master_worker() isn't called, we drop it here.	1629	* dlm_assert_master_worker() isn't called, we drop it here.
1628	*/	1630	*/
1629	if (dispatch_assert) {	1631	if (dispatch_assert) {
1630	if (response != DLM_MASTER_RESP_YES)	1632	if (response != DLM_MASTER_RESP_YES)
1631	mlog(ML_ERROR, "invalid response %d\n", response);	1633	mlog(ML_ERROR, "invalid response %d\n", response);
1632	if (!res) {	1634	if (!res) {
1633	mlog(ML_ERROR, "bad lockres while trying to assert!\n");	1635	mlog(ML_ERROR, "bad lockres while trying to assert!\n");
1634	BUG();	1636	BUG();
1635	}	1637	}
1636	mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",	1638	mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1637	dlm->node_num, res->lockname.len, res->lockname.name);	1639	dlm->node_num, res->lockname.len, res->lockname.name);
1638	ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,	1640	ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1639	DLM_ASSERT_MASTER_MLE_CLEANUP);	1641	DLM_ASSERT_MASTER_MLE_CLEANUP);
1640	if (ret < 0) {	1642	if (ret < 0) {
1641	mlog(ML_ERROR, "failed to dispatch assert master work\n");	1643	mlog(ML_ERROR, "failed to dispatch assert master work\n");
1642	response = DLM_MASTER_RESP_ERROR;	1644	response = DLM_MASTER_RESP_ERROR;
1643	dlm_lockres_put(res);	1645	dlm_lockres_put(res);
1644	} else	1646	} else
1645	dlm_lockres_grab_inflight_worker(dlm, res);	1647	dlm_lockres_grab_inflight_worker(dlm, res);
1646	} else {	1648	} else {
1647	if (res)	1649	if (res)
1648	dlm_lockres_put(res);	1650	dlm_lockres_put(res);
1649	}	1651	}
1650		1652
1651	dlm_put(dlm);	1653	dlm_put(dlm);
1652	return response;	1654	return response;
1653	}	1655	}
1654		1656
1655	/*	1657	/*
1656	* DLM_ASSERT_MASTER_MSG	1658	* DLM_ASSERT_MASTER_MSG
1657	*/	1659	*/
1658		1660
1659		1661
1660	/*	1662	/*
1661	* NOTE: this can be used for debugging	1663	* NOTE: this can be used for debugging
1662	* can periodically run all locks owned by this node	1664	* can periodically run all locks owned by this node
1663	* and re-assert across the cluster...	1665	* and re-assert across the cluster...
1664	*/	1666	*/
1665	static int dlm_do_assert_master(struct dlm_ctxt *dlm,	1667	static int dlm_do_assert_master(struct dlm_ctxt *dlm,
1666	struct dlm_lock_resource *res,	1668	struct dlm_lock_resource *res,
1667	void *nodemap, u32 flags)	1669	void *nodemap, u32 flags)
1668	{	1670	{
1669	struct dlm_assert_master assert;	1671	struct dlm_assert_master assert;
1670	int to, tmpret;	1672	int to, tmpret;
1671	struct dlm_node_iter iter;	1673	struct dlm_node_iter iter;
1672	int ret = 0;	1674	int ret = 0;
1673	int reassert;	1675	int reassert;
1674	const char *lockname = res->lockname.name;	1676	const char *lockname = res->lockname.name;
1675	unsigned int namelen = res->lockname.len;	1677	unsigned int namelen = res->lockname.len;
1676		1678
1677	BUG_ON(namelen > O2NM_MAX_NAME_LEN);	1679	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1678		1680
1679	spin_lock(&res->spinlock);	1681	spin_lock(&res->spinlock);
1680	res->state \|= DLM_LOCK_RES_SETREF_INPROG;	1682	res->state \|= DLM_LOCK_RES_SETREF_INPROG;
1681	spin_unlock(&res->spinlock);	1683	spin_unlock(&res->spinlock);
1682		1684
1683	again:	1685	again:
1684	reassert = 0;	1686	reassert = 0;
1685		1687
1686	/* note that if this nodemap is empty, it returns 0 */	1688	/* note that if this nodemap is empty, it returns 0 */
1687	dlm_node_iter_init(nodemap, &iter);	1689	dlm_node_iter_init(nodemap, &iter);
1688	while ((to = dlm_node_iter_next(&iter)) >= 0) {	1690	while ((to = dlm_node_iter_next(&iter)) >= 0) {
1689	int r = 0;	1691	int r = 0;
1690	struct dlm_master_list_entry *mle = NULL;	1692	struct dlm_master_list_entry *mle = NULL;
1691		1693
1692	mlog(0, "sending assert master to %d (%.*s)\n", to,	1694	mlog(0, "sending assert master to %d (%.*s)\n", to,
1693	namelen, lockname);	1695	namelen, lockname);
1694	memset(&assert, 0, sizeof(assert));	1696	memset(&assert, 0, sizeof(assert));
1695	assert.node_idx = dlm->node_num;	1697	assert.node_idx = dlm->node_num;
1696	assert.namelen = namelen;	1698	assert.namelen = namelen;
1697	memcpy(assert.name, lockname, namelen);	1699	memcpy(assert.name, lockname, namelen);
1698	assert.flags = cpu_to_be32(flags);	1700	assert.flags = cpu_to_be32(flags);
1699		1701
1700	tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,	1702	tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1701	&assert, sizeof(assert), to, &r);	1703	&assert, sizeof(assert), to, &r);
1702	if (tmpret < 0) {	1704	if (tmpret < 0) {
1703	mlog(ML_ERROR, "Error %d when sending message %u (key "	1705	mlog(ML_ERROR, "Error %d when sending message %u (key "
1704	"0x%x) to node %u\n", tmpret,	1706	"0x%x) to node %u\n", tmpret,
1705	DLM_ASSERT_MASTER_MSG, dlm->key, to);	1707	DLM_ASSERT_MASTER_MSG, dlm->key, to);
1706	if (!dlm_is_host_down(tmpret)) {	1708	if (!dlm_is_host_down(tmpret)) {
1707	mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);	1709	mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1708	BUG();	1710	BUG();
1709	}	1711	}
1710	/* a node died. finish out the rest of the nodes. */	1712	/* a node died. finish out the rest of the nodes. */
1711	mlog(0, "link to %d went down!\n", to);	1713	mlog(0, "link to %d went down!\n", to);
1712	/* any nonzero status return will do */	1714	/* any nonzero status return will do */
1713	ret = tmpret;	1715	ret = tmpret;
1714	r = 0;	1716	r = 0;
1715	} else if (r < 0) {	1717	} else if (r < 0) {
1716	/* ok, something horribly messed. kill thyself. */	1718	/* ok, something horribly messed. kill thyself. */
1717	mlog(ML_ERROR,"during assert master of %.*s to %u, "	1719	mlog(ML_ERROR,"during assert master of %.*s to %u, "
1718	"got %d.\n", namelen, lockname, to, r);	1720	"got %d.\n", namelen, lockname, to, r);
1719	spin_lock(&dlm->spinlock);	1721	spin_lock(&dlm->spinlock);
1720	spin_lock(&dlm->master_lock);	1722	spin_lock(&dlm->master_lock);
1721	if (dlm_find_mle(dlm, &mle, (char *)lockname,	1723	if (dlm_find_mle(dlm, &mle, (char *)lockname,
1722	namelen)) {	1724	namelen)) {
1723	dlm_print_one_mle(mle);	1725	dlm_print_one_mle(mle);
1724	__dlm_put_mle(mle);	1726	__dlm_put_mle(mle);
1725	}	1727	}
1726	spin_unlock(&dlm->master_lock);	1728	spin_unlock(&dlm->master_lock);
1727	spin_unlock(&dlm->spinlock);	1729	spin_unlock(&dlm->spinlock);
1728	BUG();	1730	BUG();
1729	}	1731	}
1730		1732
1731	if (r & DLM_ASSERT_RESPONSE_REASSERT &&	1733	if (r & DLM_ASSERT_RESPONSE_REASSERT &&
1732	!(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {	1734	!(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
1733	mlog(ML_ERROR, "%.*s: very strange, "	1735	mlog(ML_ERROR, "%.*s: very strange, "
1734	"master MLE but no lockres on %u\n",	1736	"master MLE but no lockres on %u\n",
1735	namelen, lockname, to);	1737	namelen, lockname, to);
1736	}	1738	}
1737		1739
1738	if (r & DLM_ASSERT_RESPONSE_REASSERT) {	1740	if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1739	mlog(0, "%.*s: node %u create mles on other "	1741	mlog(0, "%.*s: node %u create mles on other "
1740	"nodes and requests a re-assert\n",	1742	"nodes and requests a re-assert\n",
1741	namelen, lockname, to);	1743	namelen, lockname, to);
1742	reassert = 1;	1744	reassert = 1;
1743	}	1745	}
1744	if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {	1746	if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
1745	mlog(0, "%.*s: node %u has a reference to this "	1747	mlog(0, "%.*s: node %u has a reference to this "
1746	"lockres, set the bit in the refmap\n",	1748	"lockres, set the bit in the refmap\n",
1747	namelen, lockname, to);	1749	namelen, lockname, to);
1748	spin_lock(&res->spinlock);	1750	spin_lock(&res->spinlock);
1749	dlm_lockres_set_refmap_bit(dlm, res, to);	1751	dlm_lockres_set_refmap_bit(dlm, res, to);
1750	spin_unlock(&res->spinlock);	1752	spin_unlock(&res->spinlock);
1751	}	1753	}
1752	}	1754	}
1753		1755
1754	if (reassert)	1756	if (reassert)
1755	goto again;	1757	goto again;
1756		1758
1757	spin_lock(&res->spinlock);	1759	spin_lock(&res->spinlock);
1758	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;	1760	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
1759	spin_unlock(&res->spinlock);	1761	spin_unlock(&res->spinlock);
1760	wake_up(&res->wq);	1762	wake_up(&res->wq);
1761		1763
1762	return ret;	1764	return ret;
1763	}	1765	}
1764		1766
1765	/*	1767	/*
1766	* locks that can be taken here:	1768	* locks that can be taken here:
1767	* dlm->spinlock	1769	* dlm->spinlock
1768	* res->spinlock	1770	* res->spinlock
1769	* mle->spinlock	1771	* mle->spinlock
1770	* dlm->master_list	1772	* dlm->master_list
1771	*	1773	*
1772	* if possible, TRIM THIS DOWN!!!	1774	* if possible, TRIM THIS DOWN!!!
1773	*/	1775	*/
1774	int dlm_assert_master_handler(struct o2net_msg msg, u32 len, void data,	1776	int dlm_assert_master_handler(struct o2net_msg msg, u32 len, void data,
1775	void **ret_data)	1777	void **ret_data)
1776	{	1778	{
1777	struct dlm_ctxt *dlm = data;	1779	struct dlm_ctxt *dlm = data;
1778	struct dlm_master_list_entry *mle = NULL;	1780	struct dlm_master_list_entry *mle = NULL;
1779	struct dlm_assert_master assert = (struct dlm_assert_master )msg->buf;	1781	struct dlm_assert_master assert = (struct dlm_assert_master )msg->buf;
1780	struct dlm_lock_resource *res = NULL;	1782	struct dlm_lock_resource *res = NULL;
1781	char *name;	1783	char *name;
1782	unsigned int namelen, hash;	1784	unsigned int namelen, hash;
1783	u32 flags;	1785	u32 flags;
1784	int master_request = 0, have_lockres_ref = 0;	1786	int master_request = 0, have_lockres_ref = 0;
1785	int ret = 0;	1787	int ret = 0;
1786		1788
1787	if (!dlm_grab(dlm))	1789	if (!dlm_grab(dlm))
1788	return 0;	1790	return 0;
1789		1791
1790	name = assert->name;	1792	name = assert->name;
1791	namelen = assert->namelen;	1793	namelen = assert->namelen;
1792	hash = dlm_lockid_hash(name, namelen);	1794	hash = dlm_lockid_hash(name, namelen);
1793	flags = be32_to_cpu(assert->flags);	1795	flags = be32_to_cpu(assert->flags);
1794		1796
1795	if (namelen > DLM_LOCKID_NAME_MAX) {	1797	if (namelen > DLM_LOCKID_NAME_MAX) {
1796	mlog(ML_ERROR, "Invalid name length!");	1798	mlog(ML_ERROR, "Invalid name length!");
1797	goto done;	1799	goto done;
1798	}	1800	}
1799		1801
1800	spin_lock(&dlm->spinlock);	1802	spin_lock(&dlm->spinlock);
1801		1803
1802	if (flags)	1804	if (flags)
1803	mlog(0, "assert_master with flags: %u\n", flags);	1805	mlog(0, "assert_master with flags: %u\n", flags);
1804		1806
1805	/* find the MLE */	1807	/* find the MLE */
1806	spin_lock(&dlm->master_lock);	1808	spin_lock(&dlm->master_lock);
1807	if (!dlm_find_mle(dlm, &mle, name, namelen)) {	1809	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1808	/* not an error, could be master just re-asserting */	1810	/* not an error, could be master just re-asserting */
1809	mlog(0, "just got an assert_master from %u, but no "	1811	mlog(0, "just got an assert_master from %u, but no "
1810	"MLE for it! (%.*s)\n", assert->node_idx,	1812	"MLE for it! (%.*s)\n", assert->node_idx,
1811	namelen, name);	1813	namelen, name);
1812	} else {	1814	} else {
1813	int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);	1815	int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1814	if (bit >= O2NM_MAX_NODES) {	1816	if (bit >= O2NM_MAX_NODES) {
1815	/* not necessarily an error, though less likely.	1817	/* not necessarily an error, though less likely.
1816	* could be master just re-asserting. */	1818	* could be master just re-asserting. */
1817	mlog(0, "no bits set in the maybe_map, but %u "	1819	mlog(0, "no bits set in the maybe_map, but %u "
1818	"is asserting! (%.*s)\n", assert->node_idx,	1820	"is asserting! (%.*s)\n", assert->node_idx,
1819	namelen, name);	1821	namelen, name);
1820	} else if (bit != assert->node_idx) {	1822	} else if (bit != assert->node_idx) {
1821	if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {	1823	if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1822	mlog(0, "master %u was found, %u should "	1824	mlog(0, "master %u was found, %u should "
1823	"back off\n", assert->node_idx, bit);	1825	"back off\n", assert->node_idx, bit);
1824	} else {	1826	} else {
1825	/* with the fix for bug 569, a higher node	1827	/* with the fix for bug 569, a higher node
1826	* number winning the mastery will respond	1828	* number winning the mastery will respond
1827	* YES to mastery requests, but this node	1829	* YES to mastery requests, but this node
1828	* had no way of knowing. let it pass. */	1830	* had no way of knowing. let it pass. */
1829	mlog(0, "%u is the lowest node, "	1831	mlog(0, "%u is the lowest node, "
1830	"%u is asserting. (%.*s) %u must "	1832	"%u is asserting. (%.*s) %u must "
1831	"have begun after %u won.\n", bit,	1833	"have begun after %u won.\n", bit,
1832	assert->node_idx, namelen, name, bit,	1834	assert->node_idx, namelen, name, bit,
1833	assert->node_idx);	1835	assert->node_idx);
1834	}	1836	}
1835	}	1837	}
1836	if (mle->type == DLM_MLE_MIGRATION) {	1838	if (mle->type == DLM_MLE_MIGRATION) {
1837	if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {	1839	if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1838	mlog(0, "%s:%.*s: got cleanup assert"	1840	mlog(0, "%s:%.*s: got cleanup assert"
1839	" from %u for migration\n",	1841	" from %u for migration\n",
1840	dlm->name, namelen, name,	1842	dlm->name, namelen, name,
1841	assert->node_idx);	1843	assert->node_idx);
1842	} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {	1844	} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
1843	mlog(0, "%s:%.*s: got unrelated assert"	1845	mlog(0, "%s:%.*s: got unrelated assert"
1844	" from %u for migration, ignoring\n",	1846	" from %u for migration, ignoring\n",
1845	dlm->name, namelen, name,	1847	dlm->name, namelen, name,
1846	assert->node_idx);	1848	assert->node_idx);
1847	__dlm_put_mle(mle);	1849	__dlm_put_mle(mle);
1848	spin_unlock(&dlm->master_lock);	1850	spin_unlock(&dlm->master_lock);
1849	spin_unlock(&dlm->spinlock);	1851	spin_unlock(&dlm->spinlock);
1850	goto done;	1852	goto done;
1851	}	1853	}
1852	}	1854	}
1853	}	1855	}
1854	spin_unlock(&dlm->master_lock);	1856	spin_unlock(&dlm->master_lock);
1855		1857
1856	/* ok everything checks out with the MLE	1858	/* ok everything checks out with the MLE
1857	* now check to see if there is a lockres */	1859	* now check to see if there is a lockres */
1858	res = __dlm_lookup_lockres(dlm, name, namelen, hash);	1860	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1859	if (res) {	1861	if (res) {
1860	spin_lock(&res->spinlock);	1862	spin_lock(&res->spinlock);
1861	if (res->state & DLM_LOCK_RES_RECOVERING) {	1863	if (res->state & DLM_LOCK_RES_RECOVERING) {
1862	mlog(ML_ERROR, "%u asserting but %.*s is "	1864	mlog(ML_ERROR, "%u asserting but %.*s is "
1863	"RECOVERING!\n", assert->node_idx, namelen, name);	1865	"RECOVERING!\n", assert->node_idx, namelen, name);
1864	goto kill;	1866	goto kill;
1865	}	1867	}
1866	if (!mle) {	1868	if (!mle) {
1867	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&	1869	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1868	res->owner != assert->node_idx) {	1870	res->owner != assert->node_idx) {
1869	mlog(ML_ERROR, "DIE! Mastery assert from %u, "	1871	mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1870	"but current owner is %u! (%.*s)\n",	1872	"but current owner is %u! (%.*s)\n",
1871	assert->node_idx, res->owner, namelen,	1873	assert->node_idx, res->owner, namelen,
1872	name);	1874	name);
1873	__dlm_print_one_lock_resource(res);	1875	__dlm_print_one_lock_resource(res);
1874	BUG();	1876	BUG();
1875	}	1877	}
1876	} else if (mle->type != DLM_MLE_MIGRATION) {	1878	} else if (mle->type != DLM_MLE_MIGRATION) {
1877	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {	1879	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1878	/* owner is just re-asserting */	1880	/* owner is just re-asserting */
1879	if (res->owner == assert->node_idx) {	1881	if (res->owner == assert->node_idx) {
1880	mlog(0, "owner %u re-asserting on "	1882	mlog(0, "owner %u re-asserting on "
1881	"lock %.*s\n", assert->node_idx,	1883	"lock %.*s\n", assert->node_idx,
1882	namelen, name);	1884	namelen, name);
1883	goto ok;	1885	goto ok;
1884	}	1886	}
1885	mlog(ML_ERROR, "got assert_master from "	1887	mlog(ML_ERROR, "got assert_master from "
1886	"node %u, but %u is the owner! "	1888	"node %u, but %u is the owner! "
1887	"(%.*s)\n", assert->node_idx,	1889	"(%.*s)\n", assert->node_idx,
1888	res->owner, namelen, name);	1890	res->owner, namelen, name);
1889	goto kill;	1891	goto kill;
1890	}	1892	}
1891	if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {	1893	if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1892	mlog(ML_ERROR, "got assert from %u, but lock "	1894	mlog(ML_ERROR, "got assert from %u, but lock "
1893	"with no owner should be "	1895	"with no owner should be "
1894	"in-progress! (%.*s)\n",	1896	"in-progress! (%.*s)\n",
1895	assert->node_idx,	1897	assert->node_idx,
1896	namelen, name);	1898	namelen, name);
1897	goto kill;	1899	goto kill;
1898	}	1900	}
1899	} else /* mle->type == DLM_MLE_MIGRATION */ {	1901	} else /* mle->type == DLM_MLE_MIGRATION */ {
1900	/* should only be getting an assert from new master */	1902	/* should only be getting an assert from new master */
1901	if (assert->node_idx != mle->new_master) {	1903	if (assert->node_idx != mle->new_master) {
1902	mlog(ML_ERROR, "got assert from %u, but "	1904	mlog(ML_ERROR, "got assert from %u, but "
1903	"new master is %u, and old master "	1905	"new master is %u, and old master "
1904	"was %u (%.*s)\n",	1906	"was %u (%.*s)\n",
1905	assert->node_idx, mle->new_master,	1907	assert->node_idx, mle->new_master,
1906	mle->master, namelen, name);	1908	mle->master, namelen, name);
1907	goto kill;	1909	goto kill;
1908	}	1910	}
1909		1911
1910	}	1912	}
1911	ok:	1913	ok:
1912	spin_unlock(&res->spinlock);	1914	spin_unlock(&res->spinlock);
1913	}	1915	}
1914		1916
1915	// mlog(0, "woo! got an assert_master from node %u!\n",	1917	// mlog(0, "woo! got an assert_master from node %u!\n",
1916	// assert->node_idx);	1918	// assert->node_idx);
1917	if (mle) {	1919	if (mle) {
1918	int extra_ref = 0;	1920	int extra_ref = 0;
1919	int nn = -1;	1921	int nn = -1;
1920	int rr, err = 0;	1922	int rr, err = 0;
1921		1923
1922	spin_lock(&mle->spinlock);	1924	spin_lock(&mle->spinlock);
1923	if (mle->type == DLM_MLE_BLOCK \|\| mle->type == DLM_MLE_MIGRATION)	1925	if (mle->type == DLM_MLE_BLOCK \|\| mle->type == DLM_MLE_MIGRATION)
1924	extra_ref = 1;	1926	extra_ref = 1;
1925	else {	1927	else {
1926	/* MASTER mle: if any bits set in the response map	1928	/* MASTER mle: if any bits set in the response map
1927	* then the calling node needs to re-assert to clear	1929	* then the calling node needs to re-assert to clear
1928	* up nodes that this node contacted */	1930	* up nodes that this node contacted */
1929	while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,	1931	while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1930	nn+1)) < O2NM_MAX_NODES) {	1932	nn+1)) < O2NM_MAX_NODES) {
1931	if (nn != dlm->node_num && nn != assert->node_idx) {	1933	if (nn != dlm->node_num && nn != assert->node_idx) {
1932	master_request = 1;	1934	master_request = 1;
1933	break;	1935	break;
1934	}	1936	}
1935	}	1937	}
1936	}	1938	}
1937	mle->master = assert->node_idx;	1939	mle->master = assert->node_idx;
1938	atomic_set(&mle->woken, 1);	1940	atomic_set(&mle->woken, 1);
1939	wake_up(&mle->wq);	1941	wake_up(&mle->wq);
1940	spin_unlock(&mle->spinlock);	1942	spin_unlock(&mle->spinlock);
1941		1943
1942	if (res) {	1944	if (res) {
1943	int wake = 0;	1945	int wake = 0;
1944	spin_lock(&res->spinlock);	1946	spin_lock(&res->spinlock);
1945	if (mle->type == DLM_MLE_MIGRATION) {	1947	if (mle->type == DLM_MLE_MIGRATION) {
1946	mlog(0, "finishing off migration of lockres %.*s, "	1948	mlog(0, "finishing off migration of lockres %.*s, "
1947	"from %u to %u\n",	1949	"from %u to %u\n",
1948	res->lockname.len, res->lockname.name,	1950	res->lockname.len, res->lockname.name,
1949	dlm->node_num, mle->new_master);	1951	dlm->node_num, mle->new_master);
1950	res->state &= ~DLM_LOCK_RES_MIGRATING;	1952	res->state &= ~DLM_LOCK_RES_MIGRATING;
1951	wake = 1;	1953	wake = 1;
1952	dlm_change_lockres_owner(dlm, res, mle->new_master);	1954	dlm_change_lockres_owner(dlm, res, mle->new_master);
1953	BUG_ON(res->state & DLM_LOCK_RES_DIRTY);	1955	BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1954	} else {	1956	} else {
1955	dlm_change_lockres_owner(dlm, res, mle->master);	1957	dlm_change_lockres_owner(dlm, res, mle->master);
1956	}	1958	}
1957	spin_unlock(&res->spinlock);	1959	spin_unlock(&res->spinlock);
1958	have_lockres_ref = 1;	1960	have_lockres_ref = 1;
1959	if (wake)	1961	if (wake)
1960	wake_up(&res->wq);	1962	wake_up(&res->wq);
1961	}	1963	}
1962		1964
1963	/* master is known, detach if not already detached.	1965	/* master is known, detach if not already detached.
1964	* ensures that only one assert_master call will happen	1966	* ensures that only one assert_master call will happen
1965	* on this mle. */	1967	* on this mle. */
1966	spin_lock(&dlm->master_lock);	1968	spin_lock(&dlm->master_lock);
1967		1969
1968	rr = atomic_read(&mle->mle_refs.refcount);	1970	rr = atomic_read(&mle->mle_refs.refcount);
1969	if (mle->inuse > 0) {	1971	if (mle->inuse > 0) {
1970	if (extra_ref && rr < 3)	1972	if (extra_ref && rr < 3)
1971	err = 1;	1973	err = 1;
1972	else if (!extra_ref && rr < 2)	1974	else if (!extra_ref && rr < 2)
1973	err = 1;	1975	err = 1;
1974	} else {	1976	} else {
1975	if (extra_ref && rr < 2)	1977	if (extra_ref && rr < 2)
1976	err = 1;	1978	err = 1;
1977	else if (!extra_ref && rr < 1)	1979	else if (!extra_ref && rr < 1)
1978	err = 1;	1980	err = 1;
1979	}	1981	}
1980	if (err) {	1982	if (err) {
1981	mlog(ML_ERROR, "%s:%.*s: got assert master from %u "	1983	mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
1982	"that will mess up this node, refs=%d, extra=%d, "	1984	"that will mess up this node, refs=%d, extra=%d, "
1983	"inuse=%d\n", dlm->name, namelen, name,	1985	"inuse=%d\n", dlm->name, namelen, name,
1984	assert->node_idx, rr, extra_ref, mle->inuse);	1986	assert->node_idx, rr, extra_ref, mle->inuse);
1985	dlm_print_one_mle(mle);	1987	dlm_print_one_mle(mle);
1986	}	1988	}
1987	__dlm_unlink_mle(dlm, mle);	1989	__dlm_unlink_mle(dlm, mle);
1988	__dlm_mle_detach_hb_events(dlm, mle);	1990	__dlm_mle_detach_hb_events(dlm, mle);
1989	__dlm_put_mle(mle);	1991	__dlm_put_mle(mle);
1990	if (extra_ref) {	1992	if (extra_ref) {
1991	/* the assert master message now balances the extra	1993	/* the assert master message now balances the extra
1992	* ref given by the master / migration request message.	1994	* ref given by the master / migration request message.
1993	* if this is the last put, it will be removed	1995	* if this is the last put, it will be removed
1994	* from the list. */	1996	* from the list. */
1995	__dlm_put_mle(mle);	1997	__dlm_put_mle(mle);
1996	}	1998	}
1997	spin_unlock(&dlm->master_lock);	1999	spin_unlock(&dlm->master_lock);
1998	} else if (res) {	2000	} else if (res) {
1999	if (res->owner != assert->node_idx) {	2001	if (res->owner != assert->node_idx) {
2000	mlog(0, "assert_master from %u, but current "	2002	mlog(0, "assert_master from %u, but current "
2001	"owner is %u (%.*s), no mle\n", assert->node_idx,	2003	"owner is %u (%.*s), no mle\n", assert->node_idx,
2002	res->owner, namelen, name);	2004	res->owner, namelen, name);
2003	}	2005	}
2004	}	2006	}
2005	spin_unlock(&dlm->spinlock);	2007	spin_unlock(&dlm->spinlock);
2006		2008
2007	done:	2009	done:
2008	ret = 0;	2010	ret = 0;
2009	if (res) {	2011	if (res) {
2010	spin_lock(&res->spinlock);	2012	spin_lock(&res->spinlock);
2011	res->state \|= DLM_LOCK_RES_SETREF_INPROG;	2013	res->state \|= DLM_LOCK_RES_SETREF_INPROG;
2012	spin_unlock(&res->spinlock);	2014	spin_unlock(&res->spinlock);
2013	ret_data = (void )res;	2015	ret_data = (void )res;
2014	}	2016	}
2015	dlm_put(dlm);	2017	dlm_put(dlm);
2016	if (master_request) {	2018	if (master_request) {
2017	mlog(0, "need to tell master to reassert\n");	2019	mlog(0, "need to tell master to reassert\n");
2018	/* positive. negative would shoot down the node. */	2020	/* positive. negative would shoot down the node. */
2019	ret \|= DLM_ASSERT_RESPONSE_REASSERT;	2021	ret \|= DLM_ASSERT_RESPONSE_REASSERT;
2020	if (!have_lockres_ref) {	2022	if (!have_lockres_ref) {
2021	mlog(ML_ERROR, "strange, got assert from %u, MASTER "	2023	mlog(ML_ERROR, "strange, got assert from %u, MASTER "
2022	"mle present here for %s:%.*s, but no lockres!\n",	2024	"mle present here for %s:%.*s, but no lockres!\n",
2023	assert->node_idx, dlm->name, namelen, name);	2025	assert->node_idx, dlm->name, namelen, name);
2024	}	2026	}
2025	}	2027	}
2026	if (have_lockres_ref) {	2028	if (have_lockres_ref) {
2027	/* let the master know we have a reference to the lockres */	2029	/* let the master know we have a reference to the lockres */
2028	ret \|= DLM_ASSERT_RESPONSE_MASTERY_REF;	2030	ret \|= DLM_ASSERT_RESPONSE_MASTERY_REF;
2029	mlog(0, "%s:%.*s: got assert from %u, need a ref\n",	2031	mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2030	dlm->name, namelen, name, assert->node_idx);	2032	dlm->name, namelen, name, assert->node_idx);
2031	}	2033	}
2032	return ret;	2034	return ret;
2033		2035
2034	kill:	2036	kill:
2035	/* kill the caller! */	2037	/* kill the caller! */
2036	mlog(ML_ERROR, "Bad message received from another node. Dumping state "	2038	mlog(ML_ERROR, "Bad message received from another node. Dumping state "
2037	"and killing the other node now! This node is OK and can continue.\n");	2039	"and killing the other node now! This node is OK and can continue.\n");
2038	__dlm_print_one_lock_resource(res);	2040	__dlm_print_one_lock_resource(res);
2039	spin_unlock(&res->spinlock);	2041	spin_unlock(&res->spinlock);
2040	spin_unlock(&dlm->spinlock);	2042	spin_unlock(&dlm->spinlock);
2041	ret_data = (void )res;	2043	ret_data = (void )res;
2042	dlm_put(dlm);	2044	dlm_put(dlm);
2043	return -EINVAL;	2045	return -EINVAL;
2044	}	2046	}
2045		2047
2046	void dlm_assert_master_post_handler(int status, void data, void ret_data)	2048	void dlm_assert_master_post_handler(int status, void data, void ret_data)
2047	{	2049	{
2048	struct dlm_lock_resource res = (struct dlm_lock_resource )ret_data;	2050	struct dlm_lock_resource res = (struct dlm_lock_resource )ret_data;
2049		2051
2050	if (ret_data) {	2052	if (ret_data) {
2051	spin_lock(&res->spinlock);	2053	spin_lock(&res->spinlock);
2052	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;	2054	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
2053	spin_unlock(&res->spinlock);	2055	spin_unlock(&res->spinlock);
2054	wake_up(&res->wq);	2056	wake_up(&res->wq);
2055	dlm_lockres_put(res);	2057	dlm_lockres_put(res);
2056	}	2058	}
2057	return;	2059	return;
2058	}	2060	}
2059		2061
2060	int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,	2062	int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2061	struct dlm_lock_resource *res,	2063	struct dlm_lock_resource *res,
2062	int ignore_higher, u8 request_from, u32 flags)	2064	int ignore_higher, u8 request_from, u32 flags)
2063	{	2065	{
2064	struct dlm_work_item *item;	2066	struct dlm_work_item *item;
2065	item = kzalloc(sizeof(*item), GFP_ATOMIC);	2067	item = kzalloc(sizeof(*item), GFP_ATOMIC);
2066	if (!item)	2068	if (!item)
2067	return -ENOMEM;	2069	return -ENOMEM;
2068		2070
2069		2071
2070	/* queue up work for dlm_assert_master_worker */	2072	/* queue up work for dlm_assert_master_worker */
2071	dlm_grab(dlm); /* get an extra ref for the work item */	2073	dlm_grab(dlm); /* get an extra ref for the work item */
2072	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);	2074	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
2073	item->u.am.lockres = res; /* already have a ref */	2075	item->u.am.lockres = res; /* already have a ref */
2074	/* can optionally ignore node numbers higher than this node */	2076	/* can optionally ignore node numbers higher than this node */
2075	item->u.am.ignore_higher = ignore_higher;	2077	item->u.am.ignore_higher = ignore_higher;
2076	item->u.am.request_from = request_from;	2078	item->u.am.request_from = request_from;
2077	item->u.am.flags = flags;	2079	item->u.am.flags = flags;
2078		2080
2079	if (ignore_higher)	2081	if (ignore_higher)
2080	mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,	2082	mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2081	res->lockname.name);	2083	res->lockname.name);
2082		2084
2083	spin_lock(&dlm->work_lock);	2085	spin_lock(&dlm->work_lock);
2084	list_add_tail(&item->list, &dlm->work_list);	2086	list_add_tail(&item->list, &dlm->work_list);
2085	spin_unlock(&dlm->work_lock);	2087	spin_unlock(&dlm->work_lock);
2086		2088
2087	queue_work(dlm->dlm_worker, &dlm->dispatched_work);	2089	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2088	return 0;	2090	return 0;
2089	}	2091	}
2090		2092
2091	static void dlm_assert_master_worker(struct dlm_work_item item, void data)	2093	static void dlm_assert_master_worker(struct dlm_work_item item, void data)
2092	{	2094	{
2093	struct dlm_ctxt *dlm = data;	2095	struct dlm_ctxt *dlm = data;
2094	int ret = 0;	2096	int ret = 0;
2095	struct dlm_lock_resource *res;	2097	struct dlm_lock_resource *res;
2096	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];	2098	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
2097	int ignore_higher;	2099	int ignore_higher;
2098	int bit;	2100	int bit;
2099	u8 request_from;	2101	u8 request_from;
2100	u32 flags;	2102	u32 flags;
2101		2103
2102	dlm = item->dlm;	2104	dlm = item->dlm;
2103	res = item->u.am.lockres;	2105	res = item->u.am.lockres;
2104	ignore_higher = item->u.am.ignore_higher;	2106	ignore_higher = item->u.am.ignore_higher;
2105	request_from = item->u.am.request_from;	2107	request_from = item->u.am.request_from;
2106	flags = item->u.am.flags;	2108	flags = item->u.am.flags;
2107		2109
2108	spin_lock(&dlm->spinlock);	2110	spin_lock(&dlm->spinlock);
2109	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));	2111	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
2110	spin_unlock(&dlm->spinlock);	2112	spin_unlock(&dlm->spinlock);
2111		2113
2112	clear_bit(dlm->node_num, nodemap);	2114	clear_bit(dlm->node_num, nodemap);
2113	if (ignore_higher) {	2115	if (ignore_higher) {
2114	/* if is this just to clear up mles for nodes below	2116	/* if is this just to clear up mles for nodes below
2115	* this node, do not send the message to the original	2117	* this node, do not send the message to the original
2116	* caller or any node number higher than this */	2118	* caller or any node number higher than this */
2117	clear_bit(request_from, nodemap);	2119	clear_bit(request_from, nodemap);
2118	bit = dlm->node_num;	2120	bit = dlm->node_num;
2119	while (1) {	2121	while (1) {
2120	bit = find_next_bit(nodemap, O2NM_MAX_NODES,	2122	bit = find_next_bit(nodemap, O2NM_MAX_NODES,
2121	bit+1);	2123	bit+1);
2122	if (bit >= O2NM_MAX_NODES)	2124	if (bit >= O2NM_MAX_NODES)
2123	break;	2125	break;
2124	clear_bit(bit, nodemap);	2126	clear_bit(bit, nodemap);
2125	}	2127	}
2126	}	2128	}
2127		2129
2128	/*	2130	/*
2129	* If we're migrating this lock to someone else, we are no	2131	* If we're migrating this lock to someone else, we are no
2130	* longer allowed to assert out own mastery. OTOH, we need to	2132	* longer allowed to assert out own mastery. OTOH, we need to
2131	* prevent migration from starting while we're still asserting	2133	* prevent migration from starting while we're still asserting
2132	* our dominance. The reserved ast delays migration.	2134	* our dominance. The reserved ast delays migration.
2133	*/	2135	*/
2134	spin_lock(&res->spinlock);	2136	spin_lock(&res->spinlock);
2135	if (res->state & DLM_LOCK_RES_MIGRATING) {	2137	if (res->state & DLM_LOCK_RES_MIGRATING) {
2136	mlog(0, "Someone asked us to assert mastery, but we're "	2138	mlog(0, "Someone asked us to assert mastery, but we're "
2137	"in the middle of migration. Skipping assert, "	2139	"in the middle of migration. Skipping assert, "
2138	"the new master will handle that.\n");	2140	"the new master will handle that.\n");
2139	spin_unlock(&res->spinlock);	2141	spin_unlock(&res->spinlock);
2140	goto put;	2142	goto put;
2141	} else	2143	} else
2142	__dlm_lockres_reserve_ast(res);	2144	__dlm_lockres_reserve_ast(res);
2143	spin_unlock(&res->spinlock);	2145	spin_unlock(&res->spinlock);
2144		2146
2145	/* this call now finishes out the nodemap	2147	/* this call now finishes out the nodemap
2146	* even if one or more nodes die */	2148	* even if one or more nodes die */
2147	mlog(0, "worker about to master %.*s here, this=%u\n",	2149	mlog(0, "worker about to master %.*s here, this=%u\n",
2148	res->lockname.len, res->lockname.name, dlm->node_num);	2150	res->lockname.len, res->lockname.name, dlm->node_num);
2149	ret = dlm_do_assert_master(dlm, res, nodemap, flags);	2151	ret = dlm_do_assert_master(dlm, res, nodemap, flags);
2150	if (ret < 0) {	2152	if (ret < 0) {
2151	/* no need to restart, we are done */	2153	/* no need to restart, we are done */
2152	if (!dlm_is_host_down(ret))	2154	if (!dlm_is_host_down(ret))
2153	mlog_errno(ret);	2155	mlog_errno(ret);
2154	}	2156	}
2155		2157
2156	/* Ok, we've asserted ourselves. Let's let migration start. */	2158	/* Ok, we've asserted ourselves. Let's let migration start. */
2157	dlm_lockres_release_ast(dlm, res);	2159	dlm_lockres_release_ast(dlm, res);
2158		2160
2159	put:	2161	put:
2160	dlm_lockres_drop_inflight_worker(dlm, res);	2162	dlm_lockres_drop_inflight_worker(dlm, res);
2161		2163
2162	dlm_lockres_put(res);	2164	dlm_lockres_put(res);
2163		2165
2164	mlog(0, "finished with dlm_assert_master_worker\n");	2166	mlog(0, "finished with dlm_assert_master_worker\n");
2165	}	2167	}
2166		2168
2167	/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.	2169	/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
2168	* We cannot wait for node recovery to complete to begin mastering this	2170	* We cannot wait for node recovery to complete to begin mastering this
2169	* lockres because this lockres is used to kick off recovery! ;-)	2171	* lockres because this lockres is used to kick off recovery! ;-)
2170	* So, do a pre-check on all living nodes to see if any of those nodes	2172	* So, do a pre-check on all living nodes to see if any of those nodes
2171	* think that $RECOVERY is currently mastered by a dead node. If so,	2173	* think that $RECOVERY is currently mastered by a dead node. If so,
2172	* we wait a short time to allow that node to get notified by its own	2174	* we wait a short time to allow that node to get notified by its own
2173	* heartbeat stack, then check again. All $RECOVERY lock resources	2175	* heartbeat stack, then check again. All $RECOVERY lock resources
2174	* mastered by dead nodes are purged when the hearbeat callback is	2176	* mastered by dead nodes are purged when the hearbeat callback is
2175	* fired, so we can know for sure that it is safe to continue once	2177	* fired, so we can know for sure that it is safe to continue once
2176	* the node returns a live node or no node. */	2178	* the node returns a live node or no node. */
2177	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,	2179	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2178	struct dlm_lock_resource *res)	2180	struct dlm_lock_resource *res)
2179	{	2181	{
2180	struct dlm_node_iter iter;	2182	struct dlm_node_iter iter;
2181	int nodenum;	2183	int nodenum;
2182	int ret = 0;	2184	int ret = 0;
2183	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;	2185	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
2184		2186
2185	spin_lock(&dlm->spinlock);	2187	spin_lock(&dlm->spinlock);
2186	dlm_node_iter_init(dlm->domain_map, &iter);	2188	dlm_node_iter_init(dlm->domain_map, &iter);
2187	spin_unlock(&dlm->spinlock);	2189	spin_unlock(&dlm->spinlock);
2188		2190
2189	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {	2191	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2190	/* do not send to self */	2192	/* do not send to self */
2191	if (nodenum == dlm->node_num)	2193	if (nodenum == dlm->node_num)
2192	continue;	2194	continue;
2193	ret = dlm_do_master_requery(dlm, res, nodenum, &master);	2195	ret = dlm_do_master_requery(dlm, res, nodenum, &master);
2194	if (ret < 0) {	2196	if (ret < 0) {
2195	mlog_errno(ret);	2197	mlog_errno(ret);
2196	if (!dlm_is_host_down(ret))	2198	if (!dlm_is_host_down(ret))
2197	BUG();	2199	BUG();
2198	/* host is down, so answer for that node would be	2200	/* host is down, so answer for that node would be
2199	* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */	2201	* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
2200	ret = 0;	2202	ret = 0;
2201	}	2203	}
2202		2204
2203	if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {	2205	if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
2204	/* check to see if this master is in the recovery map */	2206	/* check to see if this master is in the recovery map */
2205	spin_lock(&dlm->spinlock);	2207	spin_lock(&dlm->spinlock);
2206	if (test_bit(master, dlm->recovery_map)) {	2208	if (test_bit(master, dlm->recovery_map)) {
2207	mlog(ML_NOTICE, "%s: node %u has not seen "	2209	mlog(ML_NOTICE, "%s: node %u has not seen "
2208	"node %u go down yet, and thinks the "	2210	"node %u go down yet, and thinks the "
2209	"dead node is mastering the recovery "	2211	"dead node is mastering the recovery "
2210	"lock. must wait.\n", dlm->name,	2212	"lock. must wait.\n", dlm->name,
2211	nodenum, master);	2213	nodenum, master);
2212	ret = -EAGAIN;	2214	ret = -EAGAIN;
2213	}	2215	}
2214	spin_unlock(&dlm->spinlock);	2216	spin_unlock(&dlm->spinlock);
2215	mlog(0, "%s: reco lock master is %u\n", dlm->name,	2217	mlog(0, "%s: reco lock master is %u\n", dlm->name,
2216	master);	2218	master);
2217	break;	2219	break;
2218	}	2220	}
2219	}	2221	}
2220	return ret;	2222	return ret;
2221	}	2223	}
2222		2224
2223	/*	2225	/*
2224	* DLM_DEREF_LOCKRES_MSG	2226	* DLM_DEREF_LOCKRES_MSG
2225	*/	2227	*/
2226		2228
2227	int dlm_drop_lockres_ref(struct dlm_ctxt dlm, struct dlm_lock_resource res)	2229	int dlm_drop_lockres_ref(struct dlm_ctxt dlm, struct dlm_lock_resource res)
2228	{	2230	{
2229	struct dlm_deref_lockres deref;	2231	struct dlm_deref_lockres deref;
2230	int ret = 0, r;	2232	int ret = 0, r;
2231	const char *lockname;	2233	const char *lockname;
2232	unsigned int namelen;	2234	unsigned int namelen;
2233		2235
2234	lockname = res->lockname.name;	2236	lockname = res->lockname.name;
2235	namelen = res->lockname.len;	2237	namelen = res->lockname.len;
2236	BUG_ON(namelen > O2NM_MAX_NAME_LEN);	2238	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2237		2239
2238	memset(&deref, 0, sizeof(deref));	2240	memset(&deref, 0, sizeof(deref));
2239	deref.node_idx = dlm->node_num;	2241	deref.node_idx = dlm->node_num;
2240	deref.namelen = namelen;	2242	deref.namelen = namelen;
2241	memcpy(deref.name, lockname, namelen);	2243	memcpy(deref.name, lockname, namelen);
2242		2244
2243	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,	2245	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2244	&deref, sizeof(deref), res->owner, &r);	2246	&deref, sizeof(deref), res->owner, &r);
2245	if (ret < 0)	2247	if (ret < 0)
2246	mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",	2248	mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2247	dlm->name, namelen, lockname, ret, res->owner);	2249	dlm->name, namelen, lockname, ret, res->owner);
2248	else if (r < 0) {	2250	else if (r < 0) {
2249	/* BAD. other node says I did not have a ref. */	2251	/* BAD. other node says I did not have a ref. */
2250	mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",	2252	mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2251	dlm->name, namelen, lockname, res->owner, r);	2253	dlm->name, namelen, lockname, res->owner, r);
2252	dlm_print_one_lock_resource(res);	2254	dlm_print_one_lock_resource(res);
2253	BUG();	2255	BUG();
2254	}	2256	}
2255	return ret;	2257	return ret;
2256	}	2258	}
2257		2259
2258	int dlm_deref_lockres_handler(struct o2net_msg msg, u32 len, void data,	2260	int dlm_deref_lockres_handler(struct o2net_msg msg, u32 len, void data,
2259	void **ret_data)	2261	void **ret_data)
2260	{	2262	{
2261	struct dlm_ctxt *dlm = data;	2263	struct dlm_ctxt *dlm = data;
2262	struct dlm_deref_lockres deref = (struct dlm_deref_lockres )msg->buf;	2264	struct dlm_deref_lockres deref = (struct dlm_deref_lockres )msg->buf;
2263	struct dlm_lock_resource *res = NULL;	2265	struct dlm_lock_resource *res = NULL;
2264	char *name;	2266	char *name;
2265	unsigned int namelen;	2267	unsigned int namelen;
2266	int ret = -EINVAL;	2268	int ret = -EINVAL;
2267	u8 node;	2269	u8 node;
2268	unsigned int hash;	2270	unsigned int hash;
2269	struct dlm_work_item *item;	2271	struct dlm_work_item *item;
2270	int cleared = 0;	2272	int cleared = 0;
2271	int dispatch = 0;	2273	int dispatch = 0;
2272		2274
2273	if (!dlm_grab(dlm))	2275	if (!dlm_grab(dlm))
2274	return 0;	2276	return 0;
2275		2277
2276	name = deref->name;	2278	name = deref->name;
2277	namelen = deref->namelen;	2279	namelen = deref->namelen;
2278	node = deref->node_idx;	2280	node = deref->node_idx;
2279		2281
2280	if (namelen > DLM_LOCKID_NAME_MAX) {	2282	if (namelen > DLM_LOCKID_NAME_MAX) {
2281	mlog(ML_ERROR, "Invalid name length!");	2283	mlog(ML_ERROR, "Invalid name length!");
2282	goto done;	2284	goto done;
2283	}	2285	}
2284	if (deref->node_idx >= O2NM_MAX_NODES) {	2286	if (deref->node_idx >= O2NM_MAX_NODES) {
2285	mlog(ML_ERROR, "Invalid node number: %u\n", node);	2287	mlog(ML_ERROR, "Invalid node number: %u\n", node);
2286	goto done;	2288	goto done;
2287	}	2289	}
2288		2290
2289	hash = dlm_lockid_hash(name, namelen);	2291	hash = dlm_lockid_hash(name, namelen);
2290		2292
2291	spin_lock(&dlm->spinlock);	2293	spin_lock(&dlm->spinlock);
2292	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);	2294	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2293	if (!res) {	2295	if (!res) {
2294	spin_unlock(&dlm->spinlock);	2296	spin_unlock(&dlm->spinlock);
2295	mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",	2297	mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2296	dlm->name, namelen, name);	2298	dlm->name, namelen, name);
2297	goto done;	2299	goto done;
2298	}	2300	}
2299	spin_unlock(&dlm->spinlock);	2301	spin_unlock(&dlm->spinlock);
2300		2302
2301	spin_lock(&res->spinlock);	2303	spin_lock(&res->spinlock);
2302	if (res->state & DLM_LOCK_RES_SETREF_INPROG)	2304	if (res->state & DLM_LOCK_RES_SETREF_INPROG)
2303	dispatch = 1;	2305	dispatch = 1;
2304	else {	2306	else {
2305	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);	2307	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2306	if (test_bit(node, res->refmap)) {	2308	if (test_bit(node, res->refmap)) {
2307	dlm_lockres_clear_refmap_bit(dlm, res, node);	2309	dlm_lockres_clear_refmap_bit(dlm, res, node);
2308	cleared = 1;	2310	cleared = 1;
2309	}	2311	}
2310	}	2312	}
2311	spin_unlock(&res->spinlock);	2313	spin_unlock(&res->spinlock);
2312		2314
2313	if (!dispatch) {	2315	if (!dispatch) {
2314	if (cleared)	2316	if (cleared)
2315	dlm_lockres_calc_usage(dlm, res);	2317	dlm_lockres_calc_usage(dlm, res);
2316	else {	2318	else {
2317	mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "	2319	mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2318	"but it is already dropped!\n", dlm->name,	2320	"but it is already dropped!\n", dlm->name,
2319	res->lockname.len, res->lockname.name, node);	2321	res->lockname.len, res->lockname.name, node);
2320	dlm_print_one_lock_resource(res);	2322	dlm_print_one_lock_resource(res);
2321	}	2323	}
2322	ret = 0;	2324	ret = 0;
2323	goto done;	2325	goto done;
2324	}	2326	}
2325		2327
2326	item = kzalloc(sizeof(*item), GFP_NOFS);	2328	item = kzalloc(sizeof(*item), GFP_NOFS);
2327	if (!item) {	2329	if (!item) {
2328	ret = -ENOMEM;	2330	ret = -ENOMEM;
2329	mlog_errno(ret);	2331	mlog_errno(ret);
2330	goto done;	2332	goto done;
2331	}	2333	}
2332		2334
2333	dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);	2335	dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
2334	item->u.dl.deref_res = res;	2336	item->u.dl.deref_res = res;
2335	item->u.dl.deref_node = node;	2337	item->u.dl.deref_node = node;
2336		2338
2337	spin_lock(&dlm->work_lock);	2339	spin_lock(&dlm->work_lock);
2338	list_add_tail(&item->list, &dlm->work_list);	2340	list_add_tail(&item->list, &dlm->work_list);
2339	spin_unlock(&dlm->work_lock);	2341	spin_unlock(&dlm->work_lock);
2340		2342
2341	queue_work(dlm->dlm_worker, &dlm->dispatched_work);	2343	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2342	return 0;	2344	return 0;
2343		2345
2344	done:	2346	done:
2345	if (res)	2347	if (res)
2346	dlm_lockres_put(res);	2348	dlm_lockres_put(res);
2347	dlm_put(dlm);	2349	dlm_put(dlm);
2348		2350
2349	return ret;	2351	return ret;
2350	}	2352	}
2351		2353
2352	static void dlm_deref_lockres_worker(struct dlm_work_item item, void data)	2354	static void dlm_deref_lockres_worker(struct dlm_work_item item, void data)
2353	{	2355	{
2354	struct dlm_ctxt *dlm;	2356	struct dlm_ctxt *dlm;
2355	struct dlm_lock_resource *res;	2357	struct dlm_lock_resource *res;
2356	u8 node;	2358	u8 node;
2357	u8 cleared = 0;	2359	u8 cleared = 0;
2358		2360
2359	dlm = item->dlm;	2361	dlm = item->dlm;
2360	res = item->u.dl.deref_res;	2362	res = item->u.dl.deref_res;
2361	node = item->u.dl.deref_node;	2363	node = item->u.dl.deref_node;
2362		2364
2363	spin_lock(&res->spinlock);	2365	spin_lock(&res->spinlock);
2364	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);	2366	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2365	if (test_bit(node, res->refmap)) {	2367	if (test_bit(node, res->refmap)) {
2366	__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);	2368	__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2367	dlm_lockres_clear_refmap_bit(dlm, res, node);	2369	dlm_lockres_clear_refmap_bit(dlm, res, node);
2368	cleared = 1;	2370	cleared = 1;
2369	}	2371	}
2370	spin_unlock(&res->spinlock);	2372	spin_unlock(&res->spinlock);
2371		2373
2372	if (cleared) {	2374	if (cleared) {
2373	mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",	2375	mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2374	dlm->name, res->lockname.len, res->lockname.name, node);	2376	dlm->name, res->lockname.len, res->lockname.name, node);
2375	dlm_lockres_calc_usage(dlm, res);	2377	dlm_lockres_calc_usage(dlm, res);
2376	} else {	2378	} else {
2377	mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "	2379	mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2378	"but it is already dropped!\n", dlm->name,	2380	"but it is already dropped!\n", dlm->name,
2379	res->lockname.len, res->lockname.name, node);	2381	res->lockname.len, res->lockname.name, node);
2380	dlm_print_one_lock_resource(res);	2382	dlm_print_one_lock_resource(res);
2381	}	2383	}
2382		2384
2383	dlm_lockres_put(res);	2385	dlm_lockres_put(res);
2384	}	2386	}
2385		2387
2386	/*	2388	/*
2387	* A migrateable resource is one that is :	2389	* A migrateable resource is one that is :
2388	* 1. locally mastered, and,	2390	* 1. locally mastered, and,
2389	* 2. zero local locks, and,	2391	* 2. zero local locks, and,
2390	* 3. one or more non-local locks, or, one or more references	2392	* 3. one or more non-local locks, or, one or more references
2391	* Returns 1 if yes, 0 if not.	2393	* Returns 1 if yes, 0 if not.
2392	*/	2394	*/
2393	static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,	2395	static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2394	struct dlm_lock_resource *res)	2396	struct dlm_lock_resource *res)
2395	{	2397	{
2396	enum dlm_lockres_list idx;	2398	enum dlm_lockres_list idx;
2397	int nonlocal = 0, node_ref;	2399	int nonlocal = 0, node_ref;
2398	struct list_head *queue;	2400	struct list_head *queue;
2399	struct dlm_lock *lock;	2401	struct dlm_lock *lock;
2400	u64 cookie;	2402	u64 cookie;
2401		2403
2402	assert_spin_locked(&res->spinlock);	2404	assert_spin_locked(&res->spinlock);
2403		2405
2404	/* delay migration when the lockres is in MIGRATING state */	2406	/* delay migration when the lockres is in MIGRATING state */
2405	if (res->state & DLM_LOCK_RES_MIGRATING)	2407	if (res->state & DLM_LOCK_RES_MIGRATING)
2406	return 0;	2408	return 0;
2407		2409
2408	/* delay migration when the lockres is in RECOCERING state */	2410	/* delay migration when the lockres is in RECOCERING state */
2409	if (res->state & DLM_LOCK_RES_RECOVERING)	2411	if (res->state & DLM_LOCK_RES_RECOVERING)
2410	return 0;	2412	return 0;
2411		2413
2412	if (res->owner != dlm->node_num)	2414	if (res->owner != dlm->node_num)
2413	return 0;	2415	return 0;
2414		2416
2415	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {	2417	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2416	queue = dlm_list_idx_to_ptr(res, idx);	2418	queue = dlm_list_idx_to_ptr(res, idx);
2417	list_for_each_entry(lock, queue, list) {	2419	list_for_each_entry(lock, queue, list) {
2418	if (lock->ml.node != dlm->node_num) {	2420	if (lock->ml.node != dlm->node_num) {
2419	nonlocal++;	2421	nonlocal++;
2420	continue;	2422	continue;
2421	}	2423	}
2422	cookie = be64_to_cpu(lock->ml.cookie);	2424	cookie = be64_to_cpu(lock->ml.cookie);
2423	mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "	2425	mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
2424	"%s list\n", dlm->name, res->lockname.len,	2426	"%s list\n", dlm->name, res->lockname.len,
2425	res->lockname.name,	2427	res->lockname.name,
2426	dlm_get_lock_cookie_node(cookie),	2428	dlm_get_lock_cookie_node(cookie),
2427	dlm_get_lock_cookie_seq(cookie),	2429	dlm_get_lock_cookie_seq(cookie),
2428	dlm_list_in_text(idx));	2430	dlm_list_in_text(idx));
2429	return 0;	2431	return 0;
2430	}	2432	}
2431	}	2433	}
2432		2434
2433	if (!nonlocal) {	2435	if (!nonlocal) {
2434	node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);	2436	node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2435	if (node_ref >= O2NM_MAX_NODES)	2437	if (node_ref >= O2NM_MAX_NODES)
2436	return 0;	2438	return 0;
2437	}	2439	}
2438		2440
2439	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,	2441	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
2440	res->lockname.name);	2442	res->lockname.name);
2441		2443
2442	return 1;	2444	return 1;
2443	}	2445	}
2444		2446
2445	/*	2447	/*
2446	* DLM_MIGRATE_LOCKRES	2448	* DLM_MIGRATE_LOCKRES
2447	*/	2449	*/
2448		2450
2449		2451
2450	static int dlm_migrate_lockres(struct dlm_ctxt *dlm,	2452	static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2451	struct dlm_lock_resource *res, u8 target)	2453	struct dlm_lock_resource *res, u8 target)
2452	{	2454	{
2453	struct dlm_master_list_entry *mle = NULL;	2455	struct dlm_master_list_entry *mle = NULL;
2454	struct dlm_master_list_entry *oldmle = NULL;	2456	struct dlm_master_list_entry *oldmle = NULL;
2455	struct dlm_migratable_lockres *mres = NULL;	2457	struct dlm_migratable_lockres *mres = NULL;
2456	int ret = 0;	2458	int ret = 0;
2457	const char *name;	2459	const char *name;
2458	unsigned int namelen;	2460	unsigned int namelen;
2459	int mle_added = 0;	2461	int mle_added = 0;
2460	int wake = 0;	2462	int wake = 0;
2461		2463
2462	if (!dlm_grab(dlm))	2464	if (!dlm_grab(dlm))
2463	return -EINVAL;	2465	return -EINVAL;
2464		2466
2465	BUG_ON(target == O2NM_MAX_NODES);	2467	BUG_ON(target == O2NM_MAX_NODES);
2466		2468
2467	name = res->lockname.name;	2469	name = res->lockname.name;
2468	namelen = res->lockname.len;	2470	namelen = res->lockname.len;
2469		2471
2470	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,	2472	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2471	target);	2473	target);
2472		2474
2473	/* preallocate up front. if this fails, abort */	2475	/* preallocate up front. if this fails, abort */
2474	ret = -ENOMEM;	2476	ret = -ENOMEM;
2475	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);	2477	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2476	if (!mres) {	2478	if (!mres) {
2477	mlog_errno(ret);	2479	mlog_errno(ret);
2478	goto leave;	2480	goto leave;
2479	}	2481	}
2480		2482
2481	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);	2483	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2482	if (!mle) {	2484	if (!mle) {
2483	mlog_errno(ret);	2485	mlog_errno(ret);
2484	goto leave;	2486	goto leave;
2485	}	2487	}
2486	ret = 0;	2488	ret = 0;
2487		2489
2488	/*	2490	/*
2489	* clear any existing master requests and	2491	* clear any existing master requests and
2490	* add the migration mle to the list	2492	* add the migration mle to the list
2491	*/	2493	*/
2492	spin_lock(&dlm->spinlock);	2494	spin_lock(&dlm->spinlock);
2493	spin_lock(&dlm->master_lock);	2495	spin_lock(&dlm->master_lock);
2494	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,	2496	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2495	namelen, target, dlm->node_num);	2497	namelen, target, dlm->node_num);
2496	spin_unlock(&dlm->master_lock);	2498	spin_unlock(&dlm->master_lock);
2497	spin_unlock(&dlm->spinlock);	2499	spin_unlock(&dlm->spinlock);
2498		2500
2499	if (ret == -EEXIST) {	2501	if (ret == -EEXIST) {
2500	mlog(0, "another process is already migrating it\n");	2502	mlog(0, "another process is already migrating it\n");
2501	goto fail;	2503	goto fail;
2502	}	2504	}
2503	mle_added = 1;	2505	mle_added = 1;
2504		2506
2505	/*	2507	/*
2506	* set the MIGRATING flag and flush asts	2508	* set the MIGRATING flag and flush asts
2507	* if we fail after this we need to re-dirty the lockres	2509	* if we fail after this we need to re-dirty the lockres
2508	*/	2510	*/
2509	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {	2511	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
2510	mlog(ML_ERROR, "tried to migrate %.*s to %u, but "	2512	mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
2511	"the target went down.\n", res->lockname.len,	2513	"the target went down.\n", res->lockname.len,
2512	res->lockname.name, target);	2514	res->lockname.name, target);
2513	spin_lock(&res->spinlock);	2515	spin_lock(&res->spinlock);
2514	res->state &= ~DLM_LOCK_RES_MIGRATING;	2516	res->state &= ~DLM_LOCK_RES_MIGRATING;
2515	wake = 1;	2517	wake = 1;
2516	spin_unlock(&res->spinlock);	2518	spin_unlock(&res->spinlock);
2517	ret = -EINVAL;	2519	ret = -EINVAL;
2518	}	2520	}
2519		2521
2520	fail:	2522	fail:
2521	if (oldmle) {	2523	if (oldmle) {
2522	/* master is known, detach if not already detached */	2524	/* master is known, detach if not already detached */
2523	dlm_mle_detach_hb_events(dlm, oldmle);	2525	dlm_mle_detach_hb_events(dlm, oldmle);
2524	dlm_put_mle(oldmle);	2526	dlm_put_mle(oldmle);
2525	}	2527	}
2526		2528
2527	if (ret < 0) {	2529	if (ret < 0) {
2528	if (mle_added) {	2530	if (mle_added) {
2529	dlm_mle_detach_hb_events(dlm, mle);	2531	dlm_mle_detach_hb_events(dlm, mle);
2530	dlm_put_mle(mle);	2532	dlm_put_mle(mle);
2531	} else if (mle) {	2533	} else if (mle) {
2532	kmem_cache_free(dlm_mle_cache, mle);	2534	kmem_cache_free(dlm_mle_cache, mle);
2533	mle = NULL;	2535	mle = NULL;
2534	}	2536	}
2535	goto leave;	2537	goto leave;
2536	}	2538	}
2537		2539
2538	/*	2540	/*
2539	* at this point, we have a migration target, an mle	2541	* at this point, we have a migration target, an mle
2540	* in the master list, and the MIGRATING flag set on	2542	* in the master list, and the MIGRATING flag set on
2541	* the lockres	2543	* the lockres
2542	*/	2544	*/
2543		2545
2544	/* now that remote nodes are spinning on the MIGRATING flag,	2546	/* now that remote nodes are spinning on the MIGRATING flag,
2545	* ensure that all assert_master work is flushed. */	2547	* ensure that all assert_master work is flushed. */
2546	flush_workqueue(dlm->dlm_worker);	2548	flush_workqueue(dlm->dlm_worker);
2547		2549
2548	/* get an extra reference on the mle.	2550	/* get an extra reference on the mle.
2549	* otherwise the assert_master from the new	2551	* otherwise the assert_master from the new
2550	* master will destroy this.	2552	* master will destroy this.
2551	* also, make sure that all callers of dlm_get_mle	2553	* also, make sure that all callers of dlm_get_mle
2552	* take both dlm->spinlock and dlm->master_lock */	2554	* take both dlm->spinlock and dlm->master_lock */
2553	spin_lock(&dlm->spinlock);	2555	spin_lock(&dlm->spinlock);
2554	spin_lock(&dlm->master_lock);	2556	spin_lock(&dlm->master_lock);
2555	dlm_get_mle_inuse(mle);	2557	dlm_get_mle_inuse(mle);
2556	spin_unlock(&dlm->master_lock);	2558	spin_unlock(&dlm->master_lock);
2557	spin_unlock(&dlm->spinlock);	2559	spin_unlock(&dlm->spinlock);
2558		2560
2559	/* notify new node and send all lock state */	2561	/* notify new node and send all lock state */
2560	/* call send_one_lockres with migration flag.	2562	/* call send_one_lockres with migration flag.
2561	* this serves as notice to the target node that a	2563	* this serves as notice to the target node that a
2562	* migration is starting. */	2564	* migration is starting. */
2563	ret = dlm_send_one_lockres(dlm, res, mres, target,	2565	ret = dlm_send_one_lockres(dlm, res, mres, target,
2564	DLM_MRES_MIGRATION);	2566	DLM_MRES_MIGRATION);
2565		2567
2566	if (ret < 0) {	2568	if (ret < 0) {
2567	mlog(0, "migration to node %u failed with %d\n",	2569	mlog(0, "migration to node %u failed with %d\n",
2568	target, ret);	2570	target, ret);
2569	/* migration failed, detach and clean up mle */	2571	/* migration failed, detach and clean up mle */
2570	dlm_mle_detach_hb_events(dlm, mle);	2572	dlm_mle_detach_hb_events(dlm, mle);
2571	dlm_put_mle(mle);	2573	dlm_put_mle(mle);
2572	dlm_put_mle_inuse(mle);	2574	dlm_put_mle_inuse(mle);
2573	spin_lock(&res->spinlock);	2575	spin_lock(&res->spinlock);
2574	res->state &= ~DLM_LOCK_RES_MIGRATING;	2576	res->state &= ~DLM_LOCK_RES_MIGRATING;
2575	wake = 1;	2577	wake = 1;
2576	spin_unlock(&res->spinlock);	2578	spin_unlock(&res->spinlock);
2577	if (dlm_is_host_down(ret))	2579	if (dlm_is_host_down(ret))
2578	dlm_wait_for_node_death(dlm, target,	2580	dlm_wait_for_node_death(dlm, target,
2579	DLM_NODE_DEATH_WAIT_MAX);	2581	DLM_NODE_DEATH_WAIT_MAX);
2580	goto leave;	2582	goto leave;
2581	}	2583	}
2582		2584
2583	/* at this point, the target sends a message to all nodes,	2585	/* at this point, the target sends a message to all nodes,
2584	* (using dlm_do_migrate_request). this node is skipped since	2586	* (using dlm_do_migrate_request). this node is skipped since
2585	* we had to put an mle in the list to begin the process. this	2587	* we had to put an mle in the list to begin the process. this
2586	* node now waits for target to do an assert master. this node	2588	* node now waits for target to do an assert master. this node
2587	* will be the last one notified, ensuring that the migration	2589	* will be the last one notified, ensuring that the migration
2588	* is complete everywhere. if the target dies while this is	2590	* is complete everywhere. if the target dies while this is
2589	* going on, some nodes could potentially see the target as the	2591	* going on, some nodes could potentially see the target as the
2590	* master, so it is important that my recovery finds the migration	2592	* master, so it is important that my recovery finds the migration
2591	* mle and sets the master to UNKNOWN. */	2593	* mle and sets the master to UNKNOWN. */
2592		2594
2593		2595
2594	/* wait for new node to assert master */	2596	/* wait for new node to assert master */
2595	while (1) {	2597	while (1) {
2596	ret = wait_event_interruptible_timeout(mle->wq,	2598	ret = wait_event_interruptible_timeout(mle->wq,
2597	(atomic_read(&mle->woken) == 1),	2599	(atomic_read(&mle->woken) == 1),
2598	msecs_to_jiffies(5000));	2600	msecs_to_jiffies(5000));
2599		2601
2600	if (ret >= 0) {	2602	if (ret >= 0) {
2601	if (atomic_read(&mle->woken) == 1 \|\|	2603	if (atomic_read(&mle->woken) == 1 \|\|
2602	res->owner == target)	2604	res->owner == target)
2603	break;	2605	break;
2604		2606
2605	mlog(0, "%s:%.*s: timed out during migration\n",	2607	mlog(0, "%s:%.*s: timed out during migration\n",
2606	dlm->name, res->lockname.len, res->lockname.name);	2608	dlm->name, res->lockname.len, res->lockname.name);
2607	/* avoid hang during shutdown when migrating lockres	2609	/* avoid hang during shutdown when migrating lockres
2608	* to a node which also goes down */	2610	* to a node which also goes down */
2609	if (dlm_is_node_dead(dlm, target)) {	2611	if (dlm_is_node_dead(dlm, target)) {
2610	mlog(0, "%s:%.*s: expected migration "	2612	mlog(0, "%s:%.*s: expected migration "
2611	"target %u is no longer up, restarting\n",	2613	"target %u is no longer up, restarting\n",
2612	dlm->name, res->lockname.len,	2614	dlm->name, res->lockname.len,
2613	res->lockname.name, target);	2615	res->lockname.name, target);
2614	ret = -EINVAL;	2616	ret = -EINVAL;
2615	/* migration failed, detach and clean up mle */	2617	/* migration failed, detach and clean up mle */
2616	dlm_mle_detach_hb_events(dlm, mle);	2618	dlm_mle_detach_hb_events(dlm, mle);
2617	dlm_put_mle(mle);	2619	dlm_put_mle(mle);
2618	dlm_put_mle_inuse(mle);	2620	dlm_put_mle_inuse(mle);
2619	spin_lock(&res->spinlock);	2621	spin_lock(&res->spinlock);
2620	res->state &= ~DLM_LOCK_RES_MIGRATING;	2622	res->state &= ~DLM_LOCK_RES_MIGRATING;
2621	wake = 1;	2623	wake = 1;
2622	spin_unlock(&res->spinlock);	2624	spin_unlock(&res->spinlock);
2623	goto leave;	2625	goto leave;
2624	}	2626	}
2625	} else	2627	} else
2626	mlog(0, "%s:%.*s: caught signal during migration\n",	2628	mlog(0, "%s:%.*s: caught signal during migration\n",
2627	dlm->name, res->lockname.len, res->lockname.name);	2629	dlm->name, res->lockname.len, res->lockname.name);
2628	}	2630	}
2629		2631
2630	/* all done, set the owner, clear the flag */	2632	/* all done, set the owner, clear the flag */
2631	spin_lock(&res->spinlock);	2633	spin_lock(&res->spinlock);
2632	dlm_set_lockres_owner(dlm, res, target);	2634	dlm_set_lockres_owner(dlm, res, target);
2633	res->state &= ~DLM_LOCK_RES_MIGRATING;	2635	res->state &= ~DLM_LOCK_RES_MIGRATING;
2634	dlm_remove_nonlocal_locks(dlm, res);	2636	dlm_remove_nonlocal_locks(dlm, res);
2635	spin_unlock(&res->spinlock);	2637	spin_unlock(&res->spinlock);
2636	wake_up(&res->wq);	2638	wake_up(&res->wq);
2637		2639
2638	/* master is known, detach if not already detached */	2640	/* master is known, detach if not already detached */
2639	dlm_mle_detach_hb_events(dlm, mle);	2641	dlm_mle_detach_hb_events(dlm, mle);
2640	dlm_put_mle_inuse(mle);	2642	dlm_put_mle_inuse(mle);
2641	ret = 0;	2643	ret = 0;
2642		2644
2643	dlm_lockres_calc_usage(dlm, res);	2645	dlm_lockres_calc_usage(dlm, res);
2644		2646
2645	leave:	2647	leave:
2646	/* re-dirty the lockres if we failed */	2648	/* re-dirty the lockres if we failed */
2647	if (ret < 0)	2649	if (ret < 0)
2648	dlm_kick_thread(dlm, res);	2650	dlm_kick_thread(dlm, res);
2649		2651
2650	/* wake up waiters if the MIGRATING flag got set	2652	/* wake up waiters if the MIGRATING flag got set
2651	* but migration failed */	2653	* but migration failed */
2652	if (wake)	2654	if (wake)
2653	wake_up(&res->wq);	2655	wake_up(&res->wq);
2654		2656
2655	if (mres)	2657	if (mres)
2656	free_page((unsigned long)mres);	2658	free_page((unsigned long)mres);
2657		2659
2658	dlm_put(dlm);	2660	dlm_put(dlm);
2659		2661
2660	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,	2662	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2661	name, target, ret);	2663	name, target, ret);
2662	return ret;	2664	return ret;
2663	}	2665	}
2664		2666
2665	#define DLM_MIGRATION_RETRY_MS 100	2667	#define DLM_MIGRATION_RETRY_MS 100
2666		2668
2667	/*	2669	/*
2668	* Should be called only after beginning the domain leave process.	2670	* Should be called only after beginning the domain leave process.
2669	* There should not be any remaining locks on nonlocal lock resources,	2671	* There should not be any remaining locks on nonlocal lock resources,
2670	* and there should be no local locks left on locally mastered resources.	2672	* and there should be no local locks left on locally mastered resources.
2671	*	2673	*
2672	* Called with the dlm spinlock held, may drop it to do migration, but	2674	* Called with the dlm spinlock held, may drop it to do migration, but
2673	* will re-acquire before exit.	2675	* will re-acquire before exit.
2674	*	2676	*
2675	* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped	2677	* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2676	*/	2678	*/
2677	int dlm_empty_lockres(struct dlm_ctxt dlm, struct dlm_lock_resource res)	2679	int dlm_empty_lockres(struct dlm_ctxt dlm, struct dlm_lock_resource res)
2678	{	2680	{
2679	int ret;	2681	int ret;
2680	int lock_dropped = 0;	2682	int lock_dropped = 0;
2681	u8 target = O2NM_MAX_NODES;	2683	u8 target = O2NM_MAX_NODES;
2682		2684
2683	assert_spin_locked(&dlm->spinlock);	2685	assert_spin_locked(&dlm->spinlock);
2684		2686
2685	spin_lock(&res->spinlock);	2687	spin_lock(&res->spinlock);
2686	if (dlm_is_lockres_migrateable(dlm, res))	2688	if (dlm_is_lockres_migrateable(dlm, res))
2687	target = dlm_pick_migration_target(dlm, res);	2689	target = dlm_pick_migration_target(dlm, res);
2688	spin_unlock(&res->spinlock);	2690	spin_unlock(&res->spinlock);
2689		2691
2690	if (target == O2NM_MAX_NODES)	2692	if (target == O2NM_MAX_NODES)
2691	goto leave;	2693	goto leave;
2692		2694
2693	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */	2695	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2694	spin_unlock(&dlm->spinlock);	2696	spin_unlock(&dlm->spinlock);
2695	lock_dropped = 1;	2697	lock_dropped = 1;
2696	ret = dlm_migrate_lockres(dlm, res, target);	2698	ret = dlm_migrate_lockres(dlm, res, target);
2697	if (ret)	2699	if (ret)
2698	mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",	2700	mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2699	dlm->name, res->lockname.len, res->lockname.name,	2701	dlm->name, res->lockname.len, res->lockname.name,
2700	target, ret);	2702	target, ret);
2701	spin_lock(&dlm->spinlock);	2703	spin_lock(&dlm->spinlock);
2702	leave:	2704	leave:
2703	return lock_dropped;	2705	return lock_dropped;
2704	}	2706	}
2705		2707
2706	int dlm_lock_basts_flushed(struct dlm_ctxt dlm, struct dlm_lock lock)	2708	int dlm_lock_basts_flushed(struct dlm_ctxt dlm, struct dlm_lock lock)
2707	{	2709	{
2708	int ret;	2710	int ret;
2709	spin_lock(&dlm->ast_lock);	2711	spin_lock(&dlm->ast_lock);
2710	spin_lock(&lock->spinlock);	2712	spin_lock(&lock->spinlock);
2711	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);	2713	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2712	spin_unlock(&lock->spinlock);	2714	spin_unlock(&lock->spinlock);
2713	spin_unlock(&dlm->ast_lock);	2715	spin_unlock(&dlm->ast_lock);
2714	return ret;	2716	return ret;
2715	}	2717	}
2716		2718
2717	static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,	2719	static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2718	struct dlm_lock_resource *res,	2720	struct dlm_lock_resource *res,
2719	u8 mig_target)	2721	u8 mig_target)
2720	{	2722	{
2721	int can_proceed;	2723	int can_proceed;
2722	spin_lock(&res->spinlock);	2724	spin_lock(&res->spinlock);
2723	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);	2725	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2724	spin_unlock(&res->spinlock);	2726	spin_unlock(&res->spinlock);
2725		2727
2726	/* target has died, so make the caller break out of the	2728	/* target has died, so make the caller break out of the
2727	* wait_event, but caller must recheck the domain_map */	2729	* wait_event, but caller must recheck the domain_map */
2728	spin_lock(&dlm->spinlock);	2730	spin_lock(&dlm->spinlock);
2729	if (!test_bit(mig_target, dlm->domain_map))	2731	if (!test_bit(mig_target, dlm->domain_map))
2730	can_proceed = 1;	2732	can_proceed = 1;
2731	spin_unlock(&dlm->spinlock);	2733	spin_unlock(&dlm->spinlock);
2732	return can_proceed;	2734	return can_proceed;
2733	}	2735	}
2734		2736
2735	static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,	2737	static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
2736	struct dlm_lock_resource *res)	2738	struct dlm_lock_resource *res)
2737	{	2739	{
2738	int ret;	2740	int ret;
2739	spin_lock(&res->spinlock);	2741	spin_lock(&res->spinlock);
2740	ret = !!(res->state & DLM_LOCK_RES_DIRTY);	2742	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2741	spin_unlock(&res->spinlock);	2743	spin_unlock(&res->spinlock);
2742	return ret;	2744	return ret;
2743	}	2745	}
2744		2746
2745		2747
2746	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,	2748	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2747	struct dlm_lock_resource *res,	2749	struct dlm_lock_resource *res,
2748	u8 target)	2750	u8 target)
2749	{	2751	{
2750	int ret = 0;	2752	int ret = 0;
2751		2753
2752	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",	2754	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2753	res->lockname.len, res->lockname.name, dlm->node_num,	2755	res->lockname.len, res->lockname.name, dlm->node_num,
2754	target);	2756	target);
2755	/* need to set MIGRATING flag on lockres. this is done by	2757	/* need to set MIGRATING flag on lockres. this is done by
2756	* ensuring that all asts have been flushed for this lockres. */	2758	* ensuring that all asts have been flushed for this lockres. */
2757	spin_lock(&res->spinlock);	2759	spin_lock(&res->spinlock);
2758	BUG_ON(res->migration_pending);	2760	BUG_ON(res->migration_pending);
2759	res->migration_pending = 1;	2761	res->migration_pending = 1;
2760	/* strategy is to reserve an extra ast then release	2762	/* strategy is to reserve an extra ast then release
2761	* it below, letting the release do all of the work */	2763	* it below, letting the release do all of the work */
2762	__dlm_lockres_reserve_ast(res);	2764	__dlm_lockres_reserve_ast(res);
2763	spin_unlock(&res->spinlock);	2765	spin_unlock(&res->spinlock);
2764		2766
2765	/* now flush all the pending asts */	2767	/* now flush all the pending asts */
2766	dlm_kick_thread(dlm, res);	2768	dlm_kick_thread(dlm, res);
2767	/* before waiting on DIRTY, block processes which may	2769	/* before waiting on DIRTY, block processes which may
2768	* try to dirty the lockres before MIGRATING is set */	2770	* try to dirty the lockres before MIGRATING is set */
2769	spin_lock(&res->spinlock);	2771	spin_lock(&res->spinlock);
2770	BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);	2772	BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
2771	res->state \|= DLM_LOCK_RES_BLOCK_DIRTY;	2773	res->state \|= DLM_LOCK_RES_BLOCK_DIRTY;
2772	spin_unlock(&res->spinlock);	2774	spin_unlock(&res->spinlock);
2773	/* now wait on any pending asts and the DIRTY state */	2775	/* now wait on any pending asts and the DIRTY state */
2774	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));	2776	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2775	dlm_lockres_release_ast(dlm, res);	2777	dlm_lockres_release_ast(dlm, res);
2776		2778
2777	mlog(0, "about to wait on migration_wq, dirty=%s\n",	2779	mlog(0, "about to wait on migration_wq, dirty=%s\n",
2778	res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");	2780	res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2779	/* if the extra ref we just put was the final one, this	2781	/* if the extra ref we just put was the final one, this
2780	* will pass thru immediately. otherwise, we need to wait	2782	* will pass thru immediately. otherwise, we need to wait
2781	* for the last ast to finish. */	2783	* for the last ast to finish. */
2782	again:	2784	again:
2783	ret = wait_event_interruptible_timeout(dlm->migration_wq,	2785	ret = wait_event_interruptible_timeout(dlm->migration_wq,
2784	dlm_migration_can_proceed(dlm, res, target),	2786	dlm_migration_can_proceed(dlm, res, target),
2785	msecs_to_jiffies(1000));	2787	msecs_to_jiffies(1000));
2786	if (ret < 0) {	2788	if (ret < 0) {
2787	mlog(0, "woken again: migrating? %s, dead? %s\n",	2789	mlog(0, "woken again: migrating? %s, dead? %s\n",
2788	res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",	2790	res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2789	test_bit(target, dlm->domain_map) ? "no":"yes");	2791	test_bit(target, dlm->domain_map) ? "no":"yes");
2790	} else {	2792	} else {
2791	mlog(0, "all is well: migrating? %s, dead? %s\n",	2793	mlog(0, "all is well: migrating? %s, dead? %s\n",
2792	res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",	2794	res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2793	test_bit(target, dlm->domain_map) ? "no":"yes");	2795	test_bit(target, dlm->domain_map) ? "no":"yes");
2794	}	2796	}
2795	if (!dlm_migration_can_proceed(dlm, res, target)) {	2797	if (!dlm_migration_can_proceed(dlm, res, target)) {
2796	mlog(0, "trying again...\n");	2798	mlog(0, "trying again...\n");
2797	goto again;	2799	goto again;
2798	}	2800	}
2799		2801
2800	ret = 0;	2802	ret = 0;
2801	/* did the target go down or die? */	2803	/* did the target go down or die? */
2802	spin_lock(&dlm->spinlock);	2804	spin_lock(&dlm->spinlock);
2803	if (!test_bit(target, dlm->domain_map)) {	2805	if (!test_bit(target, dlm->domain_map)) {
2804	mlog(ML_ERROR, "aha. migration target %u just went down\n",	2806	mlog(ML_ERROR, "aha. migration target %u just went down\n",
2805	target);	2807	target);
2806	ret = -EHOSTDOWN;	2808	ret = -EHOSTDOWN;
2807	}	2809	}
2808	spin_unlock(&dlm->spinlock);	2810	spin_unlock(&dlm->spinlock);
2809		2811
2810	/*	2812	/*
2811	* if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for	2813	* if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2812	* another try; otherwise, we are sure the MIGRATING state is there,	2814	* another try; otherwise, we are sure the MIGRATING state is there,
2813	* drop the unneded state which blocked threads trying to DIRTY	2815	* drop the unneded state which blocked threads trying to DIRTY
2814	*/	2816	*/
2815	spin_lock(&res->spinlock);	2817	spin_lock(&res->spinlock);
2816	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));	2818	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2817	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;	2819	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2818	if (!ret)	2820	if (!ret)
2819	BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));	2821	BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2820	spin_unlock(&res->spinlock);	2822	spin_unlock(&res->spinlock);
2821		2823
2822	/*	2824	/*
2823	* at this point:	2825	* at this point:
2824	*	2826	*
2825	* o the DLM_LOCK_RES_MIGRATING flag is set if target not down	2827	* o the DLM_LOCK_RES_MIGRATING flag is set if target not down
2826	* o there are no pending asts on this lockres	2828	* o there are no pending asts on this lockres
2827	* o all processes trying to reserve an ast on this	2829	* o all processes trying to reserve an ast on this
2828	* lockres must wait for the MIGRATING flag to clear	2830	* lockres must wait for the MIGRATING flag to clear
2829	*/	2831	*/
2830	return ret;	2832	return ret;
2831	}	2833	}
2832		2834
2833	/* last step in the migration process.	2835	/* last step in the migration process.
2834	* original master calls this to free all of the dlm_lock	2836	* original master calls this to free all of the dlm_lock
2835	* structures that used to be for other nodes. */	2837	* structures that used to be for other nodes. */
2836	static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,	2838	static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2837	struct dlm_lock_resource *res)	2839	struct dlm_lock_resource *res)
2838	{	2840	{
2839	struct list_head *queue = &res->granted;	2841	struct list_head *queue = &res->granted;
2840	int i, bit;	2842	int i, bit;
2841	struct dlm_lock lock, next;	2843	struct dlm_lock lock, next;
2842		2844
2843	assert_spin_locked(&res->spinlock);	2845	assert_spin_locked(&res->spinlock);
2844		2846
2845	BUG_ON(res->owner == dlm->node_num);	2847	BUG_ON(res->owner == dlm->node_num);
2846		2848
2847	for (i=0; i<3; i++) {	2849	for (i=0; i<3; i++) {
2848	list_for_each_entry_safe(lock, next, queue, list) {	2850	list_for_each_entry_safe(lock, next, queue, list) {
2849	if (lock->ml.node != dlm->node_num) {	2851	if (lock->ml.node != dlm->node_num) {
2850	mlog(0, "putting lock for node %u\n",	2852	mlog(0, "putting lock for node %u\n",
2851	lock->ml.node);	2853	lock->ml.node);
2852	/* be extra careful */	2854	/* be extra careful */
2853	BUG_ON(!list_empty(&lock->ast_list));	2855	BUG_ON(!list_empty(&lock->ast_list));
2854	BUG_ON(!list_empty(&lock->bast_list));	2856	BUG_ON(!list_empty(&lock->bast_list));
2855	BUG_ON(lock->ast_pending);	2857	BUG_ON(lock->ast_pending);
2856	BUG_ON(lock->bast_pending);	2858	BUG_ON(lock->bast_pending);
2857	dlm_lockres_clear_refmap_bit(dlm, res,	2859	dlm_lockres_clear_refmap_bit(dlm, res,
2858	lock->ml.node);	2860	lock->ml.node);
2859	list_del_init(&lock->list);	2861	list_del_init(&lock->list);
2860	dlm_lock_put(lock);	2862	dlm_lock_put(lock);
2861	/* In a normal unlock, we would have added a	2863	/* In a normal unlock, we would have added a
2862	* DLM_UNLOCK_FREE_LOCK action. Force it. */	2864	* DLM_UNLOCK_FREE_LOCK action. Force it. */
2863	dlm_lock_put(lock);	2865	dlm_lock_put(lock);
2864	}	2866	}
2865	}	2867	}
2866	queue++;	2868	queue++;
2867	}	2869	}
2868	bit = 0;	2870	bit = 0;
2869	while (1) {	2871	while (1) {
2870	bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);	2872	bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
2871	if (bit >= O2NM_MAX_NODES)	2873	if (bit >= O2NM_MAX_NODES)
2872	break;	2874	break;
2873	/* do not clear the local node reference, if there is a	2875	/* do not clear the local node reference, if there is a
2874	* process holding this, let it drop the ref itself */	2876	* process holding this, let it drop the ref itself */
2875	if (bit != dlm->node_num) {	2877	if (bit != dlm->node_num) {
2876	mlog(0, "%s:%.*s: node %u had a ref to this "	2878	mlog(0, "%s:%.*s: node %u had a ref to this "
2877	"migrating lockres, clearing\n", dlm->name,	2879	"migrating lockres, clearing\n", dlm->name,
2878	res->lockname.len, res->lockname.name, bit);	2880	res->lockname.len, res->lockname.name, bit);
2879	dlm_lockres_clear_refmap_bit(dlm, res, bit);	2881	dlm_lockres_clear_refmap_bit(dlm, res, bit);
2880	}	2882	}
2881	bit++;	2883	bit++;
2882	}	2884	}
2883	}	2885	}
2884		2886
2885	/*	2887	/*
2886	* Pick a node to migrate the lock resource to. This function selects a	2888	* Pick a node to migrate the lock resource to. This function selects a
2887	* potential target based first on the locks and then on refmap. It skips	2889	* potential target based first on the locks and then on refmap. It skips
2888	* nodes that are in the process of exiting the domain.	2890	* nodes that are in the process of exiting the domain.
2889	*/	2891	*/
2890	static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,	2892	static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2891	struct dlm_lock_resource *res)	2893	struct dlm_lock_resource *res)
2892	{	2894	{
2893	enum dlm_lockres_list idx;	2895	enum dlm_lockres_list idx;
2894	struct list_head *queue = &res->granted;	2896	struct list_head *queue = &res->granted;
2895	struct dlm_lock *lock;	2897	struct dlm_lock *lock;
2896	int noderef;	2898	int noderef;
2897	u8 nodenum = O2NM_MAX_NODES;	2899	u8 nodenum = O2NM_MAX_NODES;
2898		2900
2899	assert_spin_locked(&dlm->spinlock);	2901	assert_spin_locked(&dlm->spinlock);
2900	assert_spin_locked(&res->spinlock);	2902	assert_spin_locked(&res->spinlock);
2901		2903
2902	/* Go through all the locks */	2904	/* Go through all the locks */
2903	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {	2905	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2904	queue = dlm_list_idx_to_ptr(res, idx);	2906	queue = dlm_list_idx_to_ptr(res, idx);
2905	list_for_each_entry(lock, queue, list) {	2907	list_for_each_entry(lock, queue, list) {
2906	if (lock->ml.node == dlm->node_num)	2908	if (lock->ml.node == dlm->node_num)
2907	continue;	2909	continue;
2908	if (test_bit(lock->ml.node, dlm->exit_domain_map))	2910	if (test_bit(lock->ml.node, dlm->exit_domain_map))
2909	continue;	2911	continue;
2910	nodenum = lock->ml.node;	2912	nodenum = lock->ml.node;
2911	goto bail;	2913	goto bail;
2912	}	2914	}
2913	}	2915	}
2914		2916
2915	/* Go thru the refmap */	2917	/* Go thru the refmap */
2916	noderef = -1;	2918	noderef = -1;
2917	while (1) {	2919	while (1) {
2918	noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,	2920	noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
2919	noderef + 1);	2921	noderef + 1);
2920	if (noderef >= O2NM_MAX_NODES)	2922	if (noderef >= O2NM_MAX_NODES)
2921	break;	2923	break;
2922	if (noderef == dlm->node_num)	2924	if (noderef == dlm->node_num)
2923	continue;	2925	continue;
2924	if (test_bit(noderef, dlm->exit_domain_map))	2926	if (test_bit(noderef, dlm->exit_domain_map))
2925	continue;	2927	continue;
2926	nodenum = noderef;	2928	nodenum = noderef;
2927	goto bail;	2929	goto bail;
2928	}	2930	}
2929		2931
2930	bail:	2932	bail:
2931	return nodenum;	2933	return nodenum;
2932	}	2934	}
2933		2935
2934	/* this is called by the new master once all lockres	2936	/* this is called by the new master once all lockres
2935	* data has been received */	2937	* data has been received */
2936	static int dlm_do_migrate_request(struct dlm_ctxt *dlm,	2938	static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2937	struct dlm_lock_resource *res,	2939	struct dlm_lock_resource *res,
2938	u8 master, u8 new_master,	2940	u8 master, u8 new_master,
2939	struct dlm_node_iter *iter)	2941	struct dlm_node_iter *iter)
2940	{	2942	{
2941	struct dlm_migrate_request migrate;	2943	struct dlm_migrate_request migrate;
2942	int ret, skip, status = 0;	2944	int ret, skip, status = 0;
2943	int nodenum;	2945	int nodenum;
2944		2946
2945	memset(&migrate, 0, sizeof(migrate));	2947	memset(&migrate, 0, sizeof(migrate));
2946	migrate.namelen = res->lockname.len;	2948	migrate.namelen = res->lockname.len;
2947	memcpy(migrate.name, res->lockname.name, migrate.namelen);	2949	memcpy(migrate.name, res->lockname.name, migrate.namelen);
2948	migrate.new_master = new_master;	2950	migrate.new_master = new_master;
2949	migrate.master = master;	2951	migrate.master = master;
2950		2952
2951	ret = 0;	2953	ret = 0;
2952		2954
2953	/* send message to all nodes, except the master and myself */	2955	/* send message to all nodes, except the master and myself */
2954	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {	2956	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
2955	if (nodenum == master \|\|	2957	if (nodenum == master \|\|
2956	nodenum == new_master)	2958	nodenum == new_master)
2957	continue;	2959	continue;
2958		2960
2959	/* We could race exit domain. If exited, skip. */	2961	/* We could race exit domain. If exited, skip. */
2960	spin_lock(&dlm->spinlock);	2962	spin_lock(&dlm->spinlock);
2961	skip = (!test_bit(nodenum, dlm->domain_map));	2963	skip = (!test_bit(nodenum, dlm->domain_map));
2962	spin_unlock(&dlm->spinlock);	2964	spin_unlock(&dlm->spinlock);
2963	if (skip) {	2965	if (skip) {
2964	clear_bit(nodenum, iter->node_map);	2966	clear_bit(nodenum, iter->node_map);
2965	continue;	2967	continue;
2966	}	2968	}
2967		2969
2968	ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,	2970	ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2969	&migrate, sizeof(migrate), nodenum,	2971	&migrate, sizeof(migrate), nodenum,
2970	&status);	2972	&status);
2971	if (ret < 0) {	2973	if (ret < 0) {
2972	mlog(ML_ERROR, "%s: res %.*s, Error %d send "	2974	mlog(ML_ERROR, "%s: res %.*s, Error %d send "
2973	"MIGRATE_REQUEST to node %u\n", dlm->name,	2975	"MIGRATE_REQUEST to node %u\n", dlm->name,
2974	migrate.namelen, migrate.name, ret, nodenum);	2976	migrate.namelen, migrate.name, ret, nodenum);
2975	if (!dlm_is_host_down(ret)) {	2977	if (!dlm_is_host_down(ret)) {
2976	mlog(ML_ERROR, "unhandled error=%d!\n", ret);	2978	mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2977	BUG();	2979	BUG();
2978	}	2980	}
2979	clear_bit(nodenum, iter->node_map);	2981	clear_bit(nodenum, iter->node_map);
2980	ret = 0;	2982	ret = 0;
2981	} else if (status < 0) {	2983	} else if (status < 0) {
2982	mlog(0, "migrate request (node %u) returned %d!\n",	2984	mlog(0, "migrate request (node %u) returned %d!\n",
2983	nodenum, status);	2985	nodenum, status);
2984	ret = status;	2986	ret = status;
2985	} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {	2987	} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
2986	/* during the migration request we short-circuited	2988	/* during the migration request we short-circuited
2987	* the mastery of the lockres. make sure we have	2989	* the mastery of the lockres. make sure we have
2988	* a mastery ref for nodenum */	2990	* a mastery ref for nodenum */
2989	mlog(0, "%s:%.*s: need ref for node %u\n",	2991	mlog(0, "%s:%.*s: need ref for node %u\n",
2990	dlm->name, res->lockname.len, res->lockname.name,	2992	dlm->name, res->lockname.len, res->lockname.name,
2991	nodenum);	2993	nodenum);
2992	spin_lock(&res->spinlock);	2994	spin_lock(&res->spinlock);
2993	dlm_lockres_set_refmap_bit(dlm, res, nodenum);	2995	dlm_lockres_set_refmap_bit(dlm, res, nodenum);
2994	spin_unlock(&res->spinlock);	2996	spin_unlock(&res->spinlock);
2995	}	2997	}
2996	}	2998	}
2997		2999
2998	if (ret < 0)	3000	if (ret < 0)
2999	mlog_errno(ret);	3001	mlog_errno(ret);
3000		3002
3001	mlog(0, "returning ret=%d\n", ret);	3003	mlog(0, "returning ret=%d\n", ret);
3002	return ret;	3004	return ret;
3003	}	3005	}
3004		3006
3005		3007
3006	/* if there is an existing mle for this lockres, we now know who the master is.	3008	/* if there is an existing mle for this lockres, we now know who the master is.
3007	* (the one who sent us this message) we can clear it up right away.	3009	* (the one who sent us this message) we can clear it up right away.
3008	* since the process that put the mle on the list still has a reference to it,	3010	* since the process that put the mle on the list still has a reference to it,
3009	* we can unhash it now, set the master and wake the process. as a result,	3011	* we can unhash it now, set the master and wake the process. as a result,
3010	* we will have no mle in the list to start with. now we can add an mle for	3012	* we will have no mle in the list to start with. now we can add an mle for
3011	* the migration and this should be the only one found for those scanning the	3013	* the migration and this should be the only one found for those scanning the
3012	* list. */	3014	* list. */
3013	int dlm_migrate_request_handler(struct o2net_msg msg, u32 len, void data,	3015	int dlm_migrate_request_handler(struct o2net_msg msg, u32 len, void data,
3014	void **ret_data)	3016	void **ret_data)
3015	{	3017	{
3016	struct dlm_ctxt *dlm = data;	3018	struct dlm_ctxt *dlm = data;
3017	struct dlm_lock_resource *res = NULL;	3019	struct dlm_lock_resource *res = NULL;
3018	struct dlm_migrate_request migrate = (struct dlm_migrate_request ) msg->buf;	3020	struct dlm_migrate_request migrate = (struct dlm_migrate_request ) msg->buf;
3019	struct dlm_master_list_entry mle = NULL, oldmle = NULL;	3021	struct dlm_master_list_entry mle = NULL, oldmle = NULL;
3020	const char *name;	3022	const char *name;
3021	unsigned int namelen, hash;	3023	unsigned int namelen, hash;
3022	int ret = 0;	3024	int ret = 0;
3023		3025
3024	if (!dlm_grab(dlm))	3026	if (!dlm_grab(dlm))
3025	return -EINVAL;	3027	return -EINVAL;
3026		3028
3027	name = migrate->name;	3029	name = migrate->name;
3028	namelen = migrate->namelen;	3030	namelen = migrate->namelen;
3029	hash = dlm_lockid_hash(name, namelen);	3031	hash = dlm_lockid_hash(name, namelen);
3030		3032
3031	/* preallocate.. if this fails, abort */	3033	/* preallocate.. if this fails, abort */
3032	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);	3034	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3033		3035
3034	if (!mle) {	3036	if (!mle) {
3035	ret = -ENOMEM;	3037	ret = -ENOMEM;
3036	goto leave;	3038	goto leave;
3037	}	3039	}
3038		3040
3039	/* check for pre-existing lock */	3041	/* check for pre-existing lock */
3040	spin_lock(&dlm->spinlock);	3042	spin_lock(&dlm->spinlock);
3041	res = __dlm_lookup_lockres(dlm, name, namelen, hash);	3043	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3042	if (res) {	3044	if (res) {
3043	spin_lock(&res->spinlock);	3045	spin_lock(&res->spinlock);
3044	if (res->state & DLM_LOCK_RES_RECOVERING) {	3046	if (res->state & DLM_LOCK_RES_RECOVERING) {
3045	/* if all is working ok, this can only mean that we got	3047	/* if all is working ok, this can only mean that we got
3046	* a migrate request from a node that we now see as	3048	* a migrate request from a node that we now see as
3047	* dead. what can we do here? drop it to the floor? */	3049	* dead. what can we do here? drop it to the floor? */
3048	spin_unlock(&res->spinlock);	3050	spin_unlock(&res->spinlock);
3049	mlog(ML_ERROR, "Got a migrate request, but the "	3051	mlog(ML_ERROR, "Got a migrate request, but the "
3050	"lockres is marked as recovering!");	3052	"lockres is marked as recovering!");
3051	kmem_cache_free(dlm_mle_cache, mle);	3053	kmem_cache_free(dlm_mle_cache, mle);
3052	ret = -EINVAL; /* need a better solution */	3054	ret = -EINVAL; /* need a better solution */
3053	goto unlock;	3055	goto unlock;
3054	}	3056	}
3055	res->state \|= DLM_LOCK_RES_MIGRATING;	3057	res->state \|= DLM_LOCK_RES_MIGRATING;
3056	spin_unlock(&res->spinlock);	3058	spin_unlock(&res->spinlock);
3057	}	3059	}
3058		3060
3059	spin_lock(&dlm->master_lock);	3061	spin_lock(&dlm->master_lock);
3060	/* ignore status. only nonzero status would BUG. */	3062	/* ignore status. only nonzero status would BUG. */
3061	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,	3063	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3062	name, namelen,	3064	name, namelen,
3063	migrate->new_master,	3065	migrate->new_master,
3064	migrate->master);	3066	migrate->master);
3065		3067
3066	spin_unlock(&dlm->master_lock);	3068	spin_unlock(&dlm->master_lock);
3067	unlock:	3069	unlock:
3068	spin_unlock(&dlm->spinlock);	3070	spin_unlock(&dlm->spinlock);
3069		3071
3070	if (oldmle) {	3072	if (oldmle) {
3071	/* master is known, detach if not already detached */	3073	/* master is known, detach if not already detached */
3072	dlm_mle_detach_hb_events(dlm, oldmle);	3074	dlm_mle_detach_hb_events(dlm, oldmle);
3073	dlm_put_mle(oldmle);	3075	dlm_put_mle(oldmle);
3074	}	3076	}
3075		3077
3076	if (res)	3078	if (res)
3077	dlm_lockres_put(res);	3079	dlm_lockres_put(res);
3078	leave:	3080	leave:
3079	dlm_put(dlm);	3081	dlm_put(dlm);
3080	return ret;	3082	return ret;
3081	}	3083	}
3082		3084
3083	/* must be holding dlm->spinlock and dlm->master_lock	3085	/* must be holding dlm->spinlock and dlm->master_lock
3084	* when adding a migration mle, we can clear any other mles	3086	* when adding a migration mle, we can clear any other mles
3085	* in the master list because we know with certainty that	3087	* in the master list because we know with certainty that
3086	* the master is "master". so we remove any old mle from	3088	* the master is "master". so we remove any old mle from
3087	* the list after setting it's master field, and then add	3089	* the list after setting it's master field, and then add
3088	* the new migration mle. this way we can hold with the rule	3090	* the new migration mle. this way we can hold with the rule
3089	* of having only one mle for a given lock name at all times. */	3091	* of having only one mle for a given lock name at all times. */
3090	static int dlm_add_migration_mle(struct dlm_ctxt *dlm,	3092	static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3091	struct dlm_lock_resource *res,	3093	struct dlm_lock_resource *res,
3092	struct dlm_master_list_entry *mle,	3094	struct dlm_master_list_entry *mle,
3093	struct dlm_master_list_entry **oldmle,	3095	struct dlm_master_list_entry **oldmle,
3094	const char *name, unsigned int namelen,	3096	const char *name, unsigned int namelen,
3095	u8 new_master, u8 master)	3097	u8 new_master, u8 master)
3096	{	3098	{
3097	int found;	3099	int found;
3098	int ret = 0;	3100	int ret = 0;
3099		3101
3100	*oldmle = NULL;	3102	*oldmle = NULL;
3101		3103
3102	assert_spin_locked(&dlm->spinlock);	3104	assert_spin_locked(&dlm->spinlock);
3103	assert_spin_locked(&dlm->master_lock);	3105	assert_spin_locked(&dlm->master_lock);
3104		3106
3105	/* caller is responsible for any ref taken here on oldmle */	3107	/* caller is responsible for any ref taken here on oldmle */
3106	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);	3108	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
3107	if (found) {	3109	if (found) {
3108	struct dlm_master_list_entry tmp = oldmle;	3110	struct dlm_master_list_entry tmp = oldmle;
3109	spin_lock(&tmp->spinlock);	3111	spin_lock(&tmp->spinlock);
3110	if (tmp->type == DLM_MLE_MIGRATION) {	3112	if (tmp->type == DLM_MLE_MIGRATION) {
3111	if (master == dlm->node_num) {	3113	if (master == dlm->node_num) {
3112	/* ah another process raced me to it */	3114	/* ah another process raced me to it */
3113	mlog(0, "tried to migrate %.*s, but some "	3115	mlog(0, "tried to migrate %.*s, but some "
3114	"process beat me to it\n",	3116	"process beat me to it\n",
3115	namelen, name);	3117	namelen, name);
3116	ret = -EEXIST;	3118	ret = -EEXIST;
3117	} else {	3119	} else {
3118	/* bad. 2 NODES are trying to migrate! */	3120	/* bad. 2 NODES are trying to migrate! */
3119	mlog(ML_ERROR, "migration error mle: "	3121	mlog(ML_ERROR, "migration error mle: "
3120	"master=%u new_master=%u // request: "	3122	"master=%u new_master=%u // request: "
3121	"master=%u new_master=%u // "	3123	"master=%u new_master=%u // "
3122	"lockres=%.*s\n",	3124	"lockres=%.*s\n",
3123	tmp->master, tmp->new_master,	3125	tmp->master, tmp->new_master,
3124	master, new_master,	3126	master, new_master,
3125	namelen, name);	3127	namelen, name);
3126	BUG();	3128	BUG();
3127	}	3129	}
3128	} else {	3130	} else {
3129	/* this is essentially what assert_master does */	3131	/* this is essentially what assert_master does */
3130	tmp->master = master;	3132	tmp->master = master;
3131	atomic_set(&tmp->woken, 1);	3133	atomic_set(&tmp->woken, 1);
3132	wake_up(&tmp->wq);	3134	wake_up(&tmp->wq);
3133	/* remove it so that only one mle will be found */	3135	/* remove it so that only one mle will be found */
3134	__dlm_unlink_mle(dlm, tmp);	3136	__dlm_unlink_mle(dlm, tmp);
3135	__dlm_mle_detach_hb_events(dlm, tmp);	3137	__dlm_mle_detach_hb_events(dlm, tmp);
3136	if (tmp->type == DLM_MLE_MASTER) {	3138	if (tmp->type == DLM_MLE_MASTER) {
3137	ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;	3139	ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3138	mlog(0, "%s:%.*s: master=%u, newmaster=%u, "	3140	mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3139	"telling master to get ref "	3141	"telling master to get ref "
3140	"for cleared out mle during "	3142	"for cleared out mle during "
3141	"migration\n", dlm->name,	3143	"migration\n", dlm->name,
3142	namelen, name, master,	3144	namelen, name, master,
3143	new_master);	3145	new_master);
3144	}	3146	}
3145	}	3147	}
3146	spin_unlock(&tmp->spinlock);	3148	spin_unlock(&tmp->spinlock);
3147	}	3149	}
3148		3150
3149	/* now add a migration mle to the tail of the list */	3151	/* now add a migration mle to the tail of the list */
3150	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);	3152	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
3151	mle->new_master = new_master;	3153	mle->new_master = new_master;
3152	/* the new master will be sending an assert master for this.	3154	/* the new master will be sending an assert master for this.
3153	* at that point we will get the refmap reference */	3155	* at that point we will get the refmap reference */
3154	mle->master = master;	3156	mle->master = master;
3155	/* do this for consistency with other mle types */	3157	/* do this for consistency with other mle types */
3156	set_bit(new_master, mle->maybe_map);	3158	set_bit(new_master, mle->maybe_map);
3157	__dlm_insert_mle(dlm, mle);	3159	__dlm_insert_mle(dlm, mle);
3158		3160
3159	return ret;	3161	return ret;
3160	}	3162	}
3161		3163
3162	/*	3164	/*
3163	* Sets the owner of the lockres, associated to the mle, to UNKNOWN	3165	* Sets the owner of the lockres, associated to the mle, to UNKNOWN
3164	*/	3166	*/
3165	static struct dlm_lock_resource dlm_reset_mleres_owner(struct dlm_ctxt dlm,	3167	static struct dlm_lock_resource dlm_reset_mleres_owner(struct dlm_ctxt dlm,
3166	struct dlm_master_list_entry *mle)	3168	struct dlm_master_list_entry *mle)
3167	{	3169	{
3168	struct dlm_lock_resource *res;	3170	struct dlm_lock_resource *res;
3169		3171
3170	/* Find the lockres associated to the mle and set its owner to UNK */	3172	/* Find the lockres associated to the mle and set its owner to UNK */
3171	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,	3173	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3172	mle->mnamehash);	3174	mle->mnamehash);
3173	if (res) {	3175	if (res) {
3174	spin_unlock(&dlm->master_lock);	3176	spin_unlock(&dlm->master_lock);
3175		3177
3176	/* move lockres onto recovery list */	3178	/* move lockres onto recovery list */
3177	spin_lock(&res->spinlock);	3179	spin_lock(&res->spinlock);
3178	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);	3180	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3179	dlm_move_lockres_to_recovery_list(dlm, res);	3181	dlm_move_lockres_to_recovery_list(dlm, res);
3180	spin_unlock(&res->spinlock);	3182	spin_unlock(&res->spinlock);
3181	dlm_lockres_put(res);	3183	dlm_lockres_put(res);
3182		3184
3183	/* about to get rid of mle, detach from heartbeat */	3185	/* about to get rid of mle, detach from heartbeat */
3184	__dlm_mle_detach_hb_events(dlm, mle);	3186	__dlm_mle_detach_hb_events(dlm, mle);
3185		3187
3186	/* dump the mle */	3188	/* dump the mle */
3187	spin_lock(&dlm->master_lock);	3189	spin_lock(&dlm->master_lock);
3188	__dlm_put_mle(mle);	3190	__dlm_put_mle(mle);
3189	spin_unlock(&dlm->master_lock);	3191	spin_unlock(&dlm->master_lock);
3190	}	3192	}
3191		3193
3192	return res;	3194	return res;
3193	}	3195	}
3194		3196
3195	static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,	3197	static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3196	struct dlm_master_list_entry *mle)	3198	struct dlm_master_list_entry *mle)
3197	{	3199	{
3198	__dlm_mle_detach_hb_events(dlm, mle);	3200	__dlm_mle_detach_hb_events(dlm, mle);
3199		3201
3200	spin_lock(&mle->spinlock);	3202	spin_lock(&mle->spinlock);
3201	__dlm_unlink_mle(dlm, mle);	3203	__dlm_unlink_mle(dlm, mle);
3202	atomic_set(&mle->woken, 1);	3204	atomic_set(&mle->woken, 1);
3203	spin_unlock(&mle->spinlock);	3205	spin_unlock(&mle->spinlock);
3204		3206
3205	wake_up(&mle->wq);	3207	wake_up(&mle->wq);
3206	}	3208	}
3207		3209
3208	static void dlm_clean_block_mle(struct dlm_ctxt *dlm,	3210	static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3209	struct dlm_master_list_entry *mle, u8 dead_node)	3211	struct dlm_master_list_entry *mle, u8 dead_node)
3210	{	3212	{
3211	int bit;	3213	int bit;
3212		3214
3213	BUG_ON(mle->type != DLM_MLE_BLOCK);	3215	BUG_ON(mle->type != DLM_MLE_BLOCK);
3214		3216
3215	spin_lock(&mle->spinlock);	3217	spin_lock(&mle->spinlock);
3216	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);	3218	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3217	if (bit != dead_node) {	3219	if (bit != dead_node) {
3218	mlog(0, "mle found, but dead node %u would not have been "	3220	mlog(0, "mle found, but dead node %u would not have been "
3219	"master\n", dead_node);	3221	"master\n", dead_node);
3220	spin_unlock(&mle->spinlock);	3222	spin_unlock(&mle->spinlock);
3221	} else {	3223	} else {
3222	/* Must drop the refcount by one since the assert_master will	3224	/* Must drop the refcount by one since the assert_master will
3223	* never arrive. This may result in the mle being unlinked and	3225	* never arrive. This may result in the mle being unlinked and
3224	* freed, but there may still be a process waiting in the	3226	* freed, but there may still be a process waiting in the
3225	* dlmlock path which is fine. */	3227	* dlmlock path which is fine. */
3226	mlog(0, "node %u was expected master\n", dead_node);	3228	mlog(0, "node %u was expected master\n", dead_node);
3227	atomic_set(&mle->woken, 1);	3229	atomic_set(&mle->woken, 1);
3228	spin_unlock(&mle->spinlock);	3230	spin_unlock(&mle->spinlock);
3229	wake_up(&mle->wq);	3231	wake_up(&mle->wq);
3230		3232
3231	/* Do not need events any longer, so detach from heartbeat */	3233	/* Do not need events any longer, so detach from heartbeat */
3232	__dlm_mle_detach_hb_events(dlm, mle);	3234	__dlm_mle_detach_hb_events(dlm, mle);
3233	__dlm_put_mle(mle);	3235	__dlm_put_mle(mle);
3234	}	3236	}
3235	}	3237	}
3236		3238
3237	void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)	3239	void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3238	{	3240	{
3239	struct dlm_master_list_entry *mle;	3241	struct dlm_master_list_entry *mle;
3240	struct dlm_lock_resource *res;	3242	struct dlm_lock_resource *res;
3241	struct hlist_head *bucket;	3243	struct hlist_head *bucket;
3242	struct hlist_node *tmp;	3244	struct hlist_node *tmp;
3243	unsigned int i;	3245	unsigned int i;
3244		3246
3245	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);	3247	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
3246	top:	3248	top:
3247	assert_spin_locked(&dlm->spinlock);	3249	assert_spin_locked(&dlm->spinlock);
3248		3250
3249	/* clean the master list */	3251	/* clean the master list */
3250	spin_lock(&dlm->master_lock);	3252	spin_lock(&dlm->master_lock);
3251	for (i = 0; i < DLM_HASH_BUCKETS; i++) {	3253	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3252	bucket = dlm_master_hash(dlm, i);	3254	bucket = dlm_master_hash(dlm, i);
3253	hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {	3255	hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3254	BUG_ON(mle->type != DLM_MLE_BLOCK &&	3256	BUG_ON(mle->type != DLM_MLE_BLOCK &&
3255	mle->type != DLM_MLE_MASTER &&	3257	mle->type != DLM_MLE_MASTER &&
3256	mle->type != DLM_MLE_MIGRATION);	3258	mle->type != DLM_MLE_MIGRATION);
3257		3259
3258	/* MASTER mles are initiated locally. The waiting	3260	/* MASTER mles are initiated locally. The waiting
3259	* process will notice the node map change shortly.	3261	* process will notice the node map change shortly.
3260	* Let that happen as normal. */	3262	* Let that happen as normal. */
3261	if (mle->type == DLM_MLE_MASTER)	3263	if (mle->type == DLM_MLE_MASTER)
3262	continue;	3264	continue;
3263		3265
3264	/* BLOCK mles are initiated by other nodes. Need to	3266	/* BLOCK mles are initiated by other nodes. Need to
3265	* clean up if the dead node would have been the	3267	* clean up if the dead node would have been the
3266	* master. */	3268	* master. */
3267	if (mle->type == DLM_MLE_BLOCK) {	3269	if (mle->type == DLM_MLE_BLOCK) {
3268	dlm_clean_block_mle(dlm, mle, dead_node);	3270	dlm_clean_block_mle(dlm, mle, dead_node);
3269	continue;	3271	continue;
3270	}	3272	}
3271		3273
3272	/* Everything else is a MIGRATION mle */	3274	/* Everything else is a MIGRATION mle */
3273		3275
3274	/* The rule for MIGRATION mles is that the master	3276	/* The rule for MIGRATION mles is that the master
3275	* becomes UNKNOWN if either the original or the new	3277	* becomes UNKNOWN if either the original or the new
3276	* master dies. All UNKNOWN lockres' are sent to	3278	* master dies. All UNKNOWN lockres' are sent to
3277	* whichever node becomes the recovery master. The new	3279	* whichever node becomes the recovery master. The new
3278	* master is responsible for determining if there is	3280	* master is responsible for determining if there is
3279	* still a master for this lockres, or if he needs to	3281	* still a master for this lockres, or if he needs to
3280	* take over mastery. Either way, this node should	3282	* take over mastery. Either way, this node should
3281	* expect another message to resolve this. */	3283	* expect another message to resolve this. */
3282		3284
3283	if (mle->master != dead_node &&	3285	if (mle->master != dead_node &&
3284	mle->new_master != dead_node)	3286	mle->new_master != dead_node)
3285	continue;	3287	continue;
3286		3288
3287	/* If we have reached this point, this mle needs to be	3289	/* If we have reached this point, this mle needs to be
3288	* removed from the list and freed. */	3290	* removed from the list and freed. */
3289	dlm_clean_migration_mle(dlm, mle);	3291	dlm_clean_migration_mle(dlm, mle);
3290		3292
3291	mlog(0, "%s: node %u died during migration from "	3293	mlog(0, "%s: node %u died during migration from "
3292	"%u to %u!\n", dlm->name, dead_node, mle->master,	3294	"%u to %u!\n", dlm->name, dead_node, mle->master,
3293	mle->new_master);	3295	mle->new_master);
3294		3296
3295	/* If we find a lockres associated with the mle, we've	3297	/* If we find a lockres associated with the mle, we've
3296	* hit this rare case that messes up our lock ordering.	3298	* hit this rare case that messes up our lock ordering.
3297	* If so, we need to drop the master lock so that we can	3299	* If so, we need to drop the master lock so that we can
3298	* take the lockres lock, meaning that we will have to	3300	* take the lockres lock, meaning that we will have to
3299	* restart from the head of list. */	3301	* restart from the head of list. */
3300	res = dlm_reset_mleres_owner(dlm, mle);	3302	res = dlm_reset_mleres_owner(dlm, mle);
3301	if (res)	3303	if (res)
3302	/* restart */	3304	/* restart */
3303	goto top;	3305	goto top;
3304		3306
3305	/* This may be the last reference */	3307	/* This may be the last reference */
3306	__dlm_put_mle(mle);	3308	__dlm_put_mle(mle);
3307	}	3309	}
3308	}	3310	}
3309	spin_unlock(&dlm->master_lock);	3311	spin_unlock(&dlm->master_lock);
3310	}	3312	}
3311		3313
3312	int dlm_finish_migration(struct dlm_ctxt dlm, struct dlm_lock_resource res,	3314	int dlm_finish_migration(struct dlm_ctxt dlm, struct dlm_lock_resource res,
3313	u8 old_master)	3315	u8 old_master)
3314	{	3316	{
3315	struct dlm_node_iter iter;	3317	struct dlm_node_iter iter;
3316	int ret = 0;	3318	int ret = 0;
3317		3319
3318	spin_lock(&dlm->spinlock);	3320	spin_lock(&dlm->spinlock);
3319	dlm_node_iter_init(dlm->domain_map, &iter);	3321	dlm_node_iter_init(dlm->domain_map, &iter);
3320	clear_bit(old_master, iter.node_map);	3322	clear_bit(old_master, iter.node_map);
3321	clear_bit(dlm->node_num, iter.node_map);	3323	clear_bit(dlm->node_num, iter.node_map);
3322	spin_unlock(&dlm->spinlock);	3324	spin_unlock(&dlm->spinlock);
3323		3325
3324	/* ownership of the lockres is changing. account for the	3326	/* ownership of the lockres is changing. account for the
3325	* mastery reference here since old_master will briefly have	3327	* mastery reference here since old_master will briefly have
3326	* a reference after the migration completes */	3328	* a reference after the migration completes */
3327	spin_lock(&res->spinlock);	3329	spin_lock(&res->spinlock);
3328	dlm_lockres_set_refmap_bit(dlm, res, old_master);	3330	dlm_lockres_set_refmap_bit(dlm, res, old_master);
3329	spin_unlock(&res->spinlock);	3331	spin_unlock(&res->spinlock);
3330		3332
3331	mlog(0, "now time to do a migrate request to other nodes\n");	3333	mlog(0, "now time to do a migrate request to other nodes\n");
3332	ret = dlm_do_migrate_request(dlm, res, old_master,	3334	ret = dlm_do_migrate_request(dlm, res, old_master,
3333	dlm->node_num, &iter);	3335	dlm->node_num, &iter);
3334	if (ret < 0) {	3336	if (ret < 0) {
3335	mlog_errno(ret);	3337	mlog_errno(ret);
3336	goto leave;	3338	goto leave;
3337	}	3339	}
3338		3340
3339	mlog(0, "doing assert master of %.*s to all except the original node\n",	3341	mlog(0, "doing assert master of %.*s to all except the original node\n",
3340	res->lockname.len, res->lockname.name);	3342	res->lockname.len, res->lockname.name);
3341	/* this call now finishes out the nodemap	3343	/* this call now finishes out the nodemap
3342	* even if one or more nodes die */	3344	* even if one or more nodes die */
3343	ret = dlm_do_assert_master(dlm, res, iter.node_map,	3345	ret = dlm_do_assert_master(dlm, res, iter.node_map,
3344	DLM_ASSERT_MASTER_FINISH_MIGRATION);	3346	DLM_ASSERT_MASTER_FINISH_MIGRATION);
3345	if (ret < 0) {	3347	if (ret < 0) {
3346	/* no longer need to retry. all living nodes contacted. */	3348	/* no longer need to retry. all living nodes contacted. */
3347	mlog_errno(ret);	3349	mlog_errno(ret);
3348	ret = 0;	3350	ret = 0;
3349	}	3351	}
3350		3352
3351	memset(iter.node_map, 0, sizeof(iter.node_map));	3353	memset(iter.node_map, 0, sizeof(iter.node_map));
3352	set_bit(old_master, iter.node_map);	3354	set_bit(old_master, iter.node_map);
3353	mlog(0, "doing assert master of %.*s back to %u\n",	3355	mlog(0, "doing assert master of %.*s back to %u\n",
3354	res->lockname.len, res->lockname.name, old_master);	3356	res->lockname.len, res->lockname.name, old_master);
3355	ret = dlm_do_assert_master(dlm, res, iter.node_map,	3357	ret = dlm_do_assert_master(dlm, res, iter.node_map,
3356	DLM_ASSERT_MASTER_FINISH_MIGRATION);	3358	DLM_ASSERT_MASTER_FINISH_MIGRATION);
3357	if (ret < 0) {	3359	if (ret < 0) {
3358	mlog(0, "assert master to original master failed "	3360	mlog(0, "assert master to original master failed "
3359	"with %d.\n", ret);	3361	"with %d.\n", ret);
3360	/* the only nonzero status here would be because of	3362	/* the only nonzero status here would be because of
3361	* a dead original node. we're done. */	3363	* a dead original node. we're done. */
3362	ret = 0;	3364	ret = 0;
3363	}	3365	}
3364		3366
3365	/* all done, set the owner, clear the flag */	3367	/* all done, set the owner, clear the flag */
3366	spin_lock(&res->spinlock);	3368	spin_lock(&res->spinlock);
3367	dlm_set_lockres_owner(dlm, res, dlm->node_num);	3369	dlm_set_lockres_owner(dlm, res, dlm->node_num);
3368	res->state &= ~DLM_LOCK_RES_MIGRATING;	3370	res->state &= ~DLM_LOCK_RES_MIGRATING;
3369	spin_unlock(&res->spinlock);	3371	spin_unlock(&res->spinlock);
3370	/* re-dirty it on the new master */	3372	/* re-dirty it on the new master */
3371	dlm_kick_thread(dlm, res);	3373	dlm_kick_thread(dlm, res);
3372	wake_up(&res->wq);	3374	wake_up(&res->wq);
3373	leave:	3375	leave:
3374	return ret;	3376	return ret;
3375	}	3377	}
3376		3378
3377	/*	3379	/*
3378	* LOCKRES AST REFCOUNT	3380	* LOCKRES AST REFCOUNT
3379	* this is integral to migration	3381	* this is integral to migration
3380	*/	3382	*/
3381		3383
3382	/* for future intent to call an ast, reserve one ahead of time.	3384	/* for future intent to call an ast, reserve one ahead of time.
3383	* this should be called only after waiting on the lockres	3385	* this should be called only after waiting on the lockres
3384	* with dlm_wait_on_lockres, and while still holding the	3386	* with dlm_wait_on_lockres, and while still holding the
3385	* spinlock after the call. */	3387	* spinlock after the call. */
3386	void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)	3388	void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
3387	{	3389	{
3388	assert_spin_locked(&res->spinlock);	3390	assert_spin_locked(&res->spinlock);
3389	if (res->state & DLM_LOCK_RES_MIGRATING) {	3391	if (res->state & DLM_LOCK_RES_MIGRATING) {
3390	__dlm_print_one_lock_resource(res);	3392	__dlm_print_one_lock_resource(res);
3391	}	3393	}
3392	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);	3394	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3393		3395
3394	atomic_inc(&res->asts_reserved);	3396	atomic_inc(&res->asts_reserved);
3395	}	3397	}
3396		3398
3397	/*	3399	/*
3398	* used to drop the reserved ast, either because it went unused,	3400	* used to drop the reserved ast, either because it went unused,
3399	* or because the ast/bast was actually called.	3401	* or because the ast/bast was actually called.
3400	*	3402	*
3401	* also, if there is a pending migration on this lockres,	3403	* also, if there is a pending migration on this lockres,
3402	* and this was the last pending ast on the lockres,	3404	* and this was the last pending ast on the lockres,
3403	* atomically set the MIGRATING flag before we drop the lock.	3405	* atomically set the MIGRATING flag before we drop the lock.
3404	* this is how we ensure that migration can proceed with no	3406	* this is how we ensure that migration can proceed with no
3405	* asts in progress. note that it is ok if the state of the	3407	* asts in progress. note that it is ok if the state of the
3406	* queues is such that a lock should be granted in the future	3408	* queues is such that a lock should be granted in the future
3407	* or that a bast should be fired, because the new master will	3409	* or that a bast should be fired, because the new master will
3408	* shuffle the lists on this lockres as soon as it is migrated.	3410	* shuffle the lists on this lockres as soon as it is migrated.
3409	*/	3411	*/
3410	void dlm_lockres_release_ast(struct dlm_ctxt *dlm,	3412	void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3411	struct dlm_lock_resource *res)	3413	struct dlm_lock_resource *res)
3412	{	3414	{
3413	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))	3415	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
3414	return;	3416	return;
3415		3417
3416	if (!res->migration_pending) {	3418	if (!res->migration_pending) {
3417	spin_unlock(&res->spinlock);	3419	spin_unlock(&res->spinlock);
3418	return;	3420	return;
3419	}	3421	}
3420		3422
3421	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);	3423	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3422	res->migration_pending = 0;	3424	res->migration_pending = 0;
3423	res->state \|= DLM_LOCK_RES_MIGRATING;	3425	res->state \|= DLM_LOCK_RES_MIGRATING;
3424	spin_unlock(&res->spinlock);	3426	spin_unlock(&res->spinlock);
3425	wake_up(&res->wq);	3427	wake_up(&res->wq);
3426	wake_up(&dlm->migration_wq);	3428	wake_up(&dlm->migration_wq);
3427	}	3429	}
3428		3430
3429	void dlm_force_free_mles(struct dlm_ctxt *dlm)	3431	void dlm_force_free_mles(struct dlm_ctxt *dlm)
3430	{	3432	{
3431	int i;	3433	int i;
3432	struct hlist_head *bucket;	3434	struct hlist_head *bucket;
3433	struct dlm_master_list_entry *mle;	3435	struct dlm_master_list_entry *mle;
3434	struct hlist_node *tmp;	3436	struct hlist_node *tmp;
3435		3437
3436	/*	3438	/*
3437	* We notified all other nodes that we are exiting the domain and	3439	* We notified all other nodes that we are exiting the domain and
3438	* marked the dlm state to DLM_CTXT_LEAVING. If any mles are still	3440	* marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3439	* around we force free them and wake any processes that are waiting	3441	* around we force free them and wake any processes that are waiting
3440	* on the mles	3442	* on the mles
3441	*/	3443	*/
3442	spin_lock(&dlm->spinlock);	3444	spin_lock(&dlm->spinlock);
3443	spin_lock(&dlm->master_lock);	3445	spin_lock(&dlm->master_lock);
3444		3446
3445	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);	3447	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3446	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));	3448	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3447		3449
3448	for (i = 0; i < DLM_HASH_BUCKETS; i++) {	3450	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3449	bucket = dlm_master_hash(dlm, i);	3451	bucket = dlm_master_hash(dlm, i);
3450	hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {	3452	hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3451	if (mle->type != DLM_MLE_BLOCK) {	3453	if (mle->type != DLM_MLE_BLOCK) {
3452	mlog(ML_ERROR, "bad mle: %p\n", mle);	3454	mlog(ML_ERROR, "bad mle: %p\n", mle);
3453	dlm_print_one_mle(mle);	3455	dlm_print_one_mle(mle);
3454	}	3456	}
3455	atomic_set(&mle->woken, 1);	3457	atomic_set(&mle->woken, 1);
3456	wake_up(&mle->wq);	3458	wake_up(&mle->wq);
3457		3459
3458	__dlm_unlink_mle(dlm, mle);	3460	__dlm_unlink_mle(dlm, mle);
3459	__dlm_mle_detach_hb_events(dlm, mle);	3461	__dlm_mle_detach_hb_events(dlm, mle);
3460	__dlm_put_mle(mle);	3462	__dlm_put_mle(mle);
3461	}	3463	}

fs/ocfs2/super.c

Diff comments View file @ 8207649

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * super.c
  *
  * load/unload driver, mount/dismount volumes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/statfs.h>
 #include <linux/moduleparam.h>
 #include <linux/blkdev.h>
 #include <linux/socket.h>
 #include <linux/inet.h>
 #include <linux/parser.h>
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
 #include <linux/cleancache.h>
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 /* this should be the only file to include a version 1 header */
 #include "ocfs1_fs_compat.h"
 #include "alloc.h"
 #include "aops.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
 #include "localalloc.h"
 #include "namei.h"
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
 #include "suballoc.h"
 #include "buffer_head_io.h"
 static struct kmem_cache *ocfs2_inode_cachep;
 struct kmem_cache *ocfs2_dquot_cachep;
 struct kmem_cache *ocfs2_qf_chunk_cachep;
 /* OCFS2 needs to schedule several different types of work which
  * require cluster locking, disk I/O, recovery waits, etc. Since these
  * types of work tend to be heavy we avoid using the kernel events
  * workqueue and schedule on our own. */
 struct workqueue_struct *ocfs2_wq = NULL;
 static struct dentry *ocfs2_debugfs_root;
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("OCFS2 cluster file system");
 struct mount_options
 {
 	unsigned long	commit_interval;
 	unsigned long	mount_opt;
 	unsigned int	atime_quantum;
 	signed short	slot;
 	int		localalloc_opt;
 	unsigned int	resv_level;
 	int		dir_resv_level;
 	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 static int ocfs2_parse_options(struct super_block *sb, char *options,
 			       struct mount_options *mopt,
 			       int is_remount);
 static int ocfs2_check_set_options(struct super_block *sb,
 				   struct mount_options *options);
 static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
 static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 			       struct buffer_head *bh,
 			       u32 sectsize,
 			       struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
 				  int sector_size,
 				  struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_get_sector(struct super_block *sb,
 			    struct buffer_head **bh,
 			    int block,
 			    int sect_size);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
 static int ocfs2_enable_quotas(struct ocfs2_super *osb);
 static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 static const struct super_operations ocfs2_sops = {
 	.statfs		= ocfs2_statfs,
 	.alloc_inode	= ocfs2_alloc_inode,
 	.destroy_inode	= ocfs2_destroy_inode,
 	.drop_inode	= ocfs2_drop_inode,
 	.evict_inode	= ocfs2_evict_inode,
 	.sync_fs	= ocfs2_sync_fs,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
 	.quota_read	= ocfs2_quota_read,
 	.quota_write	= ocfs2_quota_write,
 };
 enum {
 	Opt_barrier,
 	Opt_err_panic,
 	Opt_err_ro,
 	Opt_intr,
 	Opt_nointr,
 	Opt_hb_none,
 	Opt_hb_local,
 	Opt_hb_global,
 	Opt_data_ordered,
 	Opt_data_writeback,
 	Opt_atime_quantum,
 	Opt_slot,
 	Opt_commit,
 	Opt_localalloc,
 	Opt_localflocks,
 	Opt_stack,
 	Opt_user_xattr,
 	Opt_nouser_xattr,
 	Opt_inode64,
 	Opt_acl,
 	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
 	Opt_coherency_buffered,
 	Opt_coherency_full,
 	Opt_resv_level,
 	Opt_dir_resv_level,
 	Opt_err,
 };
 static const match_table_t tokens = {
 	{Opt_barrier, "barrier=%u"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_intr, "intr"},
 	{Opt_nointr, "nointr"},
 	{Opt_hb_none, OCFS2_HB_NONE},
 	{Opt_hb_local, OCFS2_HB_LOCAL},
 	{Opt_hb_global, OCFS2_HB_GLOBAL},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_atime_quantum, "atime_quantum=%u"},
 	{Opt_slot, "preferred_slot=%u"},
 	{Opt_commit, "commit=%u"},
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_stack, "cluster_stack=%s"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_inode64, "inode64"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_coherency_buffered, "coherency=buffered"},
 	{Opt_coherency_full, "coherency=full"},
 	{Opt_resv_level, "resv_level=%u"},
 	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_err, NULL}
 };
 #ifdef CONFIG_DEBUG_FS
 static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 {
 	struct ocfs2_cluster_connection *cconn = osb->cconn;
 	struct ocfs2_recovery_map *rm = osb->recovery_map;
 	struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
 	int i, out = 0;
 	out += snprintf(buf + out, len - out,
 			"%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
 			"Device", osb->dev_str, osb->uuid_str,
 			osb->fs_generation, osb->vol_label);
 	out += snprintf(buf + out, len - out,
 			"%10s => State: %d  Flags: 0x%lX\n", "Volume",
 			atomic_read(&osb->vol_state), osb->osb_flags);
 	out += snprintf(buf + out, len - out,
 			"%10s => Block: %lu  Cluster: %d\n", "Sizes",
 			osb->sb->s_blocksize, osb->s_clustersize);
 	out += snprintf(buf + out, len - out,
 			"%10s => Compat: 0x%X  Incompat: 0x%X  "
 			"ROcompat: 0x%X\n",
 			"Features", osb->s_feature_compat,
 			osb->s_feature_incompat, osb->s_feature_ro_compat);
 	out += snprintf(buf + out, len - out,
 			"%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
 			osb->s_mount_opt, osb->s_atime_quantum);
 	if (cconn) {
 		out += snprintf(buf + out, len - out,
 				"%10s => Stack: %s  Name: %*s  "
 				"Version: %d.%d\n", "Cluster",
 				(*osb->osb_cluster_stack == '\0' ?
 				 "o2cb" : osb->osb_cluster_stack),
 				cconn->cc_namelen, cconn->cc_name,
 				cconn->cc_version.pv_major,
 				cconn->cc_version.pv_minor);
 	}
 	spin_lock(&osb->dc_task_lock);
 	out += snprintf(buf + out, len - out,
 			"%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
 			"WorkSeq: %lu\n", "DownCnvt",
 			(osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
 			osb->blocked_lock_count, osb->dc_wake_sequence,
 			osb->dc_work_sequence);
 	spin_unlock(&osb->dc_task_lock);
 	spin_lock(&osb->osb_lock);
 	out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
 			"Recovery",
 			(osb->recovery_thread_task ?
 			 task_pid_nr(osb->recovery_thread_task) : -1));
 	if (rm->rm_used == 0)
 		out += snprintf(buf + out, len - out, " None\n");
 	else {
 		for (i = 0; i < rm->rm_used; i++)
 			out += snprintf(buf + out, len - out, " %d",
 					rm->rm_entries[i]);
 		out += snprintf(buf + out, len - out, "\n");
 	}
 	spin_unlock(&osb->osb_lock);
 	out += snprintf(buf + out, len - out,
 			"%10s => Pid: %d  Interval: %lu\n", "Commit",
 			(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
 			osb->osb_commit_interval);
 	out += snprintf(buf + out, len - out,
 			"%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
 			"Journal", osb->journal->j_state,
 			osb->journal->j_trans_id,
 			atomic_read(&osb->journal->j_num_trans));
 	out += snprintf(buf + out, len - out,
 			"%10s => GlobalAllocs: %d  LocalAllocs: %d  "
 			"SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
 			"Stats",
 			atomic_read(&osb->alloc_stats.bitmap_data),
 			atomic_read(&osb->alloc_stats.local_data),
 			atomic_read(&osb->alloc_stats.bg_allocs),
 			atomic_read(&osb->alloc_stats.moves),
 			atomic_read(&osb->alloc_stats.bg_extends));
 	out += snprintf(buf + out, len - out,
 			"%10s => State: %u  Descriptor: %llu  Size: %u bits  "
 			"Default: %u bits\n",
 			"LocalAlloc", osb->local_alloc_state,
 			(unsigned long long)osb->la_last_gd,
 			osb->local_alloc_bits, osb->local_alloc_default_bits);
 	spin_lock(&osb->osb_lock);
 	out += snprintf(buf + out, len - out,
 			"%10s => InodeSlot: %d  StolenInodes: %d, "
 			"MetaSlot: %d  StolenMeta: %d\n", "Steal",
 			osb->s_inode_steal_slot,
 			atomic_read(&osb->s_num_inodes_stolen),
 			osb->s_meta_steal_slot,
 			atomic_read(&osb->s_num_meta_stolen));
 	spin_unlock(&osb->osb_lock);
 	out += snprintf(buf + out, len - out, "OrphanScan => ");
 	out += snprintf(buf + out, len - out, "Local: %u  Global: %u ",
 			os->os_count, os->os_seqno);
 	out += snprintf(buf + out, len - out, " Last Scan: ");
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
 		out += snprintf(buf + out, len - out, "Disabled\n");
 	else
 		out += snprintf(buf + out, len - out, "%lu seconds ago\n",
 				(get_seconds() - os->os_scantime.tv_sec));
 	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
 			"Slots", "Num", "RecoGen");
 	for (i = 0; i < osb->max_slots; ++i) {
 		out += snprintf(buf + out, len - out,
 				"%10s  %c %3d  %10d\n",
 				" ",
 				(i == osb->slot_num ? '*' : ' '),
 				i, osb->slot_recovery_generations[i]);
 	}
 	return out;
 }
 static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
 {
 	struct ocfs2_super *osb = inode->i_private;
 	char *buf = NULL;
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
 		goto bail;
 	i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
 	file->private_data = buf;
 	return 0;
 bail:
 	return -ENOMEM;
 }
 static int ocfs2_debug_release(struct inode *inode, struct file *file)
 {
 	kfree(file->private_data);
 	return 0;
 }
 static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
 				size_t nbytes, loff_t *ppos)
 {
 	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
 				       i_size_read(file->f_mapping->host));
 }
 #else
 static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
 {
 	return 0;
 }
 static int ocfs2_debug_release(struct inode *inode, struct file *file)
 {
 	return 0;
 }
 static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
 				size_t nbytes, loff_t *ppos)
 {
 	return 0;
 }
 #endif	/* CONFIG_DEBUG_FS */
 static const struct file_operations ocfs2_osb_debug_fops = {
 	.open =		ocfs2_osb_debug_open,
 	.release =	ocfs2_debug_release,
 	.read =		ocfs2_debug_read,
 	.llseek =	generic_file_llseek,
 };
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
 	int status;
 	tid_t target;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 	if (wait) {
 		status = ocfs2_flush_truncate_log(osb);
 		if (status < 0)
 			mlog_errno(status);
 	} else {
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 	}
 	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
 				      &target)) {
 		if (wait)
 			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
 					     target);
 	}
 	return 0;
 }
 static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
 {
 	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
 	    && (ino == USER_QUOTA_SYSTEM_INODE
 		|| ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
 		return 0;
 	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 	    && (ino == GROUP_QUOTA_SYSTEM_INODE
 		|| ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
 		return 0;
 	return 1;
 }
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
 	struct inode *new = NULL;
 	int status = 0;
 	int i;
 	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
 	if (IS_ERR(new)) {
 		status = PTR_ERR(new);
 		mlog_errno(status);
 		goto bail;
 	}
 	osb->root_inode = new;
 	new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
 	if (IS_ERR(new)) {
 		status = PTR_ERR(new);
 		mlog_errno(status);
 		goto bail;
 	}
 	osb->sys_root_inode = new;
 	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
 	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
 		if (!ocfs2_need_system_inode(osb, i))
 			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
 			status = -EINVAL;
 			mlog_errno(status);
 			/* FIXME: Should ERROR_RO_FS */
 			mlog(ML_ERROR, "Unable to load system inode %d, "
 			     "possibly corrupt fs?", i);
 			goto bail;
 		}
 		// the array now has one ref, so drop this one
 		iput(new);
 	}
 bail:
 	if (status)
 		mlog_errno(status);
 	return status;
 }
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 {
 	struct inode *new = NULL;
 	int status = 0;
 	int i;
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	     i < NUM_SYSTEM_INODES;
 	     i++) {
 		if (!ocfs2_need_system_inode(osb, i))
 			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
 			status = -EINVAL;
 			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
 			     status, i, osb->slot_num);
 			goto bail;
 		}
 		/* the array now has one ref, so drop this one */
 		iput(new);
 	}
 bail:
 	if (status)
 		mlog_errno(status);
 	return status;
 }
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 {
 	int i;
 	struct inode *inode;
 	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
 		inode = osb->global_system_inodes[i];
 		if (inode) {
 			iput(inode);
 			osb->global_system_inodes[i] = NULL;
 		}
 	}
 	inode = osb->sys_root_inode;
 	if (inode) {
 		iput(inode);
 		osb->sys_root_inode = NULL;
 	}
 	inode = osb->root_inode;
 	if (inode) {
 		iput(inode);
 		osb->root_inode = NULL;
 	}
 	if (!osb->local_system_inodes)
 		return;
 	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
 		if (osb->local_system_inodes[i]) {
 			iput(osb->local_system_inodes[i]);
 			osb->local_system_inodes[i] = NULL;
 		}
 	}
 	kfree(osb->local_system_inodes);
 	osb->local_system_inodes = NULL;
 }
 /* We're allocating fs objects, use GFP_NOFS */
 static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 {
 	struct ocfs2_inode_info *oi;
 	oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
 	if (!oi)
 		return NULL;
 	oi->i_sync_tid = 0;
 	oi->i_datasync_tid = 0;
 	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
 	return &oi->vfs_inode;
 }
 static void ocfs2_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
 static void ocfs2_destroy_inode(struct inode *inode)
 {
 	call_rcu(&inode->i_rcu, ocfs2_i_callback);
 }
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 						unsigned int cbits)
 {
 	unsigned int bytes = 1 << cbits;
 	unsigned int trim = bytes;
 	unsigned int bitshift = 32;
 	/*
 	 * i_size and all block offsets in ocfs2 are always 64 bits
 	 * wide. i_clusters is 32 bits, in cluster-sized units. So on
 	 * 64 bit platforms, cluster size will be the limiting factor.
 	 */
 #if BITS_PER_LONG == 32
 # if defined(CONFIG_LBDAF)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	/*
 	 * We might be limited by page cache size.
 	 */
 	if (bytes > PAGE_CACHE_SIZE) {
 		bytes = PAGE_CACHE_SIZE;
 		trim = 1;
 		/*
 		 * Shift by 31 here so that we don't get larger than
 		 * MAX_LFS_FILESIZE
 		 */
 		bitshift = 31;
 	}
 # else
 	/*
 	 * We are limited by the size of sector_t. Use block size, as
 	 * that's what we expose to the VFS.
 	 */
 	bytes = 1 << bbits;
 	trim = 1;
 	bitshift = 31;
 # endif
 #endif
 	/*
 	 * Trim by a whole cluster when we can actually approach the
 	 * on-disk limits. Otherwise we can overflow i_clusters when
 	 * an extent start is at the max offset.
 	 */
 	return (((unsigned long long)bytes) << bitshift) - trim;
 }
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 {
 	int incompat_features;
 	int ret = 0;
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	u32 tmp;
 	sync_filesystem(sb);
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
 	    !ocfs2_check_set_options(sb, &parsed_options)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
 		OCFS2_MOUNT_HB_NONE;
 	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
 		goto out;
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
 	    (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change data mode on remount\n");
 		goto out;
 	}
 	/* Probably don't want this on remount; it might
 	 * mess with other nodes */
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
 	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
 		goto out;
 	}
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		/* Disable quota accounting before remounting RO */
 		if (*flags & MS_RDONLY) {
 			ret = ocfs2_susp_quotas(osb, 0);
 			if (ret < 0)
 				goto out;
 		}
 		/* Lock here so the check of HARD_RO and the potential
 		 * setting of SOFT_RO is atomic. */
 		spin_lock(&osb->osb_lock);
 		if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
 			mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
 			ret = -EROFS;
 			goto unlock_osb;
 		}
 		if (*flags & MS_RDONLY) {
 			sb->s_flags |= MS_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
 			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
 				mlog(ML_ERROR, "Cannot remount RDWR "
 				     "filesystem due to previous errors.\n");
 				ret = -EROFS;
 				goto unlock_osb;
 			}
 			incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
 			if (incompat_features) {
 				mlog(ML_ERROR, "Cannot remount RDWR because "
 				     "of unsupported optional features "
 				     "(%x).\n", incompat_features);
 				ret = -EINVAL;
 				goto unlock_osb;
 			}
 			sb->s_flags &= ~MS_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
 		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
 		if (!ret && !(*flags & MS_RDONLY)) {
 			if (sb_any_quota_suspended(sb))
 				ret = ocfs2_susp_quotas(osb, 1);
 			else
 				ret = ocfs2_enable_quotas(osb);
 			if (ret < 0) {
 				/* Return back changes... */
 				spin_lock(&osb->osb_lock);
 				sb->s_flags |= MS_RDONLY;
 				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 				spin_unlock(&osb->osb_lock);
 				goto out;
 			}
 		}
 	}
 	if (!ret) {
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->preferred_slot = parsed_options.slot;
 		if (parsed_options.commit_interval)
 			osb->osb_commit_interval = parsed_options.commit_interval;
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
 		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
 							MS_POSIXACL : 0);
 	}
 out:
 	return ret;
 }
 static int ocfs2_sb_probe(struct super_block *sb,
 			  struct buffer_head **bh,
 			  int *sector_size,
 			  struct ocfs2_blockcheck_stats *stats)
 {
 	int status, tmpstat;
 	struct ocfs1_vol_disk_hdr *hdr;
 	struct ocfs2_dinode *di;
 	int blksize;
 	*bh = NULL;
 	/* may be > 512 */
 	*sector_size = bdev_logical_block_size(sb->s_bdev);
 	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
 		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
 		     *sector_size, OCFS2_MAX_BLOCKSIZE);
 		status = -EINVAL;
 		goto bail;
 	}
 	/* Can this really happen? */
 	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
 		*sector_size = OCFS2_MIN_BLOCKSIZE;
 	/* check block zero for old format */
 	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
 	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
 		mlog(ML_ERROR, "incompatible version: %u.%u\n",
 		     hdr->major_version, hdr->minor_version);
 		status = -EINVAL;
 	}
 	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
 		   strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
 		mlog(ML_ERROR, "incompatible volume signature: %8s\n",
 		     hdr->signature);
 		status = -EINVAL;
 	}
 	brelse(*bh);
 	*bh = NULL;
 	if (status < 0) {
 		mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
 		     "upgraded before mounting with ocfs v2\n");
 		goto bail;
 	}
 	/*
 	 * Now check at magic offset for 512, 1024, 2048, 4096
 	 * blocksizes.  4096 is the maximum blocksize because it is
 	 * the minimum clustersize.
 	 */
 	status = -EINVAL;
 	for (blksize = *sector_size;
 	     blksize <= OCFS2_MAX_BLOCKSIZE;
 	     blksize <<= 1) {
 		tmpstat = ocfs2_get_sector(sb, bh,
 					   OCFS2_SUPER_BLOCK_BLKNO,
 					   blksize);
 		if (tmpstat < 0) {
 			status = tmpstat;
 			mlog_errno(status);
 			break;
 		}
 		di = (struct ocfs2_dinode *) (*bh)->b_data;
 		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
 		spin_lock_init(&stats->b_lock);
 		tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
 		if (tmpstat < 0) {
 			brelse(*bh);
 			*bh = NULL;
 		}
 		if (tmpstat != -EAGAIN) {
 			status = tmpstat;
 			break;
 		}
 	}
 bail:
 	return status;
 }
 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
 {
 	u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
 	if (osb->s_mount_opt & hb_enabled) {
 		if (ocfs2_mount_local(osb)) {
 			mlog(ML_ERROR, "Cannot heartbeat on a locally "
 			     "mounted device.\n");
 			return -EINVAL;
 		}
 		if (ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Userspace stack expected, but "
 			     "o2cb heartbeat arguments passed to mount\n");
 			return -EINVAL;
 		}
 		if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
 		     !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
 		    ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
 		     ocfs2_cluster_o2cb_global_heartbeat(osb))) {
 			mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
 			return -EINVAL;
 		}
 	}
 	if (!(osb->s_mount_opt & hb_enabled)) {
 		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
 		    !ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Heartbeat has to be started to mount "
 			     "a read-write clustered device.\n");
 			return -EINVAL;
 		}
 	}
 	return 0;
 }
 /*
  * If we're using a userspace stack, mount should have passed
  * a name that matches the disk.  If not, mount should not
  * have passed a stack.
  */
 static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
 					struct mount_options *mopt)
 {
 	if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
 		mlog(ML_ERROR,
 		     "cluster stack passed to mount, but this filesystem "
 		     "does not support it\n");
 		return -EINVAL;
 	}
 	if (ocfs2_userspace_stack(osb) &&
 	    strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
 		    OCFS2_STACK_LABEL_LEN)) {
 		mlog(ML_ERROR,
 		     "cluster stack passed to mount (\"%s\") does not "
 		     "match the filesystem (\"%s\")\n",
 		     mopt->cluster_stack,
 		     osb->osb_cluster_stack);
 		return -EINVAL;
 	}
 	return 0;
 }
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
 {
 	int type;
 	struct super_block *sb = osb->sb;
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
 	int status = 0;
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
 			continue;
 		if (unsuspend)
 			status = dquot_resume(sb, type);
 		else {
 			struct ocfs2_mem_dqinfo *oinfo;
 			/* Cancel periodic syncing before suspending */
 			oinfo = sb_dqinfo(sb, type)->dqi_priv;
 			cancel_delayed_work_sync(&oinfo->dqi_sync_work);
 			status = dquot_suspend(sb, type);
 		}
 		if (status < 0)
 			break;
 	}
 	if (status < 0)
 		mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
 		     "remount (error = %d).\n", status);
 	return status;
 }
 static int ocfs2_enable_quotas(struct ocfs2_super *osb)
 {
 	struct inode *inode[MAXQUOTAS] = { NULL, NULL };
 	struct super_block *sb = osb->sb;
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
 	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
 					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
 	int status;
 	int type;
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
 			continue;
 		inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
 							osb->slot_num);
 		if (!inode[type]) {
 			status = -ENOENT;
 			goto out_quota_off;
 		}
 		status = dquot_enable(inode[type], type, QFMT_OCFS2,
 				      DQUOT_USAGE_ENABLED);
 		if (status < 0)
 			goto out_quota_off;
 	}
 	for (type = 0; type < MAXQUOTAS; type++)
 		iput(inode[type]);
 	return 0;
 out_quota_off:
 	ocfs2_disable_quotas(osb);
 	for (type = 0; type < MAXQUOTAS; type++)
 		iput(inode[type]);
 	mlog_errno(status);
 	return status;
 }
 static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 {
 	int type;
 	struct inode *inode;
 	struct super_block *sb = osb->sb;
 	struct ocfs2_mem_dqinfo *oinfo;
 	/* We mostly ignore errors in this function because there's not much
 	 * we can do when we see them */
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (!sb_has_quota_loaded(sb, type))
 			continue;
 		/* Cancel periodic syncing before we grab dqonoff_mutex */
 		oinfo = sb_dqinfo(sb, type)->dqi_priv;
 		cancel_delayed_work_sync(&oinfo->dqi_sync_work);
 		inode = igrab(sb->s_dquot.files[type]);
 		/* Turn off quotas. This will remove all dquot structures from
 		 * memory and so they will be automatically synced to global
 		 * quota files */
 		dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
 					DQUOT_LIMITS_ENABLED);
 		if (!inode)
 			continue;
 		iput(inode);
 	}
 }
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
 {
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
 	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
 		return -EINVAL;
 	return dquot_enable(sb_dqopt(sb)->files[type], type,
 			    format_id, DQUOT_LIMITS_ENABLED);
 }
 /* Handle quota off quotactl */
 static int ocfs2_quota_off(struct super_block *sb, int type)
 {
 	return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
 	.quota_on_meta	= ocfs2_quota_on,
 	.quota_off	= ocfs2_quota_off,
 	.quota_sync	= dquot_quota_sync,
 	.get_info	= dquot_get_dqinfo,
 	.set_info	= dquot_set_dqinfo,
 	.get_dqblk	= dquot_get_dqblk,
 	.set_dqblk	= dquot_set_dqblk,
 };
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dentry *root;
 	int status, sector_size;
 	struct mount_options parsed_options;
 	struct inode *inode = NULL;
 	struct ocfs2_super *osb = NULL;
 	struct buffer_head *bh = NULL;
 	char nodestr[12];
 	struct ocfs2_blockcheck_stats stats;
 	trace_ocfs2_fill_super(sb, data, silent);
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
 		status = -EINVAL;
 		goto read_super_error;
 	}
 	/* probe for superblock */
 	status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
 	if (status < 0) {
 		mlog(ML_ERROR, "superblock probe failed!\n");
 		goto read_super_error;
 	}
 	status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
 	osb = OCFS2_SB(sb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto read_super_error;
 	}
 	brelse(bh);
 	bh = NULL;
 	if (!ocfs2_check_set_options(sb, &parsed_options)) {
 		status = -EINVAL;
 		goto read_super_error;
 	}
 	osb->s_mount_opt = parsed_options.mount_opt;
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
 	osb->osb_resv_level = parsed_options.resv_level;
 	osb->osb_dir_resv_level = parsed_options.resv_level;
 	if (parsed_options.dir_resv_level == -1)
 		osb->osb_dir_resv_level = parsed_options.resv_level;
 	else
 		osb->osb_dir_resv_level = parsed_options.dir_resv_level;
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
 		goto read_super_error;
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
 		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
 		if (!(sb->s_flags & MS_RDONLY)) {
 			status = -EACCES;
 			mlog(ML_ERROR, "Readonly device detected but readonly "
 			     "mount was not specified.\n");
 			goto read_super_error;
 		}
 		/* You should not be able to start a local heartbeat
 		 * on a readonly device. */
 		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
 			status = -EROFS;
 			mlog(ML_ERROR, "Local heartbeat specified on readonly "
 			     "device.\n");
 			goto read_super_error;
 		}
 		status = ocfs2_check_journals_nolocks(osb);
 		if (status < 0) {
 			if (status == -EROFS)
 				mlog(ML_ERROR, "Recovery required on readonly "
 				     "file system, but write access is "
 				     "unavailable.\n");
 			else
 				mlog_errno(status);
 			goto read_super_error;
 		}
 		ocfs2_set_ro_flag(osb, 1);
 		printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
 		       "Cluster services will not be used for this mount. "
 		       "Recovery will be skipped.\n", osb->dev_str);
 	}
 	if (!ocfs2_is_hard_readonly(osb)) {
 		if (sb->s_flags & MS_RDONLY)
 			ocfs2_set_ro_flag(osb, 0);
 	}
 	status = ocfs2_verify_heartbeat(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto read_super_error;
 	}
 	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
 						 ocfs2_debugfs_root);
 	if (!osb->osb_debug_root) {
 		status = -EINVAL;
 		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
 		goto read_super_error;
 	}
 	osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
 					    osb->osb_debug_root,
 					    osb,
 					    &ocfs2_osb_debug_fops);
 	if (!osb->osb_ctxt) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto read_super_error;
 	}
 	if (ocfs2_meta_ecc(osb)) {
 		status = ocfs2_blockcheck_stats_debugfs_install(
 						&osb->osb_ecc_stats,
 						osb->osb_debug_root);
 		if (status) {
 			mlog(ML_ERROR,
 			     "Unable to create blockcheck statistics "
 			     "files\n");
 			goto read_super_error;
 		}
 	}
 	status = ocfs2_mount_volume(sb);
 	if (status < 0)
 		goto read_super_error;
 	if (osb->root_inode)
 		inode = igrab(osb->root_inode);
 	if (!inode) {
 		status = -EIO;
 		mlog_errno(status);
 		goto read_super_error;
 	}
 	root = d_make_root(inode);
 	if (!root) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto read_super_error;
 	}
 	sb->s_root = root;
 	ocfs2_complete_mount_recovery(osb);
 	if (ocfs2_mount_local(osb))
 		snprintf(nodestr, sizeof(nodestr), "local");
 	else
 		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
 	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
 	       "with %s data mode.\n",
 	       osb->dev_str, nodestr, osb->slot_num,
 	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
 	       "ordered");
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
 	wake_up(&osb->osb_mount_event);
 	/* Now we can initialize quotas because we can afford to wait
 	 * for cluster locks recovery now. That also means that truncation
 	 * log recovery can happen but that waits for proper quota setup */
 	if (!(sb->s_flags & MS_RDONLY)) {
 		status = ocfs2_enable_quotas(osb);
 		if (status < 0) {
 			/* We have to err-out specially here because
 			 * s_root is already set */
 			mlog_errno(status);
 			atomic_set(&osb->vol_state, VOLUME_DISABLED);
 			wake_up(&osb->osb_mount_event);
 			return status;
 		}
 	}
 	ocfs2_complete_quota_recovery(osb);
 	/* Now we wake up again for processes waiting for quotas */
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
 	wake_up(&osb->osb_mount_event);
 	/* Start this when the mount is almost sure of being successful */
 	ocfs2_orphan_scan_start(osb);
 	return status;
 read_super_error:
 	brelse(bh);
 	if (osb) {
 		atomic_set(&osb->vol_state, VOLUME_DISABLED);
 		wake_up(&osb->osb_mount_event);
 		ocfs2_dismount_volume(sb, 1);
 	}
 	if (status)
 		mlog_errno(status);
 	return status;
 }
 static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
 			int flags,
 			const char *dev_name,
 			void *data)
 {
 	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
 }
 static struct file_system_type ocfs2_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ocfs2",
 	.mount          = ocfs2_mount,
 	.kill_sb        = kill_block_super,
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
 	.next           = NULL
 };
 MODULE_ALIAS_FS("ocfs2");
 static int ocfs2_check_set_options(struct super_block *sb,
 				   struct mount_options *options)
 {
 	if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
 	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
 		mlog(ML_ERROR, "User quotas were requested, but this "
 		     "filesystem does not have the feature enabled.\n");
 		return 0;
 	}
 	if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
 	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 					 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
 		mlog(ML_ERROR, "Group quotas were requested, but this "
 		     "filesystem does not have the feature enabled.\n");
 		return 0;
 	}
 	if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
 	    !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
 		mlog(ML_ERROR, "ACL support requested but extended attributes "
 		     "feature is not enabled\n");
 		return 0;
 	}
 	/* No ACL setting specified? Use XATTR feature... */
 	if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
 				    OCFS2_MOUNT_NO_POSIX_ACL))) {
 		if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
 			options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
 		else
 			options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
 	}
 	return 1;
 }
 static int ocfs2_parse_options(struct super_block *sb,
 			       char *options,
 			       struct mount_options *mopt,
 			       int is_remount)
 {
 	int status, user_stack = 0;
 	char *p;
 	u32 tmp;
 	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
 	mopt->commit_interval = 0;
 	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
 	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 	mopt->slot = OCFS2_INVALID_SLOT;
 	mopt->localalloc_opt = -1;
 	mopt->cluster_stack[0] = '\0';
 	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
 	mopt->dir_resv_level = -1;
 	if (!options) {
 		status = 1;
 		goto bail;
 	}
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token, option;
 		substring_t args[MAX_OPT_ARGS];
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_hb_local:
 			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
 			break;
 		case Opt_hb_none:
 			mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
 			break;
 		case Opt_hb_global:
 			mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
 			break;
 		case Opt_barrier:
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option)
 				mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
 			else
 				mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
 			break;
 		case Opt_intr:
 			mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
 			break;
 		case Opt_nointr:
 			mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
 			break;
 		case Opt_err_panic:
 			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
 			break;
 		case Opt_err_ro:
 			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
 			break;
 		case Opt_data_ordered:
 			mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
 		case Opt_data_writeback:
 			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
 		case Opt_user_xattr:
 			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
 			break;
 		case Opt_nouser_xattr:
 			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
 			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option >= 0)
 				mopt->atime_quantum = option;
 			break;
 		case Opt_slot:
 			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option)
 				mopt->slot = (s16)option;
 			break;
 		case Opt_commit:
 			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option < 0)
 				return 0;
 			if (option == 0)
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
 			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option >= 0)
 				mopt->localalloc_opt = option;
 			break;
 		case Opt_localflocks:
 			/*
 			 * Changing this during remount could race
 			 * flock() requests, or "unbalance" existing
 			 * ones (e.g., a lock is taken in one mode but
 			 * dropped in the other). If users care enough
 			 * to flip locking modes during remount, we
 			 * could add a "local" flag to individual
 			 * flock structures for proper tracking of
 			 * state.
 			 */
 			if (!is_remount)
 				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
 			break;
 		case Opt_stack:
 			/* Check both that the option we were passed
 			 * is of the right length and that it is a proper
 			 * string of the right length.
 			 */
 			if (((args[0].to - args[0].from) !=
 			     OCFS2_STACK_LABEL_LEN) ||
 			    (strnlen(args[0].from,
 				     OCFS2_STACK_LABEL_LEN) !=
 			     OCFS2_STACK_LABEL_LEN)) {
 				mlog(ML_ERROR,
 				     "Invalid cluster_stack option\n");
 				status = 0;
 				goto bail;
 			}
 			memcpy(mopt->cluster_stack, args[0].from,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
 			/*
 			 * Open code the memcmp here as we don't have
 			 * an osb to pass to
 			 * ocfs2_userspace_stack().
 			 */
 			if (memcmp(mopt->cluster_stack,
 				   OCFS2_CLASSIC_CLUSTER_STACK,
 				   OCFS2_STACK_LABEL_LEN))
 				user_stack = 1;
 			break;
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
 			break;
 		case Opt_usrquota:
 			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
 			break;
 		case Opt_grpquota:
 			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
 			break;
 		case Opt_coherency_buffered:
 			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
 			break;
 		case Opt_coherency_full:
 			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
 			break;
 		case Opt_acl:
 			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
 			break;
 		case Opt_noacl:
 			mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
 			break;
 		case Opt_resv_level:
 			if (is_remount)
 				break;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option >= OCFS2_MIN_RESV_LEVEL &&
 			    option < OCFS2_MAX_RESV_LEVEL)
 				mopt->resv_level = option;
 			break;
 		case Opt_dir_resv_level:
 			if (is_remount)
 				break;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
 			}
 			if (option >= OCFS2_MIN_RESV_LEVEL &&
 			    option < OCFS2_MAX_RESV_LEVEL)
 				mopt->dir_resv_level = option;
 			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
 			     "or missing value\n", p);
 			status = 0;
 			goto bail;
 		}
 	}
 	if (user_stack == 0) {
 		/* Ensure only one heartbeat mode */
 		tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
 					 OCFS2_MOUNT_HB_GLOBAL |
 					 OCFS2_MOUNT_HB_NONE);
 		if (hweight32(tmp) != 1) {
 			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
 			status = 0;
 			goto bail;
 		}
 	}
 	status = 1;
 bail:
 	return status;
 }
 static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 {
 	struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
 	unsigned long opts = osb->s_mount_opt;
 	unsigned int local_alloc_megs;
 	if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
 		seq_printf(s, ",_netdev");
 		if (opts & OCFS2_MOUNT_HB_LOCAL)
 			seq_printf(s, ",%s", OCFS2_HB_LOCAL);
 		else
 			seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
 	} else
 		seq_printf(s, ",%s", OCFS2_HB_NONE);
 	if (opts & OCFS2_MOUNT_NOINTR)
 		seq_printf(s, ",nointr");
 	if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
 		seq_printf(s, ",data=writeback");
 	else
 		seq_printf(s, ",data=ordered");
 	if (opts & OCFS2_MOUNT_BARRIER)
 		seq_printf(s, ",barrier=1");
 	if (opts & OCFS2_MOUNT_ERRORS_PANIC)
 		seq_printf(s, ",errors=panic");
 	else
 		seq_printf(s, ",errors=remount-ro");
 	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
 		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
 	seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 	if (osb->osb_commit_interval)
 		seq_printf(s, ",commit=%u",
 			   (unsigned) (osb->osb_commit_interval / HZ));
 	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
 	if (local_alloc_megs != ocfs2_la_default_mb(osb))
 		seq_printf(s, ",localalloc=%d", local_alloc_megs);
 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
 		seq_printf(s, ",localflocks,");
 	if (osb->osb_cluster_stack[0])
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
 	if (opts & OCFS2_MOUNT_USRQUOTA)
 		seq_printf(s, ",usrquota");
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
 		seq_printf(s, ",grpquota");
 	if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
 		seq_printf(s, ",coherency=buffered");
 	else
 		seq_printf(s, ",coherency=full");
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
 	else
 		seq_printf(s, ",user_xattr");
 	if (opts & OCFS2_MOUNT_INODE64)
 		seq_printf(s, ",inode64");
 	if (opts & OCFS2_MOUNT_POSIX_ACL)
 		seq_printf(s, ",acl");
 	else
 		seq_printf(s, ",noacl");
 	if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
 		seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
 	if (osb->osb_dir_resv_level != osb->osb_resv_level)
 		seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
 	return 0;
 }
 static int __init ocfs2_init(void)
 {
 	int status;
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0)
 		goto out1;
 	status = ocfs2_initialize_mem_caches();
 	if (status < 0)
 		goto out2;
 	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
 	if (!ocfs2_wq) {
 		status = -ENOMEM;
 		goto out3;
 	}
 	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
 	if (!ocfs2_debugfs_root) {
 		status = -EFAULT;
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 	}
 	ocfs2_set_locking_protocol();
 	status = register_quota_format(&ocfs2_quota_format);
 	if (status < 0)
 		goto out4;
 	status = register_filesystem(&ocfs2_fs_type);
 	if (!status)
 		return 0;
 	unregister_quota_format(&ocfs2_quota_format);
 out4:
 	destroy_workqueue(ocfs2_wq);
 	debugfs_remove(ocfs2_debugfs_root);
 out3:
 	ocfs2_free_mem_caches();
 out2:
 	exit_ocfs2_uptodate_cache();
 out1:
 	mlog_errno(status);
 	return status;
 }
 static void __exit ocfs2_exit(void)
 {
 	if (ocfs2_wq) {
 		flush_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
 	}
 	unregister_quota_format(&ocfs2_quota_format);
 	debugfs_remove(ocfs2_debugfs_root);
 	ocfs2_free_mem_caches();
 	unregister_filesystem(&ocfs2_fs_type);
 	exit_ocfs2_uptodate_cache();
 }
 static void ocfs2_put_super(struct super_block *sb)
 {
 	trace_ocfs2_put_super(sb);
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
 }
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ocfs2_super *osb;
 	u32 numbits, freebits;
 	int status;
 	struct ocfs2_dinode *bm_lock;
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 	trace_ocfs2_statfs(dentry->d_sb, buf);
 	osb = OCFS2_SB(dentry->d_sb);
 	inode = ocfs2_get_system_file_inode(osb,
 					    GLOBAL_BITMAP_SYSTEM_INODE,
 					    OCFS2_INVALID_SLOT);
 	if (!inode) {
 		mlog(ML_ERROR, "failed to get bitmap inode\n");
 		status = -EIO;
 		goto bail;
 	}
 	status = ocfs2_inode_lock(inode, &bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	bm_lock = (struct ocfs2_dinode *) bh->b_data;
 	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
 	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
 	buf->f_type = OCFS2_SUPER_MAGIC;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
 	buf->f_blocks = ((sector_t) numbits) *
 			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
 	buf->f_bfree = ((sector_t) freebits) *
 		       (osb->s_clustersize >> osb->sb->s_blocksize_bits);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = numbits;
 	buf->f_ffree = freebits;
 	buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
 				& 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
 				OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
 	brelse(bh);
 	ocfs2_inode_unlock(inode, 0);
 	status = 0;
 bail:
 	if (inode)
 		iput(inode);
 	if (status)
 		mlog_errno(status);
 	return status;
 }
 static void ocfs2_inode_init_once(void *data)
 {
 	struct ocfs2_inode_info *oi = data;
 	oi->ip_flags = 0;
 	oi->ip_open_count = 0;
 	spin_lock_init(&oi->ip_lock);
 	ocfs2_extent_map_init(&oi->vfs_inode);
 	INIT_LIST_HEAD(&oi->ip_io_markers);
 	oi->ip_dir_start_lookup = 0;
 	mutex_init(&oi->ip_unaligned_aio);
 	init_rwsem(&oi->ip_alloc_sem);
 	init_rwsem(&oi->ip_xattr_sem);
 	mutex_init(&oi->ip_io_mutex);
 	oi->ip_blkno = 0ULL;
 	oi->ip_clusters = 0;
 	ocfs2_resv_init_once(&oi->ip_la_data_resv);
 	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 	ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
 				  &ocfs2_inode_caching_ops);
 	inode_init_once(&oi->vfs_inode);
 }
 static int ocfs2_initialize_mem_caches(void)
 {
 	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
 				       sizeof(struct ocfs2_inode_info),
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
 				       ocfs2_inode_init_once);
 	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
 					sizeof(struct ocfs2_dquot),
 					0,
 					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
 					NULL);
 	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
 					sizeof(struct ocfs2_quota_chunk),
 					0,
 					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
 					NULL);
 	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
 	    !ocfs2_qf_chunk_cachep) {
 		if (ocfs2_inode_cachep)
 			kmem_cache_destroy(ocfs2_inode_cachep);
 		if (ocfs2_dquot_cachep)
 			kmem_cache_destroy(ocfs2_dquot_cachep);
 		if (ocfs2_qf_chunk_cachep)
 			kmem_cache_destroy(ocfs2_qf_chunk_cachep);
 		return -ENOMEM;
 	}
 	return 0;
 }
 static void ocfs2_free_mem_caches(void)
 {
 	/*
 	 * Make sure all delayed rcu free inodes are flushed before we
 	 * destroy cache.
 	 */
 	rcu_barrier();
 	if (ocfs2_inode_cachep)
 		kmem_cache_destroy(ocfs2_inode_cachep);
 	ocfs2_inode_cachep = NULL;
 	if (ocfs2_dquot_cachep)
 		kmem_cache_destroy(ocfs2_dquot_cachep);
 	ocfs2_dquot_cachep = NULL;
 	if (ocfs2_qf_chunk_cachep)
 		kmem_cache_destroy(ocfs2_qf_chunk_cachep);
 	ocfs2_qf_chunk_cachep = NULL;
 }
 static int ocfs2_get_sector(struct super_block *sb,
 			    struct buffer_head **bh,
 			    int block,
 			    int sect_size)
 {
 	if (!sb_set_blocksize(sb, sect_size)) {
 		mlog(ML_ERROR, "unable to set blocksize\n");
 		return -EIO;
 	}
 	*bh = sb_getblk(sb, block);
 	if (!*bh) {
 		mlog_errno(-ENOMEM);
 		return -ENOMEM;
 	}
 	lock_buffer(*bh);
 	if (!buffer_dirty(*bh))
 		clear_buffer_uptodate(*bh);
 	unlock_buffer(*bh);
 	ll_rw_block(READ, 1, bh);
 	wait_on_buffer(*bh);
 	if (!buffer_uptodate(*bh)) {
 		mlog_errno(-EIO);
 		brelse(*bh);
 		*bh = NULL;
 		return -EIO;
 	}
 	return 0;
 }
 static int ocfs2_mount_volume(struct super_block *sb)
 {
 	int status = 0;
 	int unlock_super = 0;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	if (ocfs2_is_hard_readonly(osb))
 		goto leave;
 	status = ocfs2_dlm_init(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_super_lock(osb, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	unlock_super = 1;
 	/* This will load up the node map and add ourselves to it. */
 	status = ocfs2_find_slot(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	/* load all node-local system inodes */
 	status = ocfs2_init_local_system_inodes(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_check_volume(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_truncate_log_init(osb);
 	if (status < 0)
 		mlog_errno(status);
 leave:
 	if (unlock_super)
 		ocfs2_super_unlock(osb, 1);
 	return status;
 }
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
 	int tmp, hangup_needed = 0;
 	struct ocfs2_super *osb = NULL;
 	char nodestr[12];
 	trace_ocfs2_dismount_volume(sb);
 	BUG_ON(!sb);
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 	debugfs_remove(osb->osb_ctxt);
 	/* Orphan scan should be stopped as early as possible */
 	ocfs2_orphan_scan_stop(osb);
 	ocfs2_disable_quotas(osb);
 	/* All dquots should be freed by now */
 	WARN_ON(!llist_empty(&osb->dquot_drop_list));
 	/* Wait for worker to be done with the work structure in osb */
 	cancel_work_sync(&osb->dquot_drop_work);
 	ocfs2_shutdown_local_alloc(osb);
 	ocfs2_truncate_log_shutdown(osb);
 	/* This will disable recovery and flush any recovery work. */
 	ocfs2_recovery_exit(osb);
 	ocfs2_journal_shutdown(osb);
 	ocfs2_sync_blockdev(sb);
 	ocfs2_purge_refcount_trees(osb);
 	/* No cluster connection means we've failed during mount, so skip
 	 * all the steps which depended on that to complete. */
 	if (osb->cconn) {
 		tmp = ocfs2_super_lock(osb, 1);
 		if (tmp < 0) {
 			mlog_errno(tmp);
 			return;
 		}
 	}
 	if (osb->slot_num != OCFS2_INVALID_SLOT)
 		ocfs2_put_slot(osb);
 	if (osb->cconn)
 		ocfs2_super_unlock(osb, 1);
 	ocfs2_release_system_inodes(osb);
 	/*
 	 * If we're dismounting due to mount error, mount.ocfs2 will clean
 	 * up heartbeat.  If we're a local mount, there is no heartbeat.
 	 * If we failed before we got a uuid_str yet, we can't stop
 	 * heartbeat.  Otherwise, do it.
 	 */
 	if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
 	    !ocfs2_is_hard_readonly(osb))
 		hangup_needed = 1;
 	if (osb->cconn)
 		ocfs2_dlm_shutdown(osb, hangup_needed);
 	ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
 	debugfs_remove(osb->osb_debug_root);
 	if (hangup_needed)
 		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
 	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
 	if (ocfs2_mount_local(osb))
 		snprintf(nodestr, sizeof(nodestr), "local");
 	else
 		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
 	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
 	       osb->dev_str, nodestr);
 	ocfs2_delete_osb(osb);
 	kfree(osb);
 	sb->s_dev = 0;
 	sb->s_fs_info = NULL;
 }
 static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
 				unsigned uuid_bytes)
 {
 	int i, ret;
 	char *ptr;
 	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
 	osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
 	if (osb->uuid_str == NULL)
 		return -ENOMEM;
 	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
 		/* print with null */
 		ret = snprintf(ptr, 3, "%02X", uuid[i]);
 		if (ret != 2) /* drop super cleans up */
 			return -EINVAL;
 		/* then only advance past the last char */
 		ptr += 2;
 	}
 	return 0;
 }
 /* Make sure entire volume is addressable by our journal.  Requires
    osb_clusters_at_boot to be valid and for the journal to have been
    initialized by ocfs2_journal_init(). */
 static int ocfs2_journal_addressable(struct ocfs2_super *osb)
 {
 	int status = 0;
 	u64 max_block =
 		ocfs2_clusters_to_blocks(osb->sb,
 					 osb->osb_clusters_at_boot) - 1;
 	/* 32-bit block number is always OK. */
 	if (max_block <= (u32)~0ULL)
 		goto out;
 	/* Volume is "huge", so see if our journal is new enough to
 	   support it. */
 	if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
 				       OCFS2_FEATURE_COMPAT_JBD2_SB) &&
 	      jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
 					       JBD2_FEATURE_INCOMPAT_64BIT))) {
 		mlog(ML_ERROR, "The journal cannot address the entire volume. "
 		     "Enable the 'block64' journal option with tunefs.ocfs2");
 		status = -EFBIG;
 		goto out;
 	}
  out:
 	return status;
 }
 static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
 				  int sector_size,
 				  struct ocfs2_blockcheck_stats *stats)
 {
 	int status;
 	int i, cbits, bbits;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 	struct inode *inode = NULL;
 	struct ocfs2_journal *journal;
 	struct ocfs2_super *osb;
 	u64 total_blocks;
 	osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
 	if (!osb) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_d_op = &ocfs2_dentry_ops;
 	sb->s_export_op = &ocfs2_export_ops;
 	sb->s_qcop = &ocfs2_quotactl_ops;
 	sb->dq_op = &ocfs2_quota_operations;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
 	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
 	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
 	for (i = 0; i < 3; i++)
 		osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
 	osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
 	osb->sb = sb;
 	/* Save off for ocfs2_rw_direct */
 	osb->s_sectsize_bits = blksize_bits(sector_size);
 	BUG_ON(!osb->s_sectsize_bits);
 	spin_lock_init(&osb->dc_task_lock);
 	init_waitqueue_head(&osb->dc_event);
 	osb->dc_work_sequence = 0;
 	osb->dc_wake_sequence = 0;
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
 	spin_lock_init(&osb->osb_xattr_lock);
 	ocfs2_init_steal_slots(osb);
 	mutex_init(&osb->system_file_mutex);
 	atomic_set(&osb->alloc_stats.moves, 0);
 	atomic_set(&osb->alloc_stats.local_data, 0);
 	atomic_set(&osb->alloc_stats.bitmap_data, 0);
 	atomic_set(&osb->alloc_stats.bg_allocs, 0);
 	atomic_set(&osb->alloc_stats.bg_extends, 0);
 	/* Copy the blockcheck stats from the superblock probe */
 	osb->osb_ecc_stats = *stats;
 	ocfs2_init_node_maps(osb);
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
 	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
 		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
 		     osb->max_slots);
 		status = -EINVAL;
 		goto bail;
 	}
 	ocfs2_orphan_scan_init(osb);
 	status = ocfs2_recovery_init(osb);
 	if (status) {
 		mlog(ML_ERROR, "Unable to initialize recovery state\n");
 		mlog_errno(status);
 		goto bail;
 	}
 	init_waitqueue_head(&osb->checkpoint_event);
 	osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 	osb->slot_num = OCFS2_INVALID_SLOT;
 	osb->s_xattr_inline_size = le16_to_cpu(
 					di->id2.i_super.s_xattr_inline_size);
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
 	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
 	init_waitqueue_head(&osb->osb_mount_event);
 	status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
 	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
 	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
 	if (!osb->vol_label) {
 		mlog(ML_ERROR, "unable to alloc vol label\n");
 		status = -ENOMEM;
 		goto bail;
 	}
 	osb->slot_recovery_generations =
 		kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
 			GFP_KERNEL);
 	if (!osb->slot_recovery_generations) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
 	init_waitqueue_head(&osb->osb_wipe_event);
 	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
 					sizeof(*osb->osb_orphan_wipes),
 					GFP_KERNEL);
 	if (!osb->osb_orphan_wipes) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
 	osb->osb_rf_lock_tree = RB_ROOT;
 	osb->s_feature_compat =
 		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
 	osb->s_feature_ro_compat =
 		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
 	osb->s_feature_incompat =
 		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
 	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
 		mlog(ML_ERROR, "couldn't mount because of unsupported "
 		     "optional features (%x).\n", i);
 		status = -EINVAL;
 		goto bail;
 	}
 	if (!(osb->sb->s_flags & MS_RDONLY) &&
 	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
 		mlog(ML_ERROR, "couldn't mount RDWR because of "
 		     "unsupported optional features (%x).\n", i);
 		status = -EINVAL;
 		goto bail;
 	}
 	if (ocfs2_clusterinfo_valid(osb)) {
 		osb->osb_stackflags =
 			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
 		strlcpy(osb->osb_cluster_stack,
 		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
 		       OCFS2_STACK_LABEL_LEN + 1);
 		if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
 			mlog(ML_ERROR,
 			     "couldn't mount because of an invalid "
 			     "cluster stack label (%s) \n",
 			     osb->osb_cluster_stack);
 			status = -EINVAL;
 			goto bail;
 		}
 		strlcpy(osb->osb_cluster_name,
 			OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
 			OCFS2_CLUSTER_NAME_LEN + 1);
 	} else {
 		/* The empty string is identical with classic tools that
 		 * don't know about s_cluster_info. */
 		osb->osb_cluster_stack[0] = '\0';
 	}
 	get_random_bytes(&osb->s_next_generation, sizeof(u32));
 	/* FIXME
 	 * This should be done in ocfs2_journal_init(), but unknown
 	 * ordering issues will cause the filesystem to crash.
 	 * If anyone wants to figure out what part of the code
 	 * refers to osb->journal before ocfs2_journal_init() is run,
 	 * be my guest.
 	 */
 	/* initialize our journal structure */
 	journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
 	if (!journal) {
 		mlog(ML_ERROR, "unable to alloc journal\n");
 		status = -ENOMEM;
 		goto bail;
 	}
 	osb->journal = journal;
 	journal->j_osb = osb;
 	atomic_set(&journal->j_num_trans, 0);
 	init_rwsem(&journal->j_trans_barrier);
 	init_waitqueue_head(&journal->j_checkpointed);
 	spin_lock_init(&journal->j_lock);
 	journal->j_trans_id = (unsigned long) 1;
 	INIT_LIST_HEAD(&journal->j_la_cleanups);
 	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
 	journal->j_state = OCFS2_JOURNAL_FREE;
 	INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
 	init_llist_head(&osb->dquot_drop_list);
 	/* get some pseudo constants for clustersize bits */
 	osb->s_clustersize_bits =
 		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	osb->s_clustersize = 1 << osb->s_clustersize_bits;
 	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
 	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
 		mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
 		     osb->s_clustersize);
 		status = -EINVAL;
 		goto bail;
 	}
 	total_blocks = ocfs2_clusters_to_blocks(osb->sb,
 						le32_to_cpu(di->i_clusters));
 	status = generic_check_addressable(osb->sb->s_blocksize_bits,
 					   total_blocks);
 	if (status) {
 		mlog(ML_ERROR, "Volume too large "
 		     "to mount safely on this system");
 		status = -EFBIG;
 		goto bail;
 	}
 	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
 				 sizeof(di->id2.i_super.s_uuid))) {
 		mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
 		status = -ENOMEM;
 		goto bail;
 	}
 	strlcpy(osb->vol_label, di->id2.i_super.s_label,
 		OCFS2_MAX_VOL_LABEL_LEN);
 	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
 	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
 	osb->first_cluster_group_blkno =
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
 	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
 	trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
 				     (unsigned long long)osb->root_blkno,
 				     (unsigned long long)osb->system_dir_blkno,
 				     osb->s_clustersize_bits);
 	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
 	if (!osb->osb_dlm_debug) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
 	atomic_set(&osb->vol_state, VOLUME_INIT);
 	/* load root, system_dir, and all global system inodes */
 	status = ocfs2_init_global_system_inodes(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	/*
 	 * global bitmap
 	 */
 	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
 					    OCFS2_INVALID_SLOT);
 	if (!inode) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
 	osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
 	iput(inode);
 	osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
 				 osb->s_feature_incompat) * 8;
 	status = ocfs2_init_slot_info(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
 bail:
 	return status;
 }
 /*
  * will return: -EAGAIN if it is ok to keep searching for superblocks
  *              -EINVAL if there is a bad superblock
  *              0 on success
  */
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 			       struct buffer_head *bh,
 			       u32 blksz,
 			       struct ocfs2_blockcheck_stats *stats)
 {
 	int status = -EAGAIN;
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
 		/* We have to do a raw check of the feature here */
 		if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
 		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
 			status = ocfs2_block_check_validate(bh->b_data,
 							    bh->b_size,
 							    &di->i_check,
 							    stats);
 			if (status)
 				goto out;
 		}
 		status = -EINVAL;
 		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
 			mlog(ML_ERROR, "found superblock with incorrect block "
 			     "size: found %u, should be %u\n",
 			     1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
 			       blksz);
 		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
 			   OCFS2_MAJOR_REV_LEVEL ||
 			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
 			   OCFS2_MINOR_REV_LEVEL) {
 			mlog(ML_ERROR, "found superblock with bad version: "
 			     "found %u.%u, should be %u.%u\n",
 			     le16_to_cpu(di->id2.i_super.s_major_rev_level),
 			     le16_to_cpu(di->id2.i_super.s_minor_rev_level),
 			     OCFS2_MAJOR_REV_LEVEL,
 			     OCFS2_MINOR_REV_LEVEL);
 		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
 			mlog(ML_ERROR, "bad block number on superblock: "
 			     "found %llu, should be %llu\n",
 			     (unsigned long long)le64_to_cpu(di->i_blkno),
 			     (unsigned long long)bh->b_blocknr);
 		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
 			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
 			mlog(ML_ERROR, "bad cluster size found: %u\n",
 			     1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
 		} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
 			mlog(ML_ERROR, "bad root_blkno: 0\n");
 		} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
 			mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
 		} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
 			mlog(ML_ERROR,
 			     "Superblock slots found greater than file system "
 			     "maximum: found %u, max %u\n",
 			     le16_to_cpu(di->id2.i_super.s_max_slots),
 			     OCFS2_MAX_SLOTS);
 		} else {
 			/* found it! */
 			status = 0;
 		}
 	}
 out:
 	if (status && status != -EAGAIN)
 		mlog_errno(status);
 	return status;
 }
 static int ocfs2_check_volume(struct ocfs2_super *osb)
 {
 	int status;
 	int dirty;
 	int local;
 	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
 						  * recover
 						  * ourselves. */
 	/* Init our journal object. */
 	status = ocfs2_journal_init(osb->journal, &dirty);
 	if (status < 0) {
 		mlog(ML_ERROR, "Could not initialize journal!\n");
 		goto finally;
 	}
 	/* Now that journal has been initialized, check to make sure
 	   entire volume is addressable. */
 	status = ocfs2_journal_addressable(osb);
 	if (status)
 		goto finally;
 	/* If the journal was unmounted cleanly then we don't want to
 	 * recover anything. Otherwise, journal_load will do that
 	 * dirty work for us :) */
 	if (!dirty) {
 		status = ocfs2_journal_wipe(osb->journal, 0);
 		if (status < 0) {
 			mlog_errno(status);
 			goto finally;
 		}
 	} else {
 		printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
 		       "unmounted cleanly, recovering it.\n", osb->dev_str);
 	}
 	local = ocfs2_mount_local(osb);
 	/* will play back anything left in the journal. */
 	status = ocfs2_journal_load(osb->journal, local, dirty);
 	if (status < 0) {
 		mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
 		goto finally;
 	}
 	if (dirty) {
 		/* recover my local alloc if we didn't unmount cleanly. */
 		status = ocfs2_begin_local_alloc_recovery(osb,
 							  osb->slot_num,
 							  &local_alloc);
 		if (status < 0) {
 			mlog_errno(status);
 			goto finally;
 		}
 		/* we complete the recovery process after we've marked
 		 * ourselves as mounted. */
 	}
 	status = ocfs2_load_local_alloc(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto finally;
 	}
 	if (dirty) {
 		/* Recovery will be completed after we've mounted the
 		 * rest of the volume. */
 		osb->dirty = 1;
 		osb->local_alloc_copy = local_alloc;
 		local_alloc = NULL;
 	}
 	/* go through each journal, trylock it and if you get the
 	 * lock, and it's marked as dirty, set the bit in the recover
 	 * map and launch a recovery thread for it. */
 	status = ocfs2_mark_dead_nodes(osb);
 	if (status < 0) {
 		mlog_errno(status);
 		goto finally;
 	}
 	status = ocfs2_compute_replay_slots(osb);
 	if (status < 0)
 		mlog_errno(status);
 finally:
 	kfree(local_alloc);
 	if (status)
 		mlog_errno(status);
 	return status;
 }
 /*
  * The routine gets called from dismount or close whenever a dismount on
  * volume is requested and the osb open count becomes 1.
  * It will remove the osb from the global list and also free up all the
  * initialized resources and fileobject.
  */
 static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
 	/* This function assumes that the caller has the main osb resource */
 	ocfs2_free_slot_info(osb);
 	kfree(osb->osb_orphan_wipes);
 	kfree(osb->slot_recovery_generations);
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
 	 * allocate osb->journal at the start of ocfs2_initialize_osb(),
 	 * we free it here.
 	 */
 	kfree(osb->journal);
 	kfree(osb->local_alloc_copy);
 	kfree(osb->uuid_str);
+	kfree(osb->vol_label);
 	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
 	memset(osb, 0, sizeof(struct ocfs2_super));
 }
 /* Put OCFS2 into a readonly state, or (if the user specifies it),
  * panic(). We do not support continue-on-error operation. */
 static void ocfs2_handle_error(struct super_block *sb)
 {
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
 		panic("OCFS2: (device %s): panic forced after error\n",
 		      sb->s_id);
 	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
 	if (sb->s_flags & MS_RDONLY &&
 	    (ocfs2_is_soft_readonly(osb) ||
 	     ocfs2_is_hard_readonly(osb)))
 		return;
 	printk(KERN_CRIT "File system is now read-only due to the potential "
 	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
 	       "system is unmounted.\n");
 	sb->s_flags |= MS_RDONLY;
 	ocfs2_set_ro_flag(osb, 0);
 }
 static char error_buf[1024];
 void __ocfs2_error(struct super_block *sb,
 		   const char *function,
 		   const char *fmt, ...)
 {
 	va_list args;
 	va_start(args, fmt);
 	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 	/* Not using mlog here because we want to show the actual
 	 * function the error came from. */
 	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
 	       sb->s_id, function, error_buf);
 	ocfs2_handle_error(sb);
 }
 /* Handle critical errors. This is intentionally more drastic than
  * ocfs2_handle_error, so we only use for things like journal errors,
  * etc. */
 void __ocfs2_abort(struct super_block* sb,
 		   const char *function,
 		   const char *fmt, ...)
 {
 	va_list args;
 	va_start(args, fmt);
 	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
 	       sb->s_id, function, error_buf);
 	/* We don't have the cluster support yet to go straight to
 	 * hard readonly in here. Until then, we want to keep
 	 * ocfs2_abort() so that we can at least mark critical
 	 * errors.
 	 *
 	 * TODO: This should abort the journal and alert other nodes
 	 * that our slot needs recovery. */
 	/* Force a panic(). This stinks, but it's better than letting
 	 * things continue without having a proper hard readonly
 	 * here. */
 	if (!ocfs2_mount_local(OCFS2_SB(sb)))
 		OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
 	ocfs2_handle_error(sb);
 }
 /*
  * Void signal blockers, because in-kernel sigprocmask() only fails
  * when SIG_* is wrong.
  */
 void ocfs2_block_signals(sigset_t *oldset)
 {
 	int rc;
 	sigset_t blocked;
 	sigfillset(&blocked);
 	rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
 	BUG_ON(rc);
 }
 void ocfs2_unblock_signals(sigset_t *oldset)
 {
 	int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
 	BUG_ON(rc);
 }
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);

fs/proc/task_mmu.c

Diff comments View file @ 8207649

1	#include <linux/mm.h>	1	#include <linux/mm.h>
2	#include <linux/vmacache.h>	2	#include <linux/vmacache.h>
3	#include <linux/hugetlb.h>	3	#include <linux/hugetlb.h>
4	#include <linux/huge_mm.h>	4	#include <linux/huge_mm.h>
5	#include <linux/mount.h>	5	#include <linux/mount.h>
6	#include <linux/seq_file.h>	6	#include <linux/seq_file.h>
7	#include <linux/highmem.h>	7	#include <linux/highmem.h>
8	#include <linux/ptrace.h>	8	#include <linux/ptrace.h>
9	#include <linux/slab.h>	9	#include <linux/slab.h>
10	#include <linux/pagemap.h>	10	#include <linux/pagemap.h>
11	#include <linux/mempolicy.h>	11	#include <linux/mempolicy.h>
12	#include <linux/rmap.h>	12	#include <linux/rmap.h>
13	#include <linux/swap.h>	13	#include <linux/swap.h>
14	#include <linux/swapops.h>	14	#include <linux/swapops.h>
15	#include <linux/mmu_notifier.h>	15	#include <linux/mmu_notifier.h>
16		16
17	#include <asm/elf.h>	17	#include <asm/elf.h>
18	#include <asm/uaccess.h>	18	#include <asm/uaccess.h>
19	#include <asm/tlbflush.h>	19	#include <asm/tlbflush.h>
20	#include "internal.h"	20	#include "internal.h"
21		21
22	void task_mem(struct seq_file m, struct mm_struct mm)	22	void task_mem(struct seq_file m, struct mm_struct mm)
23	{	23	{
24	unsigned long data, text, lib, swap;	24	unsigned long data, text, lib, swap;
25	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;	25	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
26		26
27	/*	27	/*
28	* Note: to minimize their overhead, mm maintains hiwater_vm and	28	* Note: to minimize their overhead, mm maintains hiwater_vm and
29	* hiwater_rss only when about to lower total_vm or rss. Any	29	* hiwater_rss only when about to lower total_vm or rss. Any
30	* collector of these hiwater stats must therefore get total_vm	30	* collector of these hiwater stats must therefore get total_vm
31	* and rss too, which will usually be the higher. Barriers? not	31	* and rss too, which will usually be the higher. Barriers? not
32	* worth the effort, such snapshots can always be inconsistent.	32	* worth the effort, such snapshots can always be inconsistent.
33	*/	33	*/
34	hiwater_vm = total_vm = mm->total_vm;	34	hiwater_vm = total_vm = mm->total_vm;
35	if (hiwater_vm < mm->hiwater_vm)	35	if (hiwater_vm < mm->hiwater_vm)
36	hiwater_vm = mm->hiwater_vm;	36	hiwater_vm = mm->hiwater_vm;
37	hiwater_rss = total_rss = get_mm_rss(mm);	37	hiwater_rss = total_rss = get_mm_rss(mm);
38	if (hiwater_rss < mm->hiwater_rss)	38	if (hiwater_rss < mm->hiwater_rss)
39	hiwater_rss = mm->hiwater_rss;	39	hiwater_rss = mm->hiwater_rss;
40		40
41	data = mm->total_vm - mm->shared_vm - mm->stack_vm;	41	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
42	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;	42	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
43	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;	43	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
44	swap = get_mm_counter(mm, MM_SWAPENTS);	44	swap = get_mm_counter(mm, MM_SWAPENTS);
45	seq_printf(m,	45	seq_printf(m,
46	"VmPeak:\t%8lu kB\n"	46	"VmPeak:\t%8lu kB\n"
47	"VmSize:\t%8lu kB\n"	47	"VmSize:\t%8lu kB\n"
48	"VmLck:\t%8lu kB\n"	48	"VmLck:\t%8lu kB\n"
49	"VmPin:\t%8lu kB\n"	49	"VmPin:\t%8lu kB\n"
50	"VmHWM:\t%8lu kB\n"	50	"VmHWM:\t%8lu kB\n"
51	"VmRSS:\t%8lu kB\n"	51	"VmRSS:\t%8lu kB\n"
52	"VmData:\t%8lu kB\n"	52	"VmData:\t%8lu kB\n"
53	"VmStk:\t%8lu kB\n"	53	"VmStk:\t%8lu kB\n"
54	"VmExe:\t%8lu kB\n"	54	"VmExe:\t%8lu kB\n"
55	"VmLib:\t%8lu kB\n"	55	"VmLib:\t%8lu kB\n"
56	"VmPTE:\t%8lu kB\n"	56	"VmPTE:\t%8lu kB\n"
57	"VmSwap:\t%8lu kB\n",	57	"VmSwap:\t%8lu kB\n",
58	hiwater_vm << (PAGE_SHIFT-10),	58	hiwater_vm << (PAGE_SHIFT-10),
59	total_vm << (PAGE_SHIFT-10),	59	total_vm << (PAGE_SHIFT-10),
60	mm->locked_vm << (PAGE_SHIFT-10),	60	mm->locked_vm << (PAGE_SHIFT-10),
61	mm->pinned_vm << (PAGE_SHIFT-10),	61	mm->pinned_vm << (PAGE_SHIFT-10),
62	hiwater_rss << (PAGE_SHIFT-10),	62	hiwater_rss << (PAGE_SHIFT-10),
63	total_rss << (PAGE_SHIFT-10),	63	total_rss << (PAGE_SHIFT-10),
64	data << (PAGE_SHIFT-10),	64	data << (PAGE_SHIFT-10),
65	mm->stack_vm << (PAGE_SHIFT-10), text, lib,	65	mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66	(PTRS_PER_PTE * sizeof(pte_t) *	66	(PTRS_PER_PTE * sizeof(pte_t) *
67	atomic_long_read(&mm->nr_ptes)) >> 10,	67	atomic_long_read(&mm->nr_ptes)) >> 10,
68	swap << (PAGE_SHIFT-10));	68	swap << (PAGE_SHIFT-10));
69	}	69	}
70		70
71	unsigned long task_vsize(struct mm_struct *mm)	71	unsigned long task_vsize(struct mm_struct *mm)
72	{	72	{
73	return PAGE_SIZE * mm->total_vm;	73	return PAGE_SIZE * mm->total_vm;
74	}	74	}
75		75
76	unsigned long task_statm(struct mm_struct *mm,	76	unsigned long task_statm(struct mm_struct *mm,
77	unsigned long shared, unsigned long text,	77	unsigned long shared, unsigned long text,
78	unsigned long data, unsigned long resident)	78	unsigned long data, unsigned long resident)
79	{	79	{
80	*shared = get_mm_counter(mm, MM_FILEPAGES);	80	*shared = get_mm_counter(mm, MM_FILEPAGES);
81	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))	81	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
82	>> PAGE_SHIFT;	82	>> PAGE_SHIFT;
83	*data = mm->total_vm - mm->shared_vm;	83	*data = mm->total_vm - mm->shared_vm;
84	resident = shared + get_mm_counter(mm, MM_ANONPAGES);	84	resident = shared + get_mm_counter(mm, MM_ANONPAGES);
85	return mm->total_vm;	85	return mm->total_vm;
86	}	86	}
87		87
88	#ifdef CONFIG_NUMA	88	#ifdef CONFIG_NUMA
89	/*	89	/*
90	* These functions are for numa_maps but called in generic **maps seq_file	90	* These functions are for numa_maps but called in generic **maps seq_file
91	* ->start(), ->stop() ops.	91	* ->start(), ->stop() ops.
92	*	92	*
93	* numa_maps scans all vmas under mmap_sem and checks their mempolicy.	93	* numa_maps scans all vmas under mmap_sem and checks their mempolicy.
94	* Each mempolicy object is controlled by reference counting. The problem here	94	* Each mempolicy object is controlled by reference counting. The problem here
95	* is how to avoid accessing dead mempolicy object.	95	* is how to avoid accessing dead mempolicy object.
96	*	96	*
97	* Because we're holding mmap_sem while reading seq_file, it's safe to access	97	* Because we're holding mmap_sem while reading seq_file, it's safe to access
98	* each vma's mempolicy, no vma objects will never drop refs to mempolicy.	98	* each vma's mempolicy, no vma objects will never drop refs to mempolicy.
99	*	99	*
100	* A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy	100	* A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
101	* is set and replaced under mmap_sem but unrefed and cleared under task_lock().	101	* is set and replaced under mmap_sem but unrefed and cleared under task_lock().
102	* So, without task_lock(), we cannot trust get_vma_policy() because we cannot	102	* So, without task_lock(), we cannot trust get_vma_policy() because we cannot
103	* gurantee the task never exits under us. But taking task_lock() around	103	* gurantee the task never exits under us. But taking task_lock() around
104	* get_vma_plicy() causes lock order problem.	104	* get_vma_plicy() causes lock order problem.
105	*	105	*
106	* To access task->mempolicy without lock, we hold a reference count of an	106	* To access task->mempolicy without lock, we hold a reference count of an
107	* object pointed by task->mempolicy and remember it. This will guarantee	107	* object pointed by task->mempolicy and remember it. This will guarantee
108	* that task->mempolicy points to an alive object or NULL in numa_maps accesses.	108	* that task->mempolicy points to an alive object or NULL in numa_maps accesses.
109	*/	109	*/
110	static void hold_task_mempolicy(struct proc_maps_private *priv)	110	static void hold_task_mempolicy(struct proc_maps_private *priv)
111	{	111	{
112	struct task_struct *task = priv->task;	112	struct task_struct *task = priv->task;
113		113
114	task_lock(task);	114	task_lock(task);
115	priv->task_mempolicy = task->mempolicy;	115	priv->task_mempolicy = task->mempolicy;
116	mpol_get(priv->task_mempolicy);	116	mpol_get(priv->task_mempolicy);
117	task_unlock(task);	117	task_unlock(task);
118	}	118	}
119	static void release_task_mempolicy(struct proc_maps_private *priv)	119	static void release_task_mempolicy(struct proc_maps_private *priv)
120	{	120	{
121	mpol_put(priv->task_mempolicy);	121	mpol_put(priv->task_mempolicy);
122	}	122	}
123	#else	123	#else
124	static void hold_task_mempolicy(struct proc_maps_private *priv)	124	static void hold_task_mempolicy(struct proc_maps_private *priv)
125	{	125	{
126	}	126	}
127	static void release_task_mempolicy(struct proc_maps_private *priv)	127	static void release_task_mempolicy(struct proc_maps_private *priv)
128	{	128	{
129	}	129	}
130	#endif	130	#endif
131		131
132	static void vma_stop(struct proc_maps_private priv, struct vm_area_struct vma)	132	static void vma_stop(struct proc_maps_private priv, struct vm_area_struct vma)
133	{	133	{
134	if (vma && vma != priv->tail_vma) {	134	if (vma && vma != priv->tail_vma) {
135	struct mm_struct *mm = vma->vm_mm;	135	struct mm_struct *mm = vma->vm_mm;
136	release_task_mempolicy(priv);	136	release_task_mempolicy(priv);
137	up_read(&mm->mmap_sem);	137	up_read(&mm->mmap_sem);
138	mmput(mm);	138	mmput(mm);
139	}	139	}
140	}	140	}
141		141
142	static void m_start(struct seq_file m, loff_t *pos)	142	static void m_start(struct seq_file m, loff_t *pos)
143	{	143	{
144	struct proc_maps_private *priv = m->private;	144	struct proc_maps_private *priv = m->private;
145	unsigned long last_addr = m->version;	145	unsigned long last_addr = m->version;
146	struct mm_struct *mm;	146	struct mm_struct *mm;
147	struct vm_area_struct vma, tail_vma = NULL;	147	struct vm_area_struct vma, tail_vma = NULL;
148	loff_t l = *pos;	148	loff_t l = *pos;
149		149
150	/* Clear the per syscall fields in priv */	150	/* Clear the per syscall fields in priv */
151	priv->task = NULL;	151	priv->task = NULL;
152	priv->tail_vma = NULL;	152	priv->tail_vma = NULL;
153		153
154	/*	154	/*
155	* We remember last_addr rather than next_addr to hit with	155	* We remember last_addr rather than next_addr to hit with
156	* vmacache most of the time. We have zero last_addr at	156	* vmacache most of the time. We have zero last_addr at
157	* the beginning and also after lseek. We will have -1 last_addr	157	* the beginning and also after lseek. We will have -1 last_addr
158	* after the end of the vmas.	158	* after the end of the vmas.
159	*/	159	*/
160		160
161	if (last_addr == -1UL)	161	if (last_addr == -1UL)
162	return NULL;	162	return NULL;
163		163
164	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);	164	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
165	if (!priv->task)	165	if (!priv->task)
166	return ERR_PTR(-ESRCH);	166	return ERR_PTR(-ESRCH);
167		167
168	mm = mm_access(priv->task, PTRACE_MODE_READ);	168	mm = mm_access(priv->task, PTRACE_MODE_READ);
169	if (!mm \|\| IS_ERR(mm))	169	if (!mm \|\| IS_ERR(mm))
170	return mm;	170	return mm;
171	down_read(&mm->mmap_sem);	171	down_read(&mm->mmap_sem);
172		172
173	tail_vma = get_gate_vma(priv->task->mm);	173	tail_vma = get_gate_vma(priv->task->mm);
174	priv->tail_vma = tail_vma;	174	priv->tail_vma = tail_vma;
175	hold_task_mempolicy(priv);	175	hold_task_mempolicy(priv);
176	/* Start with last addr hint */	176	/* Start with last addr hint */
177	vma = find_vma(mm, last_addr);	177	vma = find_vma(mm, last_addr);
178	if (last_addr && vma) {	178	if (last_addr && vma) {
179	vma = vma->vm_next;	179	vma = vma->vm_next;
180	goto out;	180	goto out;
181	}	181	}
182		182
183	/*	183	/*
184	* Check the vma index is within the range and do	184	* Check the vma index is within the range and do
185	* sequential scan until m_index.	185	* sequential scan until m_index.
186	*/	186	*/
187	vma = NULL;	187	vma = NULL;
188	if ((unsigned long)l < mm->map_count) {	188	if ((unsigned long)l < mm->map_count) {
189	vma = mm->mmap;	189	vma = mm->mmap;
190	while (l-- && vma)	190	while (l-- && vma)
191	vma = vma->vm_next;	191	vma = vma->vm_next;
192	goto out;	192	goto out;
193	}	193	}
194		194
195	if (l != mm->map_count)	195	if (l != mm->map_count)
196	tail_vma = NULL; /* After gate vma */	196	tail_vma = NULL; /* After gate vma */
197		197
198	out:	198	out:
199	if (vma)	199	if (vma)
200	return vma;	200	return vma;
201		201
202	release_task_mempolicy(priv);	202	release_task_mempolicy(priv);
203	/* End of vmas has been reached */	203	/* End of vmas has been reached */
204	m->version = (tail_vma != NULL)? 0: -1UL;	204	m->version = (tail_vma != NULL)? 0: -1UL;
205	up_read(&mm->mmap_sem);	205	up_read(&mm->mmap_sem);
206	mmput(mm);	206	mmput(mm);
207	return tail_vma;	207	return tail_vma;
208	}	208	}
209		209
210	static void m_next(struct seq_file m, void v, loff_t pos)	210	static void m_next(struct seq_file m, void v, loff_t pos)
211	{	211	{
212	struct proc_maps_private *priv = m->private;	212	struct proc_maps_private *priv = m->private;
213	struct vm_area_struct *vma = v;	213	struct vm_area_struct *vma = v;
214	struct vm_area_struct *tail_vma = priv->tail_vma;	214	struct vm_area_struct *tail_vma = priv->tail_vma;
215		215
216	(*pos)++;	216	(*pos)++;
217	if (vma && (vma != tail_vma) && vma->vm_next)	217	if (vma && (vma != tail_vma) && vma->vm_next)
218	return vma->vm_next;	218	return vma->vm_next;
219	vma_stop(priv, vma);	219	vma_stop(priv, vma);
220	return (vma != tail_vma)? tail_vma: NULL;	220	return (vma != tail_vma)? tail_vma: NULL;
221	}	221	}
222		222
223	static void m_stop(struct seq_file m, void v)	223	static void m_stop(struct seq_file m, void v)
224	{	224	{
225	struct proc_maps_private *priv = m->private;	225	struct proc_maps_private *priv = m->private;
226	struct vm_area_struct *vma = v;	226	struct vm_area_struct *vma = v;
227		227
228	if (!IS_ERR(vma))	228	if (!IS_ERR(vma))
229	vma_stop(priv, vma);	229	vma_stop(priv, vma);
230	if (priv->task)	230	if (priv->task)
231	put_task_struct(priv->task);	231	put_task_struct(priv->task);
232	}	232	}
233		233
234	static int do_maps_open(struct inode inode, struct file file,	234	static int do_maps_open(struct inode inode, struct file file,
235	const struct seq_operations *ops)	235	const struct seq_operations *ops)
236	{	236	{
237	struct proc_maps_private *priv;	237	struct proc_maps_private *priv;
238	int ret = -ENOMEM;	238	int ret = -ENOMEM;
239	priv = kzalloc(sizeof(*priv), GFP_KERNEL);	239	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
240	if (priv) {	240	if (priv) {
241	priv->pid = proc_pid(inode);	241	priv->pid = proc_pid(inode);
242	ret = seq_open(file, ops);	242	ret = seq_open(file, ops);
243	if (!ret) {	243	if (!ret) {
244	struct seq_file *m = file->private_data;	244	struct seq_file *m = file->private_data;
245	m->private = priv;	245	m->private = priv;
246	} else {	246	} else {
247	kfree(priv);	247	kfree(priv);
248	}	248	}
249	}	249	}
250	return ret;	250	return ret;
251	}	251	}
252		252
253	static void	253	static void
254	show_map_vma(struct seq_file m, struct vm_area_struct vma, int is_pid)	254	show_map_vma(struct seq_file m, struct vm_area_struct vma, int is_pid)
255	{	255	{
256	struct mm_struct *mm = vma->vm_mm;	256	struct mm_struct *mm = vma->vm_mm;
257	struct file *file = vma->vm_file;	257	struct file *file = vma->vm_file;
258	struct proc_maps_private *priv = m->private;	258	struct proc_maps_private *priv = m->private;
259	struct task_struct *task = priv->task;	259	struct task_struct *task = priv->task;
260	vm_flags_t flags = vma->vm_flags;	260	vm_flags_t flags = vma->vm_flags;
261	unsigned long ino = 0;	261	unsigned long ino = 0;
262	unsigned long long pgoff = 0;	262	unsigned long long pgoff = 0;
263	unsigned long start, end;	263	unsigned long start, end;
264	dev_t dev = 0;	264	dev_t dev = 0;
265	const char *name = NULL;	265	const char *name = NULL;
266		266
267	if (file) {	267	if (file) {
268	struct inode *inode = file_inode(vma->vm_file);	268	struct inode *inode = file_inode(vma->vm_file);
269	dev = inode->i_sb->s_dev;	269	dev = inode->i_sb->s_dev;
270	ino = inode->i_ino;	270	ino = inode->i_ino;
271	pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;	271	pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
272	}	272	}
273		273
274	/* We don't show the stack guard page in /proc/maps */	274	/* We don't show the stack guard page in /proc/maps */
275	start = vma->vm_start;	275	start = vma->vm_start;
276	if (stack_guard_page_start(vma, start))	276	if (stack_guard_page_start(vma, start))
277	start += PAGE_SIZE;	277	start += PAGE_SIZE;
278	end = vma->vm_end;	278	end = vma->vm_end;
279	if (stack_guard_page_end(vma, end))	279	if (stack_guard_page_end(vma, end))
280	end -= PAGE_SIZE;	280	end -= PAGE_SIZE;
281		281
282	seq_setwidth(m, 25 + sizeof(void ) 6 - 1);	282	seq_setwidth(m, 25 + sizeof(void ) 6 - 1);
283	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",	283	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
284	start,	284	start,
285	end,	285	end,
286	flags & VM_READ ? 'r' : '-',	286	flags & VM_READ ? 'r' : '-',
287	flags & VM_WRITE ? 'w' : '-',	287	flags & VM_WRITE ? 'w' : '-',
288	flags & VM_EXEC ? 'x' : '-',	288	flags & VM_EXEC ? 'x' : '-',
289	flags & VM_MAYSHARE ? 's' : 'p',	289	flags & VM_MAYSHARE ? 's' : 'p',
290	pgoff,	290	pgoff,
291	MAJOR(dev), MINOR(dev), ino);	291	MAJOR(dev), MINOR(dev), ino);
292		292
293	/*	293	/*
294	* Print the dentry name for named mappings, and a	294	* Print the dentry name for named mappings, and a
295	* special [heap] marker for the heap:	295	* special [heap] marker for the heap:
296	*/	296	*/
297	if (file) {	297	if (file) {
298	seq_pad(m, ' ');	298	seq_pad(m, ' ');
299	seq_path(m, &file->f_path, "\n");	299	seq_path(m, &file->f_path, "\n");
300	goto done;	300	goto done;
301	}	301	}
302		302
303	if (vma->vm_ops && vma->vm_ops->name) {	303	if (vma->vm_ops && vma->vm_ops->name) {
304	name = vma->vm_ops->name(vma);	304	name = vma->vm_ops->name(vma);
305	if (name)	305	if (name)
306	goto done;	306	goto done;
307	}	307	}
308		308
309	name = arch_vma_name(vma);	309	name = arch_vma_name(vma);
310	if (!name) {	310	if (!name) {
311	pid_t tid;	311	pid_t tid;
312		312
313	if (!mm) {	313	if (!mm) {
314	name = "[vdso]";	314	name = "[vdso]";
315	goto done;	315	goto done;
316	}	316	}
317		317
318	if (vma->vm_start <= mm->brk &&	318	if (vma->vm_start <= mm->brk &&
319	vma->vm_end >= mm->start_brk) {	319	vma->vm_end >= mm->start_brk) {
320	name = "[heap]";	320	name = "[heap]";
321	goto done;	321	goto done;
322	}	322	}
323		323
324	tid = vm_is_stack(task, vma, is_pid);	324	tid = vm_is_stack(task, vma, is_pid);
325		325
326	if (tid != 0) {	326	if (tid != 0) {
327	/*	327	/*
328	* Thread stack in /proc/PID/task/TID/maps or	328	* Thread stack in /proc/PID/task/TID/maps or
329	* the main process stack.	329	* the main process stack.
330	*/	330	*/
331	if (!is_pid \|\| (vma->vm_start <= mm->start_stack &&	331	if (!is_pid \|\| (vma->vm_start <= mm->start_stack &&
332	vma->vm_end >= mm->start_stack)) {	332	vma->vm_end >= mm->start_stack)) {
333	name = "[stack]";	333	name = "[stack]";
334	} else {	334	} else {
335	/* Thread stack in /proc/PID/maps */	335	/* Thread stack in /proc/PID/maps */
336	seq_pad(m, ' ');	336	seq_pad(m, ' ');
337	seq_printf(m, "[stack:%d]", tid);	337	seq_printf(m, "[stack:%d]", tid);
338	}	338	}
339	}	339	}
340	}	340	}
341		341
342	done:	342	done:
343	if (name) {	343	if (name) {
344	seq_pad(m, ' ');	344	seq_pad(m, ' ');
345	seq_puts(m, name);	345	seq_puts(m, name);
346	}	346	}
347	seq_putc(m, '\n');	347	seq_putc(m, '\n');
348	}	348	}
349		349
350	static int show_map(struct seq_file m, void v, int is_pid)	350	static int show_map(struct seq_file m, void v, int is_pid)
351	{	351	{
352	struct vm_area_struct *vma = v;	352	struct vm_area_struct *vma = v;
353	struct proc_maps_private *priv = m->private;	353	struct proc_maps_private *priv = m->private;
354	struct task_struct *task = priv->task;	354	struct task_struct *task = priv->task;
355		355
356	show_map_vma(m, vma, is_pid);	356	show_map_vma(m, vma, is_pid);
357		357
358	if (m->count < m->size) /* vma is copied successfully */	358	if (m->count < m->size) /* vma is copied successfully */
359	m->version = (vma != get_gate_vma(task->mm))	359	m->version = (vma != get_gate_vma(task->mm))
360	? vma->vm_start : 0;	360	? vma->vm_start : 0;
361	return 0;	361	return 0;
362	}	362	}
363		363
364	static int show_pid_map(struct seq_file m, void v)	364	static int show_pid_map(struct seq_file m, void v)
365	{	365	{
366	return show_map(m, v, 1);	366	return show_map(m, v, 1);
367	}	367	}
368		368
369	static int show_tid_map(struct seq_file m, void v)	369	static int show_tid_map(struct seq_file m, void v)
370	{	370	{
371	return show_map(m, v, 0);	371	return show_map(m, v, 0);
372	}	372	}
373		373
374	static const struct seq_operations proc_pid_maps_op = {	374	static const struct seq_operations proc_pid_maps_op = {
375	.start = m_start,	375	.start = m_start,
376	.next = m_next,	376	.next = m_next,
377	.stop = m_stop,	377	.stop = m_stop,
378	.show = show_pid_map	378	.show = show_pid_map
379	};	379	};
380		380
381	static const struct seq_operations proc_tid_maps_op = {	381	static const struct seq_operations proc_tid_maps_op = {
382	.start = m_start,	382	.start = m_start,
383	.next = m_next,	383	.next = m_next,
384	.stop = m_stop,	384	.stop = m_stop,
385	.show = show_tid_map	385	.show = show_tid_map
386	};	386	};
387		387
388	static int pid_maps_open(struct inode inode, struct file file)	388	static int pid_maps_open(struct inode inode, struct file file)
389	{	389	{
390	return do_maps_open(inode, file, &proc_pid_maps_op);	390	return do_maps_open(inode, file, &proc_pid_maps_op);
391	}	391	}
392		392
393	static int tid_maps_open(struct inode inode, struct file file)	393	static int tid_maps_open(struct inode inode, struct file file)
394	{	394	{
395	return do_maps_open(inode, file, &proc_tid_maps_op);	395	return do_maps_open(inode, file, &proc_tid_maps_op);
396	}	396	}
397		397
398	const struct file_operations proc_pid_maps_operations = {	398	const struct file_operations proc_pid_maps_operations = {
399	.open = pid_maps_open,	399	.open = pid_maps_open,
400	.read = seq_read,	400	.read = seq_read,
401	.llseek = seq_lseek,	401	.llseek = seq_lseek,
402	.release = seq_release_private,	402	.release = seq_release_private,
403	};	403	};
404		404
405	const struct file_operations proc_tid_maps_operations = {	405	const struct file_operations proc_tid_maps_operations = {
406	.open = tid_maps_open,	406	.open = tid_maps_open,
407	.read = seq_read,	407	.read = seq_read,
408	.llseek = seq_lseek,	408	.llseek = seq_lseek,
409	.release = seq_release_private,	409	.release = seq_release_private,
410	};	410	};
411		411
412	/*	412	/*
413	* Proportional Set Size(PSS): my share of RSS.	413	* Proportional Set Size(PSS): my share of RSS.
414	*	414	*
415	* PSS of a process is the count of pages it has in memory, where each	415	* PSS of a process is the count of pages it has in memory, where each
416	* page is divided by the number of processes sharing it. So if a	416	* page is divided by the number of processes sharing it. So if a
417	* process has 1000 pages all to itself, and 1000 shared with one other	417	* process has 1000 pages all to itself, and 1000 shared with one other
418	* process, its PSS will be 1500.	418	* process, its PSS will be 1500.
419	*	419	*
420	* To keep (accumulated) division errors low, we adopt a 64bit	420	* To keep (accumulated) division errors low, we adopt a 64bit
421	* fixed-point pss counter to minimize division errors. So (pss >>	421	* fixed-point pss counter to minimize division errors. So (pss >>
422	* PSS_SHIFT) would be the real byte count.	422	* PSS_SHIFT) would be the real byte count.
423	*	423	*
424	* A shift of 12 before division means (assuming 4K page size):	424	* A shift of 12 before division means (assuming 4K page size):
425	* - 1M 3-user-pages add up to 8KB errors;	425	* - 1M 3-user-pages add up to 8KB errors;
426	* - supports mapcount up to 2^24, or 16M;	426	* - supports mapcount up to 2^24, or 16M;
427	* - supports PSS up to 2^52 bytes, or 4PB.	427	* - supports PSS up to 2^52 bytes, or 4PB.
428	*/	428	*/
429	#define PSS_SHIFT 12	429	#define PSS_SHIFT 12
430		430
431	#ifdef CONFIG_PROC_PAGE_MONITOR	431	#ifdef CONFIG_PROC_PAGE_MONITOR
432	struct mem_size_stats {	432	struct mem_size_stats {
433	struct vm_area_struct *vma;	433	struct vm_area_struct *vma;
434	unsigned long resident;	434	unsigned long resident;
435	unsigned long shared_clean;	435	unsigned long shared_clean;
436	unsigned long shared_dirty;	436	unsigned long shared_dirty;
437	unsigned long private_clean;	437	unsigned long private_clean;
438	unsigned long private_dirty;	438	unsigned long private_dirty;
439	unsigned long referenced;	439	unsigned long referenced;
440	unsigned long anonymous;	440	unsigned long anonymous;
441	unsigned long anonymous_thp;	441	unsigned long anonymous_thp;
442	unsigned long swap;	442	unsigned long swap;
443	unsigned long nonlinear;	443	unsigned long nonlinear;
444	u64 pss;	444	u64 pss;
445	};	445	};
446		446
447		447
448	static void smaps_pte_entry(pte_t ptent, unsigned long addr,	448	static void smaps_pte_entry(pte_t ptent, unsigned long addr,
449	unsigned long ptent_size, struct mm_walk *walk)	449	unsigned long ptent_size, struct mm_walk *walk)
450	{	450	{
451	struct mem_size_stats *mss = walk->private;	451	struct mem_size_stats *mss = walk->private;
452	struct vm_area_struct *vma = mss->vma;	452	struct vm_area_struct *vma = mss->vma;
453	pgoff_t pgoff = linear_page_index(vma, addr);	453	pgoff_t pgoff = linear_page_index(vma, addr);
454	struct page *page = NULL;	454	struct page *page = NULL;
455	int mapcount;	455	int mapcount;
456		456
457	if (pte_present(ptent)) {	457	if (pte_present(ptent)) {
458	page = vm_normal_page(vma, addr, ptent);	458	page = vm_normal_page(vma, addr, ptent);
459	} else if (is_swap_pte(ptent)) {	459	} else if (is_swap_pte(ptent)) {
460	swp_entry_t swpent = pte_to_swp_entry(ptent);	460	swp_entry_t swpent = pte_to_swp_entry(ptent);
461		461
462	if (!non_swap_entry(swpent))	462	if (!non_swap_entry(swpent))
463	mss->swap += ptent_size;	463	mss->swap += ptent_size;
464	else if (is_migration_entry(swpent))	464	else if (is_migration_entry(swpent))
465	page = migration_entry_to_page(swpent);	465	page = migration_entry_to_page(swpent);
466	} else if (pte_file(ptent)) {	466	} else if (pte_file(ptent)) {
467	if (pte_to_pgoff(ptent) != pgoff)	467	if (pte_to_pgoff(ptent) != pgoff)
468	mss->nonlinear += ptent_size;	468	mss->nonlinear += ptent_size;
469	}	469	}
470		470
471	if (!page)	471	if (!page)
472	return;	472	return;
473		473
474	if (PageAnon(page))	474	if (PageAnon(page))
475	mss->anonymous += ptent_size;	475	mss->anonymous += ptent_size;
476		476
477	if (page->index != pgoff)	477	if (page->index != pgoff)
478	mss->nonlinear += ptent_size;	478	mss->nonlinear += ptent_size;
479		479
480	mss->resident += ptent_size;	480	mss->resident += ptent_size;
481	/* Accumulate the size in pages that have been accessed. */	481	/* Accumulate the size in pages that have been accessed. */
482	if (pte_young(ptent) \|\| PageReferenced(page))	482	if (pte_young(ptent) \|\| PageReferenced(page))
483	mss->referenced += ptent_size;	483	mss->referenced += ptent_size;
484	mapcount = page_mapcount(page);	484	mapcount = page_mapcount(page);
485	if (mapcount >= 2) {	485	if (mapcount >= 2) {
486	if (pte_dirty(ptent) \|\| PageDirty(page))	486	if (pte_dirty(ptent) \|\| PageDirty(page))
487	mss->shared_dirty += ptent_size;	487	mss->shared_dirty += ptent_size;
488	else	488	else
489	mss->shared_clean += ptent_size;	489	mss->shared_clean += ptent_size;
490	mss->pss += (ptent_size << PSS_SHIFT) / mapcount;	490	mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
491	} else {	491	} else {
492	if (pte_dirty(ptent) \|\| PageDirty(page))	492	if (pte_dirty(ptent) \|\| PageDirty(page))
493	mss->private_dirty += ptent_size;	493	mss->private_dirty += ptent_size;
494	else	494	else
495	mss->private_clean += ptent_size;	495	mss->private_clean += ptent_size;
496	mss->pss += (ptent_size << PSS_SHIFT);	496	mss->pss += (ptent_size << PSS_SHIFT);
497	}	497	}
498	}	498	}
499		499
500	static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,	500	static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
501	struct mm_walk *walk)	501	struct mm_walk *walk)
502	{	502	{
503	struct mem_size_stats *mss = walk->private;	503	struct mem_size_stats *mss = walk->private;
504	struct vm_area_struct *vma = mss->vma;	504	struct vm_area_struct *vma = mss->vma;
505	pte_t *pte;	505	pte_t *pte;
506	spinlock_t *ptl;	506	spinlock_t *ptl;
507		507
508	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {	508	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
509	smaps_pte_entry((pte_t )pmd, addr, HPAGE_PMD_SIZE, walk);	509	smaps_pte_entry((pte_t )pmd, addr, HPAGE_PMD_SIZE, walk);
510	spin_unlock(ptl);	510	spin_unlock(ptl);
511	mss->anonymous_thp += HPAGE_PMD_SIZE;	511	mss->anonymous_thp += HPAGE_PMD_SIZE;
512	return 0;	512	return 0;
513	}	513	}
514		514
515	if (pmd_trans_unstable(pmd))	515	if (pmd_trans_unstable(pmd))
516	return 0;	516	return 0;
517	/*	517	/*
518	* The mmap_sem held all the way back in m_start() is what	518	* The mmap_sem held all the way back in m_start() is what
519	* keeps khugepaged out of here and from collapsing things	519	* keeps khugepaged out of here and from collapsing things
520	* in here.	520	* in here.
521	*/	521	*/
522	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);	522	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
523	for (; addr != end; pte++, addr += PAGE_SIZE)	523	for (; addr != end; pte++, addr += PAGE_SIZE)
524	smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);	524	smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
525	pte_unmap_unlock(pte - 1, ptl);	525	pte_unmap_unlock(pte - 1, ptl);
526	cond_resched();	526	cond_resched();
527	return 0;	527	return 0;
528	}	528	}
529		529
530	static void show_smap_vma_flags(struct seq_file m, struct vm_area_struct vma)	530	static void show_smap_vma_flags(struct seq_file m, struct vm_area_struct vma)
531	{	531	{
532	/*	532	/*
533	* Don't forget to update Documentation/ on changes.	533	* Don't forget to update Documentation/ on changes.
534	*/	534	*/
535	static const char mnemonics[BITS_PER_LONG][2] = {	535	static const char mnemonics[BITS_PER_LONG][2] = {
536	/*	536	/*
537	* In case if we meet a flag we don't know about.	537	* In case if we meet a flag we don't know about.
538	*/	538	*/
539	[0 ... (BITS_PER_LONG-1)] = "??",	539	[0 ... (BITS_PER_LONG-1)] = "??",
540		540
541	[ilog2(VM_READ)] = "rd",	541	[ilog2(VM_READ)] = "rd",
542	[ilog2(VM_WRITE)] = "wr",	542	[ilog2(VM_WRITE)] = "wr",
543	[ilog2(VM_EXEC)] = "ex",	543	[ilog2(VM_EXEC)] = "ex",
544	[ilog2(VM_SHARED)] = "sh",	544	[ilog2(VM_SHARED)] = "sh",
545	[ilog2(VM_MAYREAD)] = "mr",	545	[ilog2(VM_MAYREAD)] = "mr",
546	[ilog2(VM_MAYWRITE)] = "mw",	546	[ilog2(VM_MAYWRITE)] = "mw",
547	[ilog2(VM_MAYEXEC)] = "me",	547	[ilog2(VM_MAYEXEC)] = "me",
548	[ilog2(VM_MAYSHARE)] = "ms",	548	[ilog2(VM_MAYSHARE)] = "ms",
549	[ilog2(VM_GROWSDOWN)] = "gd",	549	[ilog2(VM_GROWSDOWN)] = "gd",
550	[ilog2(VM_PFNMAP)] = "pf",	550	[ilog2(VM_PFNMAP)] = "pf",
551	[ilog2(VM_DENYWRITE)] = "dw",	551	[ilog2(VM_DENYWRITE)] = "dw",
552	[ilog2(VM_LOCKED)] = "lo",	552	[ilog2(VM_LOCKED)] = "lo",
553	[ilog2(VM_IO)] = "io",	553	[ilog2(VM_IO)] = "io",
554	[ilog2(VM_SEQ_READ)] = "sr",	554	[ilog2(VM_SEQ_READ)] = "sr",
555	[ilog2(VM_RAND_READ)] = "rr",	555	[ilog2(VM_RAND_READ)] = "rr",
556	[ilog2(VM_DONTCOPY)] = "dc",	556	[ilog2(VM_DONTCOPY)] = "dc",
557	[ilog2(VM_DONTEXPAND)] = "de",	557	[ilog2(VM_DONTEXPAND)] = "de",
558	[ilog2(VM_ACCOUNT)] = "ac",	558	[ilog2(VM_ACCOUNT)] = "ac",
559	[ilog2(VM_NORESERVE)] = "nr",	559	[ilog2(VM_NORESERVE)] = "nr",
560	[ilog2(VM_HUGETLB)] = "ht",	560	[ilog2(VM_HUGETLB)] = "ht",
561	[ilog2(VM_NONLINEAR)] = "nl",	561	[ilog2(VM_NONLINEAR)] = "nl",
562	[ilog2(VM_ARCH_1)] = "ar",	562	[ilog2(VM_ARCH_1)] = "ar",
563	[ilog2(VM_DONTDUMP)] = "dd",	563	[ilog2(VM_DONTDUMP)] = "dd",
564	#ifdef CONFIG_MEM_SOFT_DIRTY	564	#ifdef CONFIG_MEM_SOFT_DIRTY
565	[ilog2(VM_SOFTDIRTY)] = "sd",	565	[ilog2(VM_SOFTDIRTY)] = "sd",
566	#endif	566	#endif
567	[ilog2(VM_MIXEDMAP)] = "mm",	567	[ilog2(VM_MIXEDMAP)] = "mm",
568	[ilog2(VM_HUGEPAGE)] = "hg",	568	[ilog2(VM_HUGEPAGE)] = "hg",
569	[ilog2(VM_NOHUGEPAGE)] = "nh",	569	[ilog2(VM_NOHUGEPAGE)] = "nh",
570	[ilog2(VM_MERGEABLE)] = "mg",	570	[ilog2(VM_MERGEABLE)] = "mg",
571	};	571	};
572	size_t i;	572	size_t i;
573		573
574	seq_puts(m, "VmFlags: ");	574	seq_puts(m, "VmFlags: ");
575	for (i = 0; i < BITS_PER_LONG; i++) {	575	for (i = 0; i < BITS_PER_LONG; i++) {
576	if (vma->vm_flags & (1UL << i)) {	576	if (vma->vm_flags & (1UL << i)) {
577	seq_printf(m, "%c%c ",	577	seq_printf(m, "%c%c ",
578	mnemonics[i][0], mnemonics[i][1]);	578	mnemonics[i][0], mnemonics[i][1]);
579	}	579	}
580	}	580	}
581	seq_putc(m, '\n');	581	seq_putc(m, '\n');
582	}	582	}
583		583
584	static int show_smap(struct seq_file m, void v, int is_pid)	584	static int show_smap(struct seq_file m, void v, int is_pid)
585	{	585	{
586	struct proc_maps_private *priv = m->private;	586	struct proc_maps_private *priv = m->private;
587	struct task_struct *task = priv->task;	587	struct task_struct *task = priv->task;
588	struct vm_area_struct *vma = v;	588	struct vm_area_struct *vma = v;
589	struct mem_size_stats mss;	589	struct mem_size_stats mss;
590	struct mm_walk smaps_walk = {	590	struct mm_walk smaps_walk = {
591	.pmd_entry = smaps_pte_range,	591	.pmd_entry = smaps_pte_range,
592	.mm = vma->vm_mm,	592	.mm = vma->vm_mm,
593	.private = &mss,	593	.private = &mss,
594	};	594	};
595		595
596	memset(&mss, 0, sizeof mss);	596	memset(&mss, 0, sizeof mss);
597	mss.vma = vma;	597	mss.vma = vma;
598	/* mmap_sem is held in m_start */	598	/* mmap_sem is held in m_start */
599	if (vma->vm_mm && !is_vm_hugetlb_page(vma))	599	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
600	walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);	600	walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
601		601
602	show_map_vma(m, vma, is_pid);	602	show_map_vma(m, vma, is_pid);
603		603
604	seq_printf(m,	604	seq_printf(m,
605	"Size: %8lu kB\n"	605	"Size: %8lu kB\n"
606	"Rss: %8lu kB\n"	606	"Rss: %8lu kB\n"
607	"Pss: %8lu kB\n"	607	"Pss: %8lu kB\n"
608	"Shared_Clean: %8lu kB\n"	608	"Shared_Clean: %8lu kB\n"
609	"Shared_Dirty: %8lu kB\n"	609	"Shared_Dirty: %8lu kB\n"
610	"Private_Clean: %8lu kB\n"	610	"Private_Clean: %8lu kB\n"
611	"Private_Dirty: %8lu kB\n"	611	"Private_Dirty: %8lu kB\n"
612	"Referenced: %8lu kB\n"	612	"Referenced: %8lu kB\n"
613	"Anonymous: %8lu kB\n"	613	"Anonymous: %8lu kB\n"
614	"AnonHugePages: %8lu kB\n"	614	"AnonHugePages: %8lu kB\n"
615	"Swap: %8lu kB\n"	615	"Swap: %8lu kB\n"
616	"KernelPageSize: %8lu kB\n"	616	"KernelPageSize: %8lu kB\n"
617	"MMUPageSize: %8lu kB\n"	617	"MMUPageSize: %8lu kB\n"
618	"Locked: %8lu kB\n",	618	"Locked: %8lu kB\n",
619	(vma->vm_end - vma->vm_start) >> 10,	619	(vma->vm_end - vma->vm_start) >> 10,
620	mss.resident >> 10,	620	mss.resident >> 10,
621	(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),	621	(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
622	mss.shared_clean >> 10,	622	mss.shared_clean >> 10,
623	mss.shared_dirty >> 10,	623	mss.shared_dirty >> 10,
624	mss.private_clean >> 10,	624	mss.private_clean >> 10,
625	mss.private_dirty >> 10,	625	mss.private_dirty >> 10,
626	mss.referenced >> 10,	626	mss.referenced >> 10,
627	mss.anonymous >> 10,	627	mss.anonymous >> 10,
628	mss.anonymous_thp >> 10,	628	mss.anonymous_thp >> 10,
629	mss.swap >> 10,	629	mss.swap >> 10,
630	vma_kernel_pagesize(vma) >> 10,	630	vma_kernel_pagesize(vma) >> 10,
631	vma_mmu_pagesize(vma) >> 10,	631	vma_mmu_pagesize(vma) >> 10,
632	(vma->vm_flags & VM_LOCKED) ?	632	(vma->vm_flags & VM_LOCKED) ?
633	(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);	633	(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
634		634
635	if (vma->vm_flags & VM_NONLINEAR)	635	if (vma->vm_flags & VM_NONLINEAR)
636	seq_printf(m, "Nonlinear: %8lu kB\n",	636	seq_printf(m, "Nonlinear: %8lu kB\n",
637	mss.nonlinear >> 10);	637	mss.nonlinear >> 10);
638		638
639	show_smap_vma_flags(m, vma);	639	show_smap_vma_flags(m, vma);
640		640
641	if (m->count < m->size) /* vma is copied successfully */	641	if (m->count < m->size) /* vma is copied successfully */
642	m->version = (vma != get_gate_vma(task->mm))	642	m->version = (vma != get_gate_vma(task->mm))
643	? vma->vm_start : 0;	643	? vma->vm_start : 0;
644	return 0;	644	return 0;
645	}	645	}
646		646
647	static int show_pid_smap(struct seq_file m, void v)	647	static int show_pid_smap(struct seq_file m, void v)
648	{	648	{
649	return show_smap(m, v, 1);	649	return show_smap(m, v, 1);
650	}	650	}
651		651
652	static int show_tid_smap(struct seq_file m, void v)	652	static int show_tid_smap(struct seq_file m, void v)
653	{	653	{
654	return show_smap(m, v, 0);	654	return show_smap(m, v, 0);
655	}	655	}
656		656
657	static const struct seq_operations proc_pid_smaps_op = {	657	static const struct seq_operations proc_pid_smaps_op = {
658	.start = m_start,	658	.start = m_start,
659	.next = m_next,	659	.next = m_next,
660	.stop = m_stop,	660	.stop = m_stop,
661	.show = show_pid_smap	661	.show = show_pid_smap
662	};	662	};
663		663
664	static const struct seq_operations proc_tid_smaps_op = {	664	static const struct seq_operations proc_tid_smaps_op = {
665	.start = m_start,	665	.start = m_start,
666	.next = m_next,	666	.next = m_next,
667	.stop = m_stop,	667	.stop = m_stop,
668	.show = show_tid_smap	668	.show = show_tid_smap
669	};	669	};
670		670
671	static int pid_smaps_open(struct inode inode, struct file file)	671	static int pid_smaps_open(struct inode inode, struct file file)
672	{	672	{
673	return do_maps_open(inode, file, &proc_pid_smaps_op);	673	return do_maps_open(inode, file, &proc_pid_smaps_op);
674	}	674	}
675		675
676	static int tid_smaps_open(struct inode inode, struct file file)	676	static int tid_smaps_open(struct inode inode, struct file file)
677	{	677	{
678	return do_maps_open(inode, file, &proc_tid_smaps_op);	678	return do_maps_open(inode, file, &proc_tid_smaps_op);
679	}	679	}
680		680
681	const struct file_operations proc_pid_smaps_operations = {	681	const struct file_operations proc_pid_smaps_operations = {
682	.open = pid_smaps_open,	682	.open = pid_smaps_open,
683	.read = seq_read,	683	.read = seq_read,
684	.llseek = seq_lseek,	684	.llseek = seq_lseek,
685	.release = seq_release_private,	685	.release = seq_release_private,
686	};	686	};
687		687
688	const struct file_operations proc_tid_smaps_operations = {	688	const struct file_operations proc_tid_smaps_operations = {
689	.open = tid_smaps_open,	689	.open = tid_smaps_open,
690	.read = seq_read,	690	.read = seq_read,
691	.llseek = seq_lseek,	691	.llseek = seq_lseek,
692	.release = seq_release_private,	692	.release = seq_release_private,
693	};	693	};
694		694
695	/*	695	/*
696	* We do not want to have constant page-shift bits sitting in	696	* We do not want to have constant page-shift bits sitting in
697	* pagemap entries and are about to reuse them some time soon.	697	* pagemap entries and are about to reuse them some time soon.
698	*	698	*
699	* Here's the "migration strategy":	699	* Here's the "migration strategy":
700	* 1. when the system boots these bits remain what they are,	700	* 1. when the system boots these bits remain what they are,
701	* but a warning about future change is printed in log;	701	* but a warning about future change is printed in log;
702	* 2. once anyone clears soft-dirty bits via clear_refs file,	702	* 2. once anyone clears soft-dirty bits via clear_refs file,
703	* these flag is set to denote, that user is aware of the	703	* these flag is set to denote, that user is aware of the
704	* new API and those page-shift bits change their meaning.	704	* new API and those page-shift bits change their meaning.
705	* The respective warning is printed in dmesg;	705	* The respective warning is printed in dmesg;
706	* 3. In a couple of releases we will remove all the mentions	706	* 3. In a couple of releases we will remove all the mentions
707	* of page-shift in pagemap entries.	707	* of page-shift in pagemap entries.
708	*/	708	*/
709		709
710	static bool soft_dirty_cleared __read_mostly;	710	static bool soft_dirty_cleared __read_mostly;
711		711
712	enum clear_refs_types {	712	enum clear_refs_types {
713	CLEAR_REFS_ALL = 1,	713	CLEAR_REFS_ALL = 1,
714	CLEAR_REFS_ANON,	714	CLEAR_REFS_ANON,
715	CLEAR_REFS_MAPPED,	715	CLEAR_REFS_MAPPED,
716	CLEAR_REFS_SOFT_DIRTY,	716	CLEAR_REFS_SOFT_DIRTY,
717	CLEAR_REFS_LAST,	717	CLEAR_REFS_LAST,
718	};	718	};
719		719
720	struct clear_refs_private {	720	struct clear_refs_private {
721	struct vm_area_struct *vma;	721	struct vm_area_struct *vma;
722	enum clear_refs_types type;	722	enum clear_refs_types type;
723	};	723	};
724		724
725	static inline void clear_soft_dirty(struct vm_area_struct *vma,	725	static inline void clear_soft_dirty(struct vm_area_struct *vma,
726	unsigned long addr, pte_t *pte)	726	unsigned long addr, pte_t *pte)
727	{	727	{
728	#ifdef CONFIG_MEM_SOFT_DIRTY	728	#ifdef CONFIG_MEM_SOFT_DIRTY
729	/*	729	/*
730	* The soft-dirty tracker uses #PF-s to catch writes	730	* The soft-dirty tracker uses #PF-s to catch writes
731	* to pages, so write-protect the pte as well. See the	731	* to pages, so write-protect the pte as well. See the
732	* Documentation/vm/soft-dirty.txt for full description	732	* Documentation/vm/soft-dirty.txt for full description
733	* of how soft-dirty works.	733	* of how soft-dirty works.
734	*/	734	*/
735	pte_t ptent = *pte;	735	pte_t ptent = *pte;
736		736
737	if (pte_present(ptent)) {	737	if (pte_present(ptent)) {
738	ptent = pte_wrprotect(ptent);	738	ptent = pte_wrprotect(ptent);
739	ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);	739	ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
740	} else if (is_swap_pte(ptent)) {	740	} else if (is_swap_pte(ptent)) {
741	ptent = pte_swp_clear_soft_dirty(ptent);	741	ptent = pte_swp_clear_soft_dirty(ptent);
742	} else if (pte_file(ptent)) {	742	} else if (pte_file(ptent)) {
743	ptent = pte_file_clear_soft_dirty(ptent);	743	ptent = pte_file_clear_soft_dirty(ptent);
744	}	744	}
745		745
746	set_pte_at(vma->vm_mm, addr, pte, ptent);	746	set_pte_at(vma->vm_mm, addr, pte, ptent);
747	#endif	747	#endif
748	}	748	}
749		749
750	static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,	750	static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
751	unsigned long end, struct mm_walk *walk)	751	unsigned long end, struct mm_walk *walk)
752	{	752	{
753	struct clear_refs_private *cp = walk->private;	753	struct clear_refs_private *cp = walk->private;
754	struct vm_area_struct *vma = cp->vma;	754	struct vm_area_struct *vma = cp->vma;
755	pte_t *pte, ptent;	755	pte_t *pte, ptent;
756	spinlock_t *ptl;	756	spinlock_t *ptl;
757	struct page *page;	757	struct page *page;
758		758
759	split_huge_page_pmd(vma, addr, pmd);	759	split_huge_page_pmd(vma, addr, pmd);
760	if (pmd_trans_unstable(pmd))	760	if (pmd_trans_unstable(pmd))
761	return 0;	761	return 0;
762		762
763	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);	763	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
764	for (; addr != end; pte++, addr += PAGE_SIZE) {	764	for (; addr != end; pte++, addr += PAGE_SIZE) {
765	ptent = *pte;	765	ptent = *pte;
766		766
767	if (cp->type == CLEAR_REFS_SOFT_DIRTY) {	767	if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
768	clear_soft_dirty(vma, addr, pte);	768	clear_soft_dirty(vma, addr, pte);
769	continue;	769	continue;
770	}	770	}
771		771
772	if (!pte_present(ptent))	772	if (!pte_present(ptent))
773	continue;	773	continue;
774		774
775	page = vm_normal_page(vma, addr, ptent);	775	page = vm_normal_page(vma, addr, ptent);
776	if (!page)	776	if (!page)
777	continue;	777	continue;
778		778
779	/* Clear accessed and referenced bits. */	779	/* Clear accessed and referenced bits. */
780	ptep_test_and_clear_young(vma, addr, pte);	780	ptep_test_and_clear_young(vma, addr, pte);
781	ClearPageReferenced(page);	781	ClearPageReferenced(page);
782	}	782	}
783	pte_unmap_unlock(pte - 1, ptl);	783	pte_unmap_unlock(pte - 1, ptl);
784	cond_resched();	784	cond_resched();
785	return 0;	785	return 0;
786	}	786	}
787		787
788	static ssize_t clear_refs_write(struct file file, const char __user buf,	788	static ssize_t clear_refs_write(struct file file, const char __user buf,
789	size_t count, loff_t *ppos)	789	size_t count, loff_t *ppos)
790	{	790	{
791	struct task_struct *task;	791	struct task_struct *task;
792	char buffer[PROC_NUMBUF];	792	char buffer[PROC_NUMBUF];
793	struct mm_struct *mm;	793	struct mm_struct *mm;
794	struct vm_area_struct *vma;	794	struct vm_area_struct *vma;
795	enum clear_refs_types type;	795	enum clear_refs_types type;
796	int itype;	796	int itype;
797	int rv;	797	int rv;
798		798
799	memset(buffer, 0, sizeof(buffer));	799	memset(buffer, 0, sizeof(buffer));
800	if (count > sizeof(buffer) - 1)	800	if (count > sizeof(buffer) - 1)
801	count = sizeof(buffer) - 1;	801	count = sizeof(buffer) - 1;
802	if (copy_from_user(buffer, buf, count))	802	if (copy_from_user(buffer, buf, count))
803	return -EFAULT;	803	return -EFAULT;
804	rv = kstrtoint(strstrip(buffer), 10, &itype);	804	rv = kstrtoint(strstrip(buffer), 10, &itype);
805	if (rv < 0)	805	if (rv < 0)
806	return rv;	806	return rv;
807	type = (enum clear_refs_types)itype;	807	type = (enum clear_refs_types)itype;
808	if (type < CLEAR_REFS_ALL \|\| type >= CLEAR_REFS_LAST)	808	if (type < CLEAR_REFS_ALL \|\| type >= CLEAR_REFS_LAST)
809	return -EINVAL;	809	return -EINVAL;
810		810
811	if (type == CLEAR_REFS_SOFT_DIRTY) {	811	if (type == CLEAR_REFS_SOFT_DIRTY) {
812	soft_dirty_cleared = true;	812	soft_dirty_cleared = true;
813	pr_warn_once("The pagemap bits 55-60 has changed their meaning!"	813	pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
814	" See the linux/Documentation/vm/pagemap.txt for "	814	" See the linux/Documentation/vm/pagemap.txt for "
815	"details.\n");	815	"details.\n");
816	}	816	}
817		817
818	task = get_proc_task(file_inode(file));	818	task = get_proc_task(file_inode(file));
819	if (!task)	819	if (!task)
820	return -ESRCH;	820	return -ESRCH;
821	mm = get_task_mm(task);	821	mm = get_task_mm(task);
822	if (mm) {	822	if (mm) {
823	struct clear_refs_private cp = {	823	struct clear_refs_private cp = {
824	.type = type,	824	.type = type,
825	};	825	};
826	struct mm_walk clear_refs_walk = {	826	struct mm_walk clear_refs_walk = {
827	.pmd_entry = clear_refs_pte_range,	827	.pmd_entry = clear_refs_pte_range,
828	.mm = mm,	828	.mm = mm,
829	.private = &cp,	829	.private = &cp,
830	};	830	};
831	down_read(&mm->mmap_sem);	831	down_read(&mm->mmap_sem);
832	if (type == CLEAR_REFS_SOFT_DIRTY)	832	if (type == CLEAR_REFS_SOFT_DIRTY)
833	mmu_notifier_invalidate_range_start(mm, 0, -1);	833	mmu_notifier_invalidate_range_start(mm, 0, -1);
834	for (vma = mm->mmap; vma; vma = vma->vm_next) {	834	for (vma = mm->mmap; vma; vma = vma->vm_next) {
835	cp.vma = vma;	835	cp.vma = vma;
836	if (is_vm_hugetlb_page(vma))	836	if (is_vm_hugetlb_page(vma))
837	continue;	837	continue;
838	/*	838	/*
839	* Writing 1 to /proc/pid/clear_refs affects all pages.	839	* Writing 1 to /proc/pid/clear_refs affects all pages.
840	*	840	*
841	* Writing 2 to /proc/pid/clear_refs only affects	841	* Writing 2 to /proc/pid/clear_refs only affects
842	* Anonymous pages.	842	* Anonymous pages.
843	*	843	*
844	* Writing 3 to /proc/pid/clear_refs only affects file	844	* Writing 3 to /proc/pid/clear_refs only affects file
845	* mapped pages.	845	* mapped pages.
846	*	846	*
847	* Writing 4 to /proc/pid/clear_refs affects all pages.	847	* Writing 4 to /proc/pid/clear_refs affects all pages.
848	*/	848	*/
849	if (type == CLEAR_REFS_ANON && vma->vm_file)	849	if (type == CLEAR_REFS_ANON && vma->vm_file)
850	continue;	850	continue;
851	if (type == CLEAR_REFS_MAPPED && !vma->vm_file)	851	if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
852	continue;	852	continue;
853	if (type == CLEAR_REFS_SOFT_DIRTY) {	853	if (type == CLEAR_REFS_SOFT_DIRTY) {
854	if (vma->vm_flags & VM_SOFTDIRTY)	854	if (vma->vm_flags & VM_SOFTDIRTY)
855	vma->vm_flags &= ~VM_SOFTDIRTY;	855	vma->vm_flags &= ~VM_SOFTDIRTY;
856	}	856	}
857	walk_page_range(vma->vm_start, vma->vm_end,	857	walk_page_range(vma->vm_start, vma->vm_end,
858	&clear_refs_walk);	858	&clear_refs_walk);
859	}	859	}
860	if (type == CLEAR_REFS_SOFT_DIRTY)	860	if (type == CLEAR_REFS_SOFT_DIRTY)
861	mmu_notifier_invalidate_range_end(mm, 0, -1);	861	mmu_notifier_invalidate_range_end(mm, 0, -1);
862	flush_tlb_mm(mm);	862	flush_tlb_mm(mm);
863	up_read(&mm->mmap_sem);	863	up_read(&mm->mmap_sem);
864	mmput(mm);	864	mmput(mm);
865	}	865	}
866	put_task_struct(task);	866	put_task_struct(task);
867		867
868	return count;	868	return count;
869	}	869	}
870		870
871	const struct file_operations proc_clear_refs_operations = {	871	const struct file_operations proc_clear_refs_operations = {
872	.write = clear_refs_write,	872	.write = clear_refs_write,
873	.llseek = noop_llseek,	873	.llseek = noop_llseek,
874	};	874	};
875		875
876	typedef struct {	876	typedef struct {
877	u64 pme;	877	u64 pme;
878	} pagemap_entry_t;	878	} pagemap_entry_t;
879		879
880	struct pagemapread {	880	struct pagemapread {
881	int pos, len; /* units: PM_ENTRY_BYTES, not bytes */	881	int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
882	pagemap_entry_t *buffer;	882	pagemap_entry_t *buffer;
883	bool v2;	883	bool v2;
884	};	884	};
885		885
886	#define PAGEMAP_WALK_SIZE (PMD_SIZE)	886	#define PAGEMAP_WALK_SIZE (PMD_SIZE)
887	#define PAGEMAP_WALK_MASK (PMD_MASK)	887	#define PAGEMAP_WALK_MASK (PMD_MASK)
888		888
889	#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)	889	#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
890	#define PM_STATUS_BITS 3	890	#define PM_STATUS_BITS 3
891	#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)	891	#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
892	#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)	892	#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
893	#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)	893	#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
894	#define PM_PSHIFT_BITS 6	894	#define PM_PSHIFT_BITS 6
895	#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)	895	#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
896	#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)	896	#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
897	#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)	897	#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
898	#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)	898	#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
899	#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)	899	#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
900	/* in "new" pagemap pshift bits are occupied with more status bits */	900	/* in "new" pagemap pshift bits are occupied with more status bits */
901	#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))	901	#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
902		902
903	#define __PM_SOFT_DIRTY (1LL)	903	#define __PM_SOFT_DIRTY (1LL)
904	#define PM_PRESENT PM_STATUS(4LL)	904	#define PM_PRESENT PM_STATUS(4LL)
905	#define PM_SWAP PM_STATUS(2LL)	905	#define PM_SWAP PM_STATUS(2LL)
906	#define PM_FILE PM_STATUS(1LL)	906	#define PM_FILE PM_STATUS(1LL)
907	#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)	907	#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
908	#define PM_END_OF_BUFFER 1	908	#define PM_END_OF_BUFFER 1
909		909
910	static inline pagemap_entry_t make_pme(u64 val)	910	static inline pagemap_entry_t make_pme(u64 val)
911	{	911	{
912	return (pagemap_entry_t) { .pme = val };	912	return (pagemap_entry_t) { .pme = val };
913	}	913	}
914		914
915	static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,	915	static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
916	struct pagemapread *pm)	916	struct pagemapread *pm)
917	{	917	{
918	pm->buffer[pm->pos++] = *pme;	918	pm->buffer[pm->pos++] = *pme;
919	if (pm->pos >= pm->len)	919	if (pm->pos >= pm->len)
920	return PM_END_OF_BUFFER;	920	return PM_END_OF_BUFFER;
921	return 0;	921	return 0;
922	}	922	}
923		923
924	static int pagemap_pte_hole(unsigned long start, unsigned long end,	924	static int pagemap_pte_hole(unsigned long start, unsigned long end,
925	struct mm_walk *walk)	925	struct mm_walk *walk)
926	{	926	{
927	struct pagemapread *pm = walk->private;	927	struct pagemapread *pm = walk->private;
928	unsigned long addr = start;	928	unsigned long addr = start;
929	int err = 0;	929	int err = 0;
930		930
931	while (addr < end) {	931	while (addr < end) {
932	struct vm_area_struct *vma = find_vma(walk->mm, addr);	932	struct vm_area_struct *vma = find_vma(walk->mm, addr);
933	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));	933	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
934	unsigned long vm_end;	934	/* End of address space hole, which we mark as non-present. */
		935	unsigned long hole_end;
935		936
936	if (!vma) {	937	if (vma)
937	vm_end = end;	938	hole_end = min(end, vma->vm_start);
938	} else {	939	else
939	vm_end = min(end, vma->vm_end);	940	hole_end = end;
940	if (vma->vm_flags & VM_SOFTDIRTY)	941
941	pme.pme \|= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);	942	for (; addr < hole_end; addr += PAGE_SIZE) {
		943	err = add_to_pagemap(addr, &pme, pm);
		944	if (err)
		945	goto out;
942	}	946	}
943		947
944	for (; addr < vm_end; addr += PAGE_SIZE) {	948	if (!vma)
		949	break;
		950
		951	/* Addresses in the VMA. */
		952	if (vma->vm_flags & VM_SOFTDIRTY)
		953	pme.pme \|= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
		954	for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
945	err = add_to_pagemap(addr, &pme, pm);	955	err = add_to_pagemap(addr, &pme, pm);
946	if (err)	956	if (err)
947	goto out;	957	goto out;
948	}	958	}
949	}	959	}
950
951	out:	960	out:
952	return err;	961	return err;
953	}	962	}
954		963
955	static void pte_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,	964	static void pte_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,
956	struct vm_area_struct *vma, unsigned long addr, pte_t pte)	965	struct vm_area_struct *vma, unsigned long addr, pte_t pte)
957	{	966	{
958	u64 frame, flags;	967	u64 frame, flags;
959	struct page *page = NULL;	968	struct page *page = NULL;
960	int flags2 = 0;	969	int flags2 = 0;
961		970
962	if (pte_present(pte)) {	971	if (pte_present(pte)) {
963	frame = pte_pfn(pte);	972	frame = pte_pfn(pte);
964	flags = PM_PRESENT;	973	flags = PM_PRESENT;
965	page = vm_normal_page(vma, addr, pte);	974	page = vm_normal_page(vma, addr, pte);
966	if (pte_soft_dirty(pte))	975	if (pte_soft_dirty(pte))
967	flags2 \|= __PM_SOFT_DIRTY;	976	flags2 \|= __PM_SOFT_DIRTY;
968	} else if (is_swap_pte(pte)) {	977	} else if (is_swap_pte(pte)) {
969	swp_entry_t entry;	978	swp_entry_t entry;
970	if (pte_swp_soft_dirty(pte))	979	if (pte_swp_soft_dirty(pte))
971	flags2 \|= __PM_SOFT_DIRTY;	980	flags2 \|= __PM_SOFT_DIRTY;
972	entry = pte_to_swp_entry(pte);	981	entry = pte_to_swp_entry(pte);
973	frame = swp_type(entry) \|	982	frame = swp_type(entry) \|
974	(swp_offset(entry) << MAX_SWAPFILES_SHIFT);	983	(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
975	flags = PM_SWAP;	984	flags = PM_SWAP;
976	if (is_migration_entry(entry))	985	if (is_migration_entry(entry))
977	page = migration_entry_to_page(entry);	986	page = migration_entry_to_page(entry);
978	} else {	987	} else {
979	if (vma->vm_flags & VM_SOFTDIRTY)	988	if (vma->vm_flags & VM_SOFTDIRTY)
980	flags2 \|= __PM_SOFT_DIRTY;	989	flags2 \|= __PM_SOFT_DIRTY;
981	*pme = make_pme(PM_NOT_PRESENT(pm->v2) \| PM_STATUS2(pm->v2, flags2));	990	*pme = make_pme(PM_NOT_PRESENT(pm->v2) \| PM_STATUS2(pm->v2, flags2));
982	return;	991	return;
983	}	992	}
984		993
985	if (page && !PageAnon(page))	994	if (page && !PageAnon(page))
986	flags \|= PM_FILE;	995	flags \|= PM_FILE;
987	if ((vma->vm_flags & VM_SOFTDIRTY))	996	if ((vma->vm_flags & VM_SOFTDIRTY))
988	flags2 \|= __PM_SOFT_DIRTY;	997	flags2 \|= __PM_SOFT_DIRTY;
989		998
990	*pme = make_pme(PM_PFRAME(frame) \| PM_STATUS2(pm->v2, flags2) \| flags);	999	*pme = make_pme(PM_PFRAME(frame) \| PM_STATUS2(pm->v2, flags2) \| flags);
991	}	1000	}
992		1001
993	#ifdef CONFIG_TRANSPARENT_HUGEPAGE	1002	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
994	static void thp_pmd_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,	1003	static void thp_pmd_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,
995	pmd_t pmd, int offset, int pmd_flags2)	1004	pmd_t pmd, int offset, int pmd_flags2)
996	{	1005	{
997	/*	1006	/*
998	* Currently pmd for thp is always present because thp can not be	1007	* Currently pmd for thp is always present because thp can not be
999	* swapped-out, migrated, or HWPOISONed (split in such cases instead.)	1008	* swapped-out, migrated, or HWPOISONed (split in such cases instead.)
1000	* This if-check is just to prepare for future implementation.	1009	* This if-check is just to prepare for future implementation.
1001	*/	1010	*/
1002	if (pmd_present(pmd))	1011	if (pmd_present(pmd))
1003	*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)	1012	*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
1004	\| PM_STATUS2(pm->v2, pmd_flags2) \| PM_PRESENT);	1013	\| PM_STATUS2(pm->v2, pmd_flags2) \| PM_PRESENT);
1005	else	1014	else
1006	*pme = make_pme(PM_NOT_PRESENT(pm->v2) \| PM_STATUS2(pm->v2, pmd_flags2));	1015	*pme = make_pme(PM_NOT_PRESENT(pm->v2) \| PM_STATUS2(pm->v2, pmd_flags2));
1007	}	1016	}
1008	#else	1017	#else
1009	static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,	1018	static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,
1010	pmd_t pmd, int offset, int pmd_flags2)	1019	pmd_t pmd, int offset, int pmd_flags2)
1011	{	1020	{
1012	}	1021	}
1013	#endif	1022	#endif
1014		1023
1015	static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,	1024	static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1016	struct mm_walk *walk)	1025	struct mm_walk *walk)
1017	{	1026	{
1018	struct vm_area_struct *vma;	1027	struct vm_area_struct *vma;
1019	struct pagemapread *pm = walk->private;	1028	struct pagemapread *pm = walk->private;
1020	spinlock_t *ptl;	1029	spinlock_t *ptl;
1021	pte_t *pte;	1030	pte_t *pte;
1022	int err = 0;	1031	int err = 0;
1023	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));	1032	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1024		1033
1025	/* find the first VMA at or above 'addr' */	1034	/* find the first VMA at or above 'addr' */
1026	vma = find_vma(walk->mm, addr);	1035	vma = find_vma(walk->mm, addr);
1027	if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {	1036	if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1028	int pmd_flags2;	1037	int pmd_flags2;
1029		1038
1030	if ((vma->vm_flags & VM_SOFTDIRTY) \|\| pmd_soft_dirty(*pmd))	1039	if ((vma->vm_flags & VM_SOFTDIRTY) \|\| pmd_soft_dirty(*pmd))
1031	pmd_flags2 = __PM_SOFT_DIRTY;	1040	pmd_flags2 = __PM_SOFT_DIRTY;
1032	else	1041	else
1033	pmd_flags2 = 0;	1042	pmd_flags2 = 0;
1034		1043
1035	for (; addr != end; addr += PAGE_SIZE) {	1044	for (; addr != end; addr += PAGE_SIZE) {
1036	unsigned long offset;	1045	unsigned long offset;
1037		1046
1038	offset = (addr & ~PAGEMAP_WALK_MASK) >>	1047	offset = (addr & ~PAGEMAP_WALK_MASK) >>
1039	PAGE_SHIFT;	1048	PAGE_SHIFT;
1040	thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);	1049	thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
1041	err = add_to_pagemap(addr, &pme, pm);	1050	err = add_to_pagemap(addr, &pme, pm);
1042	if (err)	1051	if (err)
1043	break;	1052	break;
1044	}	1053	}
1045	spin_unlock(ptl);	1054	spin_unlock(ptl);
1046	return err;	1055	return err;
1047	}	1056	}
1048		1057
1049	if (pmd_trans_unstable(pmd))	1058	if (pmd_trans_unstable(pmd))
1050	return 0;	1059	return 0;
1051	for (; addr != end; addr += PAGE_SIZE) {	1060	for (; addr != end; addr += PAGE_SIZE) {
1052	int flags2;	1061	int flags2;
1053		1062
1054	/* check to see if we've left 'vma' behind	1063	/* check to see if we've left 'vma' behind
1055	* and need a new, higher one */	1064	* and need a new, higher one */
1056	if (vma && (addr >= vma->vm_end)) {	1065	if (vma && (addr >= vma->vm_end)) {
1057	vma = find_vma(walk->mm, addr);	1066	vma = find_vma(walk->mm, addr);
1058	if (vma && (vma->vm_flags & VM_SOFTDIRTY))	1067	if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1059	flags2 = __PM_SOFT_DIRTY;	1068	flags2 = __PM_SOFT_DIRTY;
1060	else	1069	else
1061	flags2 = 0;	1070	flags2 = 0;
1062	pme = make_pme(PM_NOT_PRESENT(pm->v2) \| PM_STATUS2(pm->v2, flags2));	1071	pme = make_pme(PM_NOT_PRESENT(pm->v2) \| PM_STATUS2(pm->v2, flags2));
1063	}	1072	}
1064		1073
1065	/* check that 'vma' actually covers this address,	1074	/* check that 'vma' actually covers this address,
1066	* and that it isn't a huge page vma */	1075	* and that it isn't a huge page vma */
1067	if (vma && (vma->vm_start <= addr) &&	1076	if (vma && (vma->vm_start <= addr) &&
1068	!is_vm_hugetlb_page(vma)) {	1077	!is_vm_hugetlb_page(vma)) {
1069	pte = pte_offset_map(pmd, addr);	1078	pte = pte_offset_map(pmd, addr);
1070	pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);	1079	pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1071	/* unmap before userspace copy */	1080	/* unmap before userspace copy */
1072	pte_unmap(pte);	1081	pte_unmap(pte);
1073	}	1082	}
1074	err = add_to_pagemap(addr, &pme, pm);	1083	err = add_to_pagemap(addr, &pme, pm);
1075	if (err)	1084	if (err)
1076	return err;	1085	return err;
1077	}	1086	}
1078		1087
1079	cond_resched();	1088	cond_resched();
1080		1089
1081	return err;	1090	return err;
1082	}	1091	}
1083		1092
1084	#ifdef CONFIG_HUGETLB_PAGE	1093	#ifdef CONFIG_HUGETLB_PAGE
1085	static void huge_pte_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,	1094	static void huge_pte_to_pagemap_entry(pagemap_entry_t pme, struct pagemapread pm,
1086	pte_t pte, int offset, int flags2)	1095	pte_t pte, int offset, int flags2)
1087	{	1096	{
1088	if (pte_present(pte))	1097	if (pte_present(pte))
1089	*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) \|	1098	*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) \|
1090	PM_STATUS2(pm->v2, flags2) \|	1099	PM_STATUS2(pm->v2, flags2) \|
1091	PM_PRESENT);	1100	PM_PRESENT);
1092	else	1101	else
1093	*pme = make_pme(PM_NOT_PRESENT(pm->v2) \|	1102	*pme = make_pme(PM_NOT_PRESENT(pm->v2) \|
1094	PM_STATUS2(pm->v2, flags2));	1103	PM_STATUS2(pm->v2, flags2));
1095	}	1104	}
1096		1105
1097	/* This function walks within one hugetlb entry in the single call */	1106	/* This function walks within one hugetlb entry in the single call */
1098	static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,	1107	static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1099	unsigned long addr, unsigned long end,	1108	unsigned long addr, unsigned long end,
1100	struct mm_walk *walk)	1109	struct mm_walk *walk)
1101	{	1110	{
1102	struct pagemapread *pm = walk->private;	1111	struct pagemapread *pm = walk->private;
1103	struct vm_area_struct *vma;	1112	struct vm_area_struct *vma;
1104	int err = 0;	1113	int err = 0;
1105	int flags2;	1114	int flags2;
1106	pagemap_entry_t pme;	1115	pagemap_entry_t pme;
1107		1116
1108	vma = find_vma(walk->mm, addr);	1117	vma = find_vma(walk->mm, addr);
1109	WARN_ON_ONCE(!vma);	1118	WARN_ON_ONCE(!vma);
1110		1119
1111	if (vma && (vma->vm_flags & VM_SOFTDIRTY))	1120	if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1112	flags2 = __PM_SOFT_DIRTY;	1121	flags2 = __PM_SOFT_DIRTY;
1113	else	1122	else
1114	flags2 = 0;	1123	flags2 = 0;
1115		1124
1116	for (; addr != end; addr += PAGE_SIZE) {	1125	for (; addr != end; addr += PAGE_SIZE) {
1117	int offset = (addr & ~hmask) >> PAGE_SHIFT;	1126	int offset = (addr & ~hmask) >> PAGE_SHIFT;
1118	huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);	1127	huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1119	err = add_to_pagemap(addr, &pme, pm);	1128	err = add_to_pagemap(addr, &pme, pm);
1120	if (err)	1129	if (err)
1121	return err;	1130	return err;
1122	}	1131	}
1123		1132
1124	cond_resched();	1133	cond_resched();
1125		1134
1126	return err;	1135	return err;
1127	}	1136	}
1128	#endif /* HUGETLB_PAGE */	1137	#endif /* HUGETLB_PAGE */
1129		1138
1130	/*	1139	/*
1131	* /proc/pid/pagemap - an array mapping virtual pages to pfns	1140	* /proc/pid/pagemap - an array mapping virtual pages to pfns
1132	*	1141	*
1133	* For each page in the address space, this file contains one 64-bit entry	1142	* For each page in the address space, this file contains one 64-bit entry
1134	* consisting of the following:	1143	* consisting of the following:
1135	*	1144	*
1136	* Bits 0-54 page frame number (PFN) if present	1145	* Bits 0-54 page frame number (PFN) if present
1137	* Bits 0-4 swap type if swapped	1146	* Bits 0-4 swap type if swapped
1138	* Bits 5-54 swap offset if swapped	1147	* Bits 5-54 swap offset if swapped
1139	* Bits 55-60 page shift (page size = 1<<page shift)	1148	* Bits 55-60 page shift (page size = 1<<page shift)
1140	* Bit 61 page is file-page or shared-anon	1149	* Bit 61 page is file-page or shared-anon
1141	* Bit 62 page swapped	1150	* Bit 62 page swapped
1142	* Bit 63 page present	1151	* Bit 63 page present
1143	*	1152	*
1144	* If the page is not present but in swap, then the PFN contains an	1153	* If the page is not present but in swap, then the PFN contains an
1145	* encoding of the swap file number and the page's offset into the	1154	* encoding of the swap file number and the page's offset into the
1146	* swap. Unmapped pages return a null PFN. This allows determining	1155	* swap. Unmapped pages return a null PFN. This allows determining
1147	* precisely which pages are mapped (or in swap) and comparing mapped	1156	* precisely which pages are mapped (or in swap) and comparing mapped
1148	* pages between processes.	1157	* pages between processes.
1149	*	1158	*
1150	* Efficient users of this interface will use /proc/pid/maps to	1159	* Efficient users of this interface will use /proc/pid/maps to
1151	* determine which areas of memory are actually mapped and llseek to	1160	* determine which areas of memory are actually mapped and llseek to
1152	* skip over unmapped regions.	1161	* skip over unmapped regions.
1153	*/	1162	*/
1154	static ssize_t pagemap_read(struct file file, char __user buf,	1163	static ssize_t pagemap_read(struct file file, char __user buf,
1155	size_t count, loff_t *ppos)	1164	size_t count, loff_t *ppos)
1156	{	1165	{
1157	struct task_struct *task = get_proc_task(file_inode(file));	1166	struct task_struct *task = get_proc_task(file_inode(file));
1158	struct mm_struct *mm;	1167	struct mm_struct *mm;
1159	struct pagemapread pm;	1168	struct pagemapread pm;
1160	int ret = -ESRCH;	1169	int ret = -ESRCH;
1161	struct mm_walk pagemap_walk = {};	1170	struct mm_walk pagemap_walk = {};
1162	unsigned long src;	1171	unsigned long src;
1163	unsigned long svpfn;	1172	unsigned long svpfn;
1164	unsigned long start_vaddr;	1173	unsigned long start_vaddr;
1165	unsigned long end_vaddr;	1174	unsigned long end_vaddr;
1166	int copied = 0;	1175	int copied = 0;
1167		1176
1168	if (!task)	1177	if (!task)
1169	goto out;	1178	goto out;
1170		1179
1171	ret = -EINVAL;	1180	ret = -EINVAL;
1172	/* file position must be aligned */	1181	/* file position must be aligned */
1173	if ((*ppos % PM_ENTRY_BYTES) \|\| (count % PM_ENTRY_BYTES))	1182	if ((*ppos % PM_ENTRY_BYTES) \|\| (count % PM_ENTRY_BYTES))
1174	goto out_task;	1183	goto out_task;
1175		1184
1176	ret = 0;	1185	ret = 0;
1177	if (!count)	1186	if (!count)
1178	goto out_task;	1187	goto out_task;
1179		1188
1180	pm.v2 = soft_dirty_cleared;	1189	pm.v2 = soft_dirty_cleared;
1181	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);	1190	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1182	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);	1191	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1183	ret = -ENOMEM;	1192	ret = -ENOMEM;
1184	if (!pm.buffer)	1193	if (!pm.buffer)
1185	goto out_task;	1194	goto out_task;
1186		1195
1187	mm = mm_access(task, PTRACE_MODE_READ);	1196	mm = mm_access(task, PTRACE_MODE_READ);
1188	ret = PTR_ERR(mm);	1197	ret = PTR_ERR(mm);
1189	if (!mm \|\| IS_ERR(mm))	1198	if (!mm \|\| IS_ERR(mm))
1190	goto out_free;	1199	goto out_free;
1191		1200
1192	pagemap_walk.pmd_entry = pagemap_pte_range;	1201	pagemap_walk.pmd_entry = pagemap_pte_range;
1193	pagemap_walk.pte_hole = pagemap_pte_hole;	1202	pagemap_walk.pte_hole = pagemap_pte_hole;
1194	#ifdef CONFIG_HUGETLB_PAGE	1203	#ifdef CONFIG_HUGETLB_PAGE
1195	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;	1204	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1196	#endif	1205	#endif
1197	pagemap_walk.mm = mm;	1206	pagemap_walk.mm = mm;
1198	pagemap_walk.private = &pm;	1207	pagemap_walk.private = &pm;
1199		1208
1200	src = *ppos;	1209	src = *ppos;
1201	svpfn = src / PM_ENTRY_BYTES;	1210	svpfn = src / PM_ENTRY_BYTES;
1202	start_vaddr = svpfn << PAGE_SHIFT;	1211	start_vaddr = svpfn << PAGE_SHIFT;
1203	end_vaddr = TASK_SIZE_OF(task);	1212	end_vaddr = TASK_SIZE_OF(task);
1204		1213
1205	/* watch out for wraparound */	1214	/* watch out for wraparound */
1206	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)	1215	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
1207	start_vaddr = end_vaddr;	1216	start_vaddr = end_vaddr;
1208		1217
1209	/*	1218	/*
1210	* The odds are that this will stop walking way	1219	* The odds are that this will stop walking way
1211	* before end_vaddr, because the length of the	1220	* before end_vaddr, because the length of the
1212	* user buffer is tracked in "pm", and the walk	1221	* user buffer is tracked in "pm", and the walk
1213	* will stop when we hit the end of the buffer.	1222	* will stop when we hit the end of the buffer.
1214	*/	1223	*/
1215	ret = 0;	1224	ret = 0;
1216	while (count && (start_vaddr < end_vaddr)) {	1225	while (count && (start_vaddr < end_vaddr)) {
1217	int len;	1226	int len;
1218	unsigned long end;	1227	unsigned long end;
1219		1228
1220	pm.pos = 0;	1229	pm.pos = 0;
1221	end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;	1230	end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1222	/* overflow ? */	1231	/* overflow ? */
1223	if (end < start_vaddr \|\| end > end_vaddr)	1232	if (end < start_vaddr \|\| end > end_vaddr)
1224	end = end_vaddr;	1233	end = end_vaddr;
1225	down_read(&mm->mmap_sem);	1234	down_read(&mm->mmap_sem);
1226	ret = walk_page_range(start_vaddr, end, &pagemap_walk);	1235	ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1227	up_read(&mm->mmap_sem);	1236	up_read(&mm->mmap_sem);
1228	start_vaddr = end;	1237	start_vaddr = end;
1229		1238
1230	len = min(count, PM_ENTRY_BYTES * pm.pos);	1239	len = min(count, PM_ENTRY_BYTES * pm.pos);
1231	if (copy_to_user(buf, pm.buffer, len)) {	1240	if (copy_to_user(buf, pm.buffer, len)) {
1232	ret = -EFAULT;	1241	ret = -EFAULT;
1233	goto out_mm;	1242	goto out_mm;
1234	}	1243	}
1235	copied += len;	1244	copied += len;
1236	buf += len;	1245	buf += len;
1237	count -= len;	1246	count -= len;
1238	}	1247	}
1239	*ppos += copied;	1248	*ppos += copied;
1240	if (!ret \|\| ret == PM_END_OF_BUFFER)	1249	if (!ret \|\| ret == PM_END_OF_BUFFER)
1241	ret = copied;	1250	ret = copied;
1242		1251
1243	out_mm:	1252	out_mm:
1244	mmput(mm);	1253	mmput(mm);
1245	out_free:	1254	out_free:
1246	kfree(pm.buffer);	1255	kfree(pm.buffer);
1247	out_task:	1256	out_task:
1248	put_task_struct(task);	1257	put_task_struct(task);
1249	out:	1258	out:
1250	return ret;	1259	return ret;
1251	}	1260	}
1252		1261
1253	static int pagemap_open(struct inode inode, struct file file)	1262	static int pagemap_open(struct inode inode, struct file file)
1254	{	1263	{
1255	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "	1264	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1256	"to stop being page-shift some time soon. See the "	1265	"to stop being page-shift some time soon. See the "
1257	"linux/Documentation/vm/pagemap.txt for details.\n");	1266	"linux/Documentation/vm/pagemap.txt for details.\n");
1258	return 0;	1267	return 0;
1259	}	1268	}
1260		1269
1261	const struct file_operations proc_pagemap_operations = {	1270	const struct file_operations proc_pagemap_operations = {
1262	.llseek = mem_lseek, /* borrow this */	1271	.llseek = mem_lseek, /* borrow this */
1263	.read = pagemap_read,	1272	.read = pagemap_read,
1264	.open = pagemap_open,	1273	.open = pagemap_open,
1265	};	1274	};
1266	#endif /* CONFIG_PROC_PAGE_MONITOR */	1275	#endif /* CONFIG_PROC_PAGE_MONITOR */
1267		1276
1268	#ifdef CONFIG_NUMA	1277	#ifdef CONFIG_NUMA
1269		1278
1270	struct numa_maps {	1279	struct numa_maps {
1271	struct vm_area_struct *vma;	1280	struct vm_area_struct *vma;
1272	unsigned long pages;	1281	unsigned long pages;
1273	unsigned long anon;	1282	unsigned long anon;
1274	unsigned long active;	1283	unsigned long active;
1275	unsigned long writeback;	1284	unsigned long writeback;
1276	unsigned long mapcount_max;	1285	unsigned long mapcount_max;
1277	unsigned long dirty;	1286	unsigned long dirty;
1278	unsigned long swapcache;	1287	unsigned long swapcache;
1279	unsigned long node[MAX_NUMNODES];	1288	unsigned long node[MAX_NUMNODES];
1280	};	1289	};
1281		1290
1282	struct numa_maps_private {	1291	struct numa_maps_private {
1283	struct proc_maps_private proc_maps;	1292	struct proc_maps_private proc_maps;
1284	struct numa_maps md;	1293	struct numa_maps md;
1285	};	1294	};
1286		1295
1287	static void gather_stats(struct page page, struct numa_maps md, int pte_dirty,	1296	static void gather_stats(struct page page, struct numa_maps md, int pte_dirty,
1288	unsigned long nr_pages)	1297	unsigned long nr_pages)
1289	{	1298	{
1290	int count = page_mapcount(page);	1299	int count = page_mapcount(page);
1291		1300
1292	md->pages += nr_pages;	1301	md->pages += nr_pages;
1293	if (pte_dirty \|\| PageDirty(page))	1302	if (pte_dirty \|\| PageDirty(page))
1294	md->dirty += nr_pages;	1303	md->dirty += nr_pages;
1295		1304
1296	if (PageSwapCache(page))	1305	if (PageSwapCache(page))
1297	md->swapcache += nr_pages;	1306	md->swapcache += nr_pages;
1298		1307
1299	if (PageActive(page) \|\| PageUnevictable(page))	1308	if (PageActive(page) \|\| PageUnevictable(page))
1300	md->active += nr_pages;	1309	md->active += nr_pages;
1301		1310
1302	if (PageWriteback(page))	1311	if (PageWriteback(page))
1303	md->writeback += nr_pages;	1312	md->writeback += nr_pages;
1304		1313
1305	if (PageAnon(page))	1314	if (PageAnon(page))
1306	md->anon += nr_pages;	1315	md->anon += nr_pages;
1307		1316
1308	if (count > md->mapcount_max)	1317	if (count > md->mapcount_max)
1309	md->mapcount_max = count;	1318	md->mapcount_max = count;
1310		1319
1311	md->node[page_to_nid(page)] += nr_pages;	1320	md->node[page_to_nid(page)] += nr_pages;
1312	}	1321	}
1313		1322
1314	static struct page can_gather_numa_stats(pte_t pte, struct vm_area_struct vma,	1323	static struct page can_gather_numa_stats(pte_t pte, struct vm_area_struct vma,
1315	unsigned long addr)	1324	unsigned long addr)
1316	{	1325	{
1317	struct page *page;	1326	struct page *page;
1318	int nid;	1327	int nid;
1319		1328
1320	if (!pte_present(pte))	1329	if (!pte_present(pte))
1321	return NULL;	1330	return NULL;
1322		1331
1323	page = vm_normal_page(vma, addr, pte);	1332	page = vm_normal_page(vma, addr, pte);
1324	if (!page)	1333	if (!page)
1325	return NULL;	1334	return NULL;
1326		1335
1327	if (PageReserved(page))	1336	if (PageReserved(page))
1328	return NULL;	1337	return NULL;
1329		1338
1330	nid = page_to_nid(page);	1339	nid = page_to_nid(page);
1331	if (!node_isset(nid, node_states[N_MEMORY]))	1340	if (!node_isset(nid, node_states[N_MEMORY]))
1332	return NULL;	1341	return NULL;
1333		1342
1334	return page;	1343	return page;
1335	}	1344	}
1336		1345
1337	static int gather_pte_stats(pmd_t *pmd, unsigned long addr,	1346	static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1338	unsigned long end, struct mm_walk *walk)	1347	unsigned long end, struct mm_walk *walk)
1339	{	1348	{
1340	struct numa_maps *md;	1349	struct numa_maps *md;
1341	spinlock_t *ptl;	1350	spinlock_t *ptl;
1342	pte_t *orig_pte;	1351	pte_t *orig_pte;
1343	pte_t *pte;	1352	pte_t *pte;
1344		1353
1345	md = walk->private;	1354	md = walk->private;
1346		1355
1347	if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {	1356	if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1348	pte_t huge_pte = (pte_t )pmd;	1357	pte_t huge_pte = (pte_t )pmd;
1349	struct page *page;	1358	struct page *page;
1350		1359
1351	page = can_gather_numa_stats(huge_pte, md->vma, addr);	1360	page = can_gather_numa_stats(huge_pte, md->vma, addr);
1352	if (page)	1361	if (page)
1353	gather_stats(page, md, pte_dirty(huge_pte),	1362	gather_stats(page, md, pte_dirty(huge_pte),
1354	HPAGE_PMD_SIZE/PAGE_SIZE);	1363	HPAGE_PMD_SIZE/PAGE_SIZE);
1355	spin_unlock(ptl);	1364	spin_unlock(ptl);
1356	return 0;	1365	return 0;
1357	}	1366	}
1358		1367
1359	if (pmd_trans_unstable(pmd))	1368	if (pmd_trans_unstable(pmd))
1360	return 0;	1369	return 0;
1361	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);	1370	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1362	do {	1371	do {
1363	struct page page = can_gather_numa_stats(pte, md->vma, addr);	1372	struct page page = can_gather_numa_stats(pte, md->vma, addr);
1364	if (!page)	1373	if (!page)
1365	continue;	1374	continue;
1366	gather_stats(page, md, pte_dirty(*pte), 1);	1375	gather_stats(page, md, pte_dirty(*pte), 1);
1367		1376
1368	} while (pte++, addr += PAGE_SIZE, addr != end);	1377	} while (pte++, addr += PAGE_SIZE, addr != end);
1369	pte_unmap_unlock(orig_pte, ptl);	1378	pte_unmap_unlock(orig_pte, ptl);
1370	return 0;	1379	return 0;
1371	}	1380	}
1372	#ifdef CONFIG_HUGETLB_PAGE	1381	#ifdef CONFIG_HUGETLB_PAGE
1373	static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,	1382	static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1374	unsigned long addr, unsigned long end, struct mm_walk *walk)	1383	unsigned long addr, unsigned long end, struct mm_walk *walk)
1375	{	1384	{
1376	struct numa_maps *md;	1385	struct numa_maps *md;
1377	struct page *page;	1386	struct page *page;
1378		1387
1379	if (!pte_present(*pte))	1388	if (!pte_present(*pte))
1380	return 0;	1389	return 0;
1381		1390
1382	page = pte_page(*pte);	1391	page = pte_page(*pte);
1383	if (!page)	1392	if (!page)
1384	return 0;	1393	return 0;
1385		1394
1386	md = walk->private;	1395	md = walk->private;
1387	gather_stats(page, md, pte_dirty(*pte), 1);	1396	gather_stats(page, md, pte_dirty(*pte), 1);
1388	return 0;	1397	return 0;
1389	}	1398	}
1390		1399
1391	#else	1400	#else
1392	static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,	1401	static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1393	unsigned long addr, unsigned long end, struct mm_walk *walk)	1402	unsigned long addr, unsigned long end, struct mm_walk *walk)
1394	{	1403	{
1395	return 0;	1404	return 0;
1396	}	1405	}
1397	#endif	1406	#endif
1398		1407
1399	/*	1408	/*
1400	* Display pages allocated per node and memory policy via /proc.	1409	* Display pages allocated per node and memory policy via /proc.
1401	*/	1410	*/
1402	static int show_numa_map(struct seq_file m, void v, int is_pid)	1411	static int show_numa_map(struct seq_file m, void v, int is_pid)
1403	{	1412	{
1404	struct numa_maps_private *numa_priv = m->private;	1413	struct numa_maps_private *numa_priv = m->private;
1405	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;	1414	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1406	struct vm_area_struct *vma = v;	1415	struct vm_area_struct *vma = v;
1407	struct numa_maps *md = &numa_priv->md;	1416	struct numa_maps *md = &numa_priv->md;
1408	struct file *file = vma->vm_file;	1417	struct file *file = vma->vm_file;
1409	struct task_struct *task = proc_priv->task;	1418	struct task_struct *task = proc_priv->task;
1410	struct mm_struct *mm = vma->vm_mm;	1419	struct mm_struct *mm = vma->vm_mm;
1411	struct mm_walk walk = {};	1420	struct mm_walk walk = {};
1412	struct mempolicy *pol;	1421	struct mempolicy *pol;
1413	char buffer[64];	1422	char buffer[64];
1414	int nid;	1423	int nid;
1415		1424
1416	if (!mm)	1425	if (!mm)
1417	return 0;	1426	return 0;
1418		1427
1419	/* Ensure we start with an empty set of numa_maps statistics. */	1428	/* Ensure we start with an empty set of numa_maps statistics. */
1420	memset(md, 0, sizeof(*md));	1429	memset(md, 0, sizeof(*md));
1421		1430
1422	md->vma = vma;	1431	md->vma = vma;
1423		1432
1424	walk.hugetlb_entry = gather_hugetbl_stats;	1433	walk.hugetlb_entry = gather_hugetbl_stats;
1425	walk.pmd_entry = gather_pte_stats;	1434	walk.pmd_entry = gather_pte_stats;
1426	walk.private = md;	1435	walk.private = md;
1427	walk.mm = mm;	1436	walk.mm = mm;
1428		1437
1429	pol = get_vma_policy(task, vma, vma->vm_start);	1438	pol = get_vma_policy(task, vma, vma->vm_start);
1430	mpol_to_str(buffer, sizeof(buffer), pol);	1439	mpol_to_str(buffer, sizeof(buffer), pol);
1431	mpol_cond_put(pol);	1440	mpol_cond_put(pol);
1432		1441
1433	seq_printf(m, "%08lx %s", vma->vm_start, buffer);	1442	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1434		1443
1435	if (file) {	1444	if (file) {
1436	seq_puts(m, " file=");	1445	seq_puts(m, " file=");
1437	seq_path(m, &file->f_path, "\n\t= ");	1446	seq_path(m, &file->f_path, "\n\t= ");
1438	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {	1447	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1439	seq_puts(m, " heap");	1448	seq_puts(m, " heap");
1440	} else {	1449	} else {
1441	pid_t tid = vm_is_stack(task, vma, is_pid);	1450	pid_t tid = vm_is_stack(task, vma, is_pid);
1442	if (tid != 0) {	1451	if (tid != 0) {
1443	/*	1452	/*
1444	* Thread stack in /proc/PID/task/TID/maps or	1453	* Thread stack in /proc/PID/task/TID/maps or
1445	* the main process stack.	1454	* the main process stack.
1446	*/	1455	*/
1447	if (!is_pid \|\| (vma->vm_start <= mm->start_stack &&	1456	if (!is_pid \|\| (vma->vm_start <= mm->start_stack &&
1448	vma->vm_end >= mm->start_stack))	1457	vma->vm_end >= mm->start_stack))
1449	seq_puts(m, " stack");	1458	seq_puts(m, " stack");
1450	else	1459	else
1451	seq_printf(m, " stack:%d", tid);	1460	seq_printf(m, " stack:%d", tid);
1452	}	1461	}
1453	}	1462	}
1454		1463
1455	if (is_vm_hugetlb_page(vma))	1464	if (is_vm_hugetlb_page(vma))
1456	seq_puts(m, " huge");	1465	seq_puts(m, " huge");
1457		1466
1458	walk_page_range(vma->vm_start, vma->vm_end, &walk);	1467	walk_page_range(vma->vm_start, vma->vm_end, &walk);
1459		1468
1460	if (!md->pages)	1469	if (!md->pages)
1461	goto out;	1470	goto out;
1462		1471
1463	if (md->anon)	1472	if (md->anon)
1464	seq_printf(m, " anon=%lu", md->anon);	1473	seq_printf(m, " anon=%lu", md->anon);
1465		1474
1466	if (md->dirty)	1475	if (md->dirty)
1467	seq_printf(m, " dirty=%lu", md->dirty);	1476	seq_printf(m, " dirty=%lu", md->dirty);
1468		1477
1469	if (md->pages != md->anon && md->pages != md->dirty)	1478	if (md->pages != md->anon && md->pages != md->dirty)
1470	seq_printf(m, " mapped=%lu", md->pages);	1479	seq_printf(m, " mapped=%lu", md->pages);
1471		1480
1472	if (md->mapcount_max > 1)	1481	if (md->mapcount_max > 1)
1473	seq_printf(m, " mapmax=%lu", md->mapcount_max);	1482	seq_printf(m, " mapmax=%lu", md->mapcount_max);
1474		1483
1475	if (md->swapcache)	1484	if (md->swapcache)
1476	seq_printf(m, " swapcache=%lu", md->swapcache);	1485	seq_printf(m, " swapcache=%lu", md->swapcache);
1477		1486
1478	if (md->active < md->pages && !is_vm_hugetlb_page(vma))	1487	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1479	seq_printf(m, " active=%lu", md->active);	1488	seq_printf(m, " active=%lu", md->active);
1480		1489
1481	if (md->writeback)	1490	if (md->writeback)
1482	seq_printf(m, " writeback=%lu", md->writeback);	1491	seq_printf(m, " writeback=%lu", md->writeback);
1483		1492
1484	for_each_node_state(nid, N_MEMORY)	1493	for_each_node_state(nid, N_MEMORY)
1485	if (md->node[nid])	1494	if (md->node[nid])
1486	seq_printf(m, " N%d=%lu", nid, md->node[nid]);	1495	seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1487	out:	1496	out:
1488	seq_putc(m, '\n');	1497	seq_putc(m, '\n');
1489		1498
1490	if (m->count < m->size)	1499	if (m->count < m->size)
1491	m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;	1500	m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1492	return 0;	1501	return 0;
1493	}	1502	}
1494		1503
1495	static int show_pid_numa_map(struct seq_file m, void v)	1504	static int show_pid_numa_map(struct seq_file m, void v)
1496	{	1505	{
1497	return show_numa_map(m, v, 1);	1506	return show_numa_map(m, v, 1);
1498	}	1507	}
1499		1508
1500	static int show_tid_numa_map(struct seq_file m, void v)	1509	static int show_tid_numa_map(struct seq_file m, void v)
1501	{	1510	{
1502	return show_numa_map(m, v, 0);	1511	return show_numa_map(m, v, 0);
1503	}	1512	}
1504		1513
1505	static const struct seq_operations proc_pid_numa_maps_op = {	1514	static const struct seq_operations proc_pid_numa_maps_op = {
1506	.start = m_start,	1515	.start = m_start,
1507	.next = m_next,	1516	.next = m_next,
1508	.stop = m_stop,	1517	.stop = m_stop,
1509	.show = show_pid_numa_map,	1518	.show = show_pid_numa_map,
1510	};	1519	};
1511		1520
1512	static const struct seq_operations proc_tid_numa_maps_op = {	1521	static const struct seq_operations proc_tid_numa_maps_op = {
1513	.start = m_start,	1522	.start = m_start,
1514	.next = m_next,	1523	.next = m_next,
1515	.stop = m_stop,	1524	.stop = m_stop,
1516	.show = show_tid_numa_map,	1525	.show = show_tid_numa_map,
1517	};	1526	};
1518		1527
1519	static int numa_maps_open(struct inode inode, struct file file,	1528	static int numa_maps_open(struct inode inode, struct file file,
1520	const struct seq_operations *ops)	1529	const struct seq_operations *ops)
1521	{	1530	{
1522	struct numa_maps_private *priv;	1531	struct numa_maps_private *priv;
1523	int ret = -ENOMEM;	1532	int ret = -ENOMEM;
1524	priv = kzalloc(sizeof(*priv), GFP_KERNEL);	1533	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1525	if (priv) {	1534	if (priv) {
1526	priv->proc_maps.pid = proc_pid(inode);	1535	priv->proc_maps.pid = proc_pid(inode);
1527	ret = seq_open(file, ops);	1536	ret = seq_open(file, ops);
1528	if (!ret) {	1537	if (!ret) {
1529	struct seq_file *m = file->private_data;	1538	struct seq_file *m = file->private_data;
1530	m->private = priv;	1539	m->private = priv;
1531	} else {	1540	} else {
1532	kfree(priv);	1541	kfree(priv);
1533	}	1542	}
1534	}	1543	}
1535	return ret;	1544	return ret;
1536	}	1545	}
1537		1546
1538	static int pid_numa_maps_open(struct inode inode, struct file file)	1547	static int pid_numa_maps_open(struct inode inode, struct file file)
1539	{	1548	{
1540	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);	1549	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1541	}	1550	}
1542		1551
1543	static int tid_numa_maps_open(struct inode inode, struct file file)	1552	static int tid_numa_maps_open(struct inode inode, struct file file)
1544	{	1553	{
1545	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);	1554	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1546	}	1555	}
1547		1556
1548	const struct file_operations proc_pid_numa_maps_operations = {	1557	const struct file_operations proc_pid_numa_maps_operations = {
1549	.open = pid_numa_maps_open,	1558	.open = pid_numa_maps_open,
1550	.read = seq_read,	1559	.read = seq_read,
1551	.llseek = seq_lseek,	1560	.llseek = seq_lseek,
1552	.release = seq_release_private,	1561	.release = seq_release_private,
1553	};	1562	};
1554		1563
1555	const struct file_operations proc_tid_numa_maps_operations = {	1564	const struct file_operations proc_tid_numa_maps_operations = {
1556	.open = tid_numa_maps_open,	1565	.open = tid_numa_maps_open,
1557	.read = seq_read,	1566	.read = seq_read,
1558	.llseek = seq_lseek,	1567	.llseek = seq_lseek,
1559	.release = seq_release_private,	1568	.release = seq_release_private,
1560	};	1569	};
1561	#endif /* CONFIG_NUMA */	1570	#endif /* CONFIG_NUMA */

lib/genalloc.c

Diff comments View file @ 8207649

 /*
  * Basic general purpose allocator for managing special purpose
  * memory, for example, memory that is not managed by the regular
  * kmalloc/kfree interface.  Uses for this includes on-device special
  * memory, uncached memory etc.
  *
  * It is safe to use the allocator in NMI handlers and other special
  * unblockable contexts that could otherwise deadlock on locks.  This
  * is implemented by using atomic operations and retries on any
  * conflicts.  The disadvantage is that there may be livelocks in
  * extreme cases.  For better scalability, one allocator can be used
  * for each CPU.
  *
  * The lockless operation only works if there is enough memory
  * available.  If new memory is added to the pool a lock has to be
  * still taken.  So any user relying on locklessness has to ensure
  * that sufficient memory is preallocated.
  *
  * The basic atomic operation of this allocator is cmpxchg on long.
  * On architectures that don't have NMI-safe cmpxchg implementation,
  * the allocator can NOT be used in NMI handler.  So code uses the
  * allocator in NMI handler should depend on
  * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
  *
  * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/bitmap.h>
 #include <linux/rculist.h>
 #include <linux/interrupt.h>
 #include <linux/genalloc.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 static inline size_t chunk_size(const struct gen_pool_chunk *chunk)
 {
 	return chunk->end_addr - chunk->start_addr + 1;
 }
 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
 {
 	unsigned long val, nval;
 	nval = *addr;
 	do {
 		val = nval;
 		if (val & mask_to_set)
 			return -EBUSY;
 		cpu_relax();
 	} while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
 	return 0;
 }
 static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear)
 {
 	unsigned long val, nval;
 	nval = *addr;
 	do {
 		val = nval;
 		if ((val & mask_to_clear) != mask_to_clear)
 			return -EBUSY;
 		cpu_relax();
 	} while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
 	return 0;
 }
 /*
  * bitmap_set_ll - set the specified number of bits at the specified position
  * @map: pointer to a bitmap
  * @start: a bit position in @map
  * @nr: number of bits to set
  *
  * Set @nr bits start from @start in @map lock-lessly. Several users
  * can set/clear the same bitmap simultaneously without lock. If two
  * users set the same bit, one user will return remain bits, otherwise
  * return 0.
  */
 static int bitmap_set_ll(unsigned long *map, int start, int nr)
 {
 	unsigned long *p = map + BIT_WORD(start);
 	const int size = start + nr;
 	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
 	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
 	while (nr - bits_to_set >= 0) {
 		if (set_bits_ll(p, mask_to_set))
 			return nr;
 		nr -= bits_to_set;
 		bits_to_set = BITS_PER_LONG;
 		mask_to_set = ~0UL;
 		p++;
 	}
 	if (nr) {
 		mask_to_set &= BITMAP_LAST_WORD_MASK(size);
 		if (set_bits_ll(p, mask_to_set))
 			return nr;
 	}
 	return 0;
 }
 /*
  * bitmap_clear_ll - clear the specified number of bits at the specified position
  * @map: pointer to a bitmap
  * @start: a bit position in @map
  * @nr: number of bits to set
  *
  * Clear @nr bits start from @start in @map lock-lessly. Several users
  * can set/clear the same bitmap simultaneously without lock. If two
  * users clear the same bit, one user will return remain bits,
  * otherwise return 0.
  */
 static int bitmap_clear_ll(unsigned long *map, int start, int nr)
 {
 	unsigned long *p = map + BIT_WORD(start);
 	const int size = start + nr;
 	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
 	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
 	while (nr - bits_to_clear >= 0) {
 		if (clear_bits_ll(p, mask_to_clear))
 			return nr;
 		nr -= bits_to_clear;
 		bits_to_clear = BITS_PER_LONG;
 		mask_to_clear = ~0UL;
 		p++;
 	}
 	if (nr) {
 		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
 		if (clear_bits_ll(p, mask_to_clear))
 			return nr;
 	}
 	return 0;
 }
 /**
  * gen_pool_create - create a new special memory pool
  * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
  * @nid: node id of the node the pool structure should be allocated on, or -1
  *
  * Create a new special memory pool that can be used to manage special purpose
  * memory not managed by the regular kmalloc/kfree interface.
  */
 struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 {
 	struct gen_pool *pool;
 	pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
 	if (pool != NULL) {
 		spin_lock_init(&pool->lock);
 		INIT_LIST_HEAD(&pool->chunks);
 		pool->min_alloc_order = min_alloc_order;
 		pool->algo = gen_pool_first_fit;
 		pool->data = NULL;
 	}
 	return pool;
 }
 EXPORT_SYMBOL(gen_pool_create);
 /**
  * gen_pool_add_virt - add a new chunk of special memory to the pool
  * @pool: pool to add new memory chunk to
  * @virt: virtual starting address of memory chunk to add to pool
  * @phys: physical starting address of memory chunk to add to pool
  * @size: size in bytes of the memory chunk to add to pool
  * @nid: node id of the node the chunk structure and bitmap should be
  *       allocated on, or -1
  *
  * Add a new chunk of special memory to the specified pool.
  *
  * Returns 0 on success or a -ve errno on failure.
  */
 int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phys,
 		 size_t size, int nid)
 {
 	struct gen_pool_chunk *chunk;
 	int nbits = size >> pool->min_alloc_order;
 	int nbytes = sizeof(struct gen_pool_chunk) +
 				BITS_TO_LONGS(nbits) * sizeof(long);
 	chunk = kzalloc_node(nbytes, GFP_KERNEL, nid);
 	if (unlikely(chunk == NULL))
 		return -ENOMEM;
 	chunk->phys_addr = phys;
 	chunk->start_addr = virt;
 	chunk->end_addr = virt + size - 1;
 	atomic_set(&chunk->avail, size);
 	spin_lock(&pool->lock);
 	list_add_rcu(&chunk->next_chunk, &pool->chunks);
 	spin_unlock(&pool->lock);
 	return 0;
 }
 EXPORT_SYMBOL(gen_pool_add_virt);
 /**
  * gen_pool_virt_to_phys - return the physical address of memory
  * @pool: pool to allocate from
  * @addr: starting address of memory
  *
  * Returns the physical address on success, or -1 on error.
  */
 phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
 {
 	struct gen_pool_chunk *chunk;
 	phys_addr_t paddr = -1;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
 		if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
 			paddr = chunk->phys_addr + (addr - chunk->start_addr);
 			break;
 		}
 	}
 	rcu_read_unlock();
 	return paddr;
 }
 EXPORT_SYMBOL(gen_pool_virt_to_phys);
 /**
  * gen_pool_destroy - destroy a special memory pool
  * @pool: pool to destroy
  *
  * Destroy the specified special memory pool. Verifies that there are no
  * outstanding allocations.
  */
 void gen_pool_destroy(struct gen_pool *pool)
 {
 	struct list_head *_chunk, *_next_chunk;
 	struct gen_pool_chunk *chunk;
 	int order = pool->min_alloc_order;
 	int bit, end_bit;
 	list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
 		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
 		list_del(&chunk->next_chunk);
 		end_bit = chunk_size(chunk) >> order;
 		bit = find_next_bit(chunk->bits, end_bit, 0);
 		BUG_ON(bit < end_bit);
 		kfree(chunk);
 	}
 	kfree(pool);
 	return;
 }
 EXPORT_SYMBOL(gen_pool_destroy);
 /**
  * gen_pool_alloc - allocate special memory from the pool
  * @pool: pool to allocate from
  * @size: number of bytes to allocate from the pool
  *
  * Allocate the requested number of bytes from the specified pool.
  * Uses the pool allocation function (with first-fit algorithm by default).
  * Can not be used in NMI handler on architectures without
  * NMI-safe cmpxchg implementation.
  */
 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 {
 	struct gen_pool_chunk *chunk;
 	unsigned long addr = 0;
 	int order = pool->min_alloc_order;
 	int nbits, start_bit = 0, end_bit, remain;
 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
 	BUG_ON(in_nmi());
 #endif
 	if (size == 0)
 		return 0;
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
 		if (size > atomic_read(&chunk->avail))
 			continue;
 		end_bit = chunk_size(chunk) >> order;
 retry:
 		start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits,
 				pool->data);
 		if (start_bit >= end_bit)
 			continue;
 		remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
 		if (remain) {
 			remain = bitmap_clear_ll(chunk->bits, start_bit,
 						 nbits - remain);
 			BUG_ON(remain);
 			goto retry;
 		}
 		addr = chunk->start_addr + ((unsigned long)start_bit << order);
 		size = nbits << order;
 		atomic_sub(size, &chunk->avail);
 		break;
 	}
 	rcu_read_unlock();
 	return addr;
 }
 EXPORT_SYMBOL(gen_pool_alloc);
 /**
  * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage
  * @pool: pool to allocate from
  * @size: number of bytes to allocate from the pool
  * @dma: dma-view physical address return value.  Use NULL if unneeded.
  *
  * Allocate the requested number of bytes from the specified pool.
  * Uses the pool allocation function (with first-fit algorithm by default).
  * Can not be used in NMI handler on architectures without
  * NMI-safe cmpxchg implementation.
  */
 void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 {
 	unsigned long vaddr;
 	if (!pool)
 		return NULL;
 	vaddr = gen_pool_alloc(pool, size);
 	if (!vaddr)
 		return NULL;
 	if (dma)
 		*dma = gen_pool_virt_to_phys(pool, vaddr);
 	return (void *)vaddr;
 }
 EXPORT_SYMBOL(gen_pool_dma_alloc);
 /**
  * gen_pool_free - free allocated special memory back to the pool
  * @pool: pool to free to
  * @addr: starting address of memory to free back to pool
  * @size: size in bytes of memory to free
  *
  * Free previously allocated special memory back to the specified
  * pool.  Can not be used in NMI handler on architectures without
  * NMI-safe cmpxchg implementation.
  */
 void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 {
 	struct gen_pool_chunk *chunk;
 	int order = pool->min_alloc_order;
 	int start_bit, nbits, remain;
 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
 	BUG_ON(in_nmi());
 #endif
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
 		if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
 			BUG_ON(addr + size - 1 > chunk->end_addr);
 			start_bit = (addr - chunk->start_addr) >> order;
 			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
 			BUG_ON(remain);
 			size = nbits << order;
 			atomic_add(size, &chunk->avail);
 			rcu_read_unlock();
 			return;
 		}
 	}
 	rcu_read_unlock();
 	BUG();
 }
 EXPORT_SYMBOL(gen_pool_free);
 /**
  * gen_pool_for_each_chunk - call func for every chunk of generic memory pool
  * @pool:	the generic memory pool
  * @func:	func to call
  * @data:	additional data used by @func
  *
  * Call @func for every chunk of generic memory pool.  The @func is
  * called with rcu_read_lock held.
  */
 void gen_pool_for_each_chunk(struct gen_pool *pool,
 	void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data),
 	void *data)
 {
 	struct gen_pool_chunk *chunk;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk)
 		func(pool, chunk, data);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(gen_pool_for_each_chunk);
 /**
  * gen_pool_avail - get available free space of the pool
  * @pool: pool to get available free space
  *
  * Return available free space of the specified pool.
  */
 size_t gen_pool_avail(struct gen_pool *pool)
 {
 	struct gen_pool_chunk *chunk;
 	size_t avail = 0;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
 		avail += atomic_read(&chunk->avail);
 	rcu_read_unlock();
 	return avail;
 }
 EXPORT_SYMBOL_GPL(gen_pool_avail);
 /**
  * gen_pool_size - get size in bytes of memory managed by the pool
  * @pool: pool to get size
  *
  * Return size in bytes of memory managed by the pool.
  */
 size_t gen_pool_size(struct gen_pool *pool)
 {
 	struct gen_pool_chunk *chunk;
 	size_t size = 0;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
 		size += chunk_size(chunk);
 	rcu_read_unlock();
 	return size;
 }
 EXPORT_SYMBOL_GPL(gen_pool_size);
 /**
  * gen_pool_set_algo - set the allocation algorithm
  * @pool: pool to change allocation algorithm
  * @algo: custom algorithm function
  * @data: additional data used by @algo
  *
  * Call @algo for each memory allocation in the pool.
  * If @algo is NULL use gen_pool_first_fit as default
  * memory allocation function.
  */
 void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, void *data)
 {
 	rcu_read_lock();
 	pool->algo = algo;
 	if (!pool->algo)
 		pool->algo = gen_pool_first_fit;
 	pool->data = data;
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(gen_pool_set_algo);
 /**
  * gen_pool_first_fit - find the first available region
  * of memory matching the size requirement (no alignment constraint)
  * @map: The address to base the search on
  * @size: The bitmap size in bits
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  */
 unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data)
 {
 	return bitmap_find_next_zero_area(map, size, start, nr, 0);
 }
 EXPORT_SYMBOL(gen_pool_first_fit);
 /**
  * gen_pool_best_fit - find the best fitting region of memory
  * macthing the size requirement (no alignment constraint)
  * @map: The address to base the search on
  * @size: The bitmap size in bits
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  *
  * Iterate over the bitmap to find the smallest free region
  * which we can allocate the memory.
  */
 unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data)
 {
 	unsigned long start_bit = size;
 	unsigned long len = size + 1;
 	unsigned long index;
 	index = bitmap_find_next_zero_area(map, size, start, nr, 0);
 	while (index < size) {
 		int next_bit = find_next_bit(map, size, index + nr);
 		if ((next_bit - index) < len) {
 			len = next_bit - index;
 			start_bit = index;
 			if (len == nr)
 				return start_bit;
 		}
 		index = bitmap_find_next_zero_area(map, size,
 						   next_bit + 1, nr, 0);
 	}
 	return start_bit;
 }
 EXPORT_SYMBOL(gen_pool_best_fit);
 static void devm_gen_pool_release(struct device *dev, void *res)
 {
 	gen_pool_destroy(*(struct gen_pool **)res);
 }
 /**
  * devm_gen_pool_create - managed gen_pool_create
  * @dev: device that provides the gen_pool
  * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
  * @nid: node id of the node the pool structure should be allocated on, or -1
  *
  * Create a new special memory pool that can be used to manage special purpose
  * memory not managed by the regular kmalloc/kfree interface. The pool will be
  * automatically destroyed by the device management code.
  */
 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 		int nid)
 {
 	struct gen_pool **ptr, *pool;
 	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
 	pool = gen_pool_create(min_alloc_order, nid);
 	if (pool) {
 		*ptr = pool;
 		devres_add(dev, ptr);
 	} else {
 		devres_free(ptr);
 	}
 	return pool;
 }
 /**
  * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
  * @dev: device to retrieve the gen_pool from
  *
  * Returns the gen_pool for the device if one is present, or NULL.
  */
 struct gen_pool *dev_get_gen_pool(struct device *dev)
 {
 	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
 					NULL);
 	if (!p)
 		return NULL;
 	return *p;
 }
 EXPORT_SYMBOL_GPL(dev_get_gen_pool);
 #ifdef CONFIG_OF
 /**
  * of_get_named_gen_pool - find a pool by phandle property
  * @np: device node
  * @propname: property name containing phandle(s)
  * @index: index into the phandle array
  *
  * Returns the pool that contains the chunk starting at the physical
  * address of the device tree node pointed at by the phandle property,
  * or NULL if not found.
  */
 struct gen_pool *of_get_named_gen_pool(struct device_node *np,
 	const char *propname, int index)
 {
 	struct platform_device *pdev;
 	struct device_node *np_pool;
 	np_pool = of_parse_phandle(np, propname, index);
 	if (!np_pool)
 		return NULL;
 	pdev = of_find_device_by_node(np_pool);
+	of_node_put(np_pool);
 	if (!pdev)
 		return NULL;
 	return dev_get_gen_pool(&pdev->dev);
 }
 EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
 #endif /* CONFIG_OF */

mm/memory.c

Diff comments View file @ 8207649

1	/*	1	/*
2	* linux/mm/memory.c	2	* linux/mm/memory.c
3	*	3	*
4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds	4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5	*/	5	*/
6		6
7	/*	7	/*
8	* demand-loading started 01.12.91 - seems it is high on the list of	8	* demand-loading started 01.12.91 - seems it is high on the list of
9	* things wanted, and it should be easy to implement. - Linus	9	* things wanted, and it should be easy to implement. - Linus
10	*/	10	*/
11		11
12	/*	12	/*
13	* Ok, demand-loading was easy, shared pages a little bit tricker. Shared	13	* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14	* pages started 02.12.91, seems to work. - Linus.	14	* pages started 02.12.91, seems to work. - Linus.
15	*	15	*
16	* Tested sharing by executing about 30 /bin/sh: under the old kernel it	16	* Tested sharing by executing about 30 /bin/sh: under the old kernel it
17	* would have taken more than the 6M I have free, but it worked well as	17	* would have taken more than the 6M I have free, but it worked well as
18	* far as I could see.	18	* far as I could see.
19	*	19	*
20	* Also corrected some "invalidate()"s - I wasn't doing enough of them.	20	* Also corrected some "invalidate()"s - I wasn't doing enough of them.
21	*/	21	*/
22		22
23	/*	23	/*
24	* Real VM (paging to/from disk) started 18.12.91. Much more work and	24	* Real VM (paging to/from disk) started 18.12.91. Much more work and
25	* thought has to go into this. Oh, well..	25	* thought has to go into this. Oh, well..
26	* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.	26	* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27	* Found it. Everything seems to work now.	27	* Found it. Everything seems to work now.
28	* 20.12.91 - Ok, making the swap-device changeable like the root.	28	* 20.12.91 - Ok, making the swap-device changeable like the root.
29	*/	29	*/
30		30
31	/*	31	/*
32	* 05.04.94 - Multi-page memory management added for v1.1.	32	* 05.04.94 - Multi-page memory management added for v1.1.
33	* Idea by Alex Bligh (alex@cconcepts.co.uk)	33	* Idea by Alex Bligh (alex@cconcepts.co.uk)
34	*	34	*
35	* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG	35	* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36	* (Gerhard.Wichert@pdb.siemens.de)	36	* (Gerhard.Wichert@pdb.siemens.de)
37	*	37	*
38	* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)	38	* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39	*/	39	*/
40		40
41	#include <linux/kernel_stat.h>	41	#include <linux/kernel_stat.h>
42	#include <linux/mm.h>	42	#include <linux/mm.h>
43	#include <linux/hugetlb.h>	43	#include <linux/hugetlb.h>
44	#include <linux/mman.h>	44	#include <linux/mman.h>
45	#include <linux/swap.h>	45	#include <linux/swap.h>
46	#include <linux/highmem.h>	46	#include <linux/highmem.h>
47	#include <linux/pagemap.h>	47	#include <linux/pagemap.h>
48	#include <linux/ksm.h>	48	#include <linux/ksm.h>
49	#include <linux/rmap.h>	49	#include <linux/rmap.h>
50	#include <linux/export.h>	50	#include <linux/export.h>
51	#include <linux/delayacct.h>	51	#include <linux/delayacct.h>
52	#include <linux/init.h>	52	#include <linux/init.h>
53	#include <linux/writeback.h>	53	#include <linux/writeback.h>
54	#include <linux/memcontrol.h>	54	#include <linux/memcontrol.h>
55	#include <linux/mmu_notifier.h>	55	#include <linux/mmu_notifier.h>
56	#include <linux/kallsyms.h>	56	#include <linux/kallsyms.h>
57	#include <linux/swapops.h>	57	#include <linux/swapops.h>
58	#include <linux/elf.h>	58	#include <linux/elf.h>
59	#include <linux/gfp.h>	59	#include <linux/gfp.h>
60	#include <linux/migrate.h>	60	#include <linux/migrate.h>
61	#include <linux/string.h>	61	#include <linux/string.h>
62	#include <linux/dma-debug.h>	62	#include <linux/dma-debug.h>
63	#include <linux/debugfs.h>	63	#include <linux/debugfs.h>
64		64
65	#include <asm/io.h>	65	#include <asm/io.h>
66	#include <asm/pgalloc.h>	66	#include <asm/pgalloc.h>
67	#include <asm/uaccess.h>	67	#include <asm/uaccess.h>
68	#include <asm/tlb.h>	68	#include <asm/tlb.h>
69	#include <asm/tlbflush.h>	69	#include <asm/tlbflush.h>
70	#include <asm/pgtable.h>	70	#include <asm/pgtable.h>
71		71
72	#include "internal.h"	72	#include "internal.h"
73		73
74	#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS	74	#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
75	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.	75	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
76	#endif	76	#endif
77		77
78	#ifndef CONFIG_NEED_MULTIPLE_NODES	78	#ifndef CONFIG_NEED_MULTIPLE_NODES
79	/* use the per-pgdat data instead for discontigmem - mbligh */	79	/* use the per-pgdat data instead for discontigmem - mbligh */
80	unsigned long max_mapnr;	80	unsigned long max_mapnr;
81	struct page *mem_map;	81	struct page *mem_map;
82		82
83	EXPORT_SYMBOL(max_mapnr);	83	EXPORT_SYMBOL(max_mapnr);
84	EXPORT_SYMBOL(mem_map);	84	EXPORT_SYMBOL(mem_map);
85	#endif	85	#endif
86		86
87	/*	87	/*
88	* A number of key systems in x86 including ioremap() rely on the assumption	88	* A number of key systems in x86 including ioremap() rely on the assumption
89	* that high_memory defines the upper bound on direct map memory, then end	89	* that high_memory defines the upper bound on direct map memory, then end
90	* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and	90	* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
91	* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL	91	* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
92	* and ZONE_HIGHMEM.	92	* and ZONE_HIGHMEM.
93	*/	93	*/
94	void * high_memory;	94	void * high_memory;
95		95
96	EXPORT_SYMBOL(high_memory);	96	EXPORT_SYMBOL(high_memory);
97		97
98	/*	98	/*
99	* Randomize the address space (stacks, mmaps, brk, etc.).	99	* Randomize the address space (stacks, mmaps, brk, etc.).
100	*	100	*
101	* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,	101	* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
102	* as ancient (libc5 based) binaries can segfault. )	102	* as ancient (libc5 based) binaries can segfault. )
103	*/	103	*/
104	int randomize_va_space __read_mostly =	104	int randomize_va_space __read_mostly =
105	#ifdef CONFIG_COMPAT_BRK	105	#ifdef CONFIG_COMPAT_BRK
106	1;	106	1;
107	#else	107	#else
108	2;	108	2;
109	#endif	109	#endif
110		110
111	static int __init disable_randmaps(char *s)	111	static int __init disable_randmaps(char *s)
112	{	112	{
113	randomize_va_space = 0;	113	randomize_va_space = 0;
114	return 1;	114	return 1;
115	}	115	}
116	__setup("norandmaps", disable_randmaps);	116	__setup("norandmaps", disable_randmaps);
117		117
118	unsigned long zero_pfn __read_mostly;	118	unsigned long zero_pfn __read_mostly;
119	unsigned long highest_memmap_pfn __read_mostly;	119	unsigned long highest_memmap_pfn __read_mostly;
120		120
121	EXPORT_SYMBOL(zero_pfn);	121	EXPORT_SYMBOL(zero_pfn);
122		122
123	/*	123	/*
124	* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()	124	* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
125	*/	125	*/
126	static int __init init_zero_pfn(void)	126	static int __init init_zero_pfn(void)
127	{	127	{
128	zero_pfn = page_to_pfn(ZERO_PAGE(0));	128	zero_pfn = page_to_pfn(ZERO_PAGE(0));
129	return 0;	129	return 0;
130	}	130	}
131	core_initcall(init_zero_pfn);	131	core_initcall(init_zero_pfn);
132		132
133		133
134	#if defined(SPLIT_RSS_COUNTING)	134	#if defined(SPLIT_RSS_COUNTING)
135		135
136	void sync_mm_rss(struct mm_struct *mm)	136	void sync_mm_rss(struct mm_struct *mm)
137	{	137	{
138	int i;	138	int i;
139		139
140	for (i = 0; i < NR_MM_COUNTERS; i++) {	140	for (i = 0; i < NR_MM_COUNTERS; i++) {
141	if (current->rss_stat.count[i]) {	141	if (current->rss_stat.count[i]) {
142	add_mm_counter(mm, i, current->rss_stat.count[i]);	142	add_mm_counter(mm, i, current->rss_stat.count[i]);
143	current->rss_stat.count[i] = 0;	143	current->rss_stat.count[i] = 0;
144	}	144	}
145	}	145	}
146	current->rss_stat.events = 0;	146	current->rss_stat.events = 0;
147	}	147	}
148		148
149	static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)	149	static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
150	{	150	{
151	struct task_struct *task = current;	151	struct task_struct *task = current;
152		152
153	if (likely(task->mm == mm))	153	if (likely(task->mm == mm))
154	task->rss_stat.count[member] += val;	154	task->rss_stat.count[member] += val;
155	else	155	else
156	add_mm_counter(mm, member, val);	156	add_mm_counter(mm, member, val);
157	}	157	}
158	#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)	158	#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
159	#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)	159	#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
160		160
161	/* sync counter once per 64 page faults */	161	/* sync counter once per 64 page faults */
162	#define TASK_RSS_EVENTS_THRESH (64)	162	#define TASK_RSS_EVENTS_THRESH (64)
163	static void check_sync_rss_stat(struct task_struct *task)	163	static void check_sync_rss_stat(struct task_struct *task)
164	{	164	{
165	if (unlikely(task != current))	165	if (unlikely(task != current))
166	return;	166	return;
167	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))	167	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
168	sync_mm_rss(task->mm);	168	sync_mm_rss(task->mm);
169	}	169	}
170	#else /* SPLIT_RSS_COUNTING */	170	#else /* SPLIT_RSS_COUNTING */
171		171
172	#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)	172	#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
173	#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)	173	#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
174		174
175	static void check_sync_rss_stat(struct task_struct *task)	175	static void check_sync_rss_stat(struct task_struct *task)
176	{	176	{
177	}	177	}
178		178
179	#endif /* SPLIT_RSS_COUNTING */	179	#endif /* SPLIT_RSS_COUNTING */
180		180
181	#ifdef HAVE_GENERIC_MMU_GATHER	181	#ifdef HAVE_GENERIC_MMU_GATHER
182		182
183	static int tlb_next_batch(struct mmu_gather *tlb)	183	static int tlb_next_batch(struct mmu_gather *tlb)
184	{	184	{
185	struct mmu_gather_batch *batch;	185	struct mmu_gather_batch *batch;
186		186
187	batch = tlb->active;	187	batch = tlb->active;
188	if (batch->next) {	188	if (batch->next) {
189	tlb->active = batch->next;	189	tlb->active = batch->next;
190	return 1;	190	return 1;
191	}	191	}
192		192
193	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)	193	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
194	return 0;	194	return 0;
195		195
196	batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);	196	batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
197	if (!batch)	197	if (!batch)
198	return 0;	198	return 0;
199		199
200	tlb->batch_count++;	200	tlb->batch_count++;
201	batch->next = NULL;	201	batch->next = NULL;
202	batch->nr = 0;	202	batch->nr = 0;
203	batch->max = MAX_GATHER_BATCH;	203	batch->max = MAX_GATHER_BATCH;
204		204
205	tlb->active->next = batch;	205	tlb->active->next = batch;
206	tlb->active = batch;	206	tlb->active = batch;
207		207
208	return 1;	208	return 1;
209	}	209	}
210		210
211	/* tlb_gather_mmu	211	/* tlb_gather_mmu
212	* Called to initialize an (on-stack) mmu_gather structure for page-table	212	* Called to initialize an (on-stack) mmu_gather structure for page-table
213	* tear-down from @mm. The @fullmm argument is used when @mm is without	213	* tear-down from @mm. The @fullmm argument is used when @mm is without
214	* users and we're going to destroy the full address space (exit/execve).	214	* users and we're going to destroy the full address space (exit/execve).
215	*/	215	*/
216	void tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm, unsigned long start, unsigned long end)	216	void tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm, unsigned long start, unsigned long end)
217	{	217	{
218	tlb->mm = mm;	218	tlb->mm = mm;
219		219
220	/* Is it from 0 to ~0? */	220	/* Is it from 0 to ~0? */
221	tlb->fullmm = !(start \| (end+1));	221	tlb->fullmm = !(start \| (end+1));
222	tlb->need_flush_all = 0;	222	tlb->need_flush_all = 0;
223	tlb->start = start;	223	tlb->start = start;
224	tlb->end = end;	224	tlb->end = end;
225	tlb->need_flush = 0;	225	tlb->need_flush = 0;
226	tlb->local.next = NULL;	226	tlb->local.next = NULL;
227	tlb->local.nr = 0;	227	tlb->local.nr = 0;
228	tlb->local.max = ARRAY_SIZE(tlb->__pages);	228	tlb->local.max = ARRAY_SIZE(tlb->__pages);
229	tlb->active = &tlb->local;	229	tlb->active = &tlb->local;
230	tlb->batch_count = 0;	230	tlb->batch_count = 0;
231		231
232	#ifdef CONFIG_HAVE_RCU_TABLE_FREE	232	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
233	tlb->batch = NULL;	233	tlb->batch = NULL;
234	#endif	234	#endif
235	}	235	}
236		236
237	static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)	237	static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
238	{	238	{
239	tlb->need_flush = 0;	239	tlb->need_flush = 0;
240	tlb_flush(tlb);	240	tlb_flush(tlb);
241	#ifdef CONFIG_HAVE_RCU_TABLE_FREE	241	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
242	tlb_table_flush(tlb);	242	tlb_table_flush(tlb);
243	#endif	243	#endif
244	}	244	}
245		245
246	static void tlb_flush_mmu_free(struct mmu_gather *tlb)	246	static void tlb_flush_mmu_free(struct mmu_gather *tlb)
247	{	247	{
248	struct mmu_gather_batch *batch;	248	struct mmu_gather_batch *batch;
249		249
250	for (batch = &tlb->local; batch; batch = batch->next) {	250	for (batch = &tlb->local; batch; batch = batch->next) {
251	free_pages_and_swap_cache(batch->pages, batch->nr);	251	free_pages_and_swap_cache(batch->pages, batch->nr);
252	batch->nr = 0;	252	batch->nr = 0;
253	}	253	}
254	tlb->active = &tlb->local;	254	tlb->active = &tlb->local;
255	}	255	}
256		256
257	void tlb_flush_mmu(struct mmu_gather *tlb)	257	void tlb_flush_mmu(struct mmu_gather *tlb)
258	{	258	{
259	if (!tlb->need_flush)	259	if (!tlb->need_flush)
260	return;	260	return;
261	tlb_flush_mmu_tlbonly(tlb);	261	tlb_flush_mmu_tlbonly(tlb);
262	tlb_flush_mmu_free(tlb);	262	tlb_flush_mmu_free(tlb);
263	}	263	}
264		264
265	/* tlb_finish_mmu	265	/* tlb_finish_mmu
266	* Called at the end of the shootdown operation to free up any resources	266	* Called at the end of the shootdown operation to free up any resources
267	* that were required.	267	* that were required.
268	*/	268	*/
269	void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)	269	void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
270	{	270	{
271	struct mmu_gather_batch batch, next;	271	struct mmu_gather_batch batch, next;
272		272
273	tlb_flush_mmu(tlb);	273	tlb_flush_mmu(tlb);
274		274
275	/* keep the page table cache within bounds */	275	/* keep the page table cache within bounds */
276	check_pgt_cache();	276	check_pgt_cache();
277		277
278	for (batch = tlb->local.next; batch; batch = next) {	278	for (batch = tlb->local.next; batch; batch = next) {
279	next = batch->next;	279	next = batch->next;
280	free_pages((unsigned long)batch, 0);	280	free_pages((unsigned long)batch, 0);
281	}	281	}
282	tlb->local.next = NULL;	282	tlb->local.next = NULL;
283	}	283	}
284		284
285	/* __tlb_remove_page	285	/* __tlb_remove_page
286	* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while	286	* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
287	* handling the additional races in SMP caused by other CPUs caching valid	287	* handling the additional races in SMP caused by other CPUs caching valid
288	* mappings in their TLBs. Returns the number of free page slots left.	288	* mappings in their TLBs. Returns the number of free page slots left.
289	* When out of page slots we must call tlb_flush_mmu().	289	* When out of page slots we must call tlb_flush_mmu().
290	*/	290	*/
291	int __tlb_remove_page(struct mmu_gather tlb, struct page page)	291	int __tlb_remove_page(struct mmu_gather tlb, struct page page)
292	{	292	{
293	struct mmu_gather_batch *batch;	293	struct mmu_gather_batch *batch;
294		294
295	VM_BUG_ON(!tlb->need_flush);	295	VM_BUG_ON(!tlb->need_flush);
296		296
297	batch = tlb->active;	297	batch = tlb->active;
298	batch->pages[batch->nr++] = page;	298	batch->pages[batch->nr++] = page;
299	if (batch->nr == batch->max) {	299	if (batch->nr == batch->max) {
300	if (!tlb_next_batch(tlb))	300	if (!tlb_next_batch(tlb))
301	return 0;	301	return 0;
302	batch = tlb->active;	302	batch = tlb->active;
303	}	303	}
304	VM_BUG_ON_PAGE(batch->nr > batch->max, page);	304	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
305		305
306	return batch->max - batch->nr;	306	return batch->max - batch->nr;
307	}	307	}
308		308
309	#endif /* HAVE_GENERIC_MMU_GATHER */	309	#endif /* HAVE_GENERIC_MMU_GATHER */
310		310
311	#ifdef CONFIG_HAVE_RCU_TABLE_FREE	311	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
312		312
313	/*	313	/*
314	* See the comment near struct mmu_table_batch.	314	* See the comment near struct mmu_table_batch.
315	*/	315	*/
316		316
317	static void tlb_remove_table_smp_sync(void *arg)	317	static void tlb_remove_table_smp_sync(void *arg)
318	{	318	{
319	/* Simply deliver the interrupt */	319	/* Simply deliver the interrupt */
320	}	320	}
321		321
322	static void tlb_remove_table_one(void *table)	322	static void tlb_remove_table_one(void *table)
323	{	323	{
324	/*	324	/*
325	* This isn't an RCU grace period and hence the page-tables cannot be	325	* This isn't an RCU grace period and hence the page-tables cannot be
326	* assumed to be actually RCU-freed.	326	* assumed to be actually RCU-freed.
327	*	327	*
328	* It is however sufficient for software page-table walkers that rely on	328	* It is however sufficient for software page-table walkers that rely on
329	* IRQ disabling. See the comment near struct mmu_table_batch.	329	* IRQ disabling. See the comment near struct mmu_table_batch.
330	*/	330	*/
331	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);	331	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
332	__tlb_remove_table(table);	332	__tlb_remove_table(table);
333	}	333	}
334		334
335	static void tlb_remove_table_rcu(struct rcu_head *head)	335	static void tlb_remove_table_rcu(struct rcu_head *head)
336	{	336	{
337	struct mmu_table_batch *batch;	337	struct mmu_table_batch *batch;
338	int i;	338	int i;
339		339
340	batch = container_of(head, struct mmu_table_batch, rcu);	340	batch = container_of(head, struct mmu_table_batch, rcu);
341		341
342	for (i = 0; i < batch->nr; i++)	342	for (i = 0; i < batch->nr; i++)
343	__tlb_remove_table(batch->tables[i]);	343	__tlb_remove_table(batch->tables[i]);
344		344
345	free_page((unsigned long)batch);	345	free_page((unsigned long)batch);
346	}	346	}
347		347
348	void tlb_table_flush(struct mmu_gather *tlb)	348	void tlb_table_flush(struct mmu_gather *tlb)
349	{	349	{
350	struct mmu_table_batch **batch = &tlb->batch;	350	struct mmu_table_batch **batch = &tlb->batch;
351		351
352	if (*batch) {	352	if (*batch) {
353	call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);	353	call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
354	*batch = NULL;	354	*batch = NULL;
355	}	355	}
356	}	356	}
357		357
358	void tlb_remove_table(struct mmu_gather tlb, void table)	358	void tlb_remove_table(struct mmu_gather tlb, void table)
359	{	359	{
360	struct mmu_table_batch **batch = &tlb->batch;	360	struct mmu_table_batch **batch = &tlb->batch;
361		361
362	tlb->need_flush = 1;	362	tlb->need_flush = 1;
363		363
364	/*	364	/*
365	* When there's less then two users of this mm there cannot be a	365	* When there's less then two users of this mm there cannot be a
366	* concurrent page-table walk.	366	* concurrent page-table walk.
367	*/	367	*/
368	if (atomic_read(&tlb->mm->mm_users) < 2) {	368	if (atomic_read(&tlb->mm->mm_users) < 2) {
369	__tlb_remove_table(table);	369	__tlb_remove_table(table);
370	return;	370	return;
371	}	371	}
372		372
373	if (*batch == NULL) {	373	if (*batch == NULL) {
374	batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);	374	batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);
375	if (*batch == NULL) {	375	if (*batch == NULL) {
376	tlb_remove_table_one(table);	376	tlb_remove_table_one(table);
377	return;	377	return;
378	}	378	}
379	(*batch)->nr = 0;	379	(*batch)->nr = 0;
380	}	380	}
381	(batch)->tables[(batch)->nr++] = table;	381	(batch)->tables[(batch)->nr++] = table;
382	if ((*batch)->nr == MAX_TABLE_BATCH)	382	if ((*batch)->nr == MAX_TABLE_BATCH)
383	tlb_table_flush(tlb);	383	tlb_table_flush(tlb);
384	}	384	}
385		385
386	#endif /* CONFIG_HAVE_RCU_TABLE_FREE */	386	#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
387		387
388	/*	388	/*
389	* Note: this doesn't free the actual pages themselves. That	389	* Note: this doesn't free the actual pages themselves. That
390	* has been handled earlier when unmapping all the memory regions.	390	* has been handled earlier when unmapping all the memory regions.
391	*/	391	*/
392	static void free_pte_range(struct mmu_gather tlb, pmd_t pmd,	392	static void free_pte_range(struct mmu_gather tlb, pmd_t pmd,
393	unsigned long addr)	393	unsigned long addr)
394	{	394	{
395	pgtable_t token = pmd_pgtable(*pmd);	395	pgtable_t token = pmd_pgtable(*pmd);
396	pmd_clear(pmd);	396	pmd_clear(pmd);
397	pte_free_tlb(tlb, token, addr);	397	pte_free_tlb(tlb, token, addr);
398	atomic_long_dec(&tlb->mm->nr_ptes);	398	atomic_long_dec(&tlb->mm->nr_ptes);
399	}	399	}
400		400
401	static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,	401	static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,
402	unsigned long addr, unsigned long end,	402	unsigned long addr, unsigned long end,
403	unsigned long floor, unsigned long ceiling)	403	unsigned long floor, unsigned long ceiling)
404	{	404	{
405	pmd_t *pmd;	405	pmd_t *pmd;
406	unsigned long next;	406	unsigned long next;
407	unsigned long start;	407	unsigned long start;
408		408
409	start = addr;	409	start = addr;
410	pmd = pmd_offset(pud, addr);	410	pmd = pmd_offset(pud, addr);
411	do {	411	do {
412	next = pmd_addr_end(addr, end);	412	next = pmd_addr_end(addr, end);
413	if (pmd_none_or_clear_bad(pmd))	413	if (pmd_none_or_clear_bad(pmd))
414	continue;	414	continue;
415	free_pte_range(tlb, pmd, addr);	415	free_pte_range(tlb, pmd, addr);
416	} while (pmd++, addr = next, addr != end);	416	} while (pmd++, addr = next, addr != end);
417		417
418	start &= PUD_MASK;	418	start &= PUD_MASK;
419	if (start < floor)	419	if (start < floor)
420	return;	420	return;
421	if (ceiling) {	421	if (ceiling) {
422	ceiling &= PUD_MASK;	422	ceiling &= PUD_MASK;
423	if (!ceiling)	423	if (!ceiling)
424	return;	424	return;
425	}	425	}
426	if (end - 1 > ceiling - 1)	426	if (end - 1 > ceiling - 1)
427	return;	427	return;
428		428
429	pmd = pmd_offset(pud, start);	429	pmd = pmd_offset(pud, start);
430	pud_clear(pud);	430	pud_clear(pud);
431	pmd_free_tlb(tlb, pmd, start);	431	pmd_free_tlb(tlb, pmd, start);
432	}	432	}
433		433
434	static inline void free_pud_range(struct mmu_gather tlb, pgd_t pgd,	434	static inline void free_pud_range(struct mmu_gather tlb, pgd_t pgd,
435	unsigned long addr, unsigned long end,	435	unsigned long addr, unsigned long end,
436	unsigned long floor, unsigned long ceiling)	436	unsigned long floor, unsigned long ceiling)
437	{	437	{
438	pud_t *pud;	438	pud_t *pud;
439	unsigned long next;	439	unsigned long next;
440	unsigned long start;	440	unsigned long start;
441		441
442	start = addr;	442	start = addr;
443	pud = pud_offset(pgd, addr);	443	pud = pud_offset(pgd, addr);
444	do {	444	do {
445	next = pud_addr_end(addr, end);	445	next = pud_addr_end(addr, end);
446	if (pud_none_or_clear_bad(pud))	446	if (pud_none_or_clear_bad(pud))
447	continue;	447	continue;
448	free_pmd_range(tlb, pud, addr, next, floor, ceiling);	448	free_pmd_range(tlb, pud, addr, next, floor, ceiling);
449	} while (pud++, addr = next, addr != end);	449	} while (pud++, addr = next, addr != end);
450		450
451	start &= PGDIR_MASK;	451	start &= PGDIR_MASK;
452	if (start < floor)	452	if (start < floor)
453	return;	453	return;
454	if (ceiling) {	454	if (ceiling) {
455	ceiling &= PGDIR_MASK;	455	ceiling &= PGDIR_MASK;
456	if (!ceiling)	456	if (!ceiling)
457	return;	457	return;
458	}	458	}
459	if (end - 1 > ceiling - 1)	459	if (end - 1 > ceiling - 1)
460	return;	460	return;
461		461
462	pud = pud_offset(pgd, start);	462	pud = pud_offset(pgd, start);
463	pgd_clear(pgd);	463	pgd_clear(pgd);
464	pud_free_tlb(tlb, pud, start);	464	pud_free_tlb(tlb, pud, start);
465	}	465	}
466		466
467	/*	467	/*
468	* This function frees user-level page tables of a process.	468	* This function frees user-level page tables of a process.
469	*/	469	*/
470	void free_pgd_range(struct mmu_gather *tlb,	470	void free_pgd_range(struct mmu_gather *tlb,
471	unsigned long addr, unsigned long end,	471	unsigned long addr, unsigned long end,
472	unsigned long floor, unsigned long ceiling)	472	unsigned long floor, unsigned long ceiling)
473	{	473	{
474	pgd_t *pgd;	474	pgd_t *pgd;
475	unsigned long next;	475	unsigned long next;
476		476
477	/*	477	/*
478	* The next few lines have given us lots of grief...	478	* The next few lines have given us lots of grief...
479	*	479	*
480	* Why are we testing PMD* at this top level? Because often	480	* Why are we testing PMD* at this top level? Because often
481	* there will be no work to do at all, and we'd prefer not to	481	* there will be no work to do at all, and we'd prefer not to
482	* go all the way down to the bottom just to discover that.	482	* go all the way down to the bottom just to discover that.
483	*	483	*
484	* Why all these "- 1"s? Because 0 represents both the bottom	484	* Why all these "- 1"s? Because 0 represents both the bottom
485	* of the address space and the top of it (using -1 for the	485	* of the address space and the top of it (using -1 for the
486	* top wouldn't help much: the masks would do the wrong thing).	486	* top wouldn't help much: the masks would do the wrong thing).
487	* The rule is that addr 0 and floor 0 refer to the bottom of	487	* The rule is that addr 0 and floor 0 refer to the bottom of
488	* the address space, but end 0 and ceiling 0 refer to the top	488	* the address space, but end 0 and ceiling 0 refer to the top
489	* Comparisons need to use "end - 1" and "ceiling - 1" (though	489	* Comparisons need to use "end - 1" and "ceiling - 1" (though
490	* that end 0 case should be mythical).	490	* that end 0 case should be mythical).
491	*	491	*
492	* Wherever addr is brought up or ceiling brought down, we must	492	* Wherever addr is brought up or ceiling brought down, we must
493	* be careful to reject "the opposite 0" before it confuses the	493	* be careful to reject "the opposite 0" before it confuses the
494	* subsequent tests. But what about where end is brought down	494	* subsequent tests. But what about where end is brought down
495	* by PMD_SIZE below? no, end can't go down to 0 there.	495	* by PMD_SIZE below? no, end can't go down to 0 there.
496	*	496	*
497	* Whereas we round start (addr) and ceiling down, by different	497	* Whereas we round start (addr) and ceiling down, by different
498	* masks at different levels, in order to test whether a table	498	* masks at different levels, in order to test whether a table
499	* now has no other vmas using it, so can be freed, we don't	499	* now has no other vmas using it, so can be freed, we don't
500	* bother to round floor or end up - the tests don't need that.	500	* bother to round floor or end up - the tests don't need that.
501	*/	501	*/
502		502
503	addr &= PMD_MASK;	503	addr &= PMD_MASK;
504	if (addr < floor) {	504	if (addr < floor) {
505	addr += PMD_SIZE;	505	addr += PMD_SIZE;
506	if (!addr)	506	if (!addr)
507	return;	507	return;
508	}	508	}
509	if (ceiling) {	509	if (ceiling) {
510	ceiling &= PMD_MASK;	510	ceiling &= PMD_MASK;
511	if (!ceiling)	511	if (!ceiling)
512	return;	512	return;
513	}	513	}
514	if (end - 1 > ceiling - 1)	514	if (end - 1 > ceiling - 1)
515	end -= PMD_SIZE;	515	end -= PMD_SIZE;
516	if (addr > end - 1)	516	if (addr > end - 1)
517	return;	517	return;
518		518
519	pgd = pgd_offset(tlb->mm, addr);	519	pgd = pgd_offset(tlb->mm, addr);
520	do {	520	do {
521	next = pgd_addr_end(addr, end);	521	next = pgd_addr_end(addr, end);
522	if (pgd_none_or_clear_bad(pgd))	522	if (pgd_none_or_clear_bad(pgd))
523	continue;	523	continue;
524	free_pud_range(tlb, pgd, addr, next, floor, ceiling);	524	free_pud_range(tlb, pgd, addr, next, floor, ceiling);
525	} while (pgd++, addr = next, addr != end);	525	} while (pgd++, addr = next, addr != end);
526	}	526	}
527		527
528	void free_pgtables(struct mmu_gather tlb, struct vm_area_struct vma,	528	void free_pgtables(struct mmu_gather tlb, struct vm_area_struct vma,
529	unsigned long floor, unsigned long ceiling)	529	unsigned long floor, unsigned long ceiling)
530	{	530	{
531	while (vma) {	531	while (vma) {
532	struct vm_area_struct *next = vma->vm_next;	532	struct vm_area_struct *next = vma->vm_next;
533	unsigned long addr = vma->vm_start;	533	unsigned long addr = vma->vm_start;
534		534
535	/*	535	/*
536	* Hide vma from rmap and truncate_pagecache before freeing	536	* Hide vma from rmap and truncate_pagecache before freeing
537	* pgtables	537	* pgtables
538	*/	538	*/
539	unlink_anon_vmas(vma);	539	unlink_anon_vmas(vma);
540	unlink_file_vma(vma);	540	unlink_file_vma(vma);
541		541
542	if (is_vm_hugetlb_page(vma)) {	542	if (is_vm_hugetlb_page(vma)) {
543	hugetlb_free_pgd_range(tlb, addr, vma->vm_end,	543	hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
544	floor, next? next->vm_start: ceiling);	544	floor, next? next->vm_start: ceiling);
545	} else {	545	} else {
546	/*	546	/*
547	* Optimization: gather nearby vmas into one call down	547	* Optimization: gather nearby vmas into one call down
548	*/	548	*/
549	while (next && next->vm_start <= vma->vm_end + PMD_SIZE	549	while (next && next->vm_start <= vma->vm_end + PMD_SIZE
550	&& !is_vm_hugetlb_page(next)) {	550	&& !is_vm_hugetlb_page(next)) {
551	vma = next;	551	vma = next;
552	next = vma->vm_next;	552	next = vma->vm_next;
553	unlink_anon_vmas(vma);	553	unlink_anon_vmas(vma);
554	unlink_file_vma(vma);	554	unlink_file_vma(vma);
555	}	555	}
556	free_pgd_range(tlb, addr, vma->vm_end,	556	free_pgd_range(tlb, addr, vma->vm_end,
557	floor, next? next->vm_start: ceiling);	557	floor, next? next->vm_start: ceiling);
558	}	558	}
559	vma = next;	559	vma = next;
560	}	560	}
561	}	561	}
562		562
563	int __pte_alloc(struct mm_struct mm, struct vm_area_struct vma,	563	int __pte_alloc(struct mm_struct mm, struct vm_area_struct vma,
564	pmd_t *pmd, unsigned long address)	564	pmd_t *pmd, unsigned long address)
565	{	565	{
566	spinlock_t *ptl;	566	spinlock_t *ptl;
567	pgtable_t new = pte_alloc_one(mm, address);	567	pgtable_t new = pte_alloc_one(mm, address);
568	int wait_split_huge_page;	568	int wait_split_huge_page;
569	if (!new)	569	if (!new)
570	return -ENOMEM;	570	return -ENOMEM;
571		571
572	/*	572	/*
573	* Ensure all pte setup (eg. pte page lock and page clearing) are	573	* Ensure all pte setup (eg. pte page lock and page clearing) are
574	* visible before the pte is made visible to other CPUs by being	574	* visible before the pte is made visible to other CPUs by being
575	* put into page tables.	575	* put into page tables.
576	*	576	*
577	* The other side of the story is the pointer chasing in the page	577	* The other side of the story is the pointer chasing in the page
578	* table walking code (when walking the page table without locking;	578	* table walking code (when walking the page table without locking;
579	* ie. most of the time). Fortunately, these data accesses consist	579	* ie. most of the time). Fortunately, these data accesses consist
580	* of a chain of data-dependent loads, meaning most CPUs (alpha	580	* of a chain of data-dependent loads, meaning most CPUs (alpha
581	* being the notable exception) will already guarantee loads are	581	* being the notable exception) will already guarantee loads are
582	* seen in-order. See the alpha page table accessors for the	582	* seen in-order. See the alpha page table accessors for the
583	* smp_read_barrier_depends() barriers in page table walking code.	583	* smp_read_barrier_depends() barriers in page table walking code.
584	*/	584	*/
585	smp_wmb(); /* Could be smp_wmb__xxx(before\|after)_spin_lock */	585	smp_wmb(); /* Could be smp_wmb__xxx(before\|after)_spin_lock */
586		586
587	ptl = pmd_lock(mm, pmd);	587	ptl = pmd_lock(mm, pmd);
588	wait_split_huge_page = 0;	588	wait_split_huge_page = 0;
589	if (likely(pmd_none(pmd))) { / Has another populated it ? */	589	if (likely(pmd_none(pmd))) { / Has another populated it ? */
590	atomic_long_inc(&mm->nr_ptes);	590	atomic_long_inc(&mm->nr_ptes);
591	pmd_populate(mm, pmd, new);	591	pmd_populate(mm, pmd, new);
592	new = NULL;	592	new = NULL;
593	} else if (unlikely(pmd_trans_splitting(*pmd)))	593	} else if (unlikely(pmd_trans_splitting(*pmd)))
594	wait_split_huge_page = 1;	594	wait_split_huge_page = 1;
595	spin_unlock(ptl);	595	spin_unlock(ptl);
596	if (new)	596	if (new)
597	pte_free(mm, new);	597	pte_free(mm, new);
598	if (wait_split_huge_page)	598	if (wait_split_huge_page)
599	wait_split_huge_page(vma->anon_vma, pmd);	599	wait_split_huge_page(vma->anon_vma, pmd);
600	return 0;	600	return 0;
601	}	601	}
602		602
603	int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)	603	int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
604	{	604	{
605	pte_t *new = pte_alloc_one_kernel(&init_mm, address);	605	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
606	if (!new)	606	if (!new)
607	return -ENOMEM;	607	return -ENOMEM;
608		608
609	smp_wmb(); /* See comment in __pte_alloc */	609	smp_wmb(); /* See comment in __pte_alloc */
610		610
611	spin_lock(&init_mm.page_table_lock);	611	spin_lock(&init_mm.page_table_lock);
612	if (likely(pmd_none(pmd))) { / Has another populated it ? */	612	if (likely(pmd_none(pmd))) { / Has another populated it ? */
613	pmd_populate_kernel(&init_mm, pmd, new);	613	pmd_populate_kernel(&init_mm, pmd, new);
614	new = NULL;	614	new = NULL;
615	} else	615	} else
616	VM_BUG_ON(pmd_trans_splitting(*pmd));	616	VM_BUG_ON(pmd_trans_splitting(*pmd));
617	spin_unlock(&init_mm.page_table_lock);	617	spin_unlock(&init_mm.page_table_lock);
618	if (new)	618	if (new)
619	pte_free_kernel(&init_mm, new);	619	pte_free_kernel(&init_mm, new);
620	return 0;	620	return 0;
621	}	621	}
622		622
623	static inline void init_rss_vec(int *rss)	623	static inline void init_rss_vec(int *rss)
624	{	624	{
625	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);	625	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
626	}	626	}
627		627
628	static inline void add_mm_rss_vec(struct mm_struct mm, int rss)	628	static inline void add_mm_rss_vec(struct mm_struct mm, int rss)
629	{	629	{
630	int i;	630	int i;
631		631
632	if (current->mm == mm)	632	if (current->mm == mm)
633	sync_mm_rss(mm);	633	sync_mm_rss(mm);
634	for (i = 0; i < NR_MM_COUNTERS; i++)	634	for (i = 0; i < NR_MM_COUNTERS; i++)
635	if (rss[i])	635	if (rss[i])
636	add_mm_counter(mm, i, rss[i]);	636	add_mm_counter(mm, i, rss[i]);
637	}	637	}
638		638
639	/*	639	/*
640	* This function is called to print an error when a bad pte	640	* This function is called to print an error when a bad pte
641	* is found. For example, we might have a PFN-mapped pte in	641	* is found. For example, we might have a PFN-mapped pte in
642	* a region that doesn't allow it.	642	* a region that doesn't allow it.
643	*	643	*
644	* The calling function must still handle the error.	644	* The calling function must still handle the error.
645	*/	645	*/
646	static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,	646	static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
647	pte_t pte, struct page *page)	647	pte_t pte, struct page *page)
648	{	648	{
649	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);	649	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
650	pud_t *pud = pud_offset(pgd, addr);	650	pud_t *pud = pud_offset(pgd, addr);
651	pmd_t *pmd = pmd_offset(pud, addr);	651	pmd_t *pmd = pmd_offset(pud, addr);
652	struct address_space *mapping;	652	struct address_space *mapping;
653	pgoff_t index;	653	pgoff_t index;
654	static unsigned long resume;	654	static unsigned long resume;
655	static unsigned long nr_shown;	655	static unsigned long nr_shown;
656	static unsigned long nr_unshown;	656	static unsigned long nr_unshown;
657		657
658	/*	658	/*
659	* Allow a burst of 60 reports, then keep quiet for that minute;	659	* Allow a burst of 60 reports, then keep quiet for that minute;
660	* or allow a steady drip of one report per second.	660	* or allow a steady drip of one report per second.
661	*/	661	*/
662	if (nr_shown == 60) {	662	if (nr_shown == 60) {
663	if (time_before(jiffies, resume)) {	663	if (time_before(jiffies, resume)) {
664	nr_unshown++;	664	nr_unshown++;
665	return;	665	return;
666	}	666	}
667	if (nr_unshown) {	667	if (nr_unshown) {
668	printk(KERN_ALERT	668	printk(KERN_ALERT
669	"BUG: Bad page map: %lu messages suppressed\n",	669	"BUG: Bad page map: %lu messages suppressed\n",
670	nr_unshown);	670	nr_unshown);
671	nr_unshown = 0;	671	nr_unshown = 0;
672	}	672	}
673	nr_shown = 0;	673	nr_shown = 0;
674	}	674	}
675	if (nr_shown++ == 0)	675	if (nr_shown++ == 0)
676	resume = jiffies + 60 * HZ;	676	resume = jiffies + 60 * HZ;
677		677
678	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;	678	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
679	index = linear_page_index(vma, addr);	679	index = linear_page_index(vma, addr);
680		680
681	printk(KERN_ALERT	681	printk(KERN_ALERT
682	"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",	682	"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
683	current->comm,	683	current->comm,
684	(long long)pte_val(pte), (long long)pmd_val(*pmd));	684	(long long)pte_val(pte), (long long)pmd_val(*pmd));
685	if (page)	685	if (page)
686	dump_page(page, "bad pte");	686	dump_page(page, "bad pte");
687	printk(KERN_ALERT	687	printk(KERN_ALERT
688	"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",	688	"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
689	(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);	689	(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
690	/*	690	/*
691	* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y	691	* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
692	*/	692	*/
693	if (vma->vm_ops)	693	if (vma->vm_ops)
694	printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",	694	printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
695	vma->vm_ops->fault);	695	vma->vm_ops->fault);
696	if (vma->vm_file)	696	if (vma->vm_file)
697	printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",	697	printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
698	vma->vm_file->f_op->mmap);	698	vma->vm_file->f_op->mmap);
699	dump_stack();	699	dump_stack();
700	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);	700	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
701	}	701	}
702		702
703	/*	703	/*
704	* vm_normal_page -- This function gets the "struct page" associated with a pte.	704	* vm_normal_page -- This function gets the "struct page" associated with a pte.
705	*	705	*
706	* "Special" mappings do not wish to be associated with a "struct page" (either	706	* "Special" mappings do not wish to be associated with a "struct page" (either
707	* it doesn't exist, or it exists but they don't want to touch it). In this	707	* it doesn't exist, or it exists but they don't want to touch it). In this
708	* case, NULL is returned here. "Normal" mappings do have a struct page.	708	* case, NULL is returned here. "Normal" mappings do have a struct page.
709	*	709	*
710	* There are 2 broad cases. Firstly, an architecture may define a pte_special()	710	* There are 2 broad cases. Firstly, an architecture may define a pte_special()
711	* pte bit, in which case this function is trivial. Secondly, an architecture	711	* pte bit, in which case this function is trivial. Secondly, an architecture
712	* may not have a spare pte bit, which requires a more complicated scheme,	712	* may not have a spare pte bit, which requires a more complicated scheme,
713	* described below.	713	* described below.
714	*	714	*
715	* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a	715	* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
716	* special mapping (even if there are underlying and valid "struct pages").	716	* special mapping (even if there are underlying and valid "struct pages").
717	* COWed pages of a VM_PFNMAP are always normal.	717	* COWed pages of a VM_PFNMAP are always normal.
718	*	718	*
719	* The way we recognize COWed pages within VM_PFNMAP mappings is through the	719	* The way we recognize COWed pages within VM_PFNMAP mappings is through the
720	* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit	720	* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
721	* set, and the vm_pgoff will point to the first PFN mapped: thus every special	721	* set, and the vm_pgoff will point to the first PFN mapped: thus every special
722	* mapping will always honor the rule	722	* mapping will always honor the rule
723	*	723	*
724	* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)	724	* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
725	*	725	*
726	* And for normal mappings this is false.	726	* And for normal mappings this is false.
727	*	727	*
728	* This restricts such mappings to be a linear translation from virtual address	728	* This restricts such mappings to be a linear translation from virtual address
729	* to pfn. To get around this restriction, we allow arbitrary mappings so long	729	* to pfn. To get around this restriction, we allow arbitrary mappings so long
730	* as the vma is not a COW mapping; in that case, we know that all ptes are	730	* as the vma is not a COW mapping; in that case, we know that all ptes are
731	* special (because none can have been COWed).	731	* special (because none can have been COWed).
732	*	732	*
733	*	733	*
734	* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.	734	* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
735	*	735	*
736	* VM_MIXEDMAP mappings can likewise contain memory with or without "struct	736	* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
737	* page" backing, however the difference is that _all_ pages with a struct	737	* page" backing, however the difference is that _all_ pages with a struct
738	* page (that is, those where pfn_valid is true) are refcounted and considered	738	* page (that is, those where pfn_valid is true) are refcounted and considered
739	* normal pages by the VM. The disadvantage is that pages are refcounted	739	* normal pages by the VM. The disadvantage is that pages are refcounted
740	* (which can be slower and simply not an option for some PFNMAP users). The	740	* (which can be slower and simply not an option for some PFNMAP users). The
741	* advantage is that we don't have to follow the strict linearity rule of	741	* advantage is that we don't have to follow the strict linearity rule of
742	* PFNMAP mappings in order to support COWable mappings.	742	* PFNMAP mappings in order to support COWable mappings.
743	*	743	*
744	*/	744	*/
745	#ifdef __HAVE_ARCH_PTE_SPECIAL	745	#ifdef __HAVE_ARCH_PTE_SPECIAL
746	# define HAVE_PTE_SPECIAL 1	746	# define HAVE_PTE_SPECIAL 1
747	#else	747	#else
748	# define HAVE_PTE_SPECIAL 0	748	# define HAVE_PTE_SPECIAL 0
749	#endif	749	#endif
750	struct page vm_normal_page(struct vm_area_struct vma, unsigned long addr,	750	struct page vm_normal_page(struct vm_area_struct vma, unsigned long addr,
751	pte_t pte)	751	pte_t pte)
752	{	752	{
753	unsigned long pfn = pte_pfn(pte);	753	unsigned long pfn = pte_pfn(pte);
754		754
755	if (HAVE_PTE_SPECIAL) {	755	if (HAVE_PTE_SPECIAL) {
756	if (likely(!pte_special(pte)))	756	if (likely(!pte_special(pte)))
757	goto check_pfn;	757	goto check_pfn;
758	if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))	758	if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
759	return NULL;	759	return NULL;
760	if (!is_zero_pfn(pfn))	760	if (!is_zero_pfn(pfn))
761	print_bad_pte(vma, addr, pte, NULL);	761	print_bad_pte(vma, addr, pte, NULL);
762	return NULL;	762	return NULL;
763	}	763	}
764		764
765	/* !HAVE_PTE_SPECIAL case follows: */	765	/* !HAVE_PTE_SPECIAL case follows: */
766		766
767	if (unlikely(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {	767	if (unlikely(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
768	if (vma->vm_flags & VM_MIXEDMAP) {	768	if (vma->vm_flags & VM_MIXEDMAP) {
769	if (!pfn_valid(pfn))	769	if (!pfn_valid(pfn))
770	return NULL;	770	return NULL;
771	goto out;	771	goto out;
772	} else {	772	} else {
773	unsigned long off;	773	unsigned long off;
774	off = (addr - vma->vm_start) >> PAGE_SHIFT;	774	off = (addr - vma->vm_start) >> PAGE_SHIFT;
775	if (pfn == vma->vm_pgoff + off)	775	if (pfn == vma->vm_pgoff + off)
776	return NULL;	776	return NULL;
777	if (!is_cow_mapping(vma->vm_flags))	777	if (!is_cow_mapping(vma->vm_flags))
778	return NULL;	778	return NULL;
779	}	779	}
780	}	780	}
781		781
782	if (is_zero_pfn(pfn))	782	if (is_zero_pfn(pfn))
783	return NULL;	783	return NULL;
784	check_pfn:	784	check_pfn:
785	if (unlikely(pfn > highest_memmap_pfn)) {	785	if (unlikely(pfn > highest_memmap_pfn)) {
786	print_bad_pte(vma, addr, pte, NULL);	786	print_bad_pte(vma, addr, pte, NULL);
787	return NULL;	787	return NULL;
788	}	788	}
789		789
790	/*	790	/*
791	* NOTE! We still have PageReserved() pages in the page tables.	791	* NOTE! We still have PageReserved() pages in the page tables.
792	* eg. VDSO mappings can cause them to exist.	792	* eg. VDSO mappings can cause them to exist.
793	*/	793	*/
794	out:	794	out:
795	return pfn_to_page(pfn);	795	return pfn_to_page(pfn);
796	}	796	}
797		797
798	/*	798	/*
799	* copy one vm_area from one task to the other. Assumes the page tables	799	* copy one vm_area from one task to the other. Assumes the page tables
800	* already present in the new task to be cleared in the whole range	800	* already present in the new task to be cleared in the whole range
801	* covered by this vma.	801	* covered by this vma.
802	*/	802	*/
803		803
804	static inline unsigned long	804	static inline unsigned long
805	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,	805	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
806	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,	806	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,
807	unsigned long addr, int *rss)	807	unsigned long addr, int *rss)
808	{	808	{
809	unsigned long vm_flags = vma->vm_flags;	809	unsigned long vm_flags = vma->vm_flags;
810	pte_t pte = *src_pte;	810	pte_t pte = *src_pte;
811	struct page *page;	811	struct page *page;
812		812
813	/* pte contains position in swap or file, so copy. */	813	/* pte contains position in swap or file, so copy. */
814	if (unlikely(!pte_present(pte))) {	814	if (unlikely(!pte_present(pte))) {
815	if (!pte_file(pte)) {	815	if (!pte_file(pte)) {
816	swp_entry_t entry = pte_to_swp_entry(pte);	816	swp_entry_t entry = pte_to_swp_entry(pte);
817		817
818	if (swap_duplicate(entry) < 0)	818	if (swap_duplicate(entry) < 0)
819	return entry.val;	819	return entry.val;
820		820
821	/* make sure dst_mm is on swapoff's mmlist. */	821	/* make sure dst_mm is on swapoff's mmlist. */
822	if (unlikely(list_empty(&dst_mm->mmlist))) {	822	if (unlikely(list_empty(&dst_mm->mmlist))) {
823	spin_lock(&mmlist_lock);	823	spin_lock(&mmlist_lock);
824	if (list_empty(&dst_mm->mmlist))	824	if (list_empty(&dst_mm->mmlist))
825	list_add(&dst_mm->mmlist,	825	list_add(&dst_mm->mmlist,
826	&src_mm->mmlist);	826	&src_mm->mmlist);
827	spin_unlock(&mmlist_lock);	827	spin_unlock(&mmlist_lock);
828	}	828	}
829	if (likely(!non_swap_entry(entry)))	829	if (likely(!non_swap_entry(entry)))
830	rss[MM_SWAPENTS]++;	830	rss[MM_SWAPENTS]++;
831	else if (is_migration_entry(entry)) {	831	else if (is_migration_entry(entry)) {
832	page = migration_entry_to_page(entry);	832	page = migration_entry_to_page(entry);
833		833
834	if (PageAnon(page))	834	if (PageAnon(page))
835	rss[MM_ANONPAGES]++;	835	rss[MM_ANONPAGES]++;
836	else	836	else
837	rss[MM_FILEPAGES]++;	837	rss[MM_FILEPAGES]++;
838		838
839	if (is_write_migration_entry(entry) &&	839	if (is_write_migration_entry(entry) &&
840	is_cow_mapping(vm_flags)) {	840	is_cow_mapping(vm_flags)) {
841	/*	841	/*
842	* COW mappings require pages in both	842	* COW mappings require pages in both
843	* parent and child to be set to read.	843	* parent and child to be set to read.
844	*/	844	*/
845	make_migration_entry_read(&entry);	845	make_migration_entry_read(&entry);
846	pte = swp_entry_to_pte(entry);	846	pte = swp_entry_to_pte(entry);
847	if (pte_swp_soft_dirty(*src_pte))	847	if (pte_swp_soft_dirty(*src_pte))
848	pte = pte_swp_mksoft_dirty(pte);	848	pte = pte_swp_mksoft_dirty(pte);
849	set_pte_at(src_mm, addr, src_pte, pte);	849	set_pte_at(src_mm, addr, src_pte, pte);
850	}	850	}
851	}	851	}
852	}	852	}
853	goto out_set_pte;	853	goto out_set_pte;
854	}	854	}
855		855
856	/*	856	/*
857	* If it's a COW mapping, write protect it both	857	* If it's a COW mapping, write protect it both
858	* in the parent and the child	858	* in the parent and the child
859	*/	859	*/
860	if (is_cow_mapping(vm_flags)) {	860	if (is_cow_mapping(vm_flags)) {
861	ptep_set_wrprotect(src_mm, addr, src_pte);	861	ptep_set_wrprotect(src_mm, addr, src_pte);
862	pte = pte_wrprotect(pte);	862	pte = pte_wrprotect(pte);
863	}	863	}
864		864
865	/*	865	/*
866	* If it's a shared mapping, mark it clean in	866	* If it's a shared mapping, mark it clean in
867	* the child	867	* the child
868	*/	868	*/
869	if (vm_flags & VM_SHARED)	869	if (vm_flags & VM_SHARED)
870	pte = pte_mkclean(pte);	870	pte = pte_mkclean(pte);
871	pte = pte_mkold(pte);	871	pte = pte_mkold(pte);
872		872
873	page = vm_normal_page(vma, addr, pte);	873	page = vm_normal_page(vma, addr, pte);
874	if (page) {	874	if (page) {
875	get_page(page);	875	get_page(page);
876	page_dup_rmap(page);	876	page_dup_rmap(page);
877	if (PageAnon(page))	877	if (PageAnon(page))
878	rss[MM_ANONPAGES]++;	878	rss[MM_ANONPAGES]++;
879	else	879	else
880	rss[MM_FILEPAGES]++;	880	rss[MM_FILEPAGES]++;
881	}	881	}
882		882
883	out_set_pte:	883	out_set_pte:
884	set_pte_at(dst_mm, addr, dst_pte, pte);	884	set_pte_at(dst_mm, addr, dst_pte, pte);
885	return 0;	885	return 0;
886	}	886	}
887		887
888	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,	888	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
889	pmd_t dst_pmd, pmd_t src_pmd, struct vm_area_struct *vma,	889	pmd_t dst_pmd, pmd_t src_pmd, struct vm_area_struct *vma,
890	unsigned long addr, unsigned long end)	890	unsigned long addr, unsigned long end)
891	{	891	{
892	pte_t orig_src_pte, orig_dst_pte;	892	pte_t orig_src_pte, orig_dst_pte;
893	pte_t src_pte, dst_pte;	893	pte_t src_pte, dst_pte;
894	spinlock_t src_ptl, dst_ptl;	894	spinlock_t src_ptl, dst_ptl;
895	int progress = 0;	895	int progress = 0;
896	int rss[NR_MM_COUNTERS];	896	int rss[NR_MM_COUNTERS];
897	swp_entry_t entry = (swp_entry_t){0};	897	swp_entry_t entry = (swp_entry_t){0};
898		898
899	again:	899	again:
900	init_rss_vec(rss);	900	init_rss_vec(rss);
901		901
902	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);	902	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
903	if (!dst_pte)	903	if (!dst_pte)
904	return -ENOMEM;	904	return -ENOMEM;
905	src_pte = pte_offset_map(src_pmd, addr);	905	src_pte = pte_offset_map(src_pmd, addr);
906	src_ptl = pte_lockptr(src_mm, src_pmd);	906	src_ptl = pte_lockptr(src_mm, src_pmd);
907	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);	907	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
908	orig_src_pte = src_pte;	908	orig_src_pte = src_pte;
909	orig_dst_pte = dst_pte;	909	orig_dst_pte = dst_pte;
910	arch_enter_lazy_mmu_mode();	910	arch_enter_lazy_mmu_mode();
911		911
912	do {	912	do {
913	/*	913	/*
914	* We are holding two locks at this point - either of them	914	* We are holding two locks at this point - either of them
915	* could generate latencies in another task on another CPU.	915	* could generate latencies in another task on another CPU.
916	*/	916	*/
917	if (progress >= 32) {	917	if (progress >= 32) {
918	progress = 0;	918	progress = 0;
919	if (need_resched() \|\|	919	if (need_resched() \|\|
920	spin_needbreak(src_ptl) \|\| spin_needbreak(dst_ptl))	920	spin_needbreak(src_ptl) \|\| spin_needbreak(dst_ptl))
921	break;	921	break;
922	}	922	}
923	if (pte_none(*src_pte)) {	923	if (pte_none(*src_pte)) {
924	progress++;	924	progress++;
925	continue;	925	continue;
926	}	926	}
927	entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,	927	entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
928	vma, addr, rss);	928	vma, addr, rss);
929	if (entry.val)	929	if (entry.val)
930	break;	930	break;
931	progress += 8;	931	progress += 8;
932	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);	932	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
933		933
934	arch_leave_lazy_mmu_mode();	934	arch_leave_lazy_mmu_mode();
935	spin_unlock(src_ptl);	935	spin_unlock(src_ptl);
936	pte_unmap(orig_src_pte);	936	pte_unmap(orig_src_pte);
937	add_mm_rss_vec(dst_mm, rss);	937	add_mm_rss_vec(dst_mm, rss);
938	pte_unmap_unlock(orig_dst_pte, dst_ptl);	938	pte_unmap_unlock(orig_dst_pte, dst_ptl);
939	cond_resched();	939	cond_resched();
940		940
941	if (entry.val) {	941	if (entry.val) {
942	if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)	942	if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
943	return -ENOMEM;	943	return -ENOMEM;
944	progress = 0;	944	progress = 0;
945	}	945	}
946	if (addr != end)	946	if (addr != end)
947	goto again;	947	goto again;
948	return 0;	948	return 0;
949	}	949	}
950		950
951	static inline int copy_pmd_range(struct mm_struct dst_mm, struct mm_struct src_mm,	951	static inline int copy_pmd_range(struct mm_struct dst_mm, struct mm_struct src_mm,
952	pud_t dst_pud, pud_t src_pud, struct vm_area_struct *vma,	952	pud_t dst_pud, pud_t src_pud, struct vm_area_struct *vma,
953	unsigned long addr, unsigned long end)	953	unsigned long addr, unsigned long end)
954	{	954	{
955	pmd_t src_pmd, dst_pmd;	955	pmd_t src_pmd, dst_pmd;
956	unsigned long next;	956	unsigned long next;
957		957
958	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);	958	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
959	if (!dst_pmd)	959	if (!dst_pmd)
960	return -ENOMEM;	960	return -ENOMEM;
961	src_pmd = pmd_offset(src_pud, addr);	961	src_pmd = pmd_offset(src_pud, addr);
962	do {	962	do {
963	next = pmd_addr_end(addr, end);	963	next = pmd_addr_end(addr, end);
964	if (pmd_trans_huge(*src_pmd)) {	964	if (pmd_trans_huge(*src_pmd)) {
965	int err;	965	int err;
966	VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);	966	VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
967	err = copy_huge_pmd(dst_mm, src_mm,	967	err = copy_huge_pmd(dst_mm, src_mm,
968	dst_pmd, src_pmd, addr, vma);	968	dst_pmd, src_pmd, addr, vma);
969	if (err == -ENOMEM)	969	if (err == -ENOMEM)
970	return -ENOMEM;	970	return -ENOMEM;
971	if (!err)	971	if (!err)
972	continue;	972	continue;
973	/* fall through */	973	/* fall through */
974	}	974	}
975	if (pmd_none_or_clear_bad(src_pmd))	975	if (pmd_none_or_clear_bad(src_pmd))
976	continue;	976	continue;
977	if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,	977	if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
978	vma, addr, next))	978	vma, addr, next))
979	return -ENOMEM;	979	return -ENOMEM;
980	} while (dst_pmd++, src_pmd++, addr = next, addr != end);	980	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
981	return 0;	981	return 0;
982	}	982	}
983		983
984	static inline int copy_pud_range(struct mm_struct dst_mm, struct mm_struct src_mm,	984	static inline int copy_pud_range(struct mm_struct dst_mm, struct mm_struct src_mm,
985	pgd_t dst_pgd, pgd_t src_pgd, struct vm_area_struct *vma,	985	pgd_t dst_pgd, pgd_t src_pgd, struct vm_area_struct *vma,
986	unsigned long addr, unsigned long end)	986	unsigned long addr, unsigned long end)
987	{	987	{
988	pud_t src_pud, dst_pud;	988	pud_t src_pud, dst_pud;
989	unsigned long next;	989	unsigned long next;
990		990
991	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);	991	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
992	if (!dst_pud)	992	if (!dst_pud)
993	return -ENOMEM;	993	return -ENOMEM;
994	src_pud = pud_offset(src_pgd, addr);	994	src_pud = pud_offset(src_pgd, addr);
995	do {	995	do {
996	next = pud_addr_end(addr, end);	996	next = pud_addr_end(addr, end);
997	if (pud_none_or_clear_bad(src_pud))	997	if (pud_none_or_clear_bad(src_pud))
998	continue;	998	continue;
999	if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,	999	if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1000	vma, addr, next))	1000	vma, addr, next))
1001	return -ENOMEM;	1001	return -ENOMEM;
1002	} while (dst_pud++, src_pud++, addr = next, addr != end);	1002	} while (dst_pud++, src_pud++, addr = next, addr != end);
1003	return 0;	1003	return 0;
1004	}	1004	}
1005		1005
1006	int copy_page_range(struct mm_struct dst_mm, struct mm_struct src_mm,	1006	int copy_page_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1007	struct vm_area_struct *vma)	1007	struct vm_area_struct *vma)
1008	{	1008	{
1009	pgd_t src_pgd, dst_pgd;	1009	pgd_t src_pgd, dst_pgd;
1010	unsigned long next;	1010	unsigned long next;
1011	unsigned long addr = vma->vm_start;	1011	unsigned long addr = vma->vm_start;
1012	unsigned long end = vma->vm_end;	1012	unsigned long end = vma->vm_end;
1013	unsigned long mmun_start; /* For mmu_notifiers */	1013	unsigned long mmun_start; /* For mmu_notifiers */
1014	unsigned long mmun_end; /* For mmu_notifiers */	1014	unsigned long mmun_end; /* For mmu_notifiers */
1015	bool is_cow;	1015	bool is_cow;
1016	int ret;	1016	int ret;
1017		1017
1018	/*	1018	/*
1019	* Don't copy ptes where a page fault will fill them correctly.	1019	* Don't copy ptes where a page fault will fill them correctly.
1020	* Fork becomes much lighter when there are big shared or private	1020	* Fork becomes much lighter when there are big shared or private
1021	* readonly mappings. The tradeoff is that copy_page_range is more	1021	* readonly mappings. The tradeoff is that copy_page_range is more
1022	* efficient than faulting.	1022	* efficient than faulting.
1023	*/	1023	*/
1024	if (!(vma->vm_flags & (VM_HUGETLB \| VM_NONLINEAR \|	1024	if (!(vma->vm_flags & (VM_HUGETLB \| VM_NONLINEAR \|
1025	VM_PFNMAP \| VM_MIXEDMAP))) {	1025	VM_PFNMAP \| VM_MIXEDMAP))) {
1026	if (!vma->anon_vma)	1026	if (!vma->anon_vma)
1027	return 0;	1027	return 0;
1028	}	1028	}
1029		1029
1030	if (is_vm_hugetlb_page(vma))	1030	if (is_vm_hugetlb_page(vma))
1031	return copy_hugetlb_page_range(dst_mm, src_mm, vma);	1031	return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1032		1032
1033	if (unlikely(vma->vm_flags & VM_PFNMAP)) {	1033	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1034	/*	1034	/*
1035	* We do not free on error cases below as remove_vma	1035	* We do not free on error cases below as remove_vma
1036	* gets called on error from higher level routine	1036	* gets called on error from higher level routine
1037	*/	1037	*/
1038	ret = track_pfn_copy(vma);	1038	ret = track_pfn_copy(vma);
1039	if (ret)	1039	if (ret)
1040	return ret;	1040	return ret;
1041	}	1041	}
1042		1042
1043	/*	1043	/*
1044	* We need to invalidate the secondary MMU mappings only when	1044	* We need to invalidate the secondary MMU mappings only when
1045	* there could be a permission downgrade on the ptes of the	1045	* there could be a permission downgrade on the ptes of the
1046	* parent mm. And a permission downgrade will only happen if	1046	* parent mm. And a permission downgrade will only happen if
1047	* is_cow_mapping() returns true.	1047	* is_cow_mapping() returns true.
1048	*/	1048	*/
1049	is_cow = is_cow_mapping(vma->vm_flags);	1049	is_cow = is_cow_mapping(vma->vm_flags);
1050	mmun_start = addr;	1050	mmun_start = addr;
1051	mmun_end = end;	1051	mmun_end = end;
1052	if (is_cow)	1052	if (is_cow)
1053	mmu_notifier_invalidate_range_start(src_mm, mmun_start,	1053	mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1054	mmun_end);	1054	mmun_end);
1055		1055
1056	ret = 0;	1056	ret = 0;
1057	dst_pgd = pgd_offset(dst_mm, addr);	1057	dst_pgd = pgd_offset(dst_mm, addr);
1058	src_pgd = pgd_offset(src_mm, addr);	1058	src_pgd = pgd_offset(src_mm, addr);
1059	do {	1059	do {
1060	next = pgd_addr_end(addr, end);	1060	next = pgd_addr_end(addr, end);
1061	if (pgd_none_or_clear_bad(src_pgd))	1061	if (pgd_none_or_clear_bad(src_pgd))
1062	continue;	1062	continue;
1063	if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,	1063	if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1064	vma, addr, next))) {	1064	vma, addr, next))) {
1065	ret = -ENOMEM;	1065	ret = -ENOMEM;
1066	break;	1066	break;
1067	}	1067	}
1068	} while (dst_pgd++, src_pgd++, addr = next, addr != end);	1068	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1069		1069
1070	if (is_cow)	1070	if (is_cow)
1071	mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);	1071	mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1072	return ret;	1072	return ret;
1073	}	1073	}
1074		1074
1075	static unsigned long zap_pte_range(struct mmu_gather *tlb,	1075	static unsigned long zap_pte_range(struct mmu_gather *tlb,
1076	struct vm_area_struct vma, pmd_t pmd,	1076	struct vm_area_struct vma, pmd_t pmd,
1077	unsigned long addr, unsigned long end,	1077	unsigned long addr, unsigned long end,
1078	struct zap_details *details)	1078	struct zap_details *details)
1079	{	1079	{
1080	struct mm_struct *mm = tlb->mm;	1080	struct mm_struct *mm = tlb->mm;
1081	int force_flush = 0;	1081	int force_flush = 0;
1082	int rss[NR_MM_COUNTERS];	1082	int rss[NR_MM_COUNTERS];
1083	spinlock_t *ptl;	1083	spinlock_t *ptl;
1084	pte_t *start_pte;	1084	pte_t *start_pte;
1085	pte_t *pte;	1085	pte_t *pte;
1086		1086
1087	again:	1087	again:
1088	init_rss_vec(rss);	1088	init_rss_vec(rss);
1089	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);	1089	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1090	pte = start_pte;	1090	pte = start_pte;
1091	arch_enter_lazy_mmu_mode();	1091	arch_enter_lazy_mmu_mode();
1092	do {	1092	do {
1093	pte_t ptent = *pte;	1093	pte_t ptent = *pte;
1094	if (pte_none(ptent)) {	1094	if (pte_none(ptent)) {
1095	continue;	1095	continue;
1096	}	1096	}
1097		1097
1098	if (pte_present(ptent)) {	1098	if (pte_present(ptent)) {
1099	struct page *page;	1099	struct page *page;
1100		1100
1101	page = vm_normal_page(vma, addr, ptent);	1101	page = vm_normal_page(vma, addr, ptent);
1102	if (unlikely(details) && page) {	1102	if (unlikely(details) && page) {
1103	/*	1103	/*
1104	* unmap_shared_mapping_pages() wants to	1104	* unmap_shared_mapping_pages() wants to
1105	* invalidate cache without truncating:	1105	* invalidate cache without truncating:
1106	* unmap shared but keep private pages.	1106	* unmap shared but keep private pages.
1107	*/	1107	*/
1108	if (details->check_mapping &&	1108	if (details->check_mapping &&
1109	details->check_mapping != page->mapping)	1109	details->check_mapping != page->mapping)
1110	continue;	1110	continue;
1111	/*	1111	/*
1112	* Each page->index must be checked when	1112	* Each page->index must be checked when
1113	* invalidating or truncating nonlinear.	1113	* invalidating or truncating nonlinear.
1114	*/	1114	*/
1115	if (details->nonlinear_vma &&	1115	if (details->nonlinear_vma &&
1116	(page->index < details->first_index \|\|	1116	(page->index < details->first_index \|\|
1117	page->index > details->last_index))	1117	page->index > details->last_index))
1118	continue;	1118	continue;
1119	}	1119	}
1120	ptent = ptep_get_and_clear_full(mm, addr, pte,	1120	ptent = ptep_get_and_clear_full(mm, addr, pte,
1121	tlb->fullmm);	1121	tlb->fullmm);
1122	tlb_remove_tlb_entry(tlb, pte, addr);	1122	tlb_remove_tlb_entry(tlb, pte, addr);
1123	if (unlikely(!page))	1123	if (unlikely(!page))
1124	continue;	1124	continue;
1125	if (unlikely(details) && details->nonlinear_vma	1125	if (unlikely(details) && details->nonlinear_vma
1126	&& linear_page_index(details->nonlinear_vma,	1126	&& linear_page_index(details->nonlinear_vma,
1127	addr) != page->index) {	1127	addr) != page->index) {
1128	pte_t ptfile = pgoff_to_pte(page->index);	1128	pte_t ptfile = pgoff_to_pte(page->index);
1129	if (pte_soft_dirty(ptent))	1129	if (pte_soft_dirty(ptent))
1130	pte_file_mksoft_dirty(ptfile);	1130	ptfile = pte_file_mksoft_dirty(ptfile);
1131	set_pte_at(mm, addr, pte, ptfile);	1131	set_pte_at(mm, addr, pte, ptfile);
1132	}	1132	}
1133	if (PageAnon(page))	1133	if (PageAnon(page))
1134	rss[MM_ANONPAGES]--;	1134	rss[MM_ANONPAGES]--;
1135	else {	1135	else {
1136	if (pte_dirty(ptent)) {	1136	if (pte_dirty(ptent)) {
1137	force_flush = 1;	1137	force_flush = 1;
1138	set_page_dirty(page);	1138	set_page_dirty(page);
1139	}	1139	}
1140	if (pte_young(ptent) &&	1140	if (pte_young(ptent) &&
1141	likely(!(vma->vm_flags & VM_SEQ_READ)))	1141	likely(!(vma->vm_flags & VM_SEQ_READ)))
1142	mark_page_accessed(page);	1142	mark_page_accessed(page);
1143	rss[MM_FILEPAGES]--;	1143	rss[MM_FILEPAGES]--;
1144	}	1144	}
1145	page_remove_rmap(page);	1145	page_remove_rmap(page);
1146	if (unlikely(page_mapcount(page) < 0))	1146	if (unlikely(page_mapcount(page) < 0))
1147	print_bad_pte(vma, addr, ptent, page);	1147	print_bad_pte(vma, addr, ptent, page);
1148	if (unlikely(!__tlb_remove_page(tlb, page))) {	1148	if (unlikely(!__tlb_remove_page(tlb, page))) {
1149	force_flush = 1;	1149	force_flush = 1;
1150	break;	1150	break;
1151	}	1151	}
1152	continue;	1152	continue;
1153	}	1153	}
1154	/*	1154	/*
1155	* If details->check_mapping, we leave swap entries;	1155	* If details->check_mapping, we leave swap entries;
1156	* if details->nonlinear_vma, we leave file entries.	1156	* if details->nonlinear_vma, we leave file entries.
1157	*/	1157	*/
1158	if (unlikely(details))	1158	if (unlikely(details))
1159	continue;	1159	continue;
1160	if (pte_file(ptent)) {	1160	if (pte_file(ptent)) {
1161	if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))	1161	if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1162	print_bad_pte(vma, addr, ptent, NULL);	1162	print_bad_pte(vma, addr, ptent, NULL);
1163	} else {	1163	} else {
1164	swp_entry_t entry = pte_to_swp_entry(ptent);	1164	swp_entry_t entry = pte_to_swp_entry(ptent);
1165		1165
1166	if (!non_swap_entry(entry))	1166	if (!non_swap_entry(entry))
1167	rss[MM_SWAPENTS]--;	1167	rss[MM_SWAPENTS]--;
1168	else if (is_migration_entry(entry)) {	1168	else if (is_migration_entry(entry)) {
1169	struct page *page;	1169	struct page *page;
1170		1170
1171	page = migration_entry_to_page(entry);	1171	page = migration_entry_to_page(entry);
1172		1172
1173	if (PageAnon(page))	1173	if (PageAnon(page))
1174	rss[MM_ANONPAGES]--;	1174	rss[MM_ANONPAGES]--;
1175	else	1175	else
1176	rss[MM_FILEPAGES]--;	1176	rss[MM_FILEPAGES]--;
1177	}	1177	}
1178	if (unlikely(!free_swap_and_cache(entry)))	1178	if (unlikely(!free_swap_and_cache(entry)))
1179	print_bad_pte(vma, addr, ptent, NULL);	1179	print_bad_pte(vma, addr, ptent, NULL);
1180	}	1180	}
1181	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);	1181	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1182	} while (pte++, addr += PAGE_SIZE, addr != end);	1182	} while (pte++, addr += PAGE_SIZE, addr != end);
1183		1183
1184	add_mm_rss_vec(mm, rss);	1184	add_mm_rss_vec(mm, rss);
1185	arch_leave_lazy_mmu_mode();	1185	arch_leave_lazy_mmu_mode();
1186		1186
1187	/* Do the actual TLB flush before dropping ptl */	1187	/* Do the actual TLB flush before dropping ptl */
1188	if (force_flush) {	1188	if (force_flush) {
1189	unsigned long old_end;	1189	unsigned long old_end;
1190		1190
1191	/*	1191	/*
1192	* Flush the TLB just for the previous segment,	1192	* Flush the TLB just for the previous segment,
1193	* then update the range to be the remaining	1193	* then update the range to be the remaining
1194	* TLB range.	1194	* TLB range.
1195	*/	1195	*/
1196	old_end = tlb->end;	1196	old_end = tlb->end;
1197	tlb->end = addr;	1197	tlb->end = addr;
1198	tlb_flush_mmu_tlbonly(tlb);	1198	tlb_flush_mmu_tlbonly(tlb);
1199	tlb->start = addr;	1199	tlb->start = addr;
1200	tlb->end = old_end;	1200	tlb->end = old_end;
1201	}	1201	}
1202	pte_unmap_unlock(start_pte, ptl);	1202	pte_unmap_unlock(start_pte, ptl);
1203		1203
1204	/*	1204	/*
1205	* If we forced a TLB flush (either due to running out of	1205	* If we forced a TLB flush (either due to running out of
1206	* batch buffers or because we needed to flush dirty TLB	1206	* batch buffers or because we needed to flush dirty TLB
1207	* entries before releasing the ptl), free the batched	1207	* entries before releasing the ptl), free the batched
1208	* memory too. Restart if we didn't do everything.	1208	* memory too. Restart if we didn't do everything.
1209	*/	1209	*/
1210	if (force_flush) {	1210	if (force_flush) {
1211	force_flush = 0;	1211	force_flush = 0;
1212	tlb_flush_mmu_free(tlb);	1212	tlb_flush_mmu_free(tlb);
1213		1213
1214	if (addr != end)	1214	if (addr != end)
1215	goto again;	1215	goto again;
1216	}	1216	}
1217		1217
1218	return addr;	1218	return addr;
1219	}	1219	}
1220		1220
1221	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,	1221	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1222	struct vm_area_struct vma, pud_t pud,	1222	struct vm_area_struct vma, pud_t pud,
1223	unsigned long addr, unsigned long end,	1223	unsigned long addr, unsigned long end,
1224	struct zap_details *details)	1224	struct zap_details *details)
1225	{	1225	{
1226	pmd_t *pmd;	1226	pmd_t *pmd;
1227	unsigned long next;	1227	unsigned long next;
1228		1228
1229	pmd = pmd_offset(pud, addr);	1229	pmd = pmd_offset(pud, addr);
1230	do {	1230	do {
1231	next = pmd_addr_end(addr, end);	1231	next = pmd_addr_end(addr, end);
1232	if (pmd_trans_huge(*pmd)) {	1232	if (pmd_trans_huge(*pmd)) {
1233	if (next - addr != HPAGE_PMD_SIZE) {	1233	if (next - addr != HPAGE_PMD_SIZE) {
1234	#ifdef CONFIG_DEBUG_VM	1234	#ifdef CONFIG_DEBUG_VM
1235	if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {	1235	if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1236	pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",	1236	pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1237	__func__, addr, end,	1237	__func__, addr, end,
1238	vma->vm_start,	1238	vma->vm_start,
1239	vma->vm_end);	1239	vma->vm_end);
1240	BUG();	1240	BUG();
1241	}	1241	}
1242	#endif	1242	#endif
1243	split_huge_page_pmd(vma, addr, pmd);	1243	split_huge_page_pmd(vma, addr, pmd);
1244	} else if (zap_huge_pmd(tlb, vma, pmd, addr))	1244	} else if (zap_huge_pmd(tlb, vma, pmd, addr))
1245	goto next;	1245	goto next;
1246	/* fall through */	1246	/* fall through */
1247	}	1247	}
1248	/*	1248	/*
1249	* Here there can be other concurrent MADV_DONTNEED or	1249	* Here there can be other concurrent MADV_DONTNEED or
1250	* trans huge page faults running, and if the pmd is	1250	* trans huge page faults running, and if the pmd is
1251	* none or trans huge it can change under us. This is	1251	* none or trans huge it can change under us. This is
1252	* because MADV_DONTNEED holds the mmap_sem in read	1252	* because MADV_DONTNEED holds the mmap_sem in read
1253	* mode.	1253	* mode.
1254	*/	1254	*/
1255	if (pmd_none_or_trans_huge_or_clear_bad(pmd))	1255	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1256	goto next;	1256	goto next;
1257	next = zap_pte_range(tlb, vma, pmd, addr, next, details);	1257	next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1258	next:	1258	next:
1259	cond_resched();	1259	cond_resched();
1260	} while (pmd++, addr = next, addr != end);	1260	} while (pmd++, addr = next, addr != end);
1261		1261
1262	return addr;	1262	return addr;
1263	}	1263	}
1264		1264
1265	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,	1265	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1266	struct vm_area_struct vma, pgd_t pgd,	1266	struct vm_area_struct vma, pgd_t pgd,
1267	unsigned long addr, unsigned long end,	1267	unsigned long addr, unsigned long end,
1268	struct zap_details *details)	1268	struct zap_details *details)
1269	{	1269	{
1270	pud_t *pud;	1270	pud_t *pud;
1271	unsigned long next;	1271	unsigned long next;
1272		1272
1273	pud = pud_offset(pgd, addr);	1273	pud = pud_offset(pgd, addr);
1274	do {	1274	do {
1275	next = pud_addr_end(addr, end);	1275	next = pud_addr_end(addr, end);
1276	if (pud_none_or_clear_bad(pud))	1276	if (pud_none_or_clear_bad(pud))
1277	continue;	1277	continue;
1278	next = zap_pmd_range(tlb, vma, pud, addr, next, details);	1278	next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1279	} while (pud++, addr = next, addr != end);	1279	} while (pud++, addr = next, addr != end);
1280		1280
1281	return addr;	1281	return addr;
1282	}	1282	}
1283		1283
1284	static void unmap_page_range(struct mmu_gather *tlb,	1284	static void unmap_page_range(struct mmu_gather *tlb,
1285	struct vm_area_struct *vma,	1285	struct vm_area_struct *vma,
1286	unsigned long addr, unsigned long end,	1286	unsigned long addr, unsigned long end,
1287	struct zap_details *details)	1287	struct zap_details *details)
1288	{	1288	{
1289	pgd_t *pgd;	1289	pgd_t *pgd;
1290	unsigned long next;	1290	unsigned long next;
1291		1291
1292	if (details && !details->check_mapping && !details->nonlinear_vma)	1292	if (details && !details->check_mapping && !details->nonlinear_vma)
1293	details = NULL;	1293	details = NULL;
1294		1294
1295	BUG_ON(addr >= end);	1295	BUG_ON(addr >= end);
1296	tlb_start_vma(tlb, vma);	1296	tlb_start_vma(tlb, vma);
1297	pgd = pgd_offset(vma->vm_mm, addr);	1297	pgd = pgd_offset(vma->vm_mm, addr);
1298	do {	1298	do {
1299	next = pgd_addr_end(addr, end);	1299	next = pgd_addr_end(addr, end);
1300	if (pgd_none_or_clear_bad(pgd))	1300	if (pgd_none_or_clear_bad(pgd))
1301	continue;	1301	continue;
1302	next = zap_pud_range(tlb, vma, pgd, addr, next, details);	1302	next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1303	} while (pgd++, addr = next, addr != end);	1303	} while (pgd++, addr = next, addr != end);
1304	tlb_end_vma(tlb, vma);	1304	tlb_end_vma(tlb, vma);
1305	}	1305	}
1306		1306
1307		1307
1308	static void unmap_single_vma(struct mmu_gather *tlb,	1308	static void unmap_single_vma(struct mmu_gather *tlb,
1309	struct vm_area_struct *vma, unsigned long start_addr,	1309	struct vm_area_struct *vma, unsigned long start_addr,
1310	unsigned long end_addr,	1310	unsigned long end_addr,
1311	struct zap_details *details)	1311	struct zap_details *details)
1312	{	1312	{
1313	unsigned long start = max(vma->vm_start, start_addr);	1313	unsigned long start = max(vma->vm_start, start_addr);
1314	unsigned long end;	1314	unsigned long end;
1315		1315
1316	if (start >= vma->vm_end)	1316	if (start >= vma->vm_end)
1317	return;	1317	return;
1318	end = min(vma->vm_end, end_addr);	1318	end = min(vma->vm_end, end_addr);
1319	if (end <= vma->vm_start)	1319	if (end <= vma->vm_start)
1320	return;	1320	return;
1321		1321
1322	if (vma->vm_file)	1322	if (vma->vm_file)
1323	uprobe_munmap(vma, start, end);	1323	uprobe_munmap(vma, start, end);
1324		1324
1325	if (unlikely(vma->vm_flags & VM_PFNMAP))	1325	if (unlikely(vma->vm_flags & VM_PFNMAP))
1326	untrack_pfn(vma, 0, 0);	1326	untrack_pfn(vma, 0, 0);
1327		1327
1328	if (start != end) {	1328	if (start != end) {
1329	if (unlikely(is_vm_hugetlb_page(vma))) {	1329	if (unlikely(is_vm_hugetlb_page(vma))) {
1330	/*	1330	/*
1331	* It is undesirable to test vma->vm_file as it	1331	* It is undesirable to test vma->vm_file as it
1332	* should be non-null for valid hugetlb area.	1332	* should be non-null for valid hugetlb area.
1333	* However, vm_file will be NULL in the error	1333	* However, vm_file will be NULL in the error
1334	* cleanup path of mmap_region. When	1334	* cleanup path of mmap_region. When
1335	* hugetlbfs ->mmap method fails,	1335	* hugetlbfs ->mmap method fails,
1336	* mmap_region() nullifies vma->vm_file	1336	* mmap_region() nullifies vma->vm_file
1337	* before calling this function to clean up.	1337	* before calling this function to clean up.
1338	* Since no pte has actually been setup, it is	1338	* Since no pte has actually been setup, it is
1339	* safe to do nothing in this case.	1339	* safe to do nothing in this case.
1340	*/	1340	*/
1341	if (vma->vm_file) {	1341	if (vma->vm_file) {
1342	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);	1342	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1343	__unmap_hugepage_range_final(tlb, vma, start, end, NULL);	1343	__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1344	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);	1344	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1345	}	1345	}
1346	} else	1346	} else
1347	unmap_page_range(tlb, vma, start, end, details);	1347	unmap_page_range(tlb, vma, start, end, details);
1348	}	1348	}
1349	}	1349	}
1350		1350
1351	/**	1351	/**
1352	* unmap_vmas - unmap a range of memory covered by a list of vma's	1352	* unmap_vmas - unmap a range of memory covered by a list of vma's
1353	* @tlb: address of the caller's struct mmu_gather	1353	* @tlb: address of the caller's struct mmu_gather
1354	* @vma: the starting vma	1354	* @vma: the starting vma
1355	* @start_addr: virtual address at which to start unmapping	1355	* @start_addr: virtual address at which to start unmapping
1356	* @end_addr: virtual address at which to end unmapping	1356	* @end_addr: virtual address at which to end unmapping
1357	*	1357	*
1358	* Unmap all pages in the vma list.	1358	* Unmap all pages in the vma list.
1359	*	1359	*
1360	* Only addresses between `start' and `end' will be unmapped.	1360	* Only addresses between `start' and `end' will be unmapped.
1361	*	1361	*
1362	* The VMA list must be sorted in ascending virtual address order.	1362	* The VMA list must be sorted in ascending virtual address order.
1363	*	1363	*
1364	* unmap_vmas() assumes that the caller will flush the whole unmapped address	1364	* unmap_vmas() assumes that the caller will flush the whole unmapped address
1365	* range after unmap_vmas() returns. So the only responsibility here is to	1365	* range after unmap_vmas() returns. So the only responsibility here is to
1366	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()	1366	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1367	* drops the lock and schedules.	1367	* drops the lock and schedules.
1368	*/	1368	*/
1369	void unmap_vmas(struct mmu_gather *tlb,	1369	void unmap_vmas(struct mmu_gather *tlb,
1370	struct vm_area_struct *vma, unsigned long start_addr,	1370	struct vm_area_struct *vma, unsigned long start_addr,
1371	unsigned long end_addr)	1371	unsigned long end_addr)
1372	{	1372	{
1373	struct mm_struct *mm = vma->vm_mm;	1373	struct mm_struct *mm = vma->vm_mm;
1374		1374
1375	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);	1375	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1376	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)	1376	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1377	unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);	1377	unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1378	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);	1378	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1379	}	1379	}
1380		1380
1381	/**	1381	/**
1382	* zap_page_range - remove user pages in a given range	1382	* zap_page_range - remove user pages in a given range
1383	* @vma: vm_area_struct holding the applicable pages	1383	* @vma: vm_area_struct holding the applicable pages
1384	* @start: starting address of pages to zap	1384	* @start: starting address of pages to zap
1385	* @size: number of bytes to zap	1385	* @size: number of bytes to zap
1386	* @details: details of nonlinear truncation or shared cache invalidation	1386	* @details: details of nonlinear truncation or shared cache invalidation
1387	*	1387	*
1388	* Caller must protect the VMA list	1388	* Caller must protect the VMA list
1389	*/	1389	*/
1390	void zap_page_range(struct vm_area_struct *vma, unsigned long start,	1390	void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1391	unsigned long size, struct zap_details *details)	1391	unsigned long size, struct zap_details *details)
1392	{	1392	{
1393	struct mm_struct *mm = vma->vm_mm;	1393	struct mm_struct *mm = vma->vm_mm;
1394	struct mmu_gather tlb;	1394	struct mmu_gather tlb;
1395	unsigned long end = start + size;	1395	unsigned long end = start + size;
1396		1396
1397	lru_add_drain();	1397	lru_add_drain();
1398	tlb_gather_mmu(&tlb, mm, start, end);	1398	tlb_gather_mmu(&tlb, mm, start, end);
1399	update_hiwater_rss(mm);	1399	update_hiwater_rss(mm);
1400	mmu_notifier_invalidate_range_start(mm, start, end);	1400	mmu_notifier_invalidate_range_start(mm, start, end);
1401	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)	1401	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1402	unmap_single_vma(&tlb, vma, start, end, details);	1402	unmap_single_vma(&tlb, vma, start, end, details);
1403	mmu_notifier_invalidate_range_end(mm, start, end);	1403	mmu_notifier_invalidate_range_end(mm, start, end);
1404	tlb_finish_mmu(&tlb, start, end);	1404	tlb_finish_mmu(&tlb, start, end);
1405	}	1405	}
1406		1406
1407	/**	1407	/**
1408	* zap_page_range_single - remove user pages in a given range	1408	* zap_page_range_single - remove user pages in a given range
1409	* @vma: vm_area_struct holding the applicable pages	1409	* @vma: vm_area_struct holding the applicable pages
1410	* @address: starting address of pages to zap	1410	* @address: starting address of pages to zap
1411	* @size: number of bytes to zap	1411	* @size: number of bytes to zap
1412	* @details: details of nonlinear truncation or shared cache invalidation	1412	* @details: details of nonlinear truncation or shared cache invalidation
1413	*	1413	*
1414	* The range must fit into one VMA.	1414	* The range must fit into one VMA.
1415	*/	1415	*/
1416	static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,	1416	static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1417	unsigned long size, struct zap_details *details)	1417	unsigned long size, struct zap_details *details)
1418	{	1418	{
1419	struct mm_struct *mm = vma->vm_mm;	1419	struct mm_struct *mm = vma->vm_mm;
1420	struct mmu_gather tlb;	1420	struct mmu_gather tlb;
1421	unsigned long end = address + size;	1421	unsigned long end = address + size;
1422		1422
1423	lru_add_drain();	1423	lru_add_drain();
1424	tlb_gather_mmu(&tlb, mm, address, end);	1424	tlb_gather_mmu(&tlb, mm, address, end);
1425	update_hiwater_rss(mm);	1425	update_hiwater_rss(mm);
1426	mmu_notifier_invalidate_range_start(mm, address, end);	1426	mmu_notifier_invalidate_range_start(mm, address, end);
1427	unmap_single_vma(&tlb, vma, address, end, details);	1427	unmap_single_vma(&tlb, vma, address, end, details);
1428	mmu_notifier_invalidate_range_end(mm, address, end);	1428	mmu_notifier_invalidate_range_end(mm, address, end);
1429	tlb_finish_mmu(&tlb, address, end);	1429	tlb_finish_mmu(&tlb, address, end);
1430	}	1430	}
1431		1431
1432	/**	1432	/**
1433	* zap_vma_ptes - remove ptes mapping the vma	1433	* zap_vma_ptes - remove ptes mapping the vma
1434	* @vma: vm_area_struct holding ptes to be zapped	1434	* @vma: vm_area_struct holding ptes to be zapped
1435	* @address: starting address of pages to zap	1435	* @address: starting address of pages to zap
1436	* @size: number of bytes to zap	1436	* @size: number of bytes to zap
1437	*	1437	*
1438	* This function only unmaps ptes assigned to VM_PFNMAP vmas.	1438	* This function only unmaps ptes assigned to VM_PFNMAP vmas.
1439	*	1439	*
1440	* The entire address range must be fully contained within the vma.	1440	* The entire address range must be fully contained within the vma.
1441	*	1441	*
1442	* Returns 0 if successful.	1442	* Returns 0 if successful.
1443	*/	1443	*/
1444	int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,	1444	int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1445	unsigned long size)	1445	unsigned long size)
1446	{	1446	{
1447	if (address < vma->vm_start \|\| address + size > vma->vm_end \|\|	1447	if (address < vma->vm_start \|\| address + size > vma->vm_end \|\|
1448	!(vma->vm_flags & VM_PFNMAP))	1448	!(vma->vm_flags & VM_PFNMAP))
1449	return -1;	1449	return -1;
1450	zap_page_range_single(vma, address, size, NULL);	1450	zap_page_range_single(vma, address, size, NULL);
1451	return 0;	1451	return 0;
1452	}	1452	}
1453	EXPORT_SYMBOL_GPL(zap_vma_ptes);	1453	EXPORT_SYMBOL_GPL(zap_vma_ptes);
1454		1454
1455	pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,	1455	pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,
1456	spinlock_t **ptl)	1456	spinlock_t **ptl)
1457	{	1457	{
1458	pgd_t * pgd = pgd_offset(mm, addr);	1458	pgd_t * pgd = pgd_offset(mm, addr);
1459	pud_t * pud = pud_alloc(mm, pgd, addr);	1459	pud_t * pud = pud_alloc(mm, pgd, addr);
1460	if (pud) {	1460	if (pud) {
1461	pmd_t * pmd = pmd_alloc(mm, pud, addr);	1461	pmd_t * pmd = pmd_alloc(mm, pud, addr);
1462	if (pmd) {	1462	if (pmd) {
1463	VM_BUG_ON(pmd_trans_huge(*pmd));	1463	VM_BUG_ON(pmd_trans_huge(*pmd));
1464	return pte_alloc_map_lock(mm, pmd, addr, ptl);	1464	return pte_alloc_map_lock(mm, pmd, addr, ptl);
1465	}	1465	}
1466	}	1466	}
1467	return NULL;	1467	return NULL;
1468	}	1468	}
1469		1469
1470	/*	1470	/*
1471	* This is the old fallback for page remapping.	1471	* This is the old fallback for page remapping.
1472	*	1472	*
1473	* For historical reasons, it only allows reserved pages. Only	1473	* For historical reasons, it only allows reserved pages. Only
1474	* old drivers should use this, and they needed to mark their	1474	* old drivers should use this, and they needed to mark their
1475	* pages reserved for the old functions anyway.	1475	* pages reserved for the old functions anyway.
1476	*/	1476	*/
1477	static int insert_page(struct vm_area_struct *vma, unsigned long addr,	1477	static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1478	struct page *page, pgprot_t prot)	1478	struct page *page, pgprot_t prot)
1479	{	1479	{
1480	struct mm_struct *mm = vma->vm_mm;	1480	struct mm_struct *mm = vma->vm_mm;
1481	int retval;	1481	int retval;
1482	pte_t *pte;	1482	pte_t *pte;
1483	spinlock_t *ptl;	1483	spinlock_t *ptl;
1484		1484
1485	retval = -EINVAL;	1485	retval = -EINVAL;
1486	if (PageAnon(page))	1486	if (PageAnon(page))
1487	goto out;	1487	goto out;
1488	retval = -ENOMEM;	1488	retval = -ENOMEM;
1489	flush_dcache_page(page);	1489	flush_dcache_page(page);
1490	pte = get_locked_pte(mm, addr, &ptl);	1490	pte = get_locked_pte(mm, addr, &ptl);
1491	if (!pte)	1491	if (!pte)
1492	goto out;	1492	goto out;
1493	retval = -EBUSY;	1493	retval = -EBUSY;
1494	if (!pte_none(*pte))	1494	if (!pte_none(*pte))
1495	goto out_unlock;	1495	goto out_unlock;
1496		1496
1497	/* Ok, finally just insert the thing.. */	1497	/* Ok, finally just insert the thing.. */
1498	get_page(page);	1498	get_page(page);
1499	inc_mm_counter_fast(mm, MM_FILEPAGES);	1499	inc_mm_counter_fast(mm, MM_FILEPAGES);
1500	page_add_file_rmap(page);	1500	page_add_file_rmap(page);
1501	set_pte_at(mm, addr, pte, mk_pte(page, prot));	1501	set_pte_at(mm, addr, pte, mk_pte(page, prot));
1502		1502
1503	retval = 0;	1503	retval = 0;
1504	pte_unmap_unlock(pte, ptl);	1504	pte_unmap_unlock(pte, ptl);
1505	return retval;	1505	return retval;
1506	out_unlock:	1506	out_unlock:
1507	pte_unmap_unlock(pte, ptl);	1507	pte_unmap_unlock(pte, ptl);
1508	out:	1508	out:
1509	return retval;	1509	return retval;
1510	}	1510	}
1511		1511
1512	/**	1512	/**
1513	* vm_insert_page - insert single page into user vma	1513	* vm_insert_page - insert single page into user vma
1514	* @vma: user vma to map to	1514	* @vma: user vma to map to
1515	* @addr: target user address of this page	1515	* @addr: target user address of this page
1516	* @page: source kernel page	1516	* @page: source kernel page
1517	*	1517	*
1518	* This allows drivers to insert individual pages they've allocated	1518	* This allows drivers to insert individual pages they've allocated
1519	* into a user vma.	1519	* into a user vma.
1520	*	1520	*
1521	* The page has to be a nice clean _individual_ kernel allocation.	1521	* The page has to be a nice clean _individual_ kernel allocation.
1522	* If you allocate a compound page, you need to have marked it as	1522	* If you allocate a compound page, you need to have marked it as
1523	* such (__GFP_COMP), or manually just split the page up yourself	1523	* such (__GFP_COMP), or manually just split the page up yourself
1524	* (see split_page()).	1524	* (see split_page()).
1525	*	1525	*
1526	* NOTE! Traditionally this was done with "remap_pfn_range()" which	1526	* NOTE! Traditionally this was done with "remap_pfn_range()" which
1527	* took an arbitrary page protection parameter. This doesn't allow	1527	* took an arbitrary page protection parameter. This doesn't allow
1528	* that. Your vma protection will have to be set up correctly, which	1528	* that. Your vma protection will have to be set up correctly, which
1529	* means that if you want a shared writable mapping, you'd better	1529	* means that if you want a shared writable mapping, you'd better
1530	* ask for a shared writable mapping!	1530	* ask for a shared writable mapping!
1531	*	1531	*
1532	* The page does not need to be reserved.	1532	* The page does not need to be reserved.
1533	*	1533	*
1534	* Usually this function is called from f_op->mmap() handler	1534	* Usually this function is called from f_op->mmap() handler
1535	* under mm->mmap_sem write-lock, so it can change vma->vm_flags.	1535	* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
1536	* Caller must set VM_MIXEDMAP on vma if it wants to call this	1536	* Caller must set VM_MIXEDMAP on vma if it wants to call this
1537	* function from other places, for example from page-fault handler.	1537	* function from other places, for example from page-fault handler.
1538	*/	1538	*/
1539	int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,	1539	int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1540	struct page *page)	1540	struct page *page)
1541	{	1541	{
1542	if (addr < vma->vm_start \|\| addr >= vma->vm_end)	1542	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1543	return -EFAULT;	1543	return -EFAULT;
1544	if (!page_count(page))	1544	if (!page_count(page))
1545	return -EINVAL;	1545	return -EINVAL;
1546	if (!(vma->vm_flags & VM_MIXEDMAP)) {	1546	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1547	BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));	1547	BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1548	BUG_ON(vma->vm_flags & VM_PFNMAP);	1548	BUG_ON(vma->vm_flags & VM_PFNMAP);
1549	vma->vm_flags \|= VM_MIXEDMAP;	1549	vma->vm_flags \|= VM_MIXEDMAP;
1550	}	1550	}
1551	return insert_page(vma, addr, page, vma->vm_page_prot);	1551	return insert_page(vma, addr, page, vma->vm_page_prot);
1552	}	1552	}
1553	EXPORT_SYMBOL(vm_insert_page);	1553	EXPORT_SYMBOL(vm_insert_page);
1554		1554
1555	static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,	1555	static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1556	unsigned long pfn, pgprot_t prot)	1556	unsigned long pfn, pgprot_t prot)
1557	{	1557	{
1558	struct mm_struct *mm = vma->vm_mm;	1558	struct mm_struct *mm = vma->vm_mm;
1559	int retval;	1559	int retval;
1560	pte_t *pte, entry;	1560	pte_t *pte, entry;
1561	spinlock_t *ptl;	1561	spinlock_t *ptl;
1562		1562
1563	retval = -ENOMEM;	1563	retval = -ENOMEM;
1564	pte = get_locked_pte(mm, addr, &ptl);	1564	pte = get_locked_pte(mm, addr, &ptl);
1565	if (!pte)	1565	if (!pte)
1566	goto out;	1566	goto out;
1567	retval = -EBUSY;	1567	retval = -EBUSY;
1568	if (!pte_none(*pte))	1568	if (!pte_none(*pte))
1569	goto out_unlock;	1569	goto out_unlock;
1570		1570
1571	/* Ok, finally just insert the thing.. */	1571	/* Ok, finally just insert the thing.. */
1572	entry = pte_mkspecial(pfn_pte(pfn, prot));	1572	entry = pte_mkspecial(pfn_pte(pfn, prot));
1573	set_pte_at(mm, addr, pte, entry);	1573	set_pte_at(mm, addr, pte, entry);
1574	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */	1574	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1575		1575
1576	retval = 0;	1576	retval = 0;
1577	out_unlock:	1577	out_unlock:
1578	pte_unmap_unlock(pte, ptl);	1578	pte_unmap_unlock(pte, ptl);
1579	out:	1579	out:
1580	return retval;	1580	return retval;
1581	}	1581	}
1582		1582
1583	/**	1583	/**
1584	* vm_insert_pfn - insert single pfn into user vma	1584	* vm_insert_pfn - insert single pfn into user vma
1585	* @vma: user vma to map to	1585	* @vma: user vma to map to
1586	* @addr: target user address of this page	1586	* @addr: target user address of this page
1587	* @pfn: source kernel pfn	1587	* @pfn: source kernel pfn
1588	*	1588	*
1589	* Similar to vm_insert_page, this allows drivers to insert individual pages	1589	* Similar to vm_insert_page, this allows drivers to insert individual pages
1590	* they've allocated into a user vma. Same comments apply.	1590	* they've allocated into a user vma. Same comments apply.
1591	*	1591	*
1592	* This function should only be called from a vm_ops->fault handler, and	1592	* This function should only be called from a vm_ops->fault handler, and
1593	* in that case the handler should return NULL.	1593	* in that case the handler should return NULL.
1594	*	1594	*
1595	* vma cannot be a COW mapping.	1595	* vma cannot be a COW mapping.
1596	*	1596	*
1597	* As this is called only for pages that do not currently exist, we	1597	* As this is called only for pages that do not currently exist, we
1598	* do not need to flush old virtual caches or the TLB.	1598	* do not need to flush old virtual caches or the TLB.
1599	*/	1599	*/
1600	int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,	1600	int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1601	unsigned long pfn)	1601	unsigned long pfn)
1602	{	1602	{
1603	int ret;	1603	int ret;
1604	pgprot_t pgprot = vma->vm_page_prot;	1604	pgprot_t pgprot = vma->vm_page_prot;
1605	/*	1605	/*
1606	* Technically, architectures with pte_special can avoid all these	1606	* Technically, architectures with pte_special can avoid all these
1607	* restrictions (same for remap_pfn_range). However we would like	1607	* restrictions (same for remap_pfn_range). However we would like
1608	* consistency in testing and feature parity among all, so we should	1608	* consistency in testing and feature parity among all, so we should
1609	* try to keep these invariants in place for everybody.	1609	* try to keep these invariants in place for everybody.
1610	*/	1610	*/
1611	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));	1611	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));
1612	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==	1612	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
1613	(VM_PFNMAP\|VM_MIXEDMAP));	1613	(VM_PFNMAP\|VM_MIXEDMAP));
1614	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));	1614	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1615	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));	1615	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1616		1616
1617	if (addr < vma->vm_start \|\| addr >= vma->vm_end)	1617	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1618	return -EFAULT;	1618	return -EFAULT;
1619	if (track_pfn_insert(vma, &pgprot, pfn))	1619	if (track_pfn_insert(vma, &pgprot, pfn))
1620	return -EINVAL;	1620	return -EINVAL;
1621		1621
1622	ret = insert_pfn(vma, addr, pfn, pgprot);	1622	ret = insert_pfn(vma, addr, pfn, pgprot);
1623		1623
1624	return ret;	1624	return ret;
1625	}	1625	}
1626	EXPORT_SYMBOL(vm_insert_pfn);	1626	EXPORT_SYMBOL(vm_insert_pfn);
1627		1627
1628	int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,	1628	int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1629	unsigned long pfn)	1629	unsigned long pfn)
1630	{	1630	{
1631	BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));	1631	BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1632		1632
1633	if (addr < vma->vm_start \|\| addr >= vma->vm_end)	1633	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1634	return -EFAULT;	1634	return -EFAULT;
1635		1635
1636	/*	1636	/*
1637	* If we don't have pte special, then we have to use the pfn_valid()	1637	* If we don't have pte special, then we have to use the pfn_valid()
1638	* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we must	1638	* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we must
1639	* refcount the page if pfn_valid is true (hence insert_page rather	1639	* refcount the page if pfn_valid is true (hence insert_page rather
1640	* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP	1640	* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1641	* without pte special, it would there be refcounted as a normal page.	1641	* without pte special, it would there be refcounted as a normal page.
1642	*/	1642	*/
1643	if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {	1643	if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1644	struct page *page;	1644	struct page *page;
1645		1645
1646	page = pfn_to_page(pfn);	1646	page = pfn_to_page(pfn);
1647	return insert_page(vma, addr, page, vma->vm_page_prot);	1647	return insert_page(vma, addr, page, vma->vm_page_prot);
1648	}	1648	}
1649	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);	1649	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1650	}	1650	}
1651	EXPORT_SYMBOL(vm_insert_mixed);	1651	EXPORT_SYMBOL(vm_insert_mixed);
1652		1652
1653	/*	1653	/*
1654	* maps a range of physical memory into the requested pages. the old	1654	* maps a range of physical memory into the requested pages. the old
1655	* mappings are removed. any references to nonexistent pages results	1655	* mappings are removed. any references to nonexistent pages results
1656	* in null mappings (currently treated as "copy-on-access")	1656	* in null mappings (currently treated as "copy-on-access")
1657	*/	1657	*/
1658	static int remap_pte_range(struct mm_struct mm, pmd_t pmd,	1658	static int remap_pte_range(struct mm_struct mm, pmd_t pmd,
1659	unsigned long addr, unsigned long end,	1659	unsigned long addr, unsigned long end,
1660	unsigned long pfn, pgprot_t prot)	1660	unsigned long pfn, pgprot_t prot)
1661	{	1661	{
1662	pte_t *pte;	1662	pte_t *pte;
1663	spinlock_t *ptl;	1663	spinlock_t *ptl;
1664		1664
1665	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);	1665	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1666	if (!pte)	1666	if (!pte)
1667	return -ENOMEM;	1667	return -ENOMEM;
1668	arch_enter_lazy_mmu_mode();	1668	arch_enter_lazy_mmu_mode();
1669	do {	1669	do {
1670	BUG_ON(!pte_none(*pte));	1670	BUG_ON(!pte_none(*pte));
1671	set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));	1671	set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1672	pfn++;	1672	pfn++;
1673	} while (pte++, addr += PAGE_SIZE, addr != end);	1673	} while (pte++, addr += PAGE_SIZE, addr != end);
1674	arch_leave_lazy_mmu_mode();	1674	arch_leave_lazy_mmu_mode();
1675	pte_unmap_unlock(pte - 1, ptl);	1675	pte_unmap_unlock(pte - 1, ptl);
1676	return 0;	1676	return 0;
1677	}	1677	}
1678		1678
1679	static inline int remap_pmd_range(struct mm_struct mm, pud_t pud,	1679	static inline int remap_pmd_range(struct mm_struct mm, pud_t pud,
1680	unsigned long addr, unsigned long end,	1680	unsigned long addr, unsigned long end,
1681	unsigned long pfn, pgprot_t prot)	1681	unsigned long pfn, pgprot_t prot)
1682	{	1682	{
1683	pmd_t *pmd;	1683	pmd_t *pmd;
1684	unsigned long next;	1684	unsigned long next;
1685		1685
1686	pfn -= addr >> PAGE_SHIFT;	1686	pfn -= addr >> PAGE_SHIFT;
1687	pmd = pmd_alloc(mm, pud, addr);	1687	pmd = pmd_alloc(mm, pud, addr);
1688	if (!pmd)	1688	if (!pmd)
1689	return -ENOMEM;	1689	return -ENOMEM;
1690	VM_BUG_ON(pmd_trans_huge(*pmd));	1690	VM_BUG_ON(pmd_trans_huge(*pmd));
1691	do {	1691	do {
1692	next = pmd_addr_end(addr, end);	1692	next = pmd_addr_end(addr, end);
1693	if (remap_pte_range(mm, pmd, addr, next,	1693	if (remap_pte_range(mm, pmd, addr, next,
1694	pfn + (addr >> PAGE_SHIFT), prot))	1694	pfn + (addr >> PAGE_SHIFT), prot))
1695	return -ENOMEM;	1695	return -ENOMEM;
1696	} while (pmd++, addr = next, addr != end);	1696	} while (pmd++, addr = next, addr != end);
1697	return 0;	1697	return 0;
1698	}	1698	}
1699		1699
1700	static inline int remap_pud_range(struct mm_struct mm, pgd_t pgd,	1700	static inline int remap_pud_range(struct mm_struct mm, pgd_t pgd,
1701	unsigned long addr, unsigned long end,	1701	unsigned long addr, unsigned long end,
1702	unsigned long pfn, pgprot_t prot)	1702	unsigned long pfn, pgprot_t prot)
1703	{	1703	{
1704	pud_t *pud;	1704	pud_t *pud;
1705	unsigned long next;	1705	unsigned long next;
1706		1706
1707	pfn -= addr >> PAGE_SHIFT;	1707	pfn -= addr >> PAGE_SHIFT;
1708	pud = pud_alloc(mm, pgd, addr);	1708	pud = pud_alloc(mm, pgd, addr);
1709	if (!pud)	1709	if (!pud)
1710	return -ENOMEM;	1710	return -ENOMEM;
1711	do {	1711	do {
1712	next = pud_addr_end(addr, end);	1712	next = pud_addr_end(addr, end);
1713	if (remap_pmd_range(mm, pud, addr, next,	1713	if (remap_pmd_range(mm, pud, addr, next,
1714	pfn + (addr >> PAGE_SHIFT), prot))	1714	pfn + (addr >> PAGE_SHIFT), prot))
1715	return -ENOMEM;	1715	return -ENOMEM;
1716	} while (pud++, addr = next, addr != end);	1716	} while (pud++, addr = next, addr != end);
1717	return 0;	1717	return 0;
1718	}	1718	}
1719		1719
1720	/**	1720	/**
1721	* remap_pfn_range - remap kernel memory to userspace	1721	* remap_pfn_range - remap kernel memory to userspace
1722	* @vma: user vma to map to	1722	* @vma: user vma to map to
1723	* @addr: target user address to start at	1723	* @addr: target user address to start at
1724	* @pfn: physical address of kernel memory	1724	* @pfn: physical address of kernel memory
1725	* @size: size of map area	1725	* @size: size of map area
1726	* @prot: page protection flags for this mapping	1726	* @prot: page protection flags for this mapping
1727	*	1727	*
1728	* Note: this is only safe if the mm semaphore is held when called.	1728	* Note: this is only safe if the mm semaphore is held when called.
1729	*/	1729	*/
1730	int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,	1730	int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1731	unsigned long pfn, unsigned long size, pgprot_t prot)	1731	unsigned long pfn, unsigned long size, pgprot_t prot)
1732	{	1732	{
1733	pgd_t *pgd;	1733	pgd_t *pgd;
1734	unsigned long next;	1734	unsigned long next;
1735	unsigned long end = addr + PAGE_ALIGN(size);	1735	unsigned long end = addr + PAGE_ALIGN(size);
1736	struct mm_struct *mm = vma->vm_mm;	1736	struct mm_struct *mm = vma->vm_mm;
1737	int err;	1737	int err;
1738		1738
1739	/*	1739	/*
1740	* Physically remapped pages are special. Tell the	1740	* Physically remapped pages are special. Tell the
1741	* rest of the world about it:	1741	* rest of the world about it:
1742	* VM_IO tells people not to look at these pages	1742	* VM_IO tells people not to look at these pages
1743	* (accesses can have side effects).	1743	* (accesses can have side effects).
1744	* VM_PFNMAP tells the core MM that the base pages are just	1744	* VM_PFNMAP tells the core MM that the base pages are just
1745	* raw PFN mappings, and do not have a "struct page" associated	1745	* raw PFN mappings, and do not have a "struct page" associated
1746	* with them.	1746	* with them.
1747	* VM_DONTEXPAND	1747	* VM_DONTEXPAND
1748	* Disable vma merging and expanding with mremap().	1748	* Disable vma merging and expanding with mremap().
1749	* VM_DONTDUMP	1749	* VM_DONTDUMP
1750	* Omit vma from core dump, even when VM_IO turned off.	1750	* Omit vma from core dump, even when VM_IO turned off.
1751	*	1751	*
1752	* There's a horrible special case to handle copy-on-write	1752	* There's a horrible special case to handle copy-on-write
1753	* behaviour that some programs depend on. We mark the "original"	1753	* behaviour that some programs depend on. We mark the "original"
1754	* un-COW'ed pages by matching them up with "vma->vm_pgoff".	1754	* un-COW'ed pages by matching them up with "vma->vm_pgoff".
1755	* See vm_normal_page() for details.	1755	* See vm_normal_page() for details.
1756	*/	1756	*/
1757	if (is_cow_mapping(vma->vm_flags)) {	1757	if (is_cow_mapping(vma->vm_flags)) {
1758	if (addr != vma->vm_start \|\| end != vma->vm_end)	1758	if (addr != vma->vm_start \|\| end != vma->vm_end)
1759	return -EINVAL;	1759	return -EINVAL;
1760	vma->vm_pgoff = pfn;	1760	vma->vm_pgoff = pfn;
1761	}	1761	}
1762		1762
1763	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));	1763	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
1764	if (err)	1764	if (err)
1765	return -EINVAL;	1765	return -EINVAL;
1766		1766
1767	vma->vm_flags \|= VM_IO \| VM_PFNMAP \| VM_DONTEXPAND \| VM_DONTDUMP;	1767	vma->vm_flags \|= VM_IO \| VM_PFNMAP \| VM_DONTEXPAND \| VM_DONTDUMP;
1768		1768
1769	BUG_ON(addr >= end);	1769	BUG_ON(addr >= end);
1770	pfn -= addr >> PAGE_SHIFT;	1770	pfn -= addr >> PAGE_SHIFT;
1771	pgd = pgd_offset(mm, addr);	1771	pgd = pgd_offset(mm, addr);
1772	flush_cache_range(vma, addr, end);	1772	flush_cache_range(vma, addr, end);
1773	do {	1773	do {
1774	next = pgd_addr_end(addr, end);	1774	next = pgd_addr_end(addr, end);
1775	err = remap_pud_range(mm, pgd, addr, next,	1775	err = remap_pud_range(mm, pgd, addr, next,
1776	pfn + (addr >> PAGE_SHIFT), prot);	1776	pfn + (addr >> PAGE_SHIFT), prot);
1777	if (err)	1777	if (err)
1778	break;	1778	break;
1779	} while (pgd++, addr = next, addr != end);	1779	} while (pgd++, addr = next, addr != end);
1780		1780
1781	if (err)	1781	if (err)
1782	untrack_pfn(vma, pfn, PAGE_ALIGN(size));	1782	untrack_pfn(vma, pfn, PAGE_ALIGN(size));
1783		1783
1784	return err;	1784	return err;
1785	}	1785	}
1786	EXPORT_SYMBOL(remap_pfn_range);	1786	EXPORT_SYMBOL(remap_pfn_range);
1787		1787
1788	/**	1788	/**
1789	* vm_iomap_memory - remap memory to userspace	1789	* vm_iomap_memory - remap memory to userspace
1790	* @vma: user vma to map to	1790	* @vma: user vma to map to
1791	* @start: start of area	1791	* @start: start of area
1792	* @len: size of area	1792	* @len: size of area
1793	*	1793	*
1794	* This is a simplified io_remap_pfn_range() for common driver use. The	1794	* This is a simplified io_remap_pfn_range() for common driver use. The
1795	* driver just needs to give us the physical memory range to be mapped,	1795	* driver just needs to give us the physical memory range to be mapped,
1796	* we'll figure out the rest from the vma information.	1796	* we'll figure out the rest from the vma information.
1797	*	1797	*
1798	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get	1798	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
1799	* whatever write-combining details or similar.	1799	* whatever write-combining details or similar.
1800	*/	1800	*/
1801	int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)	1801	int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1802	{	1802	{
1803	unsigned long vm_len, pfn, pages;	1803	unsigned long vm_len, pfn, pages;
1804		1804
1805	/* Check that the physical memory area passed in looks valid */	1805	/* Check that the physical memory area passed in looks valid */
1806	if (start + len < start)	1806	if (start + len < start)
1807	return -EINVAL;	1807	return -EINVAL;
1808	/*	1808	/*
1809	* You really shouldn't map things that aren't page-aligned,	1809	* You really shouldn't map things that aren't page-aligned,
1810	* but we've historically allowed it because IO memory might	1810	* but we've historically allowed it because IO memory might
1811	* just have smaller alignment.	1811	* just have smaller alignment.
1812	*/	1812	*/
1813	len += start & ~PAGE_MASK;	1813	len += start & ~PAGE_MASK;
1814	pfn = start >> PAGE_SHIFT;	1814	pfn = start >> PAGE_SHIFT;
1815	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;	1815	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1816	if (pfn + pages < pfn)	1816	if (pfn + pages < pfn)
1817	return -EINVAL;	1817	return -EINVAL;
1818		1818
1819	/* We start the mapping 'vm_pgoff' pages into the area */	1819	/* We start the mapping 'vm_pgoff' pages into the area */
1820	if (vma->vm_pgoff > pages)	1820	if (vma->vm_pgoff > pages)
1821	return -EINVAL;	1821	return -EINVAL;
1822	pfn += vma->vm_pgoff;	1822	pfn += vma->vm_pgoff;
1823	pages -= vma->vm_pgoff;	1823	pages -= vma->vm_pgoff;
1824		1824
1825	/* Can we fit all of the mapping? */	1825	/* Can we fit all of the mapping? */
1826	vm_len = vma->vm_end - vma->vm_start;	1826	vm_len = vma->vm_end - vma->vm_start;
1827	if (vm_len >> PAGE_SHIFT > pages)	1827	if (vm_len >> PAGE_SHIFT > pages)
1828	return -EINVAL;	1828	return -EINVAL;
1829		1829
1830	/* Ok, let it rip */	1830	/* Ok, let it rip */
1831	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);	1831	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1832	}	1832	}
1833	EXPORT_SYMBOL(vm_iomap_memory);	1833	EXPORT_SYMBOL(vm_iomap_memory);
1834		1834
1835	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,	1835	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,
1836	unsigned long addr, unsigned long end,	1836	unsigned long addr, unsigned long end,
1837	pte_fn_t fn, void *data)	1837	pte_fn_t fn, void *data)
1838	{	1838	{
1839	pte_t *pte;	1839	pte_t *pte;
1840	int err;	1840	int err;
1841	pgtable_t token;	1841	pgtable_t token;
1842	spinlock_t *uninitialized_var(ptl);	1842	spinlock_t *uninitialized_var(ptl);
1843		1843
1844	pte = (mm == &init_mm) ?	1844	pte = (mm == &init_mm) ?
1845	pte_alloc_kernel(pmd, addr) :	1845	pte_alloc_kernel(pmd, addr) :
1846	pte_alloc_map_lock(mm, pmd, addr, &ptl);	1846	pte_alloc_map_lock(mm, pmd, addr, &ptl);
1847	if (!pte)	1847	if (!pte)
1848	return -ENOMEM;	1848	return -ENOMEM;
1849		1849
1850	BUG_ON(pmd_huge(*pmd));	1850	BUG_ON(pmd_huge(*pmd));
1851		1851
1852	arch_enter_lazy_mmu_mode();	1852	arch_enter_lazy_mmu_mode();
1853		1853
1854	token = pmd_pgtable(*pmd);	1854	token = pmd_pgtable(*pmd);
1855		1855
1856	do {	1856	do {
1857	err = fn(pte++, token, addr, data);	1857	err = fn(pte++, token, addr, data);
1858	if (err)	1858	if (err)
1859	break;	1859	break;
1860	} while (addr += PAGE_SIZE, addr != end);	1860	} while (addr += PAGE_SIZE, addr != end);
1861		1861
1862	arch_leave_lazy_mmu_mode();	1862	arch_leave_lazy_mmu_mode();
1863		1863
1864	if (mm != &init_mm)	1864	if (mm != &init_mm)
1865	pte_unmap_unlock(pte-1, ptl);	1865	pte_unmap_unlock(pte-1, ptl);
1866	return err;	1866	return err;
1867	}	1867	}
1868		1868
1869	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,	1869	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,
1870	unsigned long addr, unsigned long end,	1870	unsigned long addr, unsigned long end,
1871	pte_fn_t fn, void *data)	1871	pte_fn_t fn, void *data)
1872	{	1872	{
1873	pmd_t *pmd;	1873	pmd_t *pmd;
1874	unsigned long next;	1874	unsigned long next;
1875	int err;	1875	int err;
1876		1876
1877	BUG_ON(pud_huge(*pud));	1877	BUG_ON(pud_huge(*pud));
1878		1878
1879	pmd = pmd_alloc(mm, pud, addr);	1879	pmd = pmd_alloc(mm, pud, addr);
1880	if (!pmd)	1880	if (!pmd)
1881	return -ENOMEM;	1881	return -ENOMEM;
1882	do {	1882	do {
1883	next = pmd_addr_end(addr, end);	1883	next = pmd_addr_end(addr, end);
1884	err = apply_to_pte_range(mm, pmd, addr, next, fn, data);	1884	err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1885	if (err)	1885	if (err)
1886	break;	1886	break;
1887	} while (pmd++, addr = next, addr != end);	1887	} while (pmd++, addr = next, addr != end);
1888	return err;	1888	return err;
1889	}	1889	}
1890		1890
1891	static int apply_to_pud_range(struct mm_struct mm, pgd_t pgd,	1891	static int apply_to_pud_range(struct mm_struct mm, pgd_t pgd,
1892	unsigned long addr, unsigned long end,	1892	unsigned long addr, unsigned long end,
1893	pte_fn_t fn, void *data)	1893	pte_fn_t fn, void *data)
1894	{	1894	{
1895	pud_t *pud;	1895	pud_t *pud;
1896	unsigned long next;	1896	unsigned long next;
1897	int err;	1897	int err;
1898		1898
1899	pud = pud_alloc(mm, pgd, addr);	1899	pud = pud_alloc(mm, pgd, addr);
1900	if (!pud)	1900	if (!pud)
1901	return -ENOMEM;	1901	return -ENOMEM;
1902	do {	1902	do {
1903	next = pud_addr_end(addr, end);	1903	next = pud_addr_end(addr, end);
1904	err = apply_to_pmd_range(mm, pud, addr, next, fn, data);	1904	err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1905	if (err)	1905	if (err)
1906	break;	1906	break;
1907	} while (pud++, addr = next, addr != end);	1907	} while (pud++, addr = next, addr != end);
1908	return err;	1908	return err;
1909	}	1909	}
1910		1910
1911	/*	1911	/*
1912	* Scan a region of virtual memory, filling in page tables as necessary	1912	* Scan a region of virtual memory, filling in page tables as necessary
1913	* and calling a provided function on each leaf page table.	1913	* and calling a provided function on each leaf page table.
1914	*/	1914	*/
1915	int apply_to_page_range(struct mm_struct *mm, unsigned long addr,	1915	int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1916	unsigned long size, pte_fn_t fn, void *data)	1916	unsigned long size, pte_fn_t fn, void *data)
1917	{	1917	{
1918	pgd_t *pgd;	1918	pgd_t *pgd;
1919	unsigned long next;	1919	unsigned long next;
1920	unsigned long end = addr + size;	1920	unsigned long end = addr + size;
1921	int err;	1921	int err;
1922		1922
1923	BUG_ON(addr >= end);	1923	BUG_ON(addr >= end);
1924	pgd = pgd_offset(mm, addr);	1924	pgd = pgd_offset(mm, addr);
1925	do {	1925	do {
1926	next = pgd_addr_end(addr, end);	1926	next = pgd_addr_end(addr, end);
1927	err = apply_to_pud_range(mm, pgd, addr, next, fn, data);	1927	err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1928	if (err)	1928	if (err)
1929	break;	1929	break;
1930	} while (pgd++, addr = next, addr != end);	1930	} while (pgd++, addr = next, addr != end);
1931		1931
1932	return err;	1932	return err;
1933	}	1933	}
1934	EXPORT_SYMBOL_GPL(apply_to_page_range);	1934	EXPORT_SYMBOL_GPL(apply_to_page_range);
1935		1935
1936	/*	1936	/*
1937	* handle_pte_fault chooses page fault handler according to an entry	1937	* handle_pte_fault chooses page fault handler according to an entry
1938	* which was read non-atomically. Before making any commitment, on	1938	* which was read non-atomically. Before making any commitment, on
1939	* those architectures or configurations (e.g. i386 with PAE) which	1939	* those architectures or configurations (e.g. i386 with PAE) which
1940	* might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault	1940	* might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
1941	* must check under lock before unmapping the pte and proceeding	1941	* must check under lock before unmapping the pte and proceeding
1942	* (but do_wp_page is only called after already making such a check;	1942	* (but do_wp_page is only called after already making such a check;
1943	* and do_anonymous_page can safely check later on).	1943	* and do_anonymous_page can safely check later on).
1944	*/	1944	*/
1945	static inline int pte_unmap_same(struct mm_struct mm, pmd_t pmd,	1945	static inline int pte_unmap_same(struct mm_struct mm, pmd_t pmd,
1946	pte_t *page_table, pte_t orig_pte)	1946	pte_t *page_table, pte_t orig_pte)
1947	{	1947	{
1948	int same = 1;	1948	int same = 1;
1949	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT)	1949	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT)
1950	if (sizeof(pte_t) > sizeof(unsigned long)) {	1950	if (sizeof(pte_t) > sizeof(unsigned long)) {
1951	spinlock_t *ptl = pte_lockptr(mm, pmd);	1951	spinlock_t *ptl = pte_lockptr(mm, pmd);
1952	spin_lock(ptl);	1952	spin_lock(ptl);
1953	same = pte_same(*page_table, orig_pte);	1953	same = pte_same(*page_table, orig_pte);
1954	spin_unlock(ptl);	1954	spin_unlock(ptl);
1955	}	1955	}
1956	#endif	1956	#endif
1957	pte_unmap(page_table);	1957	pte_unmap(page_table);
1958	return same;	1958	return same;
1959	}	1959	}
1960		1960
1961	static inline void cow_user_page(struct page dst, struct page src, unsigned long va, struct vm_area_struct *vma)	1961	static inline void cow_user_page(struct page dst, struct page src, unsigned long va, struct vm_area_struct *vma)
1962	{	1962	{
1963	debug_dma_assert_idle(src);	1963	debug_dma_assert_idle(src);
1964		1964
1965	/*	1965	/*
1966	* If the source page was a PFN mapping, we don't have	1966	* If the source page was a PFN mapping, we don't have
1967	* a "struct page" for it. We do a best-effort copy by	1967	* a "struct page" for it. We do a best-effort copy by
1968	* just copying from the original user address. If that	1968	* just copying from the original user address. If that
1969	* fails, we just zero-fill it. Live with it.	1969	* fails, we just zero-fill it. Live with it.
1970	*/	1970	*/
1971	if (unlikely(!src)) {	1971	if (unlikely(!src)) {
1972	void *kaddr = kmap_atomic(dst);	1972	void *kaddr = kmap_atomic(dst);
1973	void __user uaddr = (void __user )(va & PAGE_MASK);	1973	void __user uaddr = (void __user )(va & PAGE_MASK);
1974		1974
1975	/*	1975	/*
1976	* This really shouldn't fail, because the page is there	1976	* This really shouldn't fail, because the page is there
1977	* in the page tables. But it might just be unreadable,	1977	* in the page tables. But it might just be unreadable,
1978	* in which case we just give up and fill the result with	1978	* in which case we just give up and fill the result with
1979	* zeroes.	1979	* zeroes.
1980	*/	1980	*/
1981	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))	1981	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1982	clear_page(kaddr);	1982	clear_page(kaddr);
1983	kunmap_atomic(kaddr);	1983	kunmap_atomic(kaddr);
1984	flush_dcache_page(dst);	1984	flush_dcache_page(dst);
1985	} else	1985	} else
1986	copy_user_highpage(dst, src, va, vma);	1986	copy_user_highpage(dst, src, va, vma);
1987	}	1987	}
1988		1988
1989	/*	1989	/*
1990	* Notify the address space that the page is about to become writable so that	1990	* Notify the address space that the page is about to become writable so that
1991	* it can prohibit this or wait for the page to get into an appropriate state.	1991	* it can prohibit this or wait for the page to get into an appropriate state.
1992	*	1992	*
1993	* We do this without the lock held, so that it can sleep if it needs to.	1993	* We do this without the lock held, so that it can sleep if it needs to.
1994	*/	1994	*/
1995	static int do_page_mkwrite(struct vm_area_struct vma, struct page page,	1995	static int do_page_mkwrite(struct vm_area_struct vma, struct page page,
1996	unsigned long address)	1996	unsigned long address)
1997	{	1997	{
1998	struct vm_fault vmf;	1998	struct vm_fault vmf;
1999	int ret;	1999	int ret;
2000		2000
2001	vmf.virtual_address = (void __user *)(address & PAGE_MASK);	2001	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2002	vmf.pgoff = page->index;	2002	vmf.pgoff = page->index;
2003	vmf.flags = FAULT_FLAG_WRITE\|FAULT_FLAG_MKWRITE;	2003	vmf.flags = FAULT_FLAG_WRITE\|FAULT_FLAG_MKWRITE;
2004	vmf.page = page;	2004	vmf.page = page;
2005		2005
2006	ret = vma->vm_ops->page_mkwrite(vma, &vmf);	2006	ret = vma->vm_ops->page_mkwrite(vma, &vmf);
2007	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))	2007	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))
2008	return ret;	2008	return ret;
2009	if (unlikely(!(ret & VM_FAULT_LOCKED))) {	2009	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2010	lock_page(page);	2010	lock_page(page);
2011	if (!page->mapping) {	2011	if (!page->mapping) {
2012	unlock_page(page);	2012	unlock_page(page);
2013	return 0; /* retry */	2013	return 0; /* retry */
2014	}	2014	}
2015	ret \|= VM_FAULT_LOCKED;	2015	ret \|= VM_FAULT_LOCKED;
2016	} else	2016	} else
2017	VM_BUG_ON_PAGE(!PageLocked(page), page);	2017	VM_BUG_ON_PAGE(!PageLocked(page), page);
2018	return ret;	2018	return ret;
2019	}	2019	}
2020		2020
2021	/*	2021	/*
2022	* This routine handles present pages, when users try to write	2022	* This routine handles present pages, when users try to write
2023	* to a shared page. It is done by copying the page to a new address	2023	* to a shared page. It is done by copying the page to a new address
2024	* and decrementing the shared-page counter for the old page.	2024	* and decrementing the shared-page counter for the old page.
2025	*	2025	*
2026	* Note that this routine assumes that the protection checks have been	2026	* Note that this routine assumes that the protection checks have been
2027	* done by the caller (the low-level page fault routine in most cases).	2027	* done by the caller (the low-level page fault routine in most cases).
2028	* Thus we can safely just mark it writable once we've done any necessary	2028	* Thus we can safely just mark it writable once we've done any necessary
2029	* COW.	2029	* COW.
2030	*	2030	*
2031	* We also mark the page dirty at this point even though the page will	2031	* We also mark the page dirty at this point even though the page will
2032	* change only once the write actually happens. This avoids a few races,	2032	* change only once the write actually happens. This avoids a few races,
2033	* and potentially makes it more efficient.	2033	* and potentially makes it more efficient.
2034	*	2034	*
2035	* We enter with non-exclusive mmap_sem (to exclude vma changes,	2035	* We enter with non-exclusive mmap_sem (to exclude vma changes,
2036	* but allow concurrent faults), with pte both mapped and locked.	2036	* but allow concurrent faults), with pte both mapped and locked.
2037	* We return with mmap_sem still held, but pte unmapped and unlocked.	2037	* We return with mmap_sem still held, but pte unmapped and unlocked.
2038	*/	2038	*/
2039	static int do_wp_page(struct mm_struct mm, struct vm_area_struct vma,	2039	static int do_wp_page(struct mm_struct mm, struct vm_area_struct vma,
2040	unsigned long address, pte_t page_table, pmd_t pmd,	2040	unsigned long address, pte_t page_table, pmd_t pmd,
2041	spinlock_t *ptl, pte_t orig_pte)	2041	spinlock_t *ptl, pte_t orig_pte)
2042	__releases(ptl)	2042	__releases(ptl)
2043	{	2043	{
2044	struct page old_page, new_page = NULL;	2044	struct page old_page, new_page = NULL;
2045	pte_t entry;	2045	pte_t entry;
2046	int ret = 0;	2046	int ret = 0;
2047	int page_mkwrite = 0;	2047	int page_mkwrite = 0;
2048	struct page *dirty_page = NULL;	2048	struct page *dirty_page = NULL;
2049	unsigned long mmun_start = 0; /* For mmu_notifiers */	2049	unsigned long mmun_start = 0; /* For mmu_notifiers */
2050	unsigned long mmun_end = 0; /* For mmu_notifiers */	2050	unsigned long mmun_end = 0; /* For mmu_notifiers */
2051	struct mem_cgroup *memcg;	2051	struct mem_cgroup *memcg;
2052		2052
2053	old_page = vm_normal_page(vma, address, orig_pte);	2053	old_page = vm_normal_page(vma, address, orig_pte);
2054	if (!old_page) {	2054	if (!old_page) {
2055	/*	2055	/*
2056	* VM_MIXEDMAP !pfn_valid() case	2056	* VM_MIXEDMAP !pfn_valid() case
2057	*	2057	*
2058	* We should not cow pages in a shared writeable mapping.	2058	* We should not cow pages in a shared writeable mapping.
2059	* Just mark the pages writable as we can't do any dirty	2059	* Just mark the pages writable as we can't do any dirty
2060	* accounting on raw pfn maps.	2060	* accounting on raw pfn maps.
2061	*/	2061	*/
2062	if ((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==	2062	if ((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
2063	(VM_WRITE\|VM_SHARED))	2063	(VM_WRITE\|VM_SHARED))
2064	goto reuse;	2064	goto reuse;
2065	goto gotten;	2065	goto gotten;
2066	}	2066	}
2067		2067
2068	/*	2068	/*
2069	* Take out anonymous pages first, anonymous shared vmas are	2069	* Take out anonymous pages first, anonymous shared vmas are
2070	* not dirty accountable.	2070	* not dirty accountable.
2071	*/	2071	*/
2072	if (PageAnon(old_page) && !PageKsm(old_page)) {	2072	if (PageAnon(old_page) && !PageKsm(old_page)) {
2073	if (!trylock_page(old_page)) {	2073	if (!trylock_page(old_page)) {
2074	page_cache_get(old_page);	2074	page_cache_get(old_page);
2075	pte_unmap_unlock(page_table, ptl);	2075	pte_unmap_unlock(page_table, ptl);
2076	lock_page(old_page);	2076	lock_page(old_page);
2077	page_table = pte_offset_map_lock(mm, pmd, address,	2077	page_table = pte_offset_map_lock(mm, pmd, address,
2078	&ptl);	2078	&ptl);
2079	if (!pte_same(*page_table, orig_pte)) {	2079	if (!pte_same(*page_table, orig_pte)) {
2080	unlock_page(old_page);	2080	unlock_page(old_page);
2081	goto unlock;	2081	goto unlock;
2082	}	2082	}
2083	page_cache_release(old_page);	2083	page_cache_release(old_page);
2084	}	2084	}
2085	if (reuse_swap_page(old_page)) {	2085	if (reuse_swap_page(old_page)) {
2086	/*	2086	/*
2087	* The page is all ours. Move it to our anon_vma so	2087	* The page is all ours. Move it to our anon_vma so
2088	* the rmap code will not search our parent or siblings.	2088	* the rmap code will not search our parent or siblings.
2089	* Protected against the rmap code by the page lock.	2089	* Protected against the rmap code by the page lock.
2090	*/	2090	*/
2091	page_move_anon_rmap(old_page, vma, address);	2091	page_move_anon_rmap(old_page, vma, address);
2092	unlock_page(old_page);	2092	unlock_page(old_page);
2093	goto reuse;	2093	goto reuse;
2094	}	2094	}
2095	unlock_page(old_page);	2095	unlock_page(old_page);
2096	} else if (unlikely((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==	2096	} else if (unlikely((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
2097	(VM_WRITE\|VM_SHARED))) {	2097	(VM_WRITE\|VM_SHARED))) {
2098	/*	2098	/*
2099	* Only catch write-faults on shared writable pages,	2099	* Only catch write-faults on shared writable pages,
2100	* read-only shared pages can get COWed by	2100	* read-only shared pages can get COWed by
2101	* get_user_pages(.write=1, .force=1).	2101	* get_user_pages(.write=1, .force=1).
2102	*/	2102	*/
2103	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {	2103	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2104	int tmp;	2104	int tmp;
2105	page_cache_get(old_page);	2105	page_cache_get(old_page);
2106	pte_unmap_unlock(page_table, ptl);	2106	pte_unmap_unlock(page_table, ptl);
2107	tmp = do_page_mkwrite(vma, old_page, address);	2107	tmp = do_page_mkwrite(vma, old_page, address);
2108	if (unlikely(!tmp \|\| (tmp &	2108	if (unlikely(!tmp \|\| (tmp &
2109	(VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {	2109	(VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
2110	page_cache_release(old_page);	2110	page_cache_release(old_page);
2111	return tmp;	2111	return tmp;
2112	}	2112	}
2113	/*	2113	/*
2114	* Since we dropped the lock we need to revalidate	2114	* Since we dropped the lock we need to revalidate
2115	* the PTE as someone else may have changed it. If	2115	* the PTE as someone else may have changed it. If
2116	* they did, we just return, as we can count on the	2116	* they did, we just return, as we can count on the
2117	* MMU to tell us if they didn't also make it writable.	2117	* MMU to tell us if they didn't also make it writable.
2118	*/	2118	*/
2119	page_table = pte_offset_map_lock(mm, pmd, address,	2119	page_table = pte_offset_map_lock(mm, pmd, address,
2120	&ptl);	2120	&ptl);
2121	if (!pte_same(*page_table, orig_pte)) {	2121	if (!pte_same(*page_table, orig_pte)) {
2122	unlock_page(old_page);	2122	unlock_page(old_page);
2123	goto unlock;	2123	goto unlock;
2124	}	2124	}
2125		2125
2126	page_mkwrite = 1;	2126	page_mkwrite = 1;
2127	}	2127	}
2128	dirty_page = old_page;	2128	dirty_page = old_page;
2129	get_page(dirty_page);	2129	get_page(dirty_page);
2130		2130
2131	reuse:	2131	reuse:
2132	/*	2132	/*
2133	* Clear the pages cpupid information as the existing	2133	* Clear the pages cpupid information as the existing
2134	* information potentially belongs to a now completely	2134	* information potentially belongs to a now completely
2135	* unrelated process.	2135	* unrelated process.
2136	*/	2136	*/
2137	if (old_page)	2137	if (old_page)
2138	page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);	2138	page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2139		2139
2140	flush_cache_page(vma, address, pte_pfn(orig_pte));	2140	flush_cache_page(vma, address, pte_pfn(orig_pte));
2141	entry = pte_mkyoung(orig_pte);	2141	entry = pte_mkyoung(orig_pte);
2142	entry = maybe_mkwrite(pte_mkdirty(entry), vma);	2142	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2143	if (ptep_set_access_flags(vma, address, page_table, entry,1))	2143	if (ptep_set_access_flags(vma, address, page_table, entry,1))
2144	update_mmu_cache(vma, address, page_table);	2144	update_mmu_cache(vma, address, page_table);
2145	pte_unmap_unlock(page_table, ptl);	2145	pte_unmap_unlock(page_table, ptl);
2146	ret \|= VM_FAULT_WRITE;	2146	ret \|= VM_FAULT_WRITE;
2147		2147
2148	if (!dirty_page)	2148	if (!dirty_page)
2149	return ret;	2149	return ret;
2150		2150
2151	/*	2151	/*
2152	* Yes, Virginia, this is actually required to prevent a race	2152	* Yes, Virginia, this is actually required to prevent a race
2153	* with clear_page_dirty_for_io() from clearing the page dirty	2153	* with clear_page_dirty_for_io() from clearing the page dirty
2154	* bit after it clear all dirty ptes, but before a racing	2154	* bit after it clear all dirty ptes, but before a racing
2155	* do_wp_page installs a dirty pte.	2155	* do_wp_page installs a dirty pte.
2156	*	2156	*
2157	* do_shared_fault is protected similarly.	2157	* do_shared_fault is protected similarly.
2158	*/	2158	*/
2159	if (!page_mkwrite) {	2159	if (!page_mkwrite) {
2160	wait_on_page_locked(dirty_page);	2160	wait_on_page_locked(dirty_page);
2161	set_page_dirty_balance(dirty_page);	2161	set_page_dirty_balance(dirty_page);
2162	/* file_update_time outside page_lock */	2162	/* file_update_time outside page_lock */
2163	if (vma->vm_file)	2163	if (vma->vm_file)
2164	file_update_time(vma->vm_file);	2164	file_update_time(vma->vm_file);
2165	}	2165	}
2166	put_page(dirty_page);	2166	put_page(dirty_page);
2167	if (page_mkwrite) {	2167	if (page_mkwrite) {
2168	struct address_space *mapping = dirty_page->mapping;	2168	struct address_space *mapping = dirty_page->mapping;
2169		2169
2170	set_page_dirty(dirty_page);	2170	set_page_dirty(dirty_page);
2171	unlock_page(dirty_page);	2171	unlock_page(dirty_page);
2172	page_cache_release(dirty_page);	2172	page_cache_release(dirty_page);
2173	if (mapping) {	2173	if (mapping) {
2174	/*	2174	/*
2175	* Some device drivers do not set page.mapping	2175	* Some device drivers do not set page.mapping
2176	* but still dirty their pages	2176	* but still dirty their pages
2177	*/	2177	*/
2178	balance_dirty_pages_ratelimited(mapping);	2178	balance_dirty_pages_ratelimited(mapping);
2179	}	2179	}
2180	}	2180	}
2181		2181
2182	return ret;	2182	return ret;
2183	}	2183	}
2184		2184
2185	/*	2185	/*
2186	* Ok, we need to copy. Oh, well..	2186	* Ok, we need to copy. Oh, well..
2187	*/	2187	*/
2188	page_cache_get(old_page);	2188	page_cache_get(old_page);
2189	gotten:	2189	gotten:
2190	pte_unmap_unlock(page_table, ptl);	2190	pte_unmap_unlock(page_table, ptl);
2191		2191
2192	if (unlikely(anon_vma_prepare(vma)))	2192	if (unlikely(anon_vma_prepare(vma)))
2193	goto oom;	2193	goto oom;
2194		2194
2195	if (is_zero_pfn(pte_pfn(orig_pte))) {	2195	if (is_zero_pfn(pte_pfn(orig_pte))) {
2196	new_page = alloc_zeroed_user_highpage_movable(vma, address);	2196	new_page = alloc_zeroed_user_highpage_movable(vma, address);
2197	if (!new_page)	2197	if (!new_page)
2198	goto oom;	2198	goto oom;
2199	} else {	2199	} else {
2200	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);	2200	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2201	if (!new_page)	2201	if (!new_page)
2202	goto oom;	2202	goto oom;
2203	cow_user_page(new_page, old_page, address, vma);	2203	cow_user_page(new_page, old_page, address, vma);
2204	}	2204	}
2205	__SetPageUptodate(new_page);	2205	__SetPageUptodate(new_page);
2206		2206
2207	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))	2207	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2208	goto oom_free_new;	2208	goto oom_free_new;
2209		2209
2210	mmun_start = address & PAGE_MASK;	2210	mmun_start = address & PAGE_MASK;
2211	mmun_end = mmun_start + PAGE_SIZE;	2211	mmun_end = mmun_start + PAGE_SIZE;
2212	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);	2212	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2213		2213
2214	/*	2214	/*
2215	* Re-check the pte - we dropped the lock	2215	* Re-check the pte - we dropped the lock
2216	*/	2216	*/
2217	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2217	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2218	if (likely(pte_same(*page_table, orig_pte))) {	2218	if (likely(pte_same(*page_table, orig_pte))) {
2219	if (old_page) {	2219	if (old_page) {
2220	if (!PageAnon(old_page)) {	2220	if (!PageAnon(old_page)) {
2221	dec_mm_counter_fast(mm, MM_FILEPAGES);	2221	dec_mm_counter_fast(mm, MM_FILEPAGES);
2222	inc_mm_counter_fast(mm, MM_ANONPAGES);	2222	inc_mm_counter_fast(mm, MM_ANONPAGES);
2223	}	2223	}
2224	} else	2224	} else
2225	inc_mm_counter_fast(mm, MM_ANONPAGES);	2225	inc_mm_counter_fast(mm, MM_ANONPAGES);
2226	flush_cache_page(vma, address, pte_pfn(orig_pte));	2226	flush_cache_page(vma, address, pte_pfn(orig_pte));
2227	entry = mk_pte(new_page, vma->vm_page_prot);	2227	entry = mk_pte(new_page, vma->vm_page_prot);
2228	entry = maybe_mkwrite(pte_mkdirty(entry), vma);	2228	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2229	/*	2229	/*
2230	* Clear the pte entry and flush it first, before updating the	2230	* Clear the pte entry and flush it first, before updating the
2231	* pte with the new entry. This will avoid a race condition	2231	* pte with the new entry. This will avoid a race condition
2232	* seen in the presence of one thread doing SMC and another	2232	* seen in the presence of one thread doing SMC and another
2233	* thread doing COW.	2233	* thread doing COW.
2234	*/	2234	*/
2235	ptep_clear_flush(vma, address, page_table);	2235	ptep_clear_flush(vma, address, page_table);
2236	page_add_new_anon_rmap(new_page, vma, address);	2236	page_add_new_anon_rmap(new_page, vma, address);
2237	mem_cgroup_commit_charge(new_page, memcg, false);	2237	mem_cgroup_commit_charge(new_page, memcg, false);
2238	lru_cache_add_active_or_unevictable(new_page, vma);	2238	lru_cache_add_active_or_unevictable(new_page, vma);
2239	/*	2239	/*
2240	* We call the notify macro here because, when using secondary	2240	* We call the notify macro here because, when using secondary
2241	* mmu page tables (such as kvm shadow page tables), we want the	2241	* mmu page tables (such as kvm shadow page tables), we want the
2242	* new page to be mapped directly into the secondary page table.	2242	* new page to be mapped directly into the secondary page table.
2243	*/	2243	*/
2244	set_pte_at_notify(mm, address, page_table, entry);	2244	set_pte_at_notify(mm, address, page_table, entry);
2245	update_mmu_cache(vma, address, page_table);	2245	update_mmu_cache(vma, address, page_table);
2246	if (old_page) {	2246	if (old_page) {
2247	/*	2247	/*
2248	* Only after switching the pte to the new page may	2248	* Only after switching the pte to the new page may
2249	* we remove the mapcount here. Otherwise another	2249	* we remove the mapcount here. Otherwise another
2250	* process may come and find the rmap count decremented	2250	* process may come and find the rmap count decremented
2251	* before the pte is switched to the new page, and	2251	* before the pte is switched to the new page, and
2252	* "reuse" the old page writing into it while our pte	2252	* "reuse" the old page writing into it while our pte
2253	* here still points into it and can be read by other	2253	* here still points into it and can be read by other
2254	* threads.	2254	* threads.
2255	*	2255	*
2256	* The critical issue is to order this	2256	* The critical issue is to order this
2257	* page_remove_rmap with the ptp_clear_flush above.	2257	* page_remove_rmap with the ptp_clear_flush above.
2258	* Those stores are ordered by (if nothing else,)	2258	* Those stores are ordered by (if nothing else,)
2259	* the barrier present in the atomic_add_negative	2259	* the barrier present in the atomic_add_negative
2260	* in page_remove_rmap.	2260	* in page_remove_rmap.
2261	*	2261	*
2262	* Then the TLB flush in ptep_clear_flush ensures that	2262	* Then the TLB flush in ptep_clear_flush ensures that
2263	* no process can access the old page before the	2263	* no process can access the old page before the
2264	* decremented mapcount is visible. And the old page	2264	* decremented mapcount is visible. And the old page
2265	* cannot be reused until after the decremented	2265	* cannot be reused until after the decremented
2266	* mapcount is visible. So transitively, TLBs to	2266	* mapcount is visible. So transitively, TLBs to
2267	* old page will be flushed before it can be reused.	2267	* old page will be flushed before it can be reused.
2268	*/	2268	*/
2269	page_remove_rmap(old_page);	2269	page_remove_rmap(old_page);
2270	}	2270	}
2271		2271
2272	/* Free the old page.. */	2272	/* Free the old page.. */
2273	new_page = old_page;	2273	new_page = old_page;
2274	ret \|= VM_FAULT_WRITE;	2274	ret \|= VM_FAULT_WRITE;
2275	} else	2275	} else
2276	mem_cgroup_cancel_charge(new_page, memcg);	2276	mem_cgroup_cancel_charge(new_page, memcg);
2277		2277
2278	if (new_page)	2278	if (new_page)
2279	page_cache_release(new_page);	2279	page_cache_release(new_page);
2280	unlock:	2280	unlock:
2281	pte_unmap_unlock(page_table, ptl);	2281	pte_unmap_unlock(page_table, ptl);
2282	if (mmun_end > mmun_start)	2282	if (mmun_end > mmun_start)
2283	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);	2283	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2284	if (old_page) {	2284	if (old_page) {
2285	/*	2285	/*
2286	* Don't let another task, with possibly unlocked vma,	2286	* Don't let another task, with possibly unlocked vma,
2287	* keep the mlocked page.	2287	* keep the mlocked page.
2288	*/	2288	*/
2289	if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {	2289	if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2290	lock_page(old_page); /* LRU manipulation */	2290	lock_page(old_page); /* LRU manipulation */
2291	munlock_vma_page(old_page);	2291	munlock_vma_page(old_page);
2292	unlock_page(old_page);	2292	unlock_page(old_page);
2293	}	2293	}
2294	page_cache_release(old_page);	2294	page_cache_release(old_page);
2295	}	2295	}
2296	return ret;	2296	return ret;
2297	oom_free_new:	2297	oom_free_new:
2298	page_cache_release(new_page);	2298	page_cache_release(new_page);
2299	oom:	2299	oom:
2300	if (old_page)	2300	if (old_page)
2301	page_cache_release(old_page);	2301	page_cache_release(old_page);
2302	return VM_FAULT_OOM;	2302	return VM_FAULT_OOM;
2303	}	2303	}
2304		2304
2305	static void unmap_mapping_range_vma(struct vm_area_struct *vma,	2305	static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2306	unsigned long start_addr, unsigned long end_addr,	2306	unsigned long start_addr, unsigned long end_addr,
2307	struct zap_details *details)	2307	struct zap_details *details)
2308	{	2308	{
2309	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);	2309	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2310	}	2310	}
2311		2311
2312	static inline void unmap_mapping_range_tree(struct rb_root *root,	2312	static inline void unmap_mapping_range_tree(struct rb_root *root,
2313	struct zap_details *details)	2313	struct zap_details *details)
2314	{	2314	{
2315	struct vm_area_struct *vma;	2315	struct vm_area_struct *vma;
2316	pgoff_t vba, vea, zba, zea;	2316	pgoff_t vba, vea, zba, zea;
2317		2317
2318	vma_interval_tree_foreach(vma, root,	2318	vma_interval_tree_foreach(vma, root,
2319	details->first_index, details->last_index) {	2319	details->first_index, details->last_index) {
2320		2320
2321	vba = vma->vm_pgoff;	2321	vba = vma->vm_pgoff;
2322	vea = vba + vma_pages(vma) - 1;	2322	vea = vba + vma_pages(vma) - 1;
2323	/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */	2323	/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2324	zba = details->first_index;	2324	zba = details->first_index;
2325	if (zba < vba)	2325	if (zba < vba)
2326	zba = vba;	2326	zba = vba;
2327	zea = details->last_index;	2327	zea = details->last_index;
2328	if (zea > vea)	2328	if (zea > vea)
2329	zea = vea;	2329	zea = vea;
2330		2330
2331	unmap_mapping_range_vma(vma,	2331	unmap_mapping_range_vma(vma,
2332	((zba - vba) << PAGE_SHIFT) + vma->vm_start,	2332	((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2333	((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,	2333	((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2334	details);	2334	details);
2335	}	2335	}
2336	}	2336	}
2337		2337
2338	static inline void unmap_mapping_range_list(struct list_head *head,	2338	static inline void unmap_mapping_range_list(struct list_head *head,
2339	struct zap_details *details)	2339	struct zap_details *details)
2340	{	2340	{
2341	struct vm_area_struct *vma;	2341	struct vm_area_struct *vma;
2342		2342
2343	/*	2343	/*
2344	* In nonlinear VMAs there is no correspondence between virtual address	2344	* In nonlinear VMAs there is no correspondence between virtual address
2345	* offset and file offset. So we must perform an exhaustive search	2345	* offset and file offset. So we must perform an exhaustive search
2346	* across all the pages in each nonlinear VMA, not just the pages	2346	* across all the pages in each nonlinear VMA, not just the pages
2347	* whose virtual address lies outside the file truncation point.	2347	* whose virtual address lies outside the file truncation point.
2348	*/	2348	*/
2349	list_for_each_entry(vma, head, shared.nonlinear) {	2349	list_for_each_entry(vma, head, shared.nonlinear) {
2350	details->nonlinear_vma = vma;	2350	details->nonlinear_vma = vma;
2351	unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);	2351	unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2352	}	2352	}
2353	}	2353	}
2354		2354
2355	/**	2355	/**
2356	* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.	2356	* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
2357	* @mapping: the address space containing mmaps to be unmapped.	2357	* @mapping: the address space containing mmaps to be unmapped.
2358	* @holebegin: byte in first page to unmap, relative to the start of	2358	* @holebegin: byte in first page to unmap, relative to the start of
2359	* the underlying file. This will be rounded down to a PAGE_SIZE	2359	* the underlying file. This will be rounded down to a PAGE_SIZE
2360	* boundary. Note that this is different from truncate_pagecache(), which	2360	* boundary. Note that this is different from truncate_pagecache(), which
2361	* must keep the partial page. In contrast, we must get rid of	2361	* must keep the partial page. In contrast, we must get rid of
2362	* partial pages.	2362	* partial pages.
2363	* @holelen: size of prospective hole in bytes. This will be rounded	2363	* @holelen: size of prospective hole in bytes. This will be rounded
2364	* up to a PAGE_SIZE boundary. A holelen of zero truncates to the	2364	* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
2365	* end of the file.	2365	* end of the file.
2366	* @even_cows: 1 when truncating a file, unmap even private COWed pages;	2366	* @even_cows: 1 when truncating a file, unmap even private COWed pages;
2367	* but 0 when invalidating pagecache, don't throw away private data.	2367	* but 0 when invalidating pagecache, don't throw away private data.
2368	*/	2368	*/
2369	void unmap_mapping_range(struct address_space *mapping,	2369	void unmap_mapping_range(struct address_space *mapping,
2370	loff_t const holebegin, loff_t const holelen, int even_cows)	2370	loff_t const holebegin, loff_t const holelen, int even_cows)
2371	{	2371	{
2372	struct zap_details details;	2372	struct zap_details details;
2373	pgoff_t hba = holebegin >> PAGE_SHIFT;	2373	pgoff_t hba = holebegin >> PAGE_SHIFT;
2374	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;	2374	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2375		2375
2376	/* Check for overflow. */	2376	/* Check for overflow. */
2377	if (sizeof(holelen) > sizeof(hlen)) {	2377	if (sizeof(holelen) > sizeof(hlen)) {
2378	long long holeend =	2378	long long holeend =
2379	(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;	2379	(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2380	if (holeend & ~(long long)ULONG_MAX)	2380	if (holeend & ~(long long)ULONG_MAX)
2381	hlen = ULONG_MAX - hba + 1;	2381	hlen = ULONG_MAX - hba + 1;
2382	}	2382	}
2383		2383
2384	details.check_mapping = even_cows? NULL: mapping;	2384	details.check_mapping = even_cows? NULL: mapping;
2385	details.nonlinear_vma = NULL;	2385	details.nonlinear_vma = NULL;
2386	details.first_index = hba;	2386	details.first_index = hba;
2387	details.last_index = hba + hlen - 1;	2387	details.last_index = hba + hlen - 1;
2388	if (details.last_index < details.first_index)	2388	if (details.last_index < details.first_index)
2389	details.last_index = ULONG_MAX;	2389	details.last_index = ULONG_MAX;
2390		2390
2391		2391
2392	mutex_lock(&mapping->i_mmap_mutex);	2392	mutex_lock(&mapping->i_mmap_mutex);
2393	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))	2393	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2394	unmap_mapping_range_tree(&mapping->i_mmap, &details);	2394	unmap_mapping_range_tree(&mapping->i_mmap, &details);
2395	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))	2395	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2396	unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);	2396	unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2397	mutex_unlock(&mapping->i_mmap_mutex);	2397	mutex_unlock(&mapping->i_mmap_mutex);
2398	}	2398	}
2399	EXPORT_SYMBOL(unmap_mapping_range);	2399	EXPORT_SYMBOL(unmap_mapping_range);
2400		2400
2401	/*	2401	/*
2402	* We enter with non-exclusive mmap_sem (to exclude vma changes,	2402	* We enter with non-exclusive mmap_sem (to exclude vma changes,
2403	* but allow concurrent faults), and pte mapped but not yet locked.	2403	* but allow concurrent faults), and pte mapped but not yet locked.
2404	* We return with pte unmapped and unlocked.	2404	* We return with pte unmapped and unlocked.
2405	*	2405	*
2406	* We return with the mmap_sem locked or unlocked in the same cases	2406	* We return with the mmap_sem locked or unlocked in the same cases
2407	* as does filemap_fault().	2407	* as does filemap_fault().
2408	*/	2408	*/
2409	static int do_swap_page(struct mm_struct mm, struct vm_area_struct vma,	2409	static int do_swap_page(struct mm_struct mm, struct vm_area_struct vma,
2410	unsigned long address, pte_t page_table, pmd_t pmd,	2410	unsigned long address, pte_t page_table, pmd_t pmd,
2411	unsigned int flags, pte_t orig_pte)	2411	unsigned int flags, pte_t orig_pte)
2412	{	2412	{
2413	spinlock_t *ptl;	2413	spinlock_t *ptl;
2414	struct page page, swapcache;	2414	struct page page, swapcache;
2415	struct mem_cgroup *memcg;	2415	struct mem_cgroup *memcg;
2416	swp_entry_t entry;	2416	swp_entry_t entry;
2417	pte_t pte;	2417	pte_t pte;
2418	int locked;	2418	int locked;
2419	int exclusive = 0;	2419	int exclusive = 0;
2420	int ret = 0;	2420	int ret = 0;
2421		2421
2422	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))	2422	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2423	goto out;	2423	goto out;
2424		2424
2425	entry = pte_to_swp_entry(orig_pte);	2425	entry = pte_to_swp_entry(orig_pte);
2426	if (unlikely(non_swap_entry(entry))) {	2426	if (unlikely(non_swap_entry(entry))) {
2427	if (is_migration_entry(entry)) {	2427	if (is_migration_entry(entry)) {
2428	migration_entry_wait(mm, pmd, address);	2428	migration_entry_wait(mm, pmd, address);
2429	} else if (is_hwpoison_entry(entry)) {	2429	} else if (is_hwpoison_entry(entry)) {
2430	ret = VM_FAULT_HWPOISON;	2430	ret = VM_FAULT_HWPOISON;
2431	} else {	2431	} else {
2432	print_bad_pte(vma, address, orig_pte, NULL);	2432	print_bad_pte(vma, address, orig_pte, NULL);
2433	ret = VM_FAULT_SIGBUS;	2433	ret = VM_FAULT_SIGBUS;
2434	}	2434	}
2435	goto out;	2435	goto out;
2436	}	2436	}
2437	delayacct_set_flag(DELAYACCT_PF_SWAPIN);	2437	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2438	page = lookup_swap_cache(entry);	2438	page = lookup_swap_cache(entry);
2439	if (!page) {	2439	if (!page) {
2440	page = swapin_readahead(entry,	2440	page = swapin_readahead(entry,
2441	GFP_HIGHUSER_MOVABLE, vma, address);	2441	GFP_HIGHUSER_MOVABLE, vma, address);
2442	if (!page) {	2442	if (!page) {
2443	/*	2443	/*
2444	* Back out if somebody else faulted in this pte	2444	* Back out if somebody else faulted in this pte
2445	* while we released the pte lock.	2445	* while we released the pte lock.
2446	*/	2446	*/
2447	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2447	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2448	if (likely(pte_same(*page_table, orig_pte)))	2448	if (likely(pte_same(*page_table, orig_pte)))
2449	ret = VM_FAULT_OOM;	2449	ret = VM_FAULT_OOM;
2450	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);	2450	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2451	goto unlock;	2451	goto unlock;
2452	}	2452	}
2453		2453
2454	/* Had to read the page from swap area: Major fault */	2454	/* Had to read the page from swap area: Major fault */
2455	ret = VM_FAULT_MAJOR;	2455	ret = VM_FAULT_MAJOR;
2456	count_vm_event(PGMAJFAULT);	2456	count_vm_event(PGMAJFAULT);
2457	mem_cgroup_count_vm_event(mm, PGMAJFAULT);	2457	mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2458	} else if (PageHWPoison(page)) {	2458	} else if (PageHWPoison(page)) {
2459	/*	2459	/*
2460	* hwpoisoned dirty swapcache pages are kept for killing	2460	* hwpoisoned dirty swapcache pages are kept for killing
2461	* owner processes (which may be unknown at hwpoison time)	2461	* owner processes (which may be unknown at hwpoison time)
2462	*/	2462	*/
2463	ret = VM_FAULT_HWPOISON;	2463	ret = VM_FAULT_HWPOISON;
2464	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);	2464	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2465	swapcache = page;	2465	swapcache = page;
2466	goto out_release;	2466	goto out_release;
2467	}	2467	}
2468		2468
2469	swapcache = page;	2469	swapcache = page;
2470	locked = lock_page_or_retry(page, mm, flags);	2470	locked = lock_page_or_retry(page, mm, flags);
2471		2471
2472	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);	2472	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2473	if (!locked) {	2473	if (!locked) {
2474	ret \|= VM_FAULT_RETRY;	2474	ret \|= VM_FAULT_RETRY;
2475	goto out_release;	2475	goto out_release;
2476	}	2476	}
2477		2477
2478	/*	2478	/*
2479	* Make sure try_to_free_swap or reuse_swap_page or swapoff did not	2479	* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
2480	* release the swapcache from under us. The page pin, and pte_same	2480	* release the swapcache from under us. The page pin, and pte_same
2481	* test below, are not enough to exclude that. Even if it is still	2481	* test below, are not enough to exclude that. Even if it is still
2482	* swapcache, we need to check that the page's swap has not changed.	2482	* swapcache, we need to check that the page's swap has not changed.
2483	*/	2483	*/
2484	if (unlikely(!PageSwapCache(page) \|\| page_private(page) != entry.val))	2484	if (unlikely(!PageSwapCache(page) \|\| page_private(page) != entry.val))
2485	goto out_page;	2485	goto out_page;
2486		2486
2487	page = ksm_might_need_to_copy(page, vma, address);	2487	page = ksm_might_need_to_copy(page, vma, address);
2488	if (unlikely(!page)) {	2488	if (unlikely(!page)) {
2489	ret = VM_FAULT_OOM;	2489	ret = VM_FAULT_OOM;
2490	page = swapcache;	2490	page = swapcache;
2491	goto out_page;	2491	goto out_page;
2492	}	2492	}
2493		2493
2494	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {	2494	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2495	ret = VM_FAULT_OOM;	2495	ret = VM_FAULT_OOM;
2496	goto out_page;	2496	goto out_page;
2497	}	2497	}
2498		2498
2499	/*	2499	/*
2500	* Back out if somebody else already faulted in this pte.	2500	* Back out if somebody else already faulted in this pte.
2501	*/	2501	*/
2502	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2502	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2503	if (unlikely(!pte_same(*page_table, orig_pte)))	2503	if (unlikely(!pte_same(*page_table, orig_pte)))
2504	goto out_nomap;	2504	goto out_nomap;
2505		2505
2506	if (unlikely(!PageUptodate(page))) {	2506	if (unlikely(!PageUptodate(page))) {
2507	ret = VM_FAULT_SIGBUS;	2507	ret = VM_FAULT_SIGBUS;
2508	goto out_nomap;	2508	goto out_nomap;
2509	}	2509	}
2510		2510
2511	/*	2511	/*
2512	* The page isn't present yet, go ahead with the fault.	2512	* The page isn't present yet, go ahead with the fault.
2513	*	2513	*
2514	* Be careful about the sequence of operations here.	2514	* Be careful about the sequence of operations here.
2515	* To get its accounting right, reuse_swap_page() must be called	2515	* To get its accounting right, reuse_swap_page() must be called
2516	* while the page is counted on swap but not yet in mapcount i.e.	2516	* while the page is counted on swap but not yet in mapcount i.e.
2517	* before page_add_anon_rmap() and swap_free(); try_to_free_swap()	2517	* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2518	* must be called after the swap_free(), or it will never succeed.	2518	* must be called after the swap_free(), or it will never succeed.
2519	*/	2519	*/
2520		2520
2521	inc_mm_counter_fast(mm, MM_ANONPAGES);	2521	inc_mm_counter_fast(mm, MM_ANONPAGES);
2522	dec_mm_counter_fast(mm, MM_SWAPENTS);	2522	dec_mm_counter_fast(mm, MM_SWAPENTS);
2523	pte = mk_pte(page, vma->vm_page_prot);	2523	pte = mk_pte(page, vma->vm_page_prot);
2524	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {	2524	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2525	pte = maybe_mkwrite(pte_mkdirty(pte), vma);	2525	pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2526	flags &= ~FAULT_FLAG_WRITE;	2526	flags &= ~FAULT_FLAG_WRITE;
2527	ret \|= VM_FAULT_WRITE;	2527	ret \|= VM_FAULT_WRITE;
2528	exclusive = 1;	2528	exclusive = 1;
2529	}	2529	}
2530	flush_icache_page(vma, page);	2530	flush_icache_page(vma, page);
2531	if (pte_swp_soft_dirty(orig_pte))	2531	if (pte_swp_soft_dirty(orig_pte))
2532	pte = pte_mksoft_dirty(pte);	2532	pte = pte_mksoft_dirty(pte);
2533	set_pte_at(mm, address, page_table, pte);	2533	set_pte_at(mm, address, page_table, pte);
2534	if (page == swapcache) {	2534	if (page == swapcache) {
2535	do_page_add_anon_rmap(page, vma, address, exclusive);	2535	do_page_add_anon_rmap(page, vma, address, exclusive);
2536	mem_cgroup_commit_charge(page, memcg, true);	2536	mem_cgroup_commit_charge(page, memcg, true);
2537	} else { /* ksm created a completely new copy */	2537	} else { /* ksm created a completely new copy */
2538	page_add_new_anon_rmap(page, vma, address);	2538	page_add_new_anon_rmap(page, vma, address);
2539	mem_cgroup_commit_charge(page, memcg, false);	2539	mem_cgroup_commit_charge(page, memcg, false);
2540	lru_cache_add_active_or_unevictable(page, vma);	2540	lru_cache_add_active_or_unevictable(page, vma);
2541	}	2541	}
2542		2542
2543	swap_free(entry);	2543	swap_free(entry);
2544	if (vm_swap_full() \|\| (vma->vm_flags & VM_LOCKED) \|\| PageMlocked(page))	2544	if (vm_swap_full() \|\| (vma->vm_flags & VM_LOCKED) \|\| PageMlocked(page))
2545	try_to_free_swap(page);	2545	try_to_free_swap(page);
2546	unlock_page(page);	2546	unlock_page(page);
2547	if (page != swapcache) {	2547	if (page != swapcache) {
2548	/*	2548	/*
2549	* Hold the lock to avoid the swap entry to be reused	2549	* Hold the lock to avoid the swap entry to be reused
2550	* until we take the PT lock for the pte_same() check	2550	* until we take the PT lock for the pte_same() check
2551	* (to avoid false positives from pte_same). For	2551	* (to avoid false positives from pte_same). For
2552	* further safety release the lock after the swap_free	2552	* further safety release the lock after the swap_free
2553	* so that the swap count won't change under a	2553	* so that the swap count won't change under a
2554	* parallel locked swapcache.	2554	* parallel locked swapcache.
2555	*/	2555	*/
2556	unlock_page(swapcache);	2556	unlock_page(swapcache);
2557	page_cache_release(swapcache);	2557	page_cache_release(swapcache);
2558	}	2558	}
2559		2559
2560	if (flags & FAULT_FLAG_WRITE) {	2560	if (flags & FAULT_FLAG_WRITE) {
2561	ret \|= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);	2561	ret \|= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2562	if (ret & VM_FAULT_ERROR)	2562	if (ret & VM_FAULT_ERROR)
2563	ret &= VM_FAULT_ERROR;	2563	ret &= VM_FAULT_ERROR;
2564	goto out;	2564	goto out;
2565	}	2565	}
2566		2566
2567	/* No need to invalidate - it was non-present before */	2567	/* No need to invalidate - it was non-present before */
2568	update_mmu_cache(vma, address, page_table);	2568	update_mmu_cache(vma, address, page_table);
2569	unlock:	2569	unlock:
2570	pte_unmap_unlock(page_table, ptl);	2570	pte_unmap_unlock(page_table, ptl);
2571	out:	2571	out:
2572	return ret;	2572	return ret;
2573	out_nomap:	2573	out_nomap:
2574	mem_cgroup_cancel_charge(page, memcg);	2574	mem_cgroup_cancel_charge(page, memcg);
2575	pte_unmap_unlock(page_table, ptl);	2575	pte_unmap_unlock(page_table, ptl);
2576	out_page:	2576	out_page:
2577	unlock_page(page);	2577	unlock_page(page);
2578	out_release:	2578	out_release:
2579	page_cache_release(page);	2579	page_cache_release(page);
2580	if (page != swapcache) {	2580	if (page != swapcache) {
2581	unlock_page(swapcache);	2581	unlock_page(swapcache);
2582	page_cache_release(swapcache);	2582	page_cache_release(swapcache);
2583	}	2583	}
2584	return ret;	2584	return ret;
2585	}	2585	}
2586		2586
2587	/*	2587	/*
2588	* This is like a special single-page "expand_{down\|up}wards()",	2588	* This is like a special single-page "expand_{down\|up}wards()",
2589	* except we must first make sure that 'address{-\|+}PAGE_SIZE'	2589	* except we must first make sure that 'address{-\|+}PAGE_SIZE'
2590	* doesn't hit another vma.	2590	* doesn't hit another vma.
2591	*/	2591	*/
2592	static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)	2592	static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2593	{	2593	{
2594	address &= PAGE_MASK;	2594	address &= PAGE_MASK;
2595	if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {	2595	if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2596	struct vm_area_struct *prev = vma->vm_prev;	2596	struct vm_area_struct *prev = vma->vm_prev;
2597		2597
2598	/*	2598	/*
2599	* Is there a mapping abutting this one below?	2599	* Is there a mapping abutting this one below?
2600	*	2600	*
2601	* That's only ok if it's the same stack mapping	2601	* That's only ok if it's the same stack mapping
2602	* that has gotten split..	2602	* that has gotten split..
2603	*/	2603	*/
2604	if (prev && prev->vm_end == address)	2604	if (prev && prev->vm_end == address)
2605	return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;	2605	return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2606		2606
2607	expand_downwards(vma, address - PAGE_SIZE);	2607	expand_downwards(vma, address - PAGE_SIZE);
2608	}	2608	}
2609	if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {	2609	if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2610	struct vm_area_struct *next = vma->vm_next;	2610	struct vm_area_struct *next = vma->vm_next;
2611		2611
2612	/* As VM_GROWSDOWN but s/below/above/ */	2612	/* As VM_GROWSDOWN but s/below/above/ */
2613	if (next && next->vm_start == address + PAGE_SIZE)	2613	if (next && next->vm_start == address + PAGE_SIZE)
2614	return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;	2614	return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2615		2615
2616	expand_upwards(vma, address + PAGE_SIZE);	2616	expand_upwards(vma, address + PAGE_SIZE);
2617	}	2617	}
2618	return 0;	2618	return 0;
2619	}	2619	}
2620		2620
2621	/*	2621	/*
2622	* We enter with non-exclusive mmap_sem (to exclude vma changes,	2622	* We enter with non-exclusive mmap_sem (to exclude vma changes,
2623	* but allow concurrent faults), and pte mapped but not yet locked.	2623	* but allow concurrent faults), and pte mapped but not yet locked.
2624	* We return with mmap_sem still held, but pte unmapped and unlocked.	2624	* We return with mmap_sem still held, but pte unmapped and unlocked.
2625	*/	2625	*/
2626	static int do_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,	2626	static int do_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
2627	unsigned long address, pte_t page_table, pmd_t pmd,	2627	unsigned long address, pte_t page_table, pmd_t pmd,
2628	unsigned int flags)	2628	unsigned int flags)
2629	{	2629	{
2630	struct mem_cgroup *memcg;	2630	struct mem_cgroup *memcg;
2631	struct page *page;	2631	struct page *page;
2632	spinlock_t *ptl;	2632	spinlock_t *ptl;
2633	pte_t entry;	2633	pte_t entry;
2634		2634
2635	pte_unmap(page_table);	2635	pte_unmap(page_table);
2636		2636
2637	/* Check if we need to add a guard page to the stack */	2637	/* Check if we need to add a guard page to the stack */
2638	if (check_stack_guard_page(vma, address) < 0)	2638	if (check_stack_guard_page(vma, address) < 0)
2639	return VM_FAULT_SIGBUS;	2639	return VM_FAULT_SIGBUS;
2640		2640
2641	/* Use the zero-page for reads */	2641	/* Use the zero-page for reads */
2642	if (!(flags & FAULT_FLAG_WRITE)) {	2642	if (!(flags & FAULT_FLAG_WRITE)) {
2643	entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),	2643	entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2644	vma->vm_page_prot));	2644	vma->vm_page_prot));
2645	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2645	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2646	if (!pte_none(*page_table))	2646	if (!pte_none(*page_table))
2647	goto unlock;	2647	goto unlock;
2648	goto setpte;	2648	goto setpte;
2649	}	2649	}
2650		2650
2651	/* Allocate our own private page. */	2651	/* Allocate our own private page. */
2652	if (unlikely(anon_vma_prepare(vma)))	2652	if (unlikely(anon_vma_prepare(vma)))
2653	goto oom;	2653	goto oom;
2654	page = alloc_zeroed_user_highpage_movable(vma, address);	2654	page = alloc_zeroed_user_highpage_movable(vma, address);
2655	if (!page)	2655	if (!page)
2656	goto oom;	2656	goto oom;
2657	/*	2657	/*
2658	* The memory barrier inside __SetPageUptodate makes sure that	2658	* The memory barrier inside __SetPageUptodate makes sure that
2659	* preceeding stores to the page contents become visible before	2659	* preceeding stores to the page contents become visible before
2660	* the set_pte_at() write.	2660	* the set_pte_at() write.
2661	*/	2661	*/
2662	__SetPageUptodate(page);	2662	__SetPageUptodate(page);
2663		2663
2664	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))	2664	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2665	goto oom_free_page;	2665	goto oom_free_page;
2666		2666
2667	entry = mk_pte(page, vma->vm_page_prot);	2667	entry = mk_pte(page, vma->vm_page_prot);
2668	if (vma->vm_flags & VM_WRITE)	2668	if (vma->vm_flags & VM_WRITE)
2669	entry = pte_mkwrite(pte_mkdirty(entry));	2669	entry = pte_mkwrite(pte_mkdirty(entry));
2670		2670
2671	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2671	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2672	if (!pte_none(*page_table))	2672	if (!pte_none(*page_table))
2673	goto release;	2673	goto release;
2674		2674
2675	inc_mm_counter_fast(mm, MM_ANONPAGES);	2675	inc_mm_counter_fast(mm, MM_ANONPAGES);
2676	page_add_new_anon_rmap(page, vma, address);	2676	page_add_new_anon_rmap(page, vma, address);
2677	mem_cgroup_commit_charge(page, memcg, false);	2677	mem_cgroup_commit_charge(page, memcg, false);
2678	lru_cache_add_active_or_unevictable(page, vma);	2678	lru_cache_add_active_or_unevictable(page, vma);
2679	setpte:	2679	setpte:
2680	set_pte_at(mm, address, page_table, entry);	2680	set_pte_at(mm, address, page_table, entry);
2681		2681
2682	/* No need to invalidate - it was non-present before */	2682	/* No need to invalidate - it was non-present before */
2683	update_mmu_cache(vma, address, page_table);	2683	update_mmu_cache(vma, address, page_table);
2684	unlock:	2684	unlock:
2685	pte_unmap_unlock(page_table, ptl);	2685	pte_unmap_unlock(page_table, ptl);
2686	return 0;	2686	return 0;
2687	release:	2687	release:
2688	mem_cgroup_cancel_charge(page, memcg);	2688	mem_cgroup_cancel_charge(page, memcg);
2689	page_cache_release(page);	2689	page_cache_release(page);
2690	goto unlock;	2690	goto unlock;
2691	oom_free_page:	2691	oom_free_page:
2692	page_cache_release(page);	2692	page_cache_release(page);
2693	oom:	2693	oom:
2694	return VM_FAULT_OOM;	2694	return VM_FAULT_OOM;
2695	}	2695	}
2696		2696
2697	/*	2697	/*
2698	* The mmap_sem must have been held on entry, and may have been	2698	* The mmap_sem must have been held on entry, and may have been
2699	* released depending on flags and vma->vm_ops->fault() return value.	2699	* released depending on flags and vma->vm_ops->fault() return value.
2700	* See filemap_fault() and __lock_page_retry().	2700	* See filemap_fault() and __lock_page_retry().
2701	*/	2701	*/
2702	static int __do_fault(struct vm_area_struct *vma, unsigned long address,	2702	static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2703	pgoff_t pgoff, unsigned int flags, struct page **page)	2703	pgoff_t pgoff, unsigned int flags, struct page **page)
2704	{	2704	{
2705	struct vm_fault vmf;	2705	struct vm_fault vmf;
2706	int ret;	2706	int ret;
2707		2707
2708	vmf.virtual_address = (void __user *)(address & PAGE_MASK);	2708	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2709	vmf.pgoff = pgoff;	2709	vmf.pgoff = pgoff;
2710	vmf.flags = flags;	2710	vmf.flags = flags;
2711	vmf.page = NULL;	2711	vmf.page = NULL;
2712		2712
2713	ret = vma->vm_ops->fault(vma, &vmf);	2713	ret = vma->vm_ops->fault(vma, &vmf);
2714	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))	2714	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
2715	return ret;	2715	return ret;
2716		2716
2717	if (unlikely(PageHWPoison(vmf.page))) {	2717	if (unlikely(PageHWPoison(vmf.page))) {
2718	if (ret & VM_FAULT_LOCKED)	2718	if (ret & VM_FAULT_LOCKED)
2719	unlock_page(vmf.page);	2719	unlock_page(vmf.page);
2720	page_cache_release(vmf.page);	2720	page_cache_release(vmf.page);
2721	return VM_FAULT_HWPOISON;	2721	return VM_FAULT_HWPOISON;
2722	}	2722	}
2723		2723
2724	if (unlikely(!(ret & VM_FAULT_LOCKED)))	2724	if (unlikely(!(ret & VM_FAULT_LOCKED)))
2725	lock_page(vmf.page);	2725	lock_page(vmf.page);
2726	else	2726	else
2727	VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);	2727	VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2728		2728
2729	*page = vmf.page;	2729	*page = vmf.page;
2730	return ret;	2730	return ret;
2731	}	2731	}
2732		2732
2733	/**	2733	/**
2734	* do_set_pte - setup new PTE entry for given page and add reverse page mapping.	2734	* do_set_pte - setup new PTE entry for given page and add reverse page mapping.
2735	*	2735	*
2736	* @vma: virtual memory area	2736	* @vma: virtual memory area
2737	* @address: user virtual address	2737	* @address: user virtual address
2738	* @page: page to map	2738	* @page: page to map
2739	* @pte: pointer to target page table entry	2739	* @pte: pointer to target page table entry
2740	* @write: true, if new entry is writable	2740	* @write: true, if new entry is writable
2741	* @anon: true, if it's anonymous page	2741	* @anon: true, if it's anonymous page
2742	*	2742	*
2743	* Caller must hold page table lock relevant for @pte.	2743	* Caller must hold page table lock relevant for @pte.
2744	*	2744	*
2745	* Target users are page handler itself and implementations of	2745	* Target users are page handler itself and implementations of
2746	* vm_ops->map_pages.	2746	* vm_ops->map_pages.
2747	*/	2747	*/
2748	void do_set_pte(struct vm_area_struct *vma, unsigned long address,	2748	void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2749	struct page page, pte_t pte, bool write, bool anon)	2749	struct page page, pte_t pte, bool write, bool anon)
2750	{	2750	{
2751	pte_t entry;	2751	pte_t entry;
2752		2752
2753	flush_icache_page(vma, page);	2753	flush_icache_page(vma, page);
2754	entry = mk_pte(page, vma->vm_page_prot);	2754	entry = mk_pte(page, vma->vm_page_prot);
2755	if (write)	2755	if (write)
2756	entry = maybe_mkwrite(pte_mkdirty(entry), vma);	2756	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2757	else if (pte_file(pte) && pte_file_soft_dirty(pte))	2757	else if (pte_file(pte) && pte_file_soft_dirty(pte))
2758	entry = pte_mksoft_dirty(entry);	2758	entry = pte_mksoft_dirty(entry);
2759	if (anon) {	2759	if (anon) {
2760	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);	2760	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2761	page_add_new_anon_rmap(page, vma, address);	2761	page_add_new_anon_rmap(page, vma, address);
2762	} else {	2762	} else {
2763	inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);	2763	inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
2764	page_add_file_rmap(page);	2764	page_add_file_rmap(page);
2765	}	2765	}
2766	set_pte_at(vma->vm_mm, address, pte, entry);	2766	set_pte_at(vma->vm_mm, address, pte, entry);
2767		2767
2768	/* no need to invalidate: a not-present page won't be cached */	2768	/* no need to invalidate: a not-present page won't be cached */
2769	update_mmu_cache(vma, address, pte);	2769	update_mmu_cache(vma, address, pte);
2770	}	2770	}
2771		2771
2772	static unsigned long fault_around_bytes __read_mostly =	2772	static unsigned long fault_around_bytes __read_mostly =
2773	rounddown_pow_of_two(65536);	2773	rounddown_pow_of_two(65536);
2774		2774
2775	#ifdef CONFIG_DEBUG_FS	2775	#ifdef CONFIG_DEBUG_FS
2776	static int fault_around_bytes_get(void data, u64 val)	2776	static int fault_around_bytes_get(void data, u64 val)
2777	{	2777	{
2778	*val = fault_around_bytes;	2778	*val = fault_around_bytes;
2779	return 0;	2779	return 0;
2780	}	2780	}
2781		2781
2782	/*	2782	/*
2783	* fault_around_pages() and fault_around_mask() expects fault_around_bytes	2783	* fault_around_pages() and fault_around_mask() expects fault_around_bytes
2784	* rounded down to nearest page order. It's what do_fault_around() expects to	2784	* rounded down to nearest page order. It's what do_fault_around() expects to
2785	* see.	2785	* see.
2786	*/	2786	*/
2787	static int fault_around_bytes_set(void *data, u64 val)	2787	static int fault_around_bytes_set(void *data, u64 val)
2788	{	2788	{
2789	if (val / PAGE_SIZE > PTRS_PER_PTE)	2789	if (val / PAGE_SIZE > PTRS_PER_PTE)
2790	return -EINVAL;	2790	return -EINVAL;
2791	if (val > PAGE_SIZE)	2791	if (val > PAGE_SIZE)
2792	fault_around_bytes = rounddown_pow_of_two(val);	2792	fault_around_bytes = rounddown_pow_of_two(val);
2793	else	2793	else
2794	fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */	2794	fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
2795	return 0;	2795	return 0;
2796	}	2796	}
2797	DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,	2797	DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
2798	fault_around_bytes_get, fault_around_bytes_set, "%llu\n");	2798	fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
2799		2799
2800	static int __init fault_around_debugfs(void)	2800	static int __init fault_around_debugfs(void)
2801	{	2801	{
2802	void *ret;	2802	void *ret;
2803		2803
2804	ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,	2804	ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
2805	&fault_around_bytes_fops);	2805	&fault_around_bytes_fops);
2806	if (!ret)	2806	if (!ret)
2807	pr_warn("Failed to create fault_around_bytes in debugfs");	2807	pr_warn("Failed to create fault_around_bytes in debugfs");
2808	return 0;	2808	return 0;
2809	}	2809	}
2810	late_initcall(fault_around_debugfs);	2810	late_initcall(fault_around_debugfs);
2811	#endif	2811	#endif
2812		2812
2813	/*	2813	/*
2814	* do_fault_around() tries to map few pages around the fault address. The hope	2814	* do_fault_around() tries to map few pages around the fault address. The hope
2815	* is that the pages will be needed soon and this will lower the number of	2815	* is that the pages will be needed soon and this will lower the number of
2816	* faults to handle.	2816	* faults to handle.
2817	*	2817	*
2818	* It uses vm_ops->map_pages() to map the pages, which skips the page if it's	2818	* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
2819	* not ready to be mapped: not up-to-date, locked, etc.	2819	* not ready to be mapped: not up-to-date, locked, etc.
2820	*	2820	*
2821	* This function is called with the page table lock taken. In the split ptlock	2821	* This function is called with the page table lock taken. In the split ptlock
2822	* case the page table lock only protects only those entries which belong to	2822	* case the page table lock only protects only those entries which belong to
2823	* the page table corresponding to the fault address.	2823	* the page table corresponding to the fault address.
2824	*	2824	*
2825	* This function doesn't cross the VMA boundaries, in order to call map_pages()	2825	* This function doesn't cross the VMA boundaries, in order to call map_pages()
2826	* only once.	2826	* only once.
2827	*	2827	*
2828	* fault_around_pages() defines how many pages we'll try to map.	2828	* fault_around_pages() defines how many pages we'll try to map.
2829	* do_fault_around() expects it to return a power of two less than or equal to	2829	* do_fault_around() expects it to return a power of two less than or equal to
2830	* PTRS_PER_PTE.	2830	* PTRS_PER_PTE.
2831	*	2831	*
2832	* The virtual address of the area that we map is naturally aligned to the	2832	* The virtual address of the area that we map is naturally aligned to the
2833	* fault_around_pages() value (and therefore to page order). This way it's	2833	* fault_around_pages() value (and therefore to page order). This way it's
2834	* easier to guarantee that we don't cross page table boundaries.	2834	* easier to guarantee that we don't cross page table boundaries.
2835	*/	2835	*/
2836	static void do_fault_around(struct vm_area_struct *vma, unsigned long address,	2836	static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2837	pte_t *pte, pgoff_t pgoff, unsigned int flags)	2837	pte_t *pte, pgoff_t pgoff, unsigned int flags)
2838	{	2838	{
2839	unsigned long start_addr, nr_pages, mask;	2839	unsigned long start_addr, nr_pages, mask;
2840	pgoff_t max_pgoff;	2840	pgoff_t max_pgoff;
2841	struct vm_fault vmf;	2841	struct vm_fault vmf;
2842	int off;	2842	int off;
2843		2843
2844	nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;	2844	nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2845	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;	2845	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2846		2846
2847	start_addr = max(address & mask, vma->vm_start);	2847	start_addr = max(address & mask, vma->vm_start);
2848	off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);	2848	off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2849	pte -= off;	2849	pte -= off;
2850	pgoff -= off;	2850	pgoff -= off;
2851		2851
2852	/*	2852	/*
2853	* max_pgoff is either end of page table or end of vma	2853	* max_pgoff is either end of page table or end of vma
2854	* or fault_around_pages() from pgoff, depending what is nearest.	2854	* or fault_around_pages() from pgoff, depending what is nearest.
2855	*/	2855	*/
2856	max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +	2856	max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2857	PTRS_PER_PTE - 1;	2857	PTRS_PER_PTE - 1;
2858	max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,	2858	max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
2859	pgoff + nr_pages - 1);	2859	pgoff + nr_pages - 1);
2860		2860
2861	/* Check if it makes any sense to call ->map_pages */	2861	/* Check if it makes any sense to call ->map_pages */
2862	while (!pte_none(*pte)) {	2862	while (!pte_none(*pte)) {
2863	if (++pgoff > max_pgoff)	2863	if (++pgoff > max_pgoff)
2864	return;	2864	return;
2865	start_addr += PAGE_SIZE;	2865	start_addr += PAGE_SIZE;
2866	if (start_addr >= vma->vm_end)	2866	if (start_addr >= vma->vm_end)
2867	return;	2867	return;
2868	pte++;	2868	pte++;
2869	}	2869	}
2870		2870
2871	vmf.virtual_address = (void __user *) start_addr;	2871	vmf.virtual_address = (void __user *) start_addr;
2872	vmf.pte = pte;	2872	vmf.pte = pte;
2873	vmf.pgoff = pgoff;	2873	vmf.pgoff = pgoff;
2874	vmf.max_pgoff = max_pgoff;	2874	vmf.max_pgoff = max_pgoff;
2875	vmf.flags = flags;	2875	vmf.flags = flags;
2876	vma->vm_ops->map_pages(vma, &vmf);	2876	vma->vm_ops->map_pages(vma, &vmf);
2877	}	2877	}
2878		2878
2879	static int do_read_fault(struct mm_struct mm, struct vm_area_struct vma,	2879	static int do_read_fault(struct mm_struct mm, struct vm_area_struct vma,
2880	unsigned long address, pmd_t *pmd,	2880	unsigned long address, pmd_t *pmd,
2881	pgoff_t pgoff, unsigned int flags, pte_t orig_pte)	2881	pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2882	{	2882	{
2883	struct page *fault_page;	2883	struct page *fault_page;
2884	spinlock_t *ptl;	2884	spinlock_t *ptl;
2885	pte_t *pte;	2885	pte_t *pte;
2886	int ret = 0;	2886	int ret = 0;
2887		2887
2888	/*	2888	/*
2889	* Let's call ->map_pages() first and use ->fault() as fallback	2889	* Let's call ->map_pages() first and use ->fault() as fallback
2890	* if page by the offset is not ready to be mapped (cold cache or	2890	* if page by the offset is not ready to be mapped (cold cache or
2891	* something).	2891	* something).
2892	*/	2892	*/
2893	if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&	2893	if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
2894	fault_around_bytes >> PAGE_SHIFT > 1) {	2894	fault_around_bytes >> PAGE_SHIFT > 1) {
2895	pte = pte_offset_map_lock(mm, pmd, address, &ptl);	2895	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2896	do_fault_around(vma, address, pte, pgoff, flags);	2896	do_fault_around(vma, address, pte, pgoff, flags);
2897	if (!pte_same(*pte, orig_pte))	2897	if (!pte_same(*pte, orig_pte))
2898	goto unlock_out;	2898	goto unlock_out;
2899	pte_unmap_unlock(pte, ptl);	2899	pte_unmap_unlock(pte, ptl);
2900	}	2900	}
2901		2901
2902	ret = __do_fault(vma, address, pgoff, flags, &fault_page);	2902	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
2903	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))	2903	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
2904	return ret;	2904	return ret;
2905		2905
2906	pte = pte_offset_map_lock(mm, pmd, address, &ptl);	2906	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2907	if (unlikely(!pte_same(*pte, orig_pte))) {	2907	if (unlikely(!pte_same(*pte, orig_pte))) {
2908	pte_unmap_unlock(pte, ptl);	2908	pte_unmap_unlock(pte, ptl);
2909	unlock_page(fault_page);	2909	unlock_page(fault_page);
2910	page_cache_release(fault_page);	2910	page_cache_release(fault_page);
2911	return ret;	2911	return ret;
2912	}	2912	}
2913	do_set_pte(vma, address, fault_page, pte, false, false);	2913	do_set_pte(vma, address, fault_page, pte, false, false);
2914	unlock_page(fault_page);	2914	unlock_page(fault_page);
2915	unlock_out:	2915	unlock_out:
2916	pte_unmap_unlock(pte, ptl);	2916	pte_unmap_unlock(pte, ptl);
2917	return ret;	2917	return ret;
2918	}	2918	}
2919		2919
2920	static int do_cow_fault(struct mm_struct mm, struct vm_area_struct vma,	2920	static int do_cow_fault(struct mm_struct mm, struct vm_area_struct vma,
2921	unsigned long address, pmd_t *pmd,	2921	unsigned long address, pmd_t *pmd,
2922	pgoff_t pgoff, unsigned int flags, pte_t orig_pte)	2922	pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2923	{	2923	{
2924	struct page fault_page, new_page;	2924	struct page fault_page, new_page;
2925	struct mem_cgroup *memcg;	2925	struct mem_cgroup *memcg;
2926	spinlock_t *ptl;	2926	spinlock_t *ptl;
2927	pte_t *pte;	2927	pte_t *pte;
2928	int ret;	2928	int ret;
2929		2929
2930	if (unlikely(anon_vma_prepare(vma)))	2930	if (unlikely(anon_vma_prepare(vma)))
2931	return VM_FAULT_OOM;	2931	return VM_FAULT_OOM;
2932		2932
2933	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);	2933	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2934	if (!new_page)	2934	if (!new_page)
2935	return VM_FAULT_OOM;	2935	return VM_FAULT_OOM;
2936		2936
2937	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {	2937	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2938	page_cache_release(new_page);	2938	page_cache_release(new_page);
2939	return VM_FAULT_OOM;	2939	return VM_FAULT_OOM;
2940	}	2940	}
2941		2941
2942	ret = __do_fault(vma, address, pgoff, flags, &fault_page);	2942	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
2943	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))	2943	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
2944	goto uncharge_out;	2944	goto uncharge_out;
2945		2945
2946	copy_user_highpage(new_page, fault_page, address, vma);	2946	copy_user_highpage(new_page, fault_page, address, vma);
2947	__SetPageUptodate(new_page);	2947	__SetPageUptodate(new_page);
2948		2948
2949	pte = pte_offset_map_lock(mm, pmd, address, &ptl);	2949	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2950	if (unlikely(!pte_same(*pte, orig_pte))) {	2950	if (unlikely(!pte_same(*pte, orig_pte))) {
2951	pte_unmap_unlock(pte, ptl);	2951	pte_unmap_unlock(pte, ptl);
2952	unlock_page(fault_page);	2952	unlock_page(fault_page);
2953	page_cache_release(fault_page);	2953	page_cache_release(fault_page);
2954	goto uncharge_out;	2954	goto uncharge_out;
2955	}	2955	}
2956	do_set_pte(vma, address, new_page, pte, true, true);	2956	do_set_pte(vma, address, new_page, pte, true, true);
2957	mem_cgroup_commit_charge(new_page, memcg, false);	2957	mem_cgroup_commit_charge(new_page, memcg, false);
2958	lru_cache_add_active_or_unevictable(new_page, vma);	2958	lru_cache_add_active_or_unevictable(new_page, vma);
2959	pte_unmap_unlock(pte, ptl);	2959	pte_unmap_unlock(pte, ptl);
2960	unlock_page(fault_page);	2960	unlock_page(fault_page);
2961	page_cache_release(fault_page);	2961	page_cache_release(fault_page);
2962	return ret;	2962	return ret;
2963	uncharge_out:	2963	uncharge_out:
2964	mem_cgroup_cancel_charge(new_page, memcg);	2964	mem_cgroup_cancel_charge(new_page, memcg);
2965	page_cache_release(new_page);	2965	page_cache_release(new_page);
2966	return ret;	2966	return ret;
2967	}	2967	}
2968		2968
2969	static int do_shared_fault(struct mm_struct mm, struct vm_area_struct vma,	2969	static int do_shared_fault(struct mm_struct mm, struct vm_area_struct vma,
2970	unsigned long address, pmd_t *pmd,	2970	unsigned long address, pmd_t *pmd,
2971	pgoff_t pgoff, unsigned int flags, pte_t orig_pte)	2971	pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2972	{	2972	{
2973	struct page *fault_page;	2973	struct page *fault_page;
2974	struct address_space *mapping;	2974	struct address_space *mapping;
2975	spinlock_t *ptl;	2975	spinlock_t *ptl;
2976	pte_t *pte;	2976	pte_t *pte;
2977	int dirtied = 0;	2977	int dirtied = 0;
2978	int ret, tmp;	2978	int ret, tmp;
2979		2979
2980	ret = __do_fault(vma, address, pgoff, flags, &fault_page);	2980	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
2981	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))	2981	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
2982	return ret;	2982	return ret;
2983		2983
2984	/*	2984	/*
2985	* Check if the backing address space wants to know that the page is	2985	* Check if the backing address space wants to know that the page is
2986	* about to become writable	2986	* about to become writable
2987	*/	2987	*/
2988	if (vma->vm_ops->page_mkwrite) {	2988	if (vma->vm_ops->page_mkwrite) {
2989	unlock_page(fault_page);	2989	unlock_page(fault_page);
2990	tmp = do_page_mkwrite(vma, fault_page, address);	2990	tmp = do_page_mkwrite(vma, fault_page, address);
2991	if (unlikely(!tmp \|\|	2991	if (unlikely(!tmp \|\|
2992	(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {	2992	(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
2993	page_cache_release(fault_page);	2993	page_cache_release(fault_page);
2994	return tmp;	2994	return tmp;
2995	}	2995	}
2996	}	2996	}
2997		2997
2998	pte = pte_offset_map_lock(mm, pmd, address, &ptl);	2998	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2999	if (unlikely(!pte_same(*pte, orig_pte))) {	2999	if (unlikely(!pte_same(*pte, orig_pte))) {
3000	pte_unmap_unlock(pte, ptl);	3000	pte_unmap_unlock(pte, ptl);
3001	unlock_page(fault_page);	3001	unlock_page(fault_page);
3002	page_cache_release(fault_page);	3002	page_cache_release(fault_page);
3003	return ret;	3003	return ret;
3004	}	3004	}
3005	do_set_pte(vma, address, fault_page, pte, true, false);	3005	do_set_pte(vma, address, fault_page, pte, true, false);
3006	pte_unmap_unlock(pte, ptl);	3006	pte_unmap_unlock(pte, ptl);
3007		3007
3008	if (set_page_dirty(fault_page))	3008	if (set_page_dirty(fault_page))
3009	dirtied = 1;	3009	dirtied = 1;
3010	mapping = fault_page->mapping;	3010	mapping = fault_page->mapping;
3011	unlock_page(fault_page);	3011	unlock_page(fault_page);
3012	if ((dirtied \|\| vma->vm_ops->page_mkwrite) && mapping) {	3012	if ((dirtied \|\| vma->vm_ops->page_mkwrite) && mapping) {
3013	/*	3013	/*
3014	* Some device drivers do not set page.mapping but still	3014	* Some device drivers do not set page.mapping but still
3015	* dirty their pages	3015	* dirty their pages
3016	*/	3016	*/
3017	balance_dirty_pages_ratelimited(mapping);	3017	balance_dirty_pages_ratelimited(mapping);
3018	}	3018	}
3019		3019
3020	/* file_update_time outside page_lock */	3020	/* file_update_time outside page_lock */
3021	if (vma->vm_file && !vma->vm_ops->page_mkwrite)	3021	if (vma->vm_file && !vma->vm_ops->page_mkwrite)
3022	file_update_time(vma->vm_file);	3022	file_update_time(vma->vm_file);
3023		3023
3024	return ret;	3024	return ret;
3025	}	3025	}
3026		3026
3027	/*	3027	/*
3028	* We enter with non-exclusive mmap_sem (to exclude vma changes,	3028	* We enter with non-exclusive mmap_sem (to exclude vma changes,
3029	* but allow concurrent faults).	3029	* but allow concurrent faults).
3030	* The mmap_sem may have been released depending on flags and our	3030	* The mmap_sem may have been released depending on flags and our
3031	* return value. See filemap_fault() and __lock_page_or_retry().	3031	* return value. See filemap_fault() and __lock_page_or_retry().
3032	*/	3032	*/
3033	static int do_linear_fault(struct mm_struct mm, struct vm_area_struct vma,	3033	static int do_linear_fault(struct mm_struct mm, struct vm_area_struct vma,
3034	unsigned long address, pte_t page_table, pmd_t pmd,	3034	unsigned long address, pte_t page_table, pmd_t pmd,
3035	unsigned int flags, pte_t orig_pte)	3035	unsigned int flags, pte_t orig_pte)
3036	{	3036	{
3037	pgoff_t pgoff = (((address & PAGE_MASK)	3037	pgoff_t pgoff = (((address & PAGE_MASK)
3038	- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;	3038	- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3039		3039
3040	pte_unmap(page_table);	3040	pte_unmap(page_table);
3041	if (!(flags & FAULT_FLAG_WRITE))	3041	if (!(flags & FAULT_FLAG_WRITE))
3042	return do_read_fault(mm, vma, address, pmd, pgoff, flags,	3042	return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3043	orig_pte);	3043	orig_pte);
3044	if (!(vma->vm_flags & VM_SHARED))	3044	if (!(vma->vm_flags & VM_SHARED))
3045	return do_cow_fault(mm, vma, address, pmd, pgoff, flags,	3045	return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3046	orig_pte);	3046	orig_pte);
3047	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);	3047	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3048	}	3048	}
3049		3049
3050	/*	3050	/*
3051	* Fault of a previously existing named mapping. Repopulate the pte	3051	* Fault of a previously existing named mapping. Repopulate the pte
3052	* from the encoded file_pte if possible. This enables swappable	3052	* from the encoded file_pte if possible. This enables swappable
3053	* nonlinear vmas.	3053	* nonlinear vmas.
3054	*	3054	*
3055	* We enter with non-exclusive mmap_sem (to exclude vma changes,	3055	* We enter with non-exclusive mmap_sem (to exclude vma changes,
3056	* but allow concurrent faults), and pte mapped but not yet locked.	3056	* but allow concurrent faults), and pte mapped but not yet locked.
3057	* We return with pte unmapped and unlocked.	3057	* We return with pte unmapped and unlocked.
3058	* The mmap_sem may have been released depending on flags and our	3058	* The mmap_sem may have been released depending on flags and our
3059	* return value. See filemap_fault() and __lock_page_or_retry().	3059	* return value. See filemap_fault() and __lock_page_or_retry().
3060	*/	3060	*/
3061	static int do_nonlinear_fault(struct mm_struct mm, struct vm_area_struct vma,	3061	static int do_nonlinear_fault(struct mm_struct mm, struct vm_area_struct vma,
3062	unsigned long address, pte_t page_table, pmd_t pmd,	3062	unsigned long address, pte_t page_table, pmd_t pmd,
3063	unsigned int flags, pte_t orig_pte)	3063	unsigned int flags, pte_t orig_pte)
3064	{	3064	{
3065	pgoff_t pgoff;	3065	pgoff_t pgoff;
3066		3066
3067	flags \|= FAULT_FLAG_NONLINEAR;	3067	flags \|= FAULT_FLAG_NONLINEAR;
3068		3068
3069	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))	3069	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3070	return 0;	3070	return 0;
3071		3071
3072	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {	3072	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3073	/*	3073	/*
3074	* Page table corrupted: show pte and kill process.	3074	* Page table corrupted: show pte and kill process.
3075	*/	3075	*/
3076	print_bad_pte(vma, address, orig_pte, NULL);	3076	print_bad_pte(vma, address, orig_pte, NULL);
3077	return VM_FAULT_SIGBUS;	3077	return VM_FAULT_SIGBUS;
3078	}	3078	}
3079		3079
3080	pgoff = pte_to_pgoff(orig_pte);	3080	pgoff = pte_to_pgoff(orig_pte);
3081	if (!(flags & FAULT_FLAG_WRITE))	3081	if (!(flags & FAULT_FLAG_WRITE))
3082	return do_read_fault(mm, vma, address, pmd, pgoff, flags,	3082	return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3083	orig_pte);	3083	orig_pte);
3084	if (!(vma->vm_flags & VM_SHARED))	3084	if (!(vma->vm_flags & VM_SHARED))
3085	return do_cow_fault(mm, vma, address, pmd, pgoff, flags,	3085	return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3086	orig_pte);	3086	orig_pte);
3087	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);	3087	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3088	}	3088	}
3089		3089
3090	static int numa_migrate_prep(struct page page, struct vm_area_struct vma,	3090	static int numa_migrate_prep(struct page page, struct vm_area_struct vma,
3091	unsigned long addr, int page_nid,	3091	unsigned long addr, int page_nid,
3092	int *flags)	3092	int *flags)
3093	{	3093	{
3094	get_page(page);	3094	get_page(page);
3095		3095
3096	count_vm_numa_event(NUMA_HINT_FAULTS);	3096	count_vm_numa_event(NUMA_HINT_FAULTS);
3097	if (page_nid == numa_node_id()) {	3097	if (page_nid == numa_node_id()) {
3098	count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);	3098	count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3099	*flags \|= TNF_FAULT_LOCAL;	3099	*flags \|= TNF_FAULT_LOCAL;
3100	}	3100	}
3101		3101
3102	return mpol_misplaced(page, vma, addr);	3102	return mpol_misplaced(page, vma, addr);
3103	}	3103	}
3104		3104
3105	static int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,	3105	static int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3106	unsigned long addr, pte_t pte, pte_t ptep, pmd_t pmd)	3106	unsigned long addr, pte_t pte, pte_t ptep, pmd_t pmd)
3107	{	3107	{
3108	struct page *page = NULL;	3108	struct page *page = NULL;
3109	spinlock_t *ptl;	3109	spinlock_t *ptl;
3110	int page_nid = -1;	3110	int page_nid = -1;
3111	int last_cpupid;	3111	int last_cpupid;
3112	int target_nid;	3112	int target_nid;
3113	bool migrated = false;	3113	bool migrated = false;
3114	int flags = 0;	3114	int flags = 0;
3115		3115
3116	/*	3116	/*
3117	* The "pte" at this point cannot be used safely without	3117	* The "pte" at this point cannot be used safely without
3118	* validation through pte_unmap_same(). It's of NUMA type but	3118	* validation through pte_unmap_same(). It's of NUMA type but
3119	* the pfn may be screwed if the read is non atomic.	3119	* the pfn may be screwed if the read is non atomic.
3120	*	3120	*
3121	* ptep_modify_prot_start is not called as this is clearing	3121	* ptep_modify_prot_start is not called as this is clearing
3122	* the _PAGE_NUMA bit and it is not really expected that there	3122	* the _PAGE_NUMA bit and it is not really expected that there
3123	* would be concurrent hardware modifications to the PTE.	3123	* would be concurrent hardware modifications to the PTE.
3124	*/	3124	*/
3125	ptl = pte_lockptr(mm, pmd);	3125	ptl = pte_lockptr(mm, pmd);
3126	spin_lock(ptl);	3126	spin_lock(ptl);
3127	if (unlikely(!pte_same(*ptep, pte))) {	3127	if (unlikely(!pte_same(*ptep, pte))) {
3128	pte_unmap_unlock(ptep, ptl);	3128	pte_unmap_unlock(ptep, ptl);
3129	goto out;	3129	goto out;
3130	}	3130	}
3131		3131
3132	pte = pte_mknonnuma(pte);	3132	pte = pte_mknonnuma(pte);
3133	set_pte_at(mm, addr, ptep, pte);	3133	set_pte_at(mm, addr, ptep, pte);
3134	update_mmu_cache(vma, addr, ptep);	3134	update_mmu_cache(vma, addr, ptep);
3135		3135
3136	page = vm_normal_page(vma, addr, pte);	3136	page = vm_normal_page(vma, addr, pte);
3137	if (!page) {	3137	if (!page) {
3138	pte_unmap_unlock(ptep, ptl);	3138	pte_unmap_unlock(ptep, ptl);
3139	return 0;	3139	return 0;
3140	}	3140	}
3141	BUG_ON(is_zero_pfn(page_to_pfn(page)));	3141	BUG_ON(is_zero_pfn(page_to_pfn(page)));
3142		3142
3143	/*	3143	/*
3144	* Avoid grouping on DSO/COW pages in specific and RO pages	3144	* Avoid grouping on DSO/COW pages in specific and RO pages
3145	* in general, RO pages shouldn't hurt as much anyway since	3145	* in general, RO pages shouldn't hurt as much anyway since
3146	* they can be in shared cache state.	3146	* they can be in shared cache state.
3147	*/	3147	*/
3148	if (!pte_write(pte))	3148	if (!pte_write(pte))
3149	flags \|= TNF_NO_GROUP;	3149	flags \|= TNF_NO_GROUP;
3150		3150
3151	/*	3151	/*
3152	* Flag if the page is shared between multiple address spaces. This	3152	* Flag if the page is shared between multiple address spaces. This
3153	* is later used when determining whether to group tasks together	3153	* is later used when determining whether to group tasks together
3154	*/	3154	*/
3155	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))	3155	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3156	flags \|= TNF_SHARED;	3156	flags \|= TNF_SHARED;
3157		3157
3158	last_cpupid = page_cpupid_last(page);	3158	last_cpupid = page_cpupid_last(page);
3159	page_nid = page_to_nid(page);	3159	page_nid = page_to_nid(page);
3160	target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);	3160	target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3161	pte_unmap_unlock(ptep, ptl);	3161	pte_unmap_unlock(ptep, ptl);
3162	if (target_nid == -1) {	3162	if (target_nid == -1) {
3163	put_page(page);	3163	put_page(page);
3164	goto out;	3164	goto out;
3165	}	3165	}
3166		3166
3167	/* Migrate to the requested node */	3167	/* Migrate to the requested node */
3168	migrated = migrate_misplaced_page(page, vma, target_nid);	3168	migrated = migrate_misplaced_page(page, vma, target_nid);
3169	if (migrated) {	3169	if (migrated) {
3170	page_nid = target_nid;	3170	page_nid = target_nid;
3171	flags \|= TNF_MIGRATED;	3171	flags \|= TNF_MIGRATED;
3172	}	3172	}
3173		3173
3174	out:	3174	out:
3175	if (page_nid != -1)	3175	if (page_nid != -1)
3176	task_numa_fault(last_cpupid, page_nid, 1, flags);	3176	task_numa_fault(last_cpupid, page_nid, 1, flags);
3177	return 0;	3177	return 0;
3178	}	3178	}
3179		3179
3180	/*	3180	/*
3181	* These routines also need to handle stuff like marking pages dirty	3181	* These routines also need to handle stuff like marking pages dirty
3182	* and/or accessed for architectures that don't do it in hardware (most	3182	* and/or accessed for architectures that don't do it in hardware (most
3183	* RISC architectures). The early dirtying is also good on the i386.	3183	* RISC architectures). The early dirtying is also good on the i386.
3184	*	3184	*
3185	* There is also a hook called "update_mmu_cache()" that architectures	3185	* There is also a hook called "update_mmu_cache()" that architectures
3186	* with external mmu caches can use to update those (ie the Sparc or	3186	* with external mmu caches can use to update those (ie the Sparc or
3187	* PowerPC hashed page tables that act as extended TLBs).	3187	* PowerPC hashed page tables that act as extended TLBs).
3188	*	3188	*
3189	* We enter with non-exclusive mmap_sem (to exclude vma changes,	3189	* We enter with non-exclusive mmap_sem (to exclude vma changes,
3190	* but allow concurrent faults), and pte mapped but not yet locked.	3190	* but allow concurrent faults), and pte mapped but not yet locked.
3191	* We return with pte unmapped and unlocked.	3191	* We return with pte unmapped and unlocked.
3192	*	3192	*
3193	* The mmap_sem may have been released depending on flags and our	3193	* The mmap_sem may have been released depending on flags and our
3194	* return value. See filemap_fault() and __lock_page_or_retry().	3194	* return value. See filemap_fault() and __lock_page_or_retry().
3195	*/	3195	*/
3196	static int handle_pte_fault(struct mm_struct *mm,	3196	static int handle_pte_fault(struct mm_struct *mm,
3197	struct vm_area_struct *vma, unsigned long address,	3197	struct vm_area_struct *vma, unsigned long address,
3198	pte_t pte, pmd_t pmd, unsigned int flags)	3198	pte_t pte, pmd_t pmd, unsigned int flags)
3199	{	3199	{
3200	pte_t entry;	3200	pte_t entry;
3201	spinlock_t *ptl;	3201	spinlock_t *ptl;
3202		3202
3203	entry = ACCESS_ONCE(*pte);	3203	entry = ACCESS_ONCE(*pte);
3204	if (!pte_present(entry)) {	3204	if (!pte_present(entry)) {
3205	if (pte_none(entry)) {	3205	if (pte_none(entry)) {
3206	if (vma->vm_ops) {	3206	if (vma->vm_ops) {
3207	if (likely(vma->vm_ops->fault))	3207	if (likely(vma->vm_ops->fault))
3208	return do_linear_fault(mm, vma, address,	3208	return do_linear_fault(mm, vma, address,
3209	pte, pmd, flags, entry);	3209	pte, pmd, flags, entry);
3210	}	3210	}
3211	return do_anonymous_page(mm, vma, address,	3211	return do_anonymous_page(mm, vma, address,
3212	pte, pmd, flags);	3212	pte, pmd, flags);
3213	}	3213	}
3214	if (pte_file(entry))	3214	if (pte_file(entry))
3215	return do_nonlinear_fault(mm, vma, address,	3215	return do_nonlinear_fault(mm, vma, address,
3216	pte, pmd, flags, entry);	3216	pte, pmd, flags, entry);
3217	return do_swap_page(mm, vma, address,	3217	return do_swap_page(mm, vma, address,
3218	pte, pmd, flags, entry);	3218	pte, pmd, flags, entry);
3219	}	3219	}
3220		3220
3221	if (pte_numa(entry))	3221	if (pte_numa(entry))
3222	return do_numa_page(mm, vma, address, entry, pte, pmd);	3222	return do_numa_page(mm, vma, address, entry, pte, pmd);
3223		3223
3224	ptl = pte_lockptr(mm, pmd);	3224	ptl = pte_lockptr(mm, pmd);
3225	spin_lock(ptl);	3225	spin_lock(ptl);
3226	if (unlikely(!pte_same(*pte, entry)))	3226	if (unlikely(!pte_same(*pte, entry)))
3227	goto unlock;	3227	goto unlock;
3228	if (flags & FAULT_FLAG_WRITE) {	3228	if (flags & FAULT_FLAG_WRITE) {
3229	if (!pte_write(entry))	3229	if (!pte_write(entry))
3230	return do_wp_page(mm, vma, address,	3230	return do_wp_page(mm, vma, address,
3231	pte, pmd, ptl, entry);	3231	pte, pmd, ptl, entry);
3232	entry = pte_mkdirty(entry);	3232	entry = pte_mkdirty(entry);
3233	}	3233	}
3234	entry = pte_mkyoung(entry);	3234	entry = pte_mkyoung(entry);
3235	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {	3235	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3236	update_mmu_cache(vma, address, pte);	3236	update_mmu_cache(vma, address, pte);
3237	} else {	3237	} else {
3238	/*	3238	/*
3239	* This is needed only for protection faults but the arch code	3239	* This is needed only for protection faults but the arch code
3240	* is not yet telling us if this is a protection fault or not.	3240	* is not yet telling us if this is a protection fault or not.
3241	* This still avoids useless tlb flushes for .text page faults	3241	* This still avoids useless tlb flushes for .text page faults
3242	* with threads.	3242	* with threads.
3243	*/	3243	*/
3244	if (flags & FAULT_FLAG_WRITE)	3244	if (flags & FAULT_FLAG_WRITE)
3245	flush_tlb_fix_spurious_fault(vma, address);	3245	flush_tlb_fix_spurious_fault(vma, address);
3246	}	3246	}
3247	unlock:	3247	unlock:
3248	pte_unmap_unlock(pte, ptl);	3248	pte_unmap_unlock(pte, ptl);
3249	return 0;	3249	return 0;
3250	}	3250	}
3251		3251
3252	/*	3252	/*
3253	* By the time we get here, we already hold the mm semaphore	3253	* By the time we get here, we already hold the mm semaphore
3254	*	3254	*
3255	* The mmap_sem may have been released depending on flags and our	3255	* The mmap_sem may have been released depending on flags and our
3256	* return value. See filemap_fault() and __lock_page_or_retry().	3256	* return value. See filemap_fault() and __lock_page_or_retry().
3257	*/	3257	*/
3258	static int __handle_mm_fault(struct mm_struct mm, struct vm_area_struct vma,	3258	static int __handle_mm_fault(struct mm_struct mm, struct vm_area_struct vma,
3259	unsigned long address, unsigned int flags)	3259	unsigned long address, unsigned int flags)
3260	{	3260	{
3261	pgd_t *pgd;	3261	pgd_t *pgd;
3262	pud_t *pud;	3262	pud_t *pud;
3263	pmd_t *pmd;	3263	pmd_t *pmd;
3264	pte_t *pte;	3264	pte_t *pte;
3265		3265
3266	if (unlikely(is_vm_hugetlb_page(vma)))	3266	if (unlikely(is_vm_hugetlb_page(vma)))
3267	return hugetlb_fault(mm, vma, address, flags);	3267	return hugetlb_fault(mm, vma, address, flags);
3268		3268
3269	pgd = pgd_offset(mm, address);	3269	pgd = pgd_offset(mm, address);
3270	pud = pud_alloc(mm, pgd, address);	3270	pud = pud_alloc(mm, pgd, address);
3271	if (!pud)	3271	if (!pud)
3272	return VM_FAULT_OOM;	3272	return VM_FAULT_OOM;
3273	pmd = pmd_alloc(mm, pud, address);	3273	pmd = pmd_alloc(mm, pud, address);
3274	if (!pmd)	3274	if (!pmd)
3275	return VM_FAULT_OOM;	3275	return VM_FAULT_OOM;
3276	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {	3276	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3277	int ret = VM_FAULT_FALLBACK;	3277	int ret = VM_FAULT_FALLBACK;
3278	if (!vma->vm_ops)	3278	if (!vma->vm_ops)
3279	ret = do_huge_pmd_anonymous_page(mm, vma, address,	3279	ret = do_huge_pmd_anonymous_page(mm, vma, address,
3280	pmd, flags);	3280	pmd, flags);
3281	if (!(ret & VM_FAULT_FALLBACK))	3281	if (!(ret & VM_FAULT_FALLBACK))
3282	return ret;	3282	return ret;
3283	} else {	3283	} else {
3284	pmd_t orig_pmd = *pmd;	3284	pmd_t orig_pmd = *pmd;
3285	int ret;	3285	int ret;
3286		3286
3287	barrier();	3287	barrier();
3288	if (pmd_trans_huge(orig_pmd)) {	3288	if (pmd_trans_huge(orig_pmd)) {
3289	unsigned int dirty = flags & FAULT_FLAG_WRITE;	3289	unsigned int dirty = flags & FAULT_FLAG_WRITE;
3290		3290
3291	/*	3291	/*
3292	* If the pmd is splitting, return and retry the	3292	* If the pmd is splitting, return and retry the
3293	* the fault. Alternative: wait until the split	3293	* the fault. Alternative: wait until the split
3294	* is done, and goto retry.	3294	* is done, and goto retry.
3295	*/	3295	*/
3296	if (pmd_trans_splitting(orig_pmd))	3296	if (pmd_trans_splitting(orig_pmd))
3297	return 0;	3297	return 0;
3298		3298
3299	if (pmd_numa(orig_pmd))	3299	if (pmd_numa(orig_pmd))
3300	return do_huge_pmd_numa_page(mm, vma, address,	3300	return do_huge_pmd_numa_page(mm, vma, address,
3301	orig_pmd, pmd);	3301	orig_pmd, pmd);
3302		3302
3303	if (dirty && !pmd_write(orig_pmd)) {	3303	if (dirty && !pmd_write(orig_pmd)) {
3304	ret = do_huge_pmd_wp_page(mm, vma, address, pmd,	3304	ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3305	orig_pmd);	3305	orig_pmd);
3306	if (!(ret & VM_FAULT_FALLBACK))	3306	if (!(ret & VM_FAULT_FALLBACK))
3307	return ret;	3307	return ret;
3308	} else {	3308	} else {
3309	huge_pmd_set_accessed(mm, vma, address, pmd,	3309	huge_pmd_set_accessed(mm, vma, address, pmd,
3310	orig_pmd, dirty);	3310	orig_pmd, dirty);
3311	return 0;	3311	return 0;
3312	}	3312	}
3313	}	3313	}
3314	}	3314	}
3315		3315
3316	/*	3316	/*
3317	* Use __pte_alloc instead of pte_alloc_map, because we can't	3317	* Use __pte_alloc instead of pte_alloc_map, because we can't
3318	* run pte_offset_map on the pmd, if an huge pmd could	3318	* run pte_offset_map on the pmd, if an huge pmd could
3319	* materialize from under us from a different thread.	3319	* materialize from under us from a different thread.
3320	*/	3320	*/
3321	if (unlikely(pmd_none(*pmd)) &&	3321	if (unlikely(pmd_none(*pmd)) &&
3322	unlikely(__pte_alloc(mm, vma, pmd, address)))	3322	unlikely(__pte_alloc(mm, vma, pmd, address)))
3323	return VM_FAULT_OOM;	3323	return VM_FAULT_OOM;
3324	/* if an huge pmd materialized from under us just retry later */	3324	/* if an huge pmd materialized from under us just retry later */
3325	if (unlikely(pmd_trans_huge(*pmd)))	3325	if (unlikely(pmd_trans_huge(*pmd)))
3326	return 0;	3326	return 0;
3327	/*	3327	/*
3328	* A regular pmd is established and it can't morph into a huge pmd	3328	* A regular pmd is established and it can't morph into a huge pmd
3329	* from under us anymore at this point because we hold the mmap_sem	3329	* from under us anymore at this point because we hold the mmap_sem
3330	* read mode and khugepaged takes it in write mode. So now it's	3330	* read mode and khugepaged takes it in write mode. So now it's
3331	* safe to run pte_offset_map().	3331	* safe to run pte_offset_map().
3332	*/	3332	*/
3333	pte = pte_offset_map(pmd, address);	3333	pte = pte_offset_map(pmd, address);
3334		3334
3335	return handle_pte_fault(mm, vma, address, pte, pmd, flags);	3335	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3336	}	3336	}
3337		3337
3338	/*	3338	/*
3339	* By the time we get here, we already hold the mm semaphore	3339	* By the time we get here, we already hold the mm semaphore
3340	*	3340	*
3341	* The mmap_sem may have been released depending on flags and our	3341	* The mmap_sem may have been released depending on flags and our
3342	* return value. See filemap_fault() and __lock_page_or_retry().	3342	* return value. See filemap_fault() and __lock_page_or_retry().
3343	*/	3343	*/
3344	int handle_mm_fault(struct mm_struct mm, struct vm_area_struct vma,	3344	int handle_mm_fault(struct mm_struct mm, struct vm_area_struct vma,
3345	unsigned long address, unsigned int flags)	3345	unsigned long address, unsigned int flags)
3346	{	3346	{
3347	int ret;	3347	int ret;
3348		3348
3349	__set_current_state(TASK_RUNNING);	3349	__set_current_state(TASK_RUNNING);
3350		3350
3351	count_vm_event(PGFAULT);	3351	count_vm_event(PGFAULT);
3352	mem_cgroup_count_vm_event(mm, PGFAULT);	3352	mem_cgroup_count_vm_event(mm, PGFAULT);
3353		3353
3354	/* do counter updates before entering really critical section. */	3354	/* do counter updates before entering really critical section. */
3355	check_sync_rss_stat(current);	3355	check_sync_rss_stat(current);
3356		3356
3357	/*	3357	/*
3358	* Enable the memcg OOM handling for faults triggered in user	3358	* Enable the memcg OOM handling for faults triggered in user
3359	* space. Kernel faults are handled more gracefully.	3359	* space. Kernel faults are handled more gracefully.
3360	*/	3360	*/
3361	if (flags & FAULT_FLAG_USER)	3361	if (flags & FAULT_FLAG_USER)
3362	mem_cgroup_oom_enable();	3362	mem_cgroup_oom_enable();
3363		3363
3364	ret = __handle_mm_fault(mm, vma, address, flags);	3364	ret = __handle_mm_fault(mm, vma, address, flags);
3365		3365
3366	if (flags & FAULT_FLAG_USER) {	3366	if (flags & FAULT_FLAG_USER) {
3367	mem_cgroup_oom_disable();	3367	mem_cgroup_oom_disable();
3368	/*	3368	/*
3369	* The task may have entered a memcg OOM situation but	3369	* The task may have entered a memcg OOM situation but
3370	* if the allocation error was handled gracefully (no	3370	* if the allocation error was handled gracefully (no
3371	* VM_FAULT_OOM), there is no need to kill anything.	3371	* VM_FAULT_OOM), there is no need to kill anything.
3372	* Just clean up the OOM state peacefully.	3372	* Just clean up the OOM state peacefully.
3373	*/	3373	*/
3374	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))	3374	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3375	mem_cgroup_oom_synchronize(false);	3375	mem_cgroup_oom_synchronize(false);
3376	}	3376	}
3377		3377
3378	return ret;	3378	return ret;
3379	}	3379	}
3380		3380
3381	#ifndef __PAGETABLE_PUD_FOLDED	3381	#ifndef __PAGETABLE_PUD_FOLDED
3382	/*	3382	/*
3383	* Allocate page upper directory.	3383	* Allocate page upper directory.
3384	* We've already handled the fast-path in-line.	3384	* We've already handled the fast-path in-line.
3385	*/	3385	*/
3386	int __pud_alloc(struct mm_struct mm, pgd_t pgd, unsigned long address)	3386	int __pud_alloc(struct mm_struct mm, pgd_t pgd, unsigned long address)
3387	{	3387	{
3388	pud_t *new = pud_alloc_one(mm, address);	3388	pud_t *new = pud_alloc_one(mm, address);
3389	if (!new)	3389	if (!new)
3390	return -ENOMEM;	3390	return -ENOMEM;
3391		3391
3392	smp_wmb(); /* See comment in __pte_alloc */	3392	smp_wmb(); /* See comment in __pte_alloc */
3393		3393
3394	spin_lock(&mm->page_table_lock);	3394	spin_lock(&mm->page_table_lock);
3395	if (pgd_present(pgd)) / Another has populated it */	3395	if (pgd_present(pgd)) / Another has populated it */
3396	pud_free(mm, new);	3396	pud_free(mm, new);
3397	else	3397	else
3398	pgd_populate(mm, pgd, new);	3398	pgd_populate(mm, pgd, new);
3399	spin_unlock(&mm->page_table_lock);	3399	spin_unlock(&mm->page_table_lock);
3400	return 0;	3400	return 0;
3401	}	3401	}
3402	#endif /* __PAGETABLE_PUD_FOLDED */	3402	#endif /* __PAGETABLE_PUD_FOLDED */
3403		3403
3404	#ifndef __PAGETABLE_PMD_FOLDED	3404	#ifndef __PAGETABLE_PMD_FOLDED
3405	/*	3405	/*
3406	* Allocate page middle directory.	3406	* Allocate page middle directory.
3407	* We've already handled the fast-path in-line.	3407	* We've already handled the fast-path in-line.
3408	*/	3408	*/
3409	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)	3409	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)
3410	{	3410	{
3411	pmd_t *new = pmd_alloc_one(mm, address);	3411	pmd_t *new = pmd_alloc_one(mm, address);
3412	if (!new)	3412	if (!new)
3413	return -ENOMEM;	3413	return -ENOMEM;
3414		3414
3415	smp_wmb(); /* See comment in __pte_alloc */	3415	smp_wmb(); /* See comment in __pte_alloc */
3416		3416
3417	spin_lock(&mm->page_table_lock);	3417	spin_lock(&mm->page_table_lock);
3418	#ifndef __ARCH_HAS_4LEVEL_HACK	3418	#ifndef __ARCH_HAS_4LEVEL_HACK
3419	if (pud_present(pud)) / Another has populated it */	3419	if (pud_present(pud)) / Another has populated it */
3420	pmd_free(mm, new);	3420	pmd_free(mm, new);
3421	else	3421	else
3422	pud_populate(mm, pud, new);	3422	pud_populate(mm, pud, new);
3423	#else	3423	#else
3424	if (pgd_present(pud)) / Another has populated it */	3424	if (pgd_present(pud)) / Another has populated it */
3425	pmd_free(mm, new);	3425	pmd_free(mm, new);
3426	else	3426	else
3427	pgd_populate(mm, pud, new);	3427	pgd_populate(mm, pud, new);
3428	#endif /* __ARCH_HAS_4LEVEL_HACK */	3428	#endif /* __ARCH_HAS_4LEVEL_HACK */
3429	spin_unlock(&mm->page_table_lock);	3429	spin_unlock(&mm->page_table_lock);
3430	return 0;	3430	return 0;
3431	}	3431	}
3432	#endif /* __PAGETABLE_PMD_FOLDED */	3432	#endif /* __PAGETABLE_PMD_FOLDED */
3433		3433
3434	static int __follow_pte(struct mm_struct *mm, unsigned long address,	3434	static int __follow_pte(struct mm_struct *mm, unsigned long address,
3435	pte_t ptepp, spinlock_t ptlp)	3435	pte_t ptepp, spinlock_t ptlp)
3436	{	3436	{
3437	pgd_t *pgd;	3437	pgd_t *pgd;
3438	pud_t *pud;	3438	pud_t *pud;
3439	pmd_t *pmd;	3439	pmd_t *pmd;
3440	pte_t *ptep;	3440	pte_t *ptep;
3441		3441
3442	pgd = pgd_offset(mm, address);	3442	pgd = pgd_offset(mm, address);
3443	if (pgd_none(pgd) \|\| unlikely(pgd_bad(pgd)))	3443	if (pgd_none(pgd) \|\| unlikely(pgd_bad(pgd)))
3444	goto out;	3444	goto out;
3445		3445
3446	pud = pud_offset(pgd, address);	3446	pud = pud_offset(pgd, address);
3447	if (pud_none(pud) \|\| unlikely(pud_bad(pud)))	3447	if (pud_none(pud) \|\| unlikely(pud_bad(pud)))
3448	goto out;	3448	goto out;
3449		3449
3450	pmd = pmd_offset(pud, address);	3450	pmd = pmd_offset(pud, address);
3451	VM_BUG_ON(pmd_trans_huge(*pmd));	3451	VM_BUG_ON(pmd_trans_huge(*pmd));
3452	if (pmd_none(pmd) \|\| unlikely(pmd_bad(pmd)))	3452	if (pmd_none(pmd) \|\| unlikely(pmd_bad(pmd)))
3453	goto out;	3453	goto out;
3454		3454
3455	/* We cannot handle huge page PFN maps. Luckily they don't exist. */	3455	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
3456	if (pmd_huge(*pmd))	3456	if (pmd_huge(*pmd))
3457	goto out;	3457	goto out;
3458		3458
3459	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);	3459	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3460	if (!ptep)	3460	if (!ptep)
3461	goto out;	3461	goto out;
3462	if (!pte_present(*ptep))	3462	if (!pte_present(*ptep))
3463	goto unlock;	3463	goto unlock;
3464	*ptepp = ptep;	3464	*ptepp = ptep;
3465	return 0;	3465	return 0;
3466	unlock:	3466	unlock:
3467	pte_unmap_unlock(ptep, *ptlp);	3467	pte_unmap_unlock(ptep, *ptlp);
3468	out:	3468	out:
3469	return -EINVAL;	3469	return -EINVAL;
3470	}	3470	}
3471		3471
3472	static inline int follow_pte(struct mm_struct *mm, unsigned long address,	3472	static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3473	pte_t ptepp, spinlock_t ptlp)	3473	pte_t ptepp, spinlock_t ptlp)
3474	{	3474	{
3475	int res;	3475	int res;
3476		3476
3477	/* (void) is needed to make gcc happy */	3477	/* (void) is needed to make gcc happy */
3478	(void) __cond_lock(*ptlp,	3478	(void) __cond_lock(*ptlp,
3479	!(res = __follow_pte(mm, address, ptepp, ptlp)));	3479	!(res = __follow_pte(mm, address, ptepp, ptlp)));
3480	return res;	3480	return res;
3481	}	3481	}
3482		3482
3483	/**	3483	/**
3484	* follow_pfn - look up PFN at a user virtual address	3484	* follow_pfn - look up PFN at a user virtual address
3485	* @vma: memory mapping	3485	* @vma: memory mapping
3486	* @address: user virtual address	3486	* @address: user virtual address
3487	* @pfn: location to store found PFN	3487	* @pfn: location to store found PFN
3488	*	3488	*
3489	* Only IO mappings and raw PFN mappings are allowed.	3489	* Only IO mappings and raw PFN mappings are allowed.
3490	*	3490	*
3491	* Returns zero and the pfn at @pfn on success, -ve otherwise.	3491	* Returns zero and the pfn at @pfn on success, -ve otherwise.
3492	*/	3492	*/
3493	int follow_pfn(struct vm_area_struct *vma, unsigned long address,	3493	int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3494	unsigned long *pfn)	3494	unsigned long *pfn)
3495	{	3495	{
3496	int ret = -EINVAL;	3496	int ret = -EINVAL;
3497	spinlock_t *ptl;	3497	spinlock_t *ptl;
3498	pte_t *ptep;	3498	pte_t *ptep;
3499		3499
3500	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))	3500	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
3501	return ret;	3501	return ret;
3502		3502
3503	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);	3503	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3504	if (ret)	3504	if (ret)
3505	return ret;	3505	return ret;
3506	pfn = pte_pfn(ptep);	3506	pfn = pte_pfn(ptep);
3507	pte_unmap_unlock(ptep, ptl);	3507	pte_unmap_unlock(ptep, ptl);
3508	return 0;	3508	return 0;
3509	}	3509	}
3510	EXPORT_SYMBOL(follow_pfn);	3510	EXPORT_SYMBOL(follow_pfn);
3511		3511
3512	#ifdef CONFIG_HAVE_IOREMAP_PROT	3512	#ifdef CONFIG_HAVE_IOREMAP_PROT
3513	int follow_phys(struct vm_area_struct *vma,	3513	int follow_phys(struct vm_area_struct *vma,
3514	unsigned long address, unsigned int flags,	3514	unsigned long address, unsigned int flags,
3515	unsigned long prot, resource_size_t phys)	3515	unsigned long prot, resource_size_t phys)
3516	{	3516	{
3517	int ret = -EINVAL;	3517	int ret = -EINVAL;
3518	pte_t *ptep, pte;	3518	pte_t *ptep, pte;
3519	spinlock_t *ptl;	3519	spinlock_t *ptl;
3520		3520
3521	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))	3521	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
3522	goto out;	3522	goto out;
3523		3523
3524	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))	3524	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3525	goto out;	3525	goto out;
3526	pte = *ptep;	3526	pte = *ptep;
3527		3527
3528	if ((flags & FOLL_WRITE) && !pte_write(pte))	3528	if ((flags & FOLL_WRITE) && !pte_write(pte))
3529	goto unlock;	3529	goto unlock;
3530		3530
3531	*prot = pgprot_val(pte_pgprot(pte));	3531	*prot = pgprot_val(pte_pgprot(pte));
3532	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;	3532	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3533		3533
3534	ret = 0;	3534	ret = 0;
3535	unlock:	3535	unlock:
3536	pte_unmap_unlock(ptep, ptl);	3536	pte_unmap_unlock(ptep, ptl);
3537	out:	3537	out:
3538	return ret;	3538	return ret;
3539	}	3539	}
3540		3540
3541	int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,	3541	int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3542	void *buf, int len, int write)	3542	void *buf, int len, int write)
3543	{	3543	{
3544	resource_size_t phys_addr;	3544	resource_size_t phys_addr;
3545	unsigned long prot = 0;	3545	unsigned long prot = 0;
3546	void __iomem *maddr;	3546	void __iomem *maddr;
3547	int offset = addr & (PAGE_SIZE-1);	3547	int offset = addr & (PAGE_SIZE-1);
3548		3548
3549	if (follow_phys(vma, addr, write, &prot, &phys_addr))	3549	if (follow_phys(vma, addr, write, &prot, &phys_addr))
3550	return -EINVAL;	3550	return -EINVAL;
3551		3551
3552	maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);	3552	maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3553	if (write)	3553	if (write)
3554	memcpy_toio(maddr + offset, buf, len);	3554	memcpy_toio(maddr + offset, buf, len);
3555	else	3555	else
3556	memcpy_fromio(buf, maddr + offset, len);	3556	memcpy_fromio(buf, maddr + offset, len);
3557	iounmap(maddr);	3557	iounmap(maddr);
3558		3558
3559	return len;	3559	return len;
3560	}	3560	}
3561	EXPORT_SYMBOL_GPL(generic_access_phys);	3561	EXPORT_SYMBOL_GPL(generic_access_phys);
3562	#endif	3562	#endif
3563		3563
3564	/*	3564	/*
3565	* Access another process' address space as given in mm. If non-NULL, use the	3565	* Access another process' address space as given in mm. If non-NULL, use the
3566	* given task for page fault accounting.	3566	* given task for page fault accounting.
3567	*/	3567	*/
3568	static int __access_remote_vm(struct task_struct tsk, struct mm_struct mm,	3568	static int __access_remote_vm(struct task_struct tsk, struct mm_struct mm,
3569	unsigned long addr, void *buf, int len, int write)	3569	unsigned long addr, void *buf, int len, int write)
3570	{	3570	{
3571	struct vm_area_struct *vma;	3571	struct vm_area_struct *vma;
3572	void *old_buf = buf;	3572	void *old_buf = buf;
3573		3573
3574	down_read(&mm->mmap_sem);	3574	down_read(&mm->mmap_sem);
3575	/* ignore errors, just check how much was successfully transferred */	3575	/* ignore errors, just check how much was successfully transferred */
3576	while (len) {	3576	while (len) {
3577	int bytes, ret, offset;	3577	int bytes, ret, offset;
3578	void *maddr;	3578	void *maddr;
3579	struct page *page = NULL;	3579	struct page *page = NULL;
3580		3580
3581	ret = get_user_pages(tsk, mm, addr, 1,	3581	ret = get_user_pages(tsk, mm, addr, 1,
3582	write, 1, &page, &vma);	3582	write, 1, &page, &vma);
3583	if (ret <= 0) {	3583	if (ret <= 0) {
3584	#ifndef CONFIG_HAVE_IOREMAP_PROT	3584	#ifndef CONFIG_HAVE_IOREMAP_PROT
3585	break;	3585	break;
3586	#else	3586	#else
3587	/*	3587	/*
3588	* Check if this is a VM_IO \| VM_PFNMAP VMA, which	3588	* Check if this is a VM_IO \| VM_PFNMAP VMA, which
3589	* we can access using slightly different code.	3589	* we can access using slightly different code.
3590	*/	3590	*/
3591	vma = find_vma(mm, addr);	3591	vma = find_vma(mm, addr);
3592	if (!vma \|\| vma->vm_start > addr)	3592	if (!vma \|\| vma->vm_start > addr)
3593	break;	3593	break;
3594	if (vma->vm_ops && vma->vm_ops->access)	3594	if (vma->vm_ops && vma->vm_ops->access)
3595	ret = vma->vm_ops->access(vma, addr, buf,	3595	ret = vma->vm_ops->access(vma, addr, buf,
3596	len, write);	3596	len, write);
3597	if (ret <= 0)	3597	if (ret <= 0)
3598	break;	3598	break;
3599	bytes = ret;	3599	bytes = ret;
3600	#endif	3600	#endif
3601	} else {	3601	} else {
3602	bytes = len;	3602	bytes = len;
3603	offset = addr & (PAGE_SIZE-1);	3603	offset = addr & (PAGE_SIZE-1);
3604	if (bytes > PAGE_SIZE-offset)	3604	if (bytes > PAGE_SIZE-offset)
3605	bytes = PAGE_SIZE-offset;	3605	bytes = PAGE_SIZE-offset;
3606		3606
3607	maddr = kmap(page);	3607	maddr = kmap(page);
3608	if (write) {	3608	if (write) {
3609	copy_to_user_page(vma, page, addr,	3609	copy_to_user_page(vma, page, addr,
3610	maddr + offset, buf, bytes);	3610	maddr + offset, buf, bytes);
3611	set_page_dirty_lock(page);	3611	set_page_dirty_lock(page);
3612	} else {	3612	} else {
3613	copy_from_user_page(vma, page, addr,	3613	copy_from_user_page(vma, page, addr,
3614	buf, maddr + offset, bytes);	3614	buf, maddr + offset, bytes);
3615	}	3615	}
3616	kunmap(page);	3616	kunmap(page);
3617	page_cache_release(page);	3617	page_cache_release(page);
3618	}	3618	}
3619	len -= bytes;	3619	len -= bytes;
3620	buf += bytes;	3620	buf += bytes;
3621	addr += bytes;	3621	addr += bytes;
3622	}	3622	}
3623	up_read(&mm->mmap_sem);	3623	up_read(&mm->mmap_sem);
3624		3624
3625	return buf - old_buf;	3625	return buf - old_buf;
3626	}	3626	}
3627		3627
3628	/**	3628	/**
3629	* access_remote_vm - access another process' address space	3629	* access_remote_vm - access another process' address space
3630	* @mm: the mm_struct of the target address space	3630	* @mm: the mm_struct of the target address space
3631	* @addr: start address to access	3631	* @addr: start address to access
3632	* @buf: source or destination buffer	3632	* @buf: source or destination buffer
3633	* @len: number of bytes to transfer	3633	* @len: number of bytes to transfer
3634	* @write: whether the access is a write	3634	* @write: whether the access is a write
3635	*	3635	*
3636	* The caller must hold a reference on @mm.	3636	* The caller must hold a reference on @mm.
3637	*/	3637	*/
3638	int access_remote_vm(struct mm_struct *mm, unsigned long addr,	3638	int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3639	void *buf, int len, int write)	3639	void *buf, int len, int write)
3640	{	3640	{
3641	return __access_remote_vm(NULL, mm, addr, buf, len, write);	3641	return __access_remote_vm(NULL, mm, addr, buf, len, write);
3642	}	3642	}
3643		3643
3644	/*	3644	/*
3645	* Access another process' address space.	3645	* Access another process' address space.
3646	* Source/target buffer must be kernel space,	3646	* Source/target buffer must be kernel space,
3647	* Do not walk the page table directly, use get_user_pages	3647	* Do not walk the page table directly, use get_user_pages
3648	*/	3648	*/
3649	int access_process_vm(struct task_struct *tsk, unsigned long addr,	3649	int access_process_vm(struct task_struct *tsk, unsigned long addr,
3650	void *buf, int len, int write)	3650	void *buf, int len, int write)
3651	{	3651	{
3652	struct mm_struct *mm;	3652	struct mm_struct *mm;
3653	int ret;	3653	int ret;
3654		3654
3655	mm = get_task_mm(tsk);	3655	mm = get_task_mm(tsk);
3656	if (!mm)	3656	if (!mm)
3657	return 0;	3657	return 0;
3658		3658
3659	ret = __access_remote_vm(tsk, mm, addr, buf, len, write);	3659	ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3660	mmput(mm);	3660	mmput(mm);
3661		3661
3662	return ret;	3662	return ret;
3663	}	3663	}
3664		3664
3665	/*	3665	/*
3666	* Print the name of a VMA.	3666	* Print the name of a VMA.
3667	*/	3667	*/
3668	void print_vma_addr(char *prefix, unsigned long ip)	3668	void print_vma_addr(char *prefix, unsigned long ip)
3669	{	3669	{
3670	struct mm_struct *mm = current->mm;	3670	struct mm_struct *mm = current->mm;
3671	struct vm_area_struct *vma;	3671	struct vm_area_struct *vma;
3672		3672
3673	/*	3673	/*
3674	* Do not print if we are in atomic	3674	* Do not print if we are in atomic
3675	* contexts (in exception stacks, etc.):	3675	* contexts (in exception stacks, etc.):
3676	*/	3676	*/
3677	if (preempt_count())	3677	if (preempt_count())
3678	return;	3678	return;
3679		3679
3680	down_read(&mm->mmap_sem);	3680	down_read(&mm->mmap_sem);
3681	vma = find_vma(mm, ip);	3681	vma = find_vma(mm, ip);
3682	if (vma && vma->vm_file) {	3682	if (vma && vma->vm_file) {
3683	struct file *f = vma->vm_file;	3683	struct file *f = vma->vm_file;
3684	char buf = (char )__get_free_page(GFP_KERNEL);	3684	char buf = (char )__get_free_page(GFP_KERNEL);
3685	if (buf) {	3685	if (buf) {
3686	char *p;	3686	char *p;
3687		3687
3688	p = d_path(&f->f_path, buf, PAGE_SIZE);	3688	p = d_path(&f->f_path, buf, PAGE_SIZE);
3689	if (IS_ERR(p))	3689	if (IS_ERR(p))
3690	p = "?";	3690	p = "?";
3691	printk("%s%s[%lx+%lx]", prefix, kbasename(p),	3691	printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3692	vma->vm_start,	3692	vma->vm_start,
3693	vma->vm_end - vma->vm_start);	3693	vma->vm_end - vma->vm_start);
3694	free_page((unsigned long)buf);	3694	free_page((unsigned long)buf);
3695	}	3695	}
3696	}	3696	}
3697	up_read(&mm->mmap_sem);	3697	up_read(&mm->mmap_sem);
3698	}	3698	}
3699		3699
3700	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)	3700	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3701	void might_fault(void)	3701	void might_fault(void)
3702	{	3702	{
3703	/*	3703	/*
3704	* Some code (nfs/sunrpc) uses socket ops on kernel memory while	3704	* Some code (nfs/sunrpc) uses socket ops on kernel memory while
3705	* holding the mmap_sem, this is safe because kernel memory doesn't	3705	* holding the mmap_sem, this is safe because kernel memory doesn't
3706	* get paged out, therefore we'll never actually fault, and the	3706	* get paged out, therefore we'll never actually fault, and the
3707	* below annotations will generate false positives.	3707	* below annotations will generate false positives.
3708	*/	3708	*/
3709	if (segment_eq(get_fs(), KERNEL_DS))	3709	if (segment_eq(get_fs(), KERNEL_DS))
3710	return;	3710	return;
3711		3711
3712	/*	3712	/*
3713	* it would be nicer only to annotate paths which are not under	3713	* it would be nicer only to annotate paths which are not under
3714	* pagefault_disable, however that requires a larger audit and	3714	* pagefault_disable, however that requires a larger audit and
3715	* providing helpers like get_user_atomic.	3715	* providing helpers like get_user_atomic.
3716	*/	3716	*/
3717	if (in_atomic())	3717	if (in_atomic())
3718	return;	3718	return;
3719		3719
3720	__might_sleep(__FILE__, __LINE__, 0);	3720	__might_sleep(__FILE__, __LINE__, 0);
3721		3721
3722	if (current->mm)	3722	if (current->mm)
3723	might_lock_read(&current->mm->mmap_sem);	3723	might_lock_read(&current->mm->mmap_sem);
3724	}	3724	}
3725	EXPORT_SYMBOL(might_fault);	3725	EXPORT_SYMBOL(might_fault);
3726	#endif	3726	#endif
3727		3727
3728	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) \|\| defined(CONFIG_HUGETLBFS)	3728	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) \|\| defined(CONFIG_HUGETLBFS)
3729	static void clear_gigantic_page(struct page *page,	3729	static void clear_gigantic_page(struct page *page,
3730	unsigned long addr,	3730	unsigned long addr,
3731	unsigned int pages_per_huge_page)	3731	unsigned int pages_per_huge_page)
3732	{	3732	{
3733	int i;	3733	int i;
3734	struct page *p = page;	3734	struct page *p = page;
3735		3735
3736	might_sleep();	3736	might_sleep();
3737	for (i = 0; i < pages_per_huge_page;	3737	for (i = 0; i < pages_per_huge_page;
3738	i++, p = mem_map_next(p, page, i)) {	3738	i++, p = mem_map_next(p, page, i)) {
3739	cond_resched();	3739	cond_resched();
3740	clear_user_highpage(p, addr + i * PAGE_SIZE);	3740	clear_user_highpage(p, addr + i * PAGE_SIZE);
3741	}	3741	}
3742	}	3742	}
3743	void clear_huge_page(struct page *page,	3743	void clear_huge_page(struct page *page,
3744	unsigned long addr, unsigned int pages_per_huge_page)	3744	unsigned long addr, unsigned int pages_per_huge_page)
3745	{	3745	{
3746	int i;	3746	int i;
3747		3747
3748	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {	3748	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3749	clear_gigantic_page(page, addr, pages_per_huge_page);	3749	clear_gigantic_page(page, addr, pages_per_huge_page);
3750	return;	3750	return;
3751	}	3751	}
3752		3752
3753	might_sleep();	3753	might_sleep();
3754	for (i = 0; i < pages_per_huge_page; i++) {	3754	for (i = 0; i < pages_per_huge_page; i++) {
3755	cond_resched();	3755	cond_resched();
3756	clear_user_highpage(page + i, addr + i * PAGE_SIZE);	3756	clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3757	}	3757	}
3758	}	3758	}
3759		3759
3760	static void copy_user_gigantic_page(struct page dst, struct page src,	3760	static void copy_user_gigantic_page(struct page dst, struct page src,
3761	unsigned long addr,	3761	unsigned long addr,
3762	struct vm_area_struct *vma,	3762	struct vm_area_struct *vma,
3763	unsigned int pages_per_huge_page)	3763	unsigned int pages_per_huge_page)
3764	{	3764	{
3765	int i;	3765	int i;
3766	struct page *dst_base = dst;	3766	struct page *dst_base = dst;
3767	struct page *src_base = src;	3767	struct page *src_base = src;
3768		3768
3769	for (i = 0; i < pages_per_huge_page; ) {	3769	for (i = 0; i < pages_per_huge_page; ) {
3770	cond_resched();	3770	cond_resched();
3771	copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);	3771	copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3772		3772
3773	i++;	3773	i++;
3774	dst = mem_map_next(dst, dst_base, i);	3774	dst = mem_map_next(dst, dst_base, i);
3775	src = mem_map_next(src, src_base, i);	3775	src = mem_map_next(src, src_base, i);
3776	}	3776	}
3777	}	3777	}
3778		3778
3779	void copy_user_huge_page(struct page dst, struct page src,	3779	void copy_user_huge_page(struct page dst, struct page src,
3780	unsigned long addr, struct vm_area_struct *vma,	3780	unsigned long addr, struct vm_area_struct *vma,
3781	unsigned int pages_per_huge_page)	3781	unsigned int pages_per_huge_page)
3782	{	3782	{
3783	int i;	3783	int i;
3784		3784
3785	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {	3785	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3786	copy_user_gigantic_page(dst, src, addr, vma,	3786	copy_user_gigantic_page(dst, src, addr, vma,
3787	pages_per_huge_page);	3787	pages_per_huge_page);
3788	return;	3788	return;
3789	}	3789	}
3790		3790
3791	might_sleep();	3791	might_sleep();
3792	for (i = 0; i < pages_per_huge_page; i++) {	3792	for (i = 0; i < pages_per_huge_page; i++) {
3793	cond_resched();	3793	cond_resched();
3794	copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);	3794	copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3795	}	3795	}
3796	}	3796	}
3797	#endif /* CONFIG_TRANSPARENT_HUGEPAGE \|\| CONFIG_HUGETLBFS */	3797	#endif /* CONFIG_TRANSPARENT_HUGEPAGE \|\| CONFIG_HUGETLBFS */
3798		3798
3799	#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS	3799	#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
3800		3800
3801	static struct kmem_cache *page_ptl_cachep;	3801	static struct kmem_cache *page_ptl_cachep;
3802		3802
3803	void __init ptlock_cache_init(void)	3803	void __init ptlock_cache_init(void)
3804	{	3804	{
3805	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,	3805	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
3806	SLAB_PANIC, NULL);	3806	SLAB_PANIC, NULL);
3807	}	3807	}
3808		3808
3809	bool ptlock_alloc(struct page *page)	3809	bool ptlock_alloc(struct page *page)
3810	{	3810	{
3811	spinlock_t *ptl;	3811	spinlock_t *ptl;
3812		3812
3813	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);	3813	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
3814	if (!ptl)	3814	if (!ptl)
3815	return false;	3815	return false;
3816	page->ptl = ptl;	3816	page->ptl = ptl;
3817	return true;	3817	return true;
3818	}	3818	}
3819		3819
3820	void ptlock_free(struct page *page)	3820	void ptlock_free(struct page *page)
3821	{	3821	{
3822	kmem_cache_free(page_ptl_cachep, page->ptl);	3822	kmem_cache_free(page_ptl_cachep, page->ptl);
3823	}	3823	}
3824	#endif	3824	#endif
3825		3825

mm/slab.c

Diff comments View file @ 8207649

 /*
  * linux/mm/slab.c
  * Written by Mark Hemment, 1996/97.
  * (markhe@nextd.demon.co.uk)
  *
  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
  *
  * Major cleanup, different bufctl logic, per-cpu arrays
  *	(c) 2000 Manfred Spraul
  *
  * Cleanup, make the head arrays unconditional, preparation for NUMA
  * 	(c) 2002 Manfred Spraul
  *
  * An implementation of the Slab Allocator as described in outline in;
  *	UNIX Internals: The New Frontiers by Uresh Vahalia
  *	Pub: Prentice Hall	ISBN 0-13-101908-2
  * or with a little more detail in;
  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
  *	Jeff Bonwick (Sun Microsystems).
  *	Presented at: USENIX Summer 1994 Technical Conference
  *
  * The memory is organized in caches, one cache for each object type.
  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  * Each cache consists out of many slabs (they are small (usually one
  * page long) and always contiguous), and each slab contains multiple
  * initialized objects.
  *
  * This means, that your constructor is used only for newly allocated
  * slabs and you must pass objects with the same initializations to
  * kmem_cache_free.
  *
  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  * normal). If you need a special memory type, then must create a new
  * cache for that memory type.
  *
  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  *   full slabs with 0 free objects
  *   partial slabs
  *   empty slabs with no allocated objects
  *
  * If partial slabs exist, then new allocations come from these slabs,
  * otherwise from empty slabs or new slabs are allocated.
  *
  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  *
  * Each cache has a short per-cpu head array, most allocs
  * and frees go into that array, and if that array overflows, then 1/2
  * of the entries in the array are given back into the global cache.
  * The head array is strictly LIFO and should improve the cache hit rates.
  * On SMP, it additionally reduces the spinlock operations.
  *
  * The c_cpuarray may not be read with enabled local interrupts -
  * it's changed with a smp_call_function().
  *
  * SMP synchronization:
  *  constructors and destructors are called without any locking.
  *  Several members in struct kmem_cache and struct slab never change, they
  *	are accessed without any locking.
  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  *  	and local interrupts are disabled so slab code is preempt-safe.
  *  The non-constant members are protected with a per-cache irq spinlock.
  *
  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  * in 2000 - many ideas in the current implementation are derived from
  * his patch.
  *
  * Further notes from the original documentation:
  *
  * 11 April '97.  Started multi-threading - markhe
  *	The global cache-chain is protected by the mutex 'slab_mutex'.
  *	The sem is only needed when accessing/extending the cache-chain, which
  *	can never happen inside an interrupt (kmem_cache_create(),
  *	kmem_cache_shrink() and kmem_cache_reap()).
  *
  *	At present, each engine can be growing a cache.  This should be blocked.
  *
  * 15 March 2005. NUMA slab allocator.
  *	Shai Fultheim <shai@scalex86.org>.
  *	Shobhit Dayal <shobhit@calsoftinc.com>
  *	Alok N Kataria <alokk@calsoftinc.com>
  *	Christoph Lameter <christoph@lameter.com>
  *
  *	Modified the slab allocator to be node aware on NUMA systems.
  *	Each node has its own list of partial, free and full slabs.
  *	All object allocations for a node occur from node specific slab lists.
  */
 #include	<linux/slab.h>
 #include	<linux/mm.h>
 #include	<linux/poison.h>
 #include	<linux/swap.h>
 #include	<linux/cache.h>
 #include	<linux/interrupt.h>
 #include	<linux/init.h>
 #include	<linux/compiler.h>
 #include	<linux/cpuset.h>
 #include	<linux/proc_fs.h>
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
 #include	<linux/nodemask.h>
 #include	<linux/kmemleak.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 #include	<linux/fault-inject.h>
 #include	<linux/rtmutex.h>
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
 #include	<net/sock.h>
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
 #include	<asm/page.h>
 #include <trace/events/kmem.h>
 #include	"internal.h"
 #include	"slab.h"
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
  *
  * STATS	- 1 to collect stats for /proc/slabinfo.
  *		  0 for faster, smaller code (especially in the critical paths).
  *
  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
  */
 #ifdef CONFIG_DEBUG_SLAB
 #define	DEBUG		1
 #define	STATS		1
 #define	FORCED_DEBUG	1
 #else
 #define	DEBUG		0
 #define	STATS		0
 #define	FORCED_DEBUG	0
 #endif
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
 #ifndef ARCH_KMALLOC_FLAGS
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
 #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
 				<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
 #if FREELIST_BYTE_INDEX
 typedef unsigned char freelist_idx_t;
 #else
 typedef unsigned short freelist_idx_t;
 #endif
 #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
 /*
  * true if a page was allocated from pfmemalloc reserves for network-based
  * swap
  */
 static bool pfmemalloc_active __read_mostly;
 /*
  * struct array_cache
  *
  * Purpose:
  * - LIFO ordering, to hand out cache-warm objects from _alloc
  * - reduce the number of linked list operations
  * - reduce spinlock operations
  *
  * The limit is stored in the per-cpu structure to reduce the data cache
  * footprint.
  *
  */
 struct array_cache {
 	unsigned int avail;
 	unsigned int limit;
 	unsigned int batchcount;
 	unsigned int touched;
 	void *entry[];	/*
 			 * Must have this definition in here for the proper
 			 * alignment of array_cache. Also simplifies accessing
 			 * the entries.
 			 *
 			 * Entries should not be directly dereferenced as
 			 * entries belonging to slabs marked pfmemalloc will
 			 * have the lower bits set SLAB_OBJ_PFMEMALLOC
 			 */
 };
 struct alien_cache {
 	spinlock_t lock;
 	struct array_cache ac;
 };
 #define SLAB_OBJ_PFMEMALLOC	1
 static inline bool is_obj_pfmemalloc(void *objp)
 {
 	return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
 }
 static inline void set_obj_pfmemalloc(void **objp)
 {
 	*objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
 	return;
 }
 static inline void clear_obj_pfmemalloc(void **objp)
 {
 	*objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
 }
 /*
  * bootstrap: The caches do not work without cpuarrays anymore, but the
  * cpuarrays are allocated from the generic caches...
  */
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
 	struct array_cache cache;
 	void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 /*
  * Need this for bootstrapping a per node allocator.
  */
 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
 static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
 #define	CACHE_CACHE 0
 #define	SIZE_AC MAX_NUMNODES
 #define	SIZE_NODE (2 * MAX_NUMNODES)
 static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_cache_node *n, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node, struct list_head *list);
 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 static int slab_early_init = 1;
 #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
 static void kmem_cache_node_init(struct kmem_cache_node *parent)
 {
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
 	INIT_LIST_HEAD(&parent->slabs_free);
 	parent->shared = NULL;
 	parent->alien = NULL;
 	parent->colour_next = 0;
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
 }
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
 	do {								\
 		INIT_LIST_HEAD(listp);					\
 		list_splice(&get_node(cachep, nodeid)->slab, listp);	\
 	} while (0)
 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
 	do {								\
 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
 	} while (0)
 #define CFLGS_OFF_SLAB		(0x80000000UL)
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 #define BATCHREFILL_LIMIT	16
 /*
  * Optimization question: fewer reaps means less probability for unnessary
  * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
  * which could lock up otherwise freeable slabs.
  */
 #define REAPTIMEOUT_AC		(2*HZ)
 #define REAPTIMEOUT_NODE	(4*HZ)
 #if STATS
 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
 #define	STATS_INC_GROWN(x)	((x)->grown++)
 #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
 #define	STATS_SET_HIGH(x)						\
 	do {								\
 		if ((x)->num_active > (x)->high_mark)			\
 			(x)->high_mark = (x)->num_active;		\
 	} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 #define	STATS_SET_FREEABLE(x, i)					\
 	do {								\
 		if ((x)->max_freeable < i)				\
 			(x)->max_freeable = i;				\
 	} while (0)
 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
 #else
 #define	STATS_INC_ACTIVE(x)	do { } while (0)
 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
 #define	STATS_INC_ALLOCED(x)	do { } while (0)
 #define	STATS_INC_GROWN(x)	do { } while (0)
 #define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
 #define	STATS_INC_NODEFREES(x)	do { } while (0)
 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
 #define STATS_INC_FREEHIT(x)	do { } while (0)
 #define STATS_INC_FREEMISS(x)	do { } while (0)
 #endif
 #if DEBUG
 /*
  * memory layout of objects:
  * 0		: objp
  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
  * 		allocation. Catches writes behind the end of the allocation.
  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
  * 		redzone word.
  * cachep->obj_offset: The real object.
  * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
  * cachep->size - 1* BYTES_PER_WORD: last caller address
  *					[BYTES_PER_WORD long]
  */
 static int obj_offset(struct kmem_cache *cachep)
 {
 	return cachep->obj_offset;
 }
 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	return (unsigned long long*) (objp + obj_offset(cachep) -
 				      sizeof(unsigned long long));
 }
 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
 		return (unsigned long long *)(objp + cachep->size -
 					      sizeof(unsigned long long) -
 					      REDZONE_ALIGN);
 	return (unsigned long long *) (objp + cachep->size -
 				       sizeof(unsigned long long));
 }
 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 	return (void **)(objp + cachep->size - BYTES_PER_WORD);
 }
 #else
 #define obj_offset(x)			0
 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
 #endif
 #define OBJECT_FREE (0)
 #define OBJECT_ACTIVE (1)
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 static void set_obj_status(struct page *page, int idx, int val)
 {
 	int freelist_size;
 	char *status;
 	struct kmem_cache *cachep = page->slab_cache;
 	freelist_size = cachep->num * sizeof(freelist_idx_t);
 	status = (char *)page->freelist + freelist_size;
 	status[idx] = val;
 }
 static inline unsigned int get_obj_status(struct page *page, int idx)
 {
 	int freelist_size;
 	char *status;
 	struct kmem_cache *cachep = page->slab_cache;
 	freelist_size = cachep->num * sizeof(freelist_idx_t);
 	status = (char *)page->freelist + freelist_size;
 	return status[idx];
 }
 #else
 static inline void set_obj_status(struct page *page, int idx, int val) {}
 #endif
 /*
  * Do not go above this order unless 0 objects fit into the slab or
  * overridden on the command line.
  */
 #define	SLAB_MAX_ORDER_HI	1
 #define	SLAB_MAX_ORDER_LO	0
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
 	struct page *page = virt_to_head_page(obj);
 	return page->slab_cache;
 }
 static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
 				 unsigned int idx)
 {
 	return page->s_mem + cache->size * idx;
 }
 /*
  * We want to avoid an expensive divide : (offset / cache->size)
  *   Using the fact that size is a constant for a particular cache,
  *   we can replace (offset / cache->size) by
  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
  */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 					const struct page *page, void *obj)
 {
 	u32 offset = (obj - page->s_mem);
 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
 static struct kmem_cache kmem_cache_boot = {
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
 	.size = sizeof(struct kmem_cache),
 	.name = "kmem_cache",
 };
 #define BAD_ALIEN_MAGIC 0x01020304ul
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
 static size_t calculate_freelist_size(int nr_objs, size_t align)
 {
 	size_t freelist_size;
 	freelist_size = nr_objs * sizeof(freelist_idx_t);
 	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
 		freelist_size += nr_objs * sizeof(char);
 	if (align)
 		freelist_size = ALIGN(freelist_size, align);
 	return freelist_size;
 }
 static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
 				size_t idx_size, size_t align)
 {
 	int nr_objs;
 	size_t remained_size;
 	size_t freelist_size;
 	int extra_space = 0;
 	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
 		extra_space = sizeof(char);
 	/*
 	 * Ignore padding for the initial guess. The padding
 	 * is at most @align-1 bytes, and @buffer_size is at
 	 * least @align. In the worst case, this result will
 	 * be one greater than the number of objects that fit
 	 * into the memory allocation when taking the padding
 	 * into account.
 	 */
 	nr_objs = slab_size / (buffer_size + idx_size + extra_space);
 	/*
 	 * This calculated number will be either the right
 	 * amount, or one greater than what we want.
 	 */
 	remained_size = slab_size - nr_objs * buffer_size;
 	freelist_size = calculate_freelist_size(nr_objs, align);
 	if (remained_size < freelist_size)
 		nr_objs--;
 	return nr_objs;
 }
 /*
  * Calculate the number of objects and left-over bytes for a given buffer size.
  */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 			   size_t align, int flags, size_t *left_over,
 			   unsigned int *num)
 {
 	int nr_objs;
 	size_t mgmt_size;
 	size_t slab_size = PAGE_SIZE << gfporder;
 	/*
 	 * The slab management structure can be either off the slab or
 	 * on it. For the latter case, the memory allocated for a
 	 * slab is used for:
 	 *
 	 * - One unsigned int for each object
 	 * - Padding to respect alignment of @align
 	 * - @buffer_size bytes for each object
 	 *
 	 * If the slab management structure is off the slab, then the
 	 * alignment will already be calculated into the size. Because
 	 * the slabs are all pages aligned, the objects will be at the
 	 * correct alignment when allocated.
 	 */
 	if (flags & CFLGS_OFF_SLAB) {
 		mgmt_size = 0;
 		nr_objs = slab_size / buffer_size;
 	} else {
 		nr_objs = calculate_nr_objs(slab_size, buffer_size,
 					sizeof(freelist_idx_t), align);
 		mgmt_size = calculate_freelist_size(nr_objs, align);
 	}
 	*num = nr_objs;
 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 #if DEBUG
 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 static void __slab_error(const char *function, struct kmem_cache *cachep,
 			char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
 	dump_stack();
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 #endif
 /*
  * By default on NUMA we use alien caches to stage the freeing of
  * objects allocated from other nodes. This causes massive memory
  * inefficiencies when using fake NUMA setup to split memory into a
  * large number of small nodes, so it can be disabled on the command
  * line
   */
 static int use_alien_caches __read_mostly = 1;
 static int __init noaliencache_setup(char *s)
 {
 	use_alien_caches = 0;
 	return 1;
 }
 __setup("noaliencache", noaliencache_setup);
 static int __init slab_max_order_setup(char *str)
 {
 	get_option(&str, &slab_max_order);
 	slab_max_order = slab_max_order < 0 ? 0 :
 				min(slab_max_order, MAX_ORDER - 1);
 	slab_max_order_set = true;
 	return 1;
 }
 __setup("slab_max_order=", slab_max_order_setup);
 #ifdef CONFIG_NUMA
 /*
  * Special reaping functions for NUMA systems called from cache_reap().
  * These take care of doing round robin flushing of alien caches (containing
  * objects freed on different nodes from which they were allocated) and the
  * flushing of remote pcps by calling drain_node_pages.
  */
 static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 static void init_reap_node(int cpu)
 {
 	int node;
 	node = next_node(cpu_to_mem(cpu), node_online_map);
 	if (node == MAX_NUMNODES)
 		node = first_node(node_online_map);
 	per_cpu(slab_reap_node, cpu) = node;
 }
 static void next_reap_node(void)
 {
 	int node = __this_cpu_read(slab_reap_node);
 	node = next_node(node, node_online_map);
 	if (unlikely(node >= MAX_NUMNODES))
 		node = first_node(node_online_map);
 	__this_cpu_write(slab_reap_node, node);
 }
 #else
 #define init_reap_node(cpu) do { } while (0)
 #define next_reap_node(void) do { } while (0)
 #endif
 /*
  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
  * via the workqueue/eventd.
  * Add the CPU number into the expiration time to minimize the possibility of
  * the CPUs getting into lockstep and contending for the global cache chain
  * lock.
  */
 static void start_cpu_timer(int cpu)
 {
 	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
 	/*
 	 * When this gets called from do_initcalls via cpucache_init(),
 	 * init_workqueues() has already run, so keventd will be setup
 	 * at that time.
 	 */
 	if (keventd_up() && reap_work->work.func == NULL) {
 		init_reap_node(cpu);
 		INIT_DEFERRABLE_WORK(reap_work, cache_reap);
 		schedule_delayed_work_on(cpu, reap_work,
 					__round_jiffies_relative(HZ, cpu));
 	}
 }
 static void init_arraycache(struct array_cache *ac, int limit, int batch)
 {
 	/*
 	 * The array_cache structures contain pointers to free object.
 	 * However, when such objects are allocated or transferred to another
 	 * cache the pointers are not cleared and they could be counted as
 	 * valid references during a kmemleak scan. Therefore, kmemleak must
 	 * not scan such objects.
 	 */
 	kmemleak_no_scan(ac);
 	if (ac) {
 		ac->avail = 0;
 		ac->limit = limit;
 		ac->batchcount = batch;
 		ac->touched = 0;
 	}
 }
 static struct array_cache *alloc_arraycache(int node, int entries,
 					    int batchcount, gfp_t gfp)
 {
 	size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *ac = NULL;
 	ac = kmalloc_node(memsize, gfp, node);
 	init_arraycache(ac, entries, batchcount);
 	return ac;
 }
 static inline bool is_slab_pfmemalloc(struct page *page)
 {
 	return PageSlabPfmemalloc(page);
 }
 /* Clears pfmemalloc_active if no slabs have pfmalloc set */
 static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
 						struct array_cache *ac)
 {
 	struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
 	struct page *page;
 	unsigned long flags;
 	if (!pfmemalloc_active)
 		return;
 	spin_lock_irqsave(&n->list_lock, flags);
 	list_for_each_entry(page, &n->slabs_full, lru)
 		if (is_slab_pfmemalloc(page))
 			goto out;
 	list_for_each_entry(page, &n->slabs_partial, lru)
 		if (is_slab_pfmemalloc(page))
 			goto out;
 	list_for_each_entry(page, &n->slabs_free, lru)
 		if (is_slab_pfmemalloc(page))
 			goto out;
 	pfmemalloc_active = false;
 out:
 	spin_unlock_irqrestore(&n->list_lock, flags);
 }
 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 						gfp_t flags, bool force_refill)
 {
 	int i;
 	void *objp = ac->entry[--ac->avail];
 	/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
 	if (unlikely(is_obj_pfmemalloc(objp))) {
 		struct kmem_cache_node *n;
 		if (gfp_pfmemalloc_allowed(flags)) {
 			clear_obj_pfmemalloc(&objp);
 			return objp;
 		}
 		/* The caller cannot use PFMEMALLOC objects, find another one */
 		for (i = 0; i < ac->avail; i++) {
 			/* If a !PFMEMALLOC object is found, swap them */
 			if (!is_obj_pfmemalloc(ac->entry[i])) {
 				objp = ac->entry[i];
 				ac->entry[i] = ac->entry[ac->avail];
 				ac->entry[ac->avail] = objp;
 				return objp;
 			}
 		}
 		/*
 		 * If there are empty slabs on the slabs_free list and we are
 		 * being forced to refill the cache, mark this one !pfmemalloc.
 		 */
 		n = get_node(cachep, numa_mem_id());
 		if (!list_empty(&n->slabs_free) && force_refill) {
 			struct page *page = virt_to_head_page(objp);
 			ClearPageSlabPfmemalloc(page);
 			clear_obj_pfmemalloc(&objp);
 			recheck_pfmemalloc_active(cachep, ac);
 			return objp;
 		}
 		/* No !PFMEMALLOC objects available */
 		ac->avail++;
 		objp = NULL;
 	}
 	return objp;
 }
 static inline void *ac_get_obj(struct kmem_cache *cachep,
 			struct array_cache *ac, gfp_t flags, bool force_refill)
 {
 	void *objp;
 	if (unlikely(sk_memalloc_socks()))
 		objp = __ac_get_obj(cachep, ac, flags, force_refill);
 	else
 		objp = ac->entry[--ac->avail];
 	return objp;
 }
 static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 								void *objp)
 {
 	if (unlikely(pfmemalloc_active)) {
 		/* Some pfmemalloc slabs exist, check if this is one */
 		struct page *page = virt_to_head_page(objp);
 		if (PageSlabPfmemalloc(page))
 			set_obj_pfmemalloc(&objp);
 	}
 	return objp;
 }
 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 								void *objp)
 {
 	if (unlikely(sk_memalloc_socks()))
 		objp = __ac_put_obj(cachep, ac, objp);
 	ac->entry[ac->avail++] = objp;
 }
 /*
  * Transfer objects in one arraycache to another.
  * Locking must be handled by the caller.
  *
  * Return the number of entries transferred.
  */
 static int transfer_objects(struct array_cache *to,
 		struct array_cache *from, unsigned int max)
 {
 	/* Figure out how many entries to transfer */
 	int nr = min3(from->avail, max, to->limit - to->avail);
 	if (!nr)
 		return 0;
 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 			sizeof(void *) *nr);
 	from->avail -= nr;
 	to->avail += nr;
 	return nr;
 }
 #ifndef CONFIG_NUMA
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, n) do { } while (0)
 static inline struct alien_cache **alloc_alien_cache(int node,
 						int limit, gfp_t gfp)
 {
 	return (struct alien_cache **)BAD_ALIEN_MAGIC;
 }
 static inline void free_alien_cache(struct alien_cache **ac_ptr)
 {
 }
 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	return 0;
 }
 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 		gfp_t flags)
 {
 	return NULL;
 }
 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 		 gfp_t flags, int nodeid)
 {
 	return NULL;
 }
 #else	/* CONFIG_NUMA */
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 static struct alien_cache *__alloc_alien_cache(int node, int entries,
 						int batch, gfp_t gfp)
 {
 	size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
 	struct alien_cache *alc = NULL;
 	alc = kmalloc_node(memsize, gfp, node);
 	init_arraycache(&alc->ac, entries, batch);
 	spin_lock_init(&alc->lock);
 	return alc;
 }
 static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	struct alien_cache **alc_ptr;
 	size_t memsize = sizeof(void *) * nr_node_ids;
 	int i;
 	if (limit > 1)
 		limit = 12;
 	alc_ptr = kzalloc_node(memsize, gfp, node);
 	if (!alc_ptr)
 		return NULL;
 	for_each_node(i) {
 		if (i == node || !node_online(i))
 			continue;
 		alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
 		if (!alc_ptr[i]) {
 			for (i--; i >= 0; i--)
 				kfree(alc_ptr[i]);
 			kfree(alc_ptr);
 			return NULL;
 		}
 	}
 	return alc_ptr;
 }
 static void free_alien_cache(struct alien_cache **alc_ptr)
 {
 	int i;
 	if (!alc_ptr)
 		return;
 	for_each_node(i)
 	    kfree(alc_ptr[i]);
 	kfree(alc_ptr);
 }
 static void __drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache *ac, int node,
 				struct list_head *list)
 {
 	struct kmem_cache_node *n = get_node(cachep, node);
 	if (ac->avail) {
 		spin_lock(&n->list_lock);
 		/*
 		 * Stuff objects into the remote nodes shared array first.
 		 * That way we could avoid the overhead of putting the objects
 		 * into the free lists and getting them back later.
 		 */
 		if (n->shared)
 			transfer_objects(n->shared, ac, ac->limit);
 		free_block(cachep, ac->entry, ac->avail, node, list);
 		ac->avail = 0;
 		spin_unlock(&n->list_lock);
 	}
 }
 /*
  * Called from cache_reap() to regularly drain alien caches round robin.
  */
 static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
 {
 	int node = __this_cpu_read(slab_reap_node);
 	if (n->alien) {
 		struct alien_cache *alc = n->alien[node];
 		struct array_cache *ac;
 		if (alc) {
 			ac = &alc->ac;
 			if (ac->avail && spin_trylock_irq(&alc->lock)) {
 				LIST_HEAD(list);
 				__drain_alien_cache(cachep, ac, node, &list);
 				spin_unlock_irq(&alc->lock);
 				slabs_destroy(cachep, &list);
 			}
 		}
 	}
 }
 static void drain_alien_cache(struct kmem_cache *cachep,
 				struct alien_cache **alien)
 {
 	int i = 0;
 	struct alien_cache *alc;
 	struct array_cache *ac;
 	unsigned long flags;
 	for_each_online_node(i) {
 		alc = alien[i];
 		if (alc) {
 			LIST_HEAD(list);
 			ac = &alc->ac;
 			spin_lock_irqsave(&alc->lock, flags);
 			__drain_alien_cache(cachep, ac, i, &list);
 			spin_unlock_irqrestore(&alc->lock, flags);
 			slabs_destroy(cachep, &list);
 		}
 	}
 }
 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	int nodeid = page_to_nid(virt_to_page(objp));
 	struct kmem_cache_node *n;
 	struct alien_cache *alien = NULL;
 	struct array_cache *ac;
 	int node;
 	LIST_HEAD(list);
 	node = numa_mem_id();
 	/*
 	 * Make sure we are not freeing a object from another node to the array
 	 * cache on this cpu.
 	 */
 	if (likely(nodeid == node))
 		return 0;
 	n = get_node(cachep, node);
 	STATS_INC_NODEFREES(cachep);
 	if (n->alien && n->alien[nodeid]) {
 		alien = n->alien[nodeid];
 		ac = &alien->ac;
 		spin_lock(&alien->lock);
 		if (unlikely(ac->avail == ac->limit)) {
 			STATS_INC_ACOVERFLOW(cachep);
 			__drain_alien_cache(cachep, ac, nodeid, &list);
 		}
 		ac_put_obj(cachep, ac, objp);
 		spin_unlock(&alien->lock);
 		slabs_destroy(cachep, &list);
 	} else {
 		n = get_node(cachep, nodeid);
 		spin_lock(&n->list_lock);
 		free_block(cachep, &objp, 1, nodeid, &list);
 		spin_unlock(&n->list_lock);
 		slabs_destroy(cachep, &list);
 	}
 	return 1;
 }
 #endif
 /*
  * Allocates and initializes node for a node on each slab cache, used for
  * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
  * will be allocated off-node since memory is not yet online for the new node.
  * When hotplugging memory or a cpu, existing node are not replaced if
  * already in use.
  *
  * Must hold slab_mutex.
  */
 static int init_cache_node_node(int node)
 {
 	struct kmem_cache *cachep;
 	struct kmem_cache_node *n;
 	const size_t memsize = sizeof(struct kmem_cache_node);
 	list_for_each_entry(cachep, &slab_caches, list) {
 		/*
 		 * Set up the kmem_cache_node for cpu before we can
 		 * begin anything. Make sure some other cpu on this
 		 * node has not already allocated this
 		 */
 		n = get_node(cachep, node);
 		if (!n) {
 			n = kmalloc_node(memsize, GFP_KERNEL, node);
 			if (!n)
 				return -ENOMEM;
 			kmem_cache_node_init(n);
 			n->next_reap = jiffies + REAPTIMEOUT_NODE +
 			    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
 			/*
 			 * The kmem_cache_nodes don't come and go as CPUs
 			 * come and go.  slab_mutex is sufficient
 			 * protection here.
 			 */
 			cachep->node[node] = n;
 		}
 		spin_lock_irq(&n->list_lock);
 		n->free_limit =
 			(1 + nr_cpus_node(node)) *
 			cachep->batchcount + cachep->num;
 		spin_unlock_irq(&n->list_lock);
 	}
 	return 0;
 }
 static inline int slabs_tofree(struct kmem_cache *cachep,
 						struct kmem_cache_node *n)
 {
 	return (n->free_objects + cachep->num - 1) / cachep->num;
 }
 static void cpuup_canceled(long cpu)
 {
 	struct kmem_cache *cachep;
 	struct kmem_cache_node *n = NULL;
 	int node = cpu_to_mem(cpu);
 	const struct cpumask *mask = cpumask_of_node(node);
 	list_for_each_entry(cachep, &slab_caches, list) {
 		struct array_cache *nc;
 		struct array_cache *shared;
 		struct alien_cache **alien;
 		LIST_HEAD(list);
 		/* cpu is dead; no one can alloc from it. */
 		nc = cachep->array[cpu];
 		cachep->array[cpu] = NULL;
 		n = get_node(cachep, node);
 		if (!n)
 			goto free_array_cache;
 		spin_lock_irq(&n->list_lock);
 		/* Free limit for this kmem_cache_node */
 		n->free_limit -= cachep->batchcount;
 		if (nc)
 			free_block(cachep, nc->entry, nc->avail, node, &list);
 		if (!cpumask_empty(mask)) {
 			spin_unlock_irq(&n->list_lock);
 			goto free_array_cache;
 		}
 		shared = n->shared;
 		if (shared) {
 			free_block(cachep, shared->entry,
 				   shared->avail, node, &list);
 			n->shared = NULL;
 		}
 		alien = n->alien;
 		n->alien = NULL;
 		spin_unlock_irq(&n->list_lock);
 		kfree(shared);
 		if (alien) {
 			drain_alien_cache(cachep, alien);
 			free_alien_cache(alien);
 		}
 free_array_cache:
 		slabs_destroy(cachep, &list);
 		kfree(nc);
 	}
 	/*
 	 * In the previous loop, all the objects were freed to
 	 * the respective cache's slabs,  now we can go ahead and
 	 * shrink each nodelist to its limit.
 	 */
 	list_for_each_entry(cachep, &slab_caches, list) {
 		n = get_node(cachep, node);
 		if (!n)
 			continue;
 		drain_freelist(cachep, n, slabs_tofree(cachep, n));
 	}
 }
 static int cpuup_prepare(long cpu)
 {
 	struct kmem_cache *cachep;
 	struct kmem_cache_node *n = NULL;
 	int node = cpu_to_mem(cpu);
 	int err;
 	/*
 	 * We need to do this right in the beginning since
 	 * alloc_arraycache's are going to use this list.
 	 * kmalloc_node allows us to add the slab to the right
 	 * kmem_cache_node and not this cpu's kmem_cache_node
 	 */
 	err = init_cache_node_node(node);
 	if (err < 0)
 		goto bad;
 	/*
 	 * Now we can go ahead with allocating the shared arrays and
 	 * array caches
 	 */
 	list_for_each_entry(cachep, &slab_caches, list) {
 		struct array_cache *nc;
 		struct array_cache *shared = NULL;
 		struct alien_cache **alien = NULL;
 		nc = alloc_arraycache(node, cachep->limit,
 					cachep->batchcount, GFP_KERNEL);
 		if (!nc)
 			goto bad;
 		if (cachep->shared) {
 			shared = alloc_arraycache(node,
 				cachep->shared * cachep->batchcount,
 				0xbaadf00d, GFP_KERNEL);
 			if (!shared) {
 				kfree(nc);
 				goto bad;
 			}
 		}
 		if (use_alien_caches) {
 			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
 			if (!alien) {
 				kfree(shared);
 				kfree(nc);
 				goto bad;
 			}
 		}
 		cachep->array[cpu] = nc;
 		n = get_node(cachep, node);
 		BUG_ON(!n);
 		spin_lock_irq(&n->list_lock);
 		if (!n->shared) {
 			/*
 			 * We are serialised from CPU_DEAD or
 			 * CPU_UP_CANCELLED by the cpucontrol lock
 			 */
 			n->shared = shared;
 			shared = NULL;
 		}
 #ifdef CONFIG_NUMA
 		if (!n->alien) {
 			n->alien = alien;
 			alien = NULL;
 		}
 #endif
 		spin_unlock_irq(&n->list_lock);
 		kfree(shared);
 		free_alien_cache(alien);
 	}
 	return 0;
 bad:
 	cpuup_canceled(cpu);
 	return -ENOMEM;
 }
 static int cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	int err = 0;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		mutex_lock(&slab_mutex);
 		err = cpuup_prepare(cpu);
 		mutex_unlock(&slab_mutex);
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		start_cpu_timer(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
   	case CPU_DOWN_PREPARE:
   	case CPU_DOWN_PREPARE_FROZEN:
 		/*
 		 * Shutdown cache reaper. Note that the slab_mutex is
 		 * held so that if cache_reap() is invoked it cannot do
 		 * anything expensive but will only modify reap_work
 		 * and reschedule the timer.
 		*/
 		cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
 		/* Now the cache_reaper is guaranteed to be not running. */
 		per_cpu(slab_reap_work, cpu).work.func = NULL;
   		break;
   	case CPU_DOWN_FAILED:
   	case CPU_DOWN_FAILED_FROZEN:
 		start_cpu_timer(cpu);
   		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		/*
 		 * Even if all the cpus of a node are down, we don't free the
 		 * kmem_cache_node of any cache. This to avoid a race between
 		 * cpu_down, and a kmalloc allocation from another cpu for
 		 * memory from the node of the cpu going down.  The node
 		 * structure is usually allocated from kmem_cache_create() and
 		 * gets destroyed at kmem_cache_destroy().
 		 */
 		/* fall through */
 #endif
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		mutex_lock(&slab_mutex);
 		cpuup_canceled(cpu);
 		mutex_unlock(&slab_mutex);
 		break;
 	}
 	return notifier_from_errno(err);
 }
 static struct notifier_block cpucache_notifier = {
 	&cpuup_callback, NULL, 0
 };
 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
 /*
  * Drains freelist for a node on each slab cache, used for memory hot-remove.
  * Returns -EBUSY if all objects cannot be drained so that the node is not
  * removed.
  *
  * Must hold slab_mutex.
  */
 static int __meminit drain_cache_node_node(int node)
 {
 	struct kmem_cache *cachep;
 	int ret = 0;
 	list_for_each_entry(cachep, &slab_caches, list) {
 		struct kmem_cache_node *n;
 		n = get_node(cachep, node);
 		if (!n)
 			continue;
 		drain_freelist(cachep, n, slabs_tofree(cachep, n));
 		if (!list_empty(&n->slabs_full) ||
 		    !list_empty(&n->slabs_partial)) {
 			ret = -EBUSY;
 			break;
 		}
 	}
 	return ret;
 }
 static int __meminit slab_memory_callback(struct notifier_block *self,
 					unsigned long action, void *arg)
 {
 	struct memory_notify *mnb = arg;
 	int ret = 0;
 	int nid;
 	nid = mnb->status_change_nid;
 	if (nid < 0)
 		goto out;
 	switch (action) {
 	case MEM_GOING_ONLINE:
 		mutex_lock(&slab_mutex);
 		ret = init_cache_node_node(nid);
 		mutex_unlock(&slab_mutex);
 		break;
 	case MEM_GOING_OFFLINE:
 		mutex_lock(&slab_mutex);
 		ret = drain_cache_node_node(nid);
 		mutex_unlock(&slab_mutex);
 		break;
 	case MEM_ONLINE:
 	case MEM_OFFLINE:
 	case MEM_CANCEL_ONLINE:
 	case MEM_CANCEL_OFFLINE:
 		break;
 	}
 out:
 	return notifier_from_errno(ret);
 }
 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
 /*
  * swap the static kmem_cache_node with kmalloced memory
  */
 static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
 				int nodeid)
 {
 	struct kmem_cache_node *ptr;
 	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
 	BUG_ON(!ptr);
 	memcpy(ptr, list, sizeof(struct kmem_cache_node));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
 	 */
 	spin_lock_init(&ptr->list_lock);
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->node[nodeid] = ptr;
 }
 /*
  * For setting up all the kmem_cache_node for cache whose buffer_size is same as
  * size of kmem_cache_node.
  */
 static void __init set_up_node(struct kmem_cache *cachep, int index)
 {
 	int node;
 	for_each_online_node(node) {
 		cachep->node[node] = &init_kmem_cache_node[index + node];
 		cachep->node[node]->next_reap = jiffies +
 		    REAPTIMEOUT_NODE +
 		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
 	}
 }
 /*
  * The memory after the last cpu cache pointer is used for the
  * the node pointer.
  */
 static void setup_node_pointer(struct kmem_cache *cachep)
 {
 	cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
 }
 /*
  * Initialisation.  Called after the page allocator have been initialised and
  * before smp_init().
  */
 void __init kmem_cache_init(void)
 {
 	int i;
 	BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
 					sizeof(struct rcu_head));
 	kmem_cache = &kmem_cache_boot;
 	setup_node_pointer(kmem_cache);
 	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
 	for (i = 0; i < NUM_INIT_LISTS; i++)
 		kmem_cache_node_init(&init_kmem_cache_node[i]);
 	set_up_node(kmem_cache, CACHE_CACHE);
 	/*
 	 * Fragmentation resistance on low memory - only use bigger
 	 * page orders on machines with more than 32MB of memory if
 	 * not overridden on the command line.
 	 */
 	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
 		slab_max_order = SLAB_MAX_ORDER_HI;
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
 	 * 1) initialize the kmem_cache cache: it contains the struct
 	 *    kmem_cache structures of all caches, except kmem_cache itself:
 	 *    kmem_cache is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_cache_node structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
 	 *    The struct kmem_cache for the new cache is allocated normally.
 	 *    An __init data area is used for the head array.
 	 * 3) Create the remaining kmalloc caches, with minimally sized
 	 *    head arrays.
 	 * 4) Replace the __init data head arrays for kmem_cache and the first
 	 *    kmalloc cache with kmalloc allocated arrays.
 	 * 5) Replace the __init data for kmem_cache_node for kmem_cache and
 	 *    the other cache's with kmalloc allocated memory.
 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 	/* 1) create the kmem_cache */
 	/*
 	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
 	 */
 	create_boot_cache(kmem_cache, "kmem_cache",
 		offsetof(struct kmem_cache, array[nr_cpu_ids]) +
 				  nr_node_ids * sizeof(struct kmem_cache_node *),
 				  SLAB_HWCACHE_ALIGN);
 	list_add(&kmem_cache->list, &slab_caches);
 	/* 2+3) create the kmalloc caches */
 	/*
 	 * Initialize the caches that provide memory for the array cache and the
 	 * kmem_cache_node structures first.  Without this, further allocations will
 	 * bug.
 	 */
 	kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
 					kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
 	if (INDEX_AC != INDEX_NODE)
 		kmalloc_caches[INDEX_NODE] =
 			create_kmalloc_cache("kmalloc-node",
 				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
 	slab_early_init = 0;
 	/* 4) Replace the bootstrap head arrays */
 	{
 		struct array_cache *ptr;
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 		memcpy(ptr, cpu_cache_get(kmem_cache),
 		       sizeof(struct arraycache_init));
 		kmem_cache->array[smp_processor_id()] = ptr;
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 		BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
 		       sizeof(struct arraycache_init));
 		kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
 	}
 	/* 5) Replace the bootstrap kmem_cache_node */
 	{
 		int nid;
 		for_each_online_node(nid) {
 			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
 			init_list(kmalloc_caches[INDEX_AC],
 				  &init_kmem_cache_node[SIZE_AC + nid], nid);
 			if (INDEX_AC != INDEX_NODE) {
 				init_list(kmalloc_caches[INDEX_NODE],
 					  &init_kmem_cache_node[SIZE_NODE + nid], nid);
 			}
 		}
 	}
 	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
 }
 void __init kmem_cache_init_late(void)
 {
 	struct kmem_cache *cachep;
 	slab_state = UP;
 	/* 6) resize the head arrays to their final sizes */
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(cachep, &slab_caches, list)
 		if (enable_cpucache(cachep, GFP_NOWAIT))
 			BUG();
 	mutex_unlock(&slab_mutex);
 	/* Done! */
 	slab_state = FULL;
 	/*
 	 * Register a cpu startup notifier callback that initializes
 	 * cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 #ifdef CONFIG_NUMA
 	/*
 	 * Register a memory hotplug callback that initializes and frees
 	 * node.
 	 */
 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 #endif
 	/*
 	 * The reap timers are started later, with a module init call: That part
 	 * of the kernel is not yet operational.
 	 */
 }
 static int __init cpucache_init(void)
 {
 	int cpu;
 	/*
 	 * Register the timers that return unneeded pages to the page allocator
 	 */
 	for_each_online_cpu(cpu)
 		start_cpu_timer(cpu);
 	/* Done! */
 	slab_state = FULL;
 	return 0;
 }
 __initcall(cpucache_init);
 static noinline void
 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 {
 #if DEBUG
 	struct kmem_cache_node *n;
 	struct page *page;
 	unsigned long flags;
 	int node;
 	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
 		return;
 	printk(KERN_WARNING
 		"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
 		nodeid, gfpflags);
 	printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
 		cachep->name, cachep->size, cachep->gfporder);
 	for_each_kmem_cache_node(cachep, node, n) {
 		unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
 		unsigned long active_slabs = 0, num_slabs = 0;
 		spin_lock_irqsave(&n->list_lock, flags);
 		list_for_each_entry(page, &n->slabs_full, lru) {
 			active_objs += cachep->num;
 			active_slabs++;
 		}
 		list_for_each_entry(page, &n->slabs_partial, lru) {
 			active_objs += page->active;
 			active_slabs++;
 		}
 		list_for_each_entry(page, &n->slabs_free, lru)
 			num_slabs++;
 		free_objects += n->free_objects;
 		spin_unlock_irqrestore(&n->list_lock, flags);
 		num_slabs += active_slabs;
 		num_objs = num_slabs * cachep->num;
 		printk(KERN_WARNING
 			"  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
 			node, active_slabs, num_slabs, active_objs, num_objs,
 			free_objects);
 	}
 #endif
 }
 /*
  * Interface to system's page allocator. No need to hold the
  * kmem_cache_node ->list_lock.
  *
  * If we requested dmaable memory, we will get it. Even if we
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
 static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 								int nodeid)
 {
 	struct page *page;
 	int nr_pages;
 	flags |= cachep->allocflags;
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 	if (memcg_charge_slab(cachep, flags, cachep->gfporder))
 		return NULL;
 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page) {
 		memcg_uncharge_slab(cachep, cachep->gfporder);
 		slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
 	}
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
 	if (unlikely(page->pfmemalloc))
 		pfmemalloc_active = true;
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_RECLAIMABLE, nr_pages);
 	else
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_UNRECLAIMABLE, nr_pages);
 	__SetPageSlab(page);
 	if (page->pfmemalloc)
 		SetPageSlabPfmemalloc(page);
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
 		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
 		if (cachep->ctor)
 			kmemcheck_mark_uninitialized_pages(page, nr_pages);
 		else
 			kmemcheck_mark_unallocated_pages(page, nr_pages);
 	}
 	return page;
 }
 /*
  * Interface to system's page release.
  */
 static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 {
 	const unsigned long nr_freed = (1 << cachep->gfporder);
 	kmemcheck_free_shadow(page, cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		sub_zone_page_state(page_zone(page),
 				NR_SLAB_RECLAIMABLE, nr_freed);
 	else
 		sub_zone_page_state(page_zone(page),
 				NR_SLAB_UNRECLAIMABLE, nr_freed);
 	BUG_ON(!PageSlab(page));
 	__ClearPageSlabPfmemalloc(page);
 	__ClearPageSlab(page);
 	page_mapcount_reset(page);
 	page->mapping = NULL;
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	__free_pages(page, cachep->gfporder);
 	memcg_uncharge_slab(cachep, cachep->gfporder);
 }
 static void kmem_rcu_free(struct rcu_head *head)
 {
 	struct kmem_cache *cachep;
 	struct page *page;
 	page = container_of(head, struct page, rcu_head);
 	cachep = page->slab_cache;
 	kmem_freepages(cachep, page);
 }
 #if DEBUG
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 			    unsigned long caller)
 {
 	int size = cachep->object_size;
 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
 	if (size < 5 * sizeof(unsigned long))
 		return;
 	*addr++ = 0x12345678;
 	*addr++ = caller;
 	*addr++ = smp_processor_id();
 	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
 		unsigned long svalue;
 		while (!kstack_end(sptr)) {
 			svalue = *sptr++;
 			if (kernel_text_address(svalue)) {
 				*addr++ = svalue;
 				size -= sizeof(unsigned long);
 				if (size <= sizeof(unsigned long))
 					break;
 			}
 		}
 	}
 	*addr++ = 0x87654321;
 }
 #endif
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
 	int size = cachep->object_size;
 	addr = &((char *)addr)[obj_offset(cachep)];
 	memset(addr, val, size);
 	*(unsigned char *)(addr + size - 1) = POISON_END;
 }
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	unsigned char error = 0;
 	int bad_count = 0;
 	printk(KERN_ERR "%03x: ", offset);
 	for (i = 0; i < limit; i++) {
 		if (data[offset + i] != POISON_FREE) {
 			error = data[offset + i];
 			bad_count++;
 		}
 	}
 	print_hex_dump(KERN_CONT, "", 0, 16, 1,
 			&data[offset], limit, 1);
 	if (bad_count == 1) {
 		error ^= POISON_FREE;
 		if (!(error & (error - 1))) {
 			printk(KERN_ERR "Single bit error detected. Probably "
 					"bad RAM.\n");
 #ifdef CONFIG_X86
 			printk(KERN_ERR "Run memtest86+ or a similar memory "
 					"test tool.\n");
 #else
 			printk(KERN_ERR "Run a memory test tool.\n");
 #endif
 		}
 	}
 }
 #endif
 #if DEBUG
 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 {
 	int i, size;
 	char *realobj;
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
 			*dbg_redzone1(cachep, objp),
 			*dbg_redzone2(cachep, objp));
 	}
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
 		       *dbg_userword(cachep, objp),
 		       *dbg_userword(cachep, objp));
 	}
 	realobj = (char *)objp + obj_offset(cachep);
 	size = cachep->object_size;
 	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
 		if (i + limit > size)
 			limit = size - i;
 		dump_line(realobj, i, limit);
 	}
 }
 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 {
 	char *realobj;
 	int size, i;
 	int lines = 0;
 	realobj = (char *)objp + obj_offset(cachep);
 	size = cachep->object_size;
 	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
 		if (i == size - 1)
 			exp = POISON_END;
 		if (realobj[i] != exp) {
 			int limit;
 			/* Mismatch ! */
 			/* Print header */
 			if (lines == 0) {
 				printk(KERN_ERR
 					"Slab corruption (%s): %s start=%p, len=%d\n",
 					print_tainted(), cachep->name, realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
 			i = (i / 16) * 16;
 			limit = 16;
 			if (i + limit > size)
 				limit = size - i;
 			dump_line(realobj, i, limit);
 			i += 16;
 			lines++;
 			/* Limit to 5 lines */
 			if (lines > 5)
 				break;
 		}
 	}
 	if (lines != 0) {
 		/* Print some data about the neighboring objects, if they
 		 * exist:
 		 */
 		struct page *page = virt_to_head_page(objp);
 		unsigned int objnr;
 		objnr = obj_to_index(cachep, page, objp);
 		if (objnr) {
 			objp = index_to_obj(cachep, page, objnr - 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
 			objp = index_to_obj(cachep, page, objnr + 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 	}
 }
 #endif
 #if DEBUG
 static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 						struct page *page)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, page, i);
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 			if (cachep->size % PAGE_SIZE == 0 &&
 					OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
 					cachep->size / PAGE_SIZE, 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
 			check_poison_obj(cachep, objp);
 #endif
 		}
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "start of a freed object "
 					   "was overwritten");
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "end of a freed object "
 					   "was overwritten");
 		}
 	}
 }
 #else
 static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 						struct page *page)
 {
 }
 #endif
 /**
  * slab_destroy - destroy and release all objects in a slab
  * @cachep: cache pointer being destroyed
  * @page: page pointer being destroyed
  *
  * Destroy all the objs in a slab page, and release the mem back to the system.
  * Before calling the slab page must have been unlinked from the cache. The
  * kmem_cache_node ->list_lock is not held/needed.
  */
 static void slab_destroy(struct kmem_cache *cachep, struct page *page)
 {
 	void *freelist;
 	freelist = page->freelist;
 	slab_destroy_debugcheck(cachep, page);
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct rcu_head *head;
 		/*
 		 * RCU free overloads the RCU head over the LRU.
 		 * slab_page has been overloeaded over the LRU,
 		 * however it is not used from now on so that
 		 * we can use it safely.
 		 */
 		head = (void *)&page->rcu_head;
 		call_rcu(head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, page);
 	}
 	/*
 	 * From now on, we don't use freelist
 	 * although actual page can be freed in rcu context
 	 */
 	if (OFF_SLAB(cachep))
 		kmem_cache_free(cachep->freelist_cache, freelist);
 }
 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
 {
 	struct page *page, *n;
 	list_for_each_entry_safe(page, n, list, lru) {
 		list_del(&page->lru);
 		slab_destroy(cachep, page);
 	}
 }
 /**
  * calculate_slab_order - calculate size (page order) of slabs
  * @cachep: pointer to the cache that is being created
  * @size: size of objects to be created in this cache.
  * @align: required alignment for the objects.
  * @flags: slab allocation flags
  *
  * Also calculates the number of objects per slab.
  *
  * This could be made much more intelligent.  For now, try to avoid using
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
 static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	unsigned long offslab_limit;
 	size_t left_over = 0;
 	int gfporder;
 	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
 		if (!num)
 			continue;
 		/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
 		if (num > SLAB_OBJ_MAX_NUM)
 			break;
 		if (flags & CFLGS_OFF_SLAB) {
 			size_t freelist_size_per_obj = sizeof(freelist_idx_t);
 			/*
 			 * Max number of objs-per-slab for caches which
 			 * use off-slab slabs. Needed to avoid a possible
 			 * looping condition in cache_grow().
 			 */
 			if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
 				freelist_size_per_obj += sizeof(char);
 			offslab_limit = size;
 			offslab_limit /= freelist_size_per_obj;
  			if (num > offslab_limit)
 				break;
 		}
 		/* Found something acceptable - save it away */
 		cachep->num = num;
 		cachep->gfporder = gfporder;
 		left_over = remainder;
 		/*
 		 * A VFS-reclaimable slab tends to have most allocations
 		 * as GFP_NOFS and we really don't want to have to be allocating
 		 * higher-order pages when we are unable to shrink dcache.
 		 */
 		if (flags & SLAB_RECLAIM_ACCOUNT)
 			break;
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
 		if (gfporder >= slab_max_order)
 			break;
 		/*
 		 * Acceptable internal fragmentation?
 		 */
 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
 }
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (slab_state >= FULL)
 		return enable_cpucache(cachep, gfp);
 	if (slab_state == DOWN) {
 		/*
 		 * Note: Creation of first cache (kmem_cache).
 		 * The setup_node is taken care
 		 * of by the caller of __kmem_cache_create
 		 */
 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
 		slab_state = PARTIAL;
 	} else if (slab_state == PARTIAL) {
 		/*
 		 * Note: the second kmem_cache_create must create the cache
 		 * that's used by kmalloc(24), otherwise the creation of
 		 * further caches will BUG().
 		 */
 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
 		/*
 		 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
 		 * the second cache, then we need to set up all its node/,
 		 * otherwise the creation of further caches will BUG().
 		 */
 		set_up_node(cachep, SIZE_AC);
 		if (INDEX_AC == INDEX_NODE)
 			slab_state = PARTIAL_NODE;
 		else
 			slab_state = PARTIAL_ARRAYCACHE;
 	} else {
 		/* Remaining boot caches */
 		cachep->array[smp_processor_id()] =
 			kmalloc(sizeof(struct arraycache_init), gfp);
 		if (slab_state == PARTIAL_ARRAYCACHE) {
 			set_up_node(cachep, SIZE_NODE);
 			slab_state = PARTIAL_NODE;
 		} else {
 			int node;
 			for_each_online_node(node) {
 				cachep->node[node] =
 				    kmalloc_node(sizeof(struct kmem_cache_node),
 						gfp, node);
 				BUG_ON(!cachep->node[node]);
 				kmem_cache_node_init(cachep->node[node]);
 			}
 		}
 	}
 	cachep->node[numa_mem_id()]->next_reap =
 			jiffies + REAPTIMEOUT_NODE +
 			((unsigned long)cachep) % REAPTIMEOUT_NODE;
 	cpu_cache_get(cachep)->avail = 0;
 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
 	cpu_cache_get(cachep)->batchcount = 1;
 	cpu_cache_get(cachep)->touched = 0;
 	cachep->batchcount = 1;
 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
 	return 0;
 }
 /**
  * __kmem_cache_create - Create a cache.
  * @cachep: cache management descriptor
  * @flags: SLAB flags
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
  * The @ctor is run when new pages are allocated by the cache.
  *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
  * to catch references to uninitialised memory.
  *
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
 int
 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 {
-	size_t left_over, freelist_size, ralign;
+	size_t left_over, freelist_size;
+	size_t ralign = BYTES_PER_WORD;
 	gfp_t gfp;
 	int err;
 	size_t size = cachep->size;
 #if DEBUG
 #if FORCED_DEBUG
 	/*
 	 * Enable redzoning and last user accounting, except for caches with
 	 * large objects, if the increased size would increase the object size
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
 	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
 						2 * sizeof(unsigned long long)))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
 	if (flags & SLAB_DESTROY_BY_RCU)
 		BUG_ON(flags & SLAB_POISON);
 #endif
 	/*
 	 * Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
 	if (size & (BYTES_PER_WORD - 1)) {
 		size += (BYTES_PER_WORD - 1);
 		size &= ~(BYTES_PER_WORD - 1);
 	}
-	/*
-	 * Redzoning and user store require word alignment or possibly larger.
-	 * Note this will be overridden by architecture or caller mandated
-	 * alignment if either is greater than BYTES_PER_WORD.
-	 */
-	if (flags & SLAB_STORE_USER)
-		ralign = BYTES_PER_WORD;
 	if (flags & SLAB_RED_ZONE) {
 		ralign = REDZONE_ALIGN;
 		/* If redzoning, ensure that the second redzone is suitably
 		 * aligned, by adjusting the object size accordingly. */
 		size += REDZONE_ALIGN - 1;
 		size &= ~(REDZONE_ALIGN - 1);
 	}
 	/* 3) caller mandated alignment */
 	if (ralign < cachep->align) {
 		ralign = cachep->align;
 	}
 	/* disable debug if necessary */
 	if (ralign > __alignof__(unsigned long long))
 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	/*
 	 * 4) Store it.
 	 */
 	cachep->align = ralign;
 	if (slab_is_available())
 		gfp = GFP_KERNEL;
 	else
 		gfp = GFP_NOWAIT;
 	setup_node_pointer(cachep);
 #if DEBUG
 	/*
 	 * Both debugging options require word-alignment which is calculated
 	 * into align above.
 	 */
 	if (flags & SLAB_RED_ZONE) {
 		/* add space for red zone words */
 		cachep->obj_offset += sizeof(unsigned long long);
 		size += 2 * sizeof(unsigned long long);
 	}
 	if (flags & SLAB_STORE_USER) {
 		/* user store requires one word storage behind the end of
 		 * the real object. But if the second red zone needs to be
 		 * aligned to 64 bits, we must allow that much space.
 		 */
 		if (flags & SLAB_RED_ZONE)
 			size += REDZONE_ALIGN;
 		else
 			size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	if (size >= kmalloc_size(INDEX_NODE + 1)
 	    && cachep->object_size > cache_line_size()
 	    && ALIGN(size, cachep->align) < PAGE_SIZE) {
 		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
 		size = PAGE_SIZE;
 	}
 #endif
 #endif
 	/*
 	 * Determine if the slab management is 'on' or 'off' slab.
 	 * (bootstrapping cannot cope with offslab caches so don't do
 	 * it too early on. Always use on-slab management when
 	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
 	 */
 	if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
 	    !(flags & SLAB_NOLEAKTRACE))
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
 		 */
 		flags |= CFLGS_OFF_SLAB;
 	size = ALIGN(size, cachep->align);
 	/*
 	 * We should restrict the number of objects in a slab to implement
 	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
 	 */
 	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
 		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
 	left_over = calculate_slab_order(cachep, size, cachep->align, flags);
 	if (!cachep->num)
 		return -E2BIG;
 	freelist_size = calculate_freelist_size(cachep->num, cachep->align);
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
 	 * move it on-slab. This is at the expense of any extra colouring.
 	 */
 	if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
 		flags &= ~CFLGS_OFF_SLAB;
 		left_over -= freelist_size;
 	}
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
 		freelist_size = calculate_freelist_size(cachep->num, 0);
 #ifdef CONFIG_PAGE_POISONING
 		/* If we're going to use the generic kernel_map_pages()
 		 * poisoning, then it's going to smash the contents of
 		 * the redzone and userword anyhow, so switch them off.
 		 */
 		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 #endif
 	}
 	cachep->colour_off = cache_line_size();
 	/* Offset must be a multiple of the alignment. */
 	if (cachep->colour_off < cachep->align)
 		cachep->colour_off = cachep->align;
 	cachep->colour = left_over / cachep->colour_off;
 	cachep->freelist_size = freelist_size;
 	cachep->flags = flags;
 	cachep->allocflags = __GFP_COMP;
 	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
 		cachep->allocflags |= GFP_DMA;
 	cachep->size = size;
 	cachep->reciprocal_buffer_size = reciprocal_value(size);
 	if (flags & CFLGS_OFF_SLAB) {
 		cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
 		/*
 		 * This is a possibility for one of the kmalloc_{dma,}_caches.
 		 * But since we go off slab only for object size greater than
 		 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
 		 * in ascending order,this should not happen at all.
 		 * But leave a BUG_ON for some lucky dude.
 		 */
 		BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
 	}
 	err = setup_cpu_cache(cachep, gfp);
 	if (err) {
 		__kmem_cache_shutdown(cachep);
 		return err;
 	}
 	return 0;
 }
 #if DEBUG
 static void check_irq_off(void)
 {
 	BUG_ON(!irqs_disabled());
 }
 static void check_irq_on(void)
 {
 	BUG_ON(irqs_disabled());
 }
 static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
 	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
 #endif
 }
 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
 	assert_spin_locked(&get_node(cachep, node)->list_lock);
 #endif
 }
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
 #define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
 			struct array_cache *ac,
 			int force, int node);
 static void do_drain(void *arg)
 {
 	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
 	int node = numa_mem_id();
 	struct kmem_cache_node *n;
 	LIST_HEAD(list);
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
 	n = get_node(cachep, node);
 	spin_lock(&n->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node, &list);
 	spin_unlock(&n->list_lock);
 	slabs_destroy(cachep, &list);
 	ac->avail = 0;
 }
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_cache_node *n;
 	int node;
 	on_each_cpu(do_drain, cachep, 1);
 	check_irq_on();
 	for_each_kmem_cache_node(cachep, node, n)
 		if (n->alien)
 			drain_alien_cache(cachep, n->alien);
 	for_each_kmem_cache_node(cachep, node, n)
 		drain_array(cachep, n, n->shared, 1, node);
 }
 /*
  * Remove slabs from the list of free slabs.
  * Specify the number of slabs to drain in tofree.
  *
  * Returns the actual number of slabs released.
  */
 static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_cache_node *n, int tofree)
 {
 	struct list_head *p;
 	int nr_freed;
 	struct page *page;
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
 		spin_lock_irq(&n->list_lock);
 		p = n->slabs_free.prev;
 		if (p == &n->slabs_free) {
 			spin_unlock_irq(&n->list_lock);
 			goto out;
 		}
 		page = list_entry(p, struct page, lru);
 #if DEBUG
 		BUG_ON(page->active);
 #endif
 		list_del(&page->lru);
 		/*
 		 * Safe to drop the lock. The slab is no longer linked
 		 * to the cache.
 		 */
 		n->free_objects -= cache->num;
 		spin_unlock_irq(&n->list_lock);
 		slab_destroy(cache, page);
 		nr_freed++;
 	}
 out:
 	return nr_freed;
 }
 int __kmem_cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0;
 	int node;
 	struct kmem_cache_node *n;
 	drain_cpu_caches(cachep);
 	check_irq_on();
 	for_each_kmem_cache_node(cachep, node, n) {
 		drain_freelist(cachep, n, slabs_tofree(cachep, n));
 		ret += !list_empty(&n->slabs_full) ||
 			!list_empty(&n->slabs_partial);
 	}
 	return (ret ? 1 : 0);
 }
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_cache_node *n;
 	int rc = __kmem_cache_shrink(cachep);
 	if (rc)
 		return rc;
 	for_each_online_cpu(i)
 	    kfree(cachep->array[i]);
 	/* NUMA: free the node structures */
 	for_each_kmem_cache_node(cachep, i, n) {
 		kfree(n->shared);
 		free_alien_cache(n->alien);
 		kfree(n);
 		cachep->node[i] = NULL;
 	}
 	return 0;
 }
 /*
  * Get the memory for a slab management obj.
  *
  * For a slab cache when the slab descriptor is off-slab, the
  * slab descriptor can't come from the same cache which is being created,
  * Because if it is the case, that means we defer the creation of
  * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
  * And we eventually call down to __kmem_cache_create(), which
  * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
  * This is a "chicken-and-egg" problem.
  *
  * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
  * which are all initialized during kmem_cache_init().
  */
 static void *alloc_slabmgmt(struct kmem_cache *cachep,
 				   struct page *page, int colour_off,
 				   gfp_t local_flags, int nodeid)
 {
 	void *freelist;
 	void *addr = page_address(page);
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		freelist = kmem_cache_alloc_node(cachep->freelist_cache,
 					      local_flags, nodeid);
 		if (!freelist)
 			return NULL;
 	} else {
 		freelist = addr + colour_off;
 		colour_off += cachep->freelist_size;
 	}
 	page->active = 0;
 	page->s_mem = addr + colour_off;
 	return freelist;
 }
 static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
 {
 	return ((freelist_idx_t *)page->freelist)[idx];
 }
 static inline void set_free_obj(struct page *page,
 					unsigned int idx, freelist_idx_t val)
 {
 	((freelist_idx_t *)(page->freelist))[idx] = val;
 }
 static void cache_init_objs(struct kmem_cache *cachep,
 			    struct page *page)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, page, i);
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
 			poison_obj(cachep, objp, POISON_FREE);
 		if (cachep->flags & SLAB_STORE_USER)
 			*dbg_userword(cachep, objp) = NULL;
 		if (cachep->flags & SLAB_RED_ZONE) {
 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 		}
 		/*
 		 * Constructors are not allowed to allocate memory from the same
 		 * cache which they are a constructor for.  Otherwise, deadlock.
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
 			cachep->ctor(objp + obj_offset(cachep));
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
 					   " end of an object");
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
 		if ((cachep->size % PAGE_SIZE) == 0 &&
 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp);
 #endif
 		set_obj_status(page, i, OBJECT_FREE);
 		set_free_obj(page, i, i);
 	}
 }
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (CONFIG_ZONE_DMA_FLAG) {
 		if (flags & GFP_DMA)
 			BUG_ON(!(cachep->allocflags & GFP_DMA));
 		else
 			BUG_ON(cachep->allocflags & GFP_DMA);
 	}
 }
 static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
 				int nodeid)
 {
 	void *objp;
 	objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
 	page->active++;
 #if DEBUG
 	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
 #endif
 	return objp;
 }
 static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
 				void *objp, int nodeid)
 {
 	unsigned int objnr = obj_to_index(cachep, page, objp);
 #if DEBUG
 	unsigned int i;
 	/* Verify that the slab belongs to the intended node */
 	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
 	/* Verify double free bug */
 	for (i = page->active; i < cachep->num; i++) {
 		if (get_free_obj(page, i) == objnr) {
 			printk(KERN_ERR "slab: double free detected in cache "
 					"'%s', objp %p\n", cachep->name, objp);
 			BUG();
 		}
 	}
 #endif
 	page->active--;
 	set_free_obj(page, page->active, objnr);
 }
 /*
  * Map pages beginning at addr to the given cache and slab. This is required
  * for the slab allocator to be able to lookup the cache and slab of a
  * virtual address for kfree, ksize, and slab debugging.
  */
 static void slab_map_pages(struct kmem_cache *cache, struct page *page,
 			   void *freelist)
 {
 	page->slab_cache = cache;
 	page->freelist = freelist;
 }
 /*
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
 static int cache_grow(struct kmem_cache *cachep,
 		gfp_t flags, int nodeid, struct page *page)
 {
 	void *freelist;
 	size_t offset;
 	gfp_t local_flags;
 	struct kmem_cache_node *n;
 	/*
 	 * Be lazy and only check for valid flags here,  keeping it out of the
 	 * critical path in kmem_cache_alloc().
 	 */
 	BUG_ON(flags & GFP_SLAB_BUG_MASK);
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 	/* Take the node list lock to change the colour_next on this node */
 	check_irq_off();
 	n = get_node(cachep, nodeid);
 	spin_lock(&n->list_lock);
 	/* Get colour for the slab, and cal the next value. */
 	offset = n->colour_next;
 	n->colour_next++;
 	if (n->colour_next >= cachep->colour)
 		n->colour_next = 0;
 	spin_unlock(&n->list_lock);
 	offset *= cachep->colour_off;
 	if (local_flags & __GFP_WAIT)
 		local_irq_enable();
 	/*
 	 * The test for missing atomic flag is performed here, rather than
 	 * the more obvious place, simply to reduce the critical path length
 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
 	 * will eventually be caught here (where it matters).
 	 */
 	kmem_flagcheck(cachep, flags);
 	/*
 	 * Get mem for the objs.  Attempt to allocate a physical page from
 	 * 'nodeid'.
 	 */
 	if (!page)
 		page = kmem_getpages(cachep, local_flags, nodeid);
 	if (!page)
 		goto failed;
 	/* Get slab management. */
 	freelist = alloc_slabmgmt(cachep, page, offset,
 			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
 	if (!freelist)
 		goto opps1;
 	slab_map_pages(cachep, page, freelist);
 	cache_init_objs(cachep, page);
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
 	spin_lock(&n->list_lock);
 	/* Make slab active. */
 	list_add_tail(&page->lru, &(n->slabs_free));
 	STATS_INC_GROWN(cachep);
 	n->free_objects += cachep->num;
 	spin_unlock(&n->list_lock);
 	return 1;
 opps1:
 	kmem_freepages(cachep, page);
 failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
 }
 #if DEBUG
 /*
  * Perform extra freeing checks:
  * - detect bad pointers.
  * - POISON/RED_ZONE checking
  */
 static void kfree_debugcheck(const void *objp)
 {
 	if (!virt_addr_valid(objp)) {
 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
 		       (unsigned long)objp);
 		BUG();
 	}
 }
 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 {
 	unsigned long long redzone1, redzone2;
 	redzone1 = *dbg_redzone1(cache, obj);
 	redzone2 = *dbg_redzone2(cache, obj);
 	/*
 	 * Redzone is ok.
 	 */
 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
 		return;
 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
 		slab_error(cache, "double free detected");
 	else
 		slab_error(cache, "memory outside object was overwritten");
 	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
 			obj, redzone1, redzone2);
 }
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 				   unsigned long caller)
 {
 	unsigned int objnr;
 	struct page *page;
 	BUG_ON(virt_to_cache(objp) != cachep);
 	objp -= obj_offset(cachep);
 	kfree_debugcheck(objp);
 	page = virt_to_head_page(objp);
 	if (cachep->flags & SLAB_RED_ZONE) {
 		verify_redzone_free(cachep, objp);
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 	}
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = (void *)caller;
 	objnr = obj_to_index(cachep, page, objp);
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != index_to_obj(cachep, page, objnr));
 	set_obj_status(page, objnr, OBJECT_FREE);
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
 #else
 		poison_obj(cachep, objp, POISON_FREE);
 #endif
 	}
 	return objp;
 }
 #else
 #define kfree_debugcheck(x) do { } while(0)
 #define cache_free_debugcheck(x,objp,z) (objp)
 #endif
 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
 							bool force_refill)
 {
 	int batchcount;
 	struct kmem_cache_node *n;
 	struct array_cache *ac;
 	int node;
 	check_irq_off();
 	node = numa_mem_id();
 	if (unlikely(force_refill))
 		goto force_grow;
 retry:
 	ac = cpu_cache_get(cachep);
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 		/*
 		 * If there was little recent activity on this cache, then
 		 * perform only a partial refill.  Otherwise we could generate
 		 * refill bouncing.
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
 	n = get_node(cachep, node);
 	BUG_ON(ac->avail > 0 || !n);
 	spin_lock(&n->list_lock);
 	/* See if we can refill from the shared array */
 	if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
 		n->shared->touched = 1;
 		goto alloc_done;
 	}
 	while (batchcount > 0) {
 		struct list_head *entry;
 		struct page *page;
 		/* Get slab alloc is to come from. */
 		entry = n->slabs_partial.next;
 		if (entry == &n->slabs_partial) {
 			n->free_touched = 1;
 			entry = n->slabs_free.next;
 			if (entry == &n->slabs_free)
 				goto must_grow;
 		}
 		page = list_entry(entry, struct page, lru);
 		check_spinlock_acquired(cachep);
 		/*
 		 * The slab was either on partial or free list so
 		 * there must be at least one object available for
 		 * allocation.
 		 */
 		BUG_ON(page->active >= cachep->num);
 		while (page->active < cachep->num && batchcount--) {
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 			ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
 									node));
 		}
 		/* move slabp to correct slabp list: */
 		list_del(&page->lru);
 		if (page->active == cachep->num)
 			list_add(&page->lru, &n->slabs_full);
 		else
 			list_add(&page->lru, &n->slabs_partial);
 	}
 must_grow:
 	n->free_objects -= ac->avail;
 alloc_done:
 	spin_unlock(&n->list_lock);
 	if (unlikely(!ac->avail)) {
 		int x;
 force_grow:
 		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
 		node = numa_mem_id();
 		/* no objects in sight? abort */
 		if (!x && (ac->avail == 0 || force_refill))
 			return NULL;
 		if (!ac->avail)		/* objects refilled by interrupt? */
 			goto retry;
 	}
 	ac->touched = 1;
 	return ac_get_obj(cachep, ac, flags, force_refill);
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 						gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
 	kmem_flagcheck(cachep, flags);
 #endif
 }
 #if DEBUG
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 				gfp_t flags, void *objp, unsigned long caller)
 {
 	struct page *page;
 	if (!objp)
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
 		check_poison_obj(cachep, objp);
 #endif
 		poison_obj(cachep, objp, POISON_INUSE);
 	}
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = (void *)caller;
 	if (cachep->flags & SLAB_RED_ZONE) {
 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
 			slab_error(cachep, "double free, or memory outside"
 						" object was overwritten");
 			printk(KERN_ERR
 				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
 				objp, *dbg_redzone1(cachep, objp),
 				*dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
 	page = virt_to_head_page(objp);
 	set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
 		cachep->ctor(objp);
 	if (ARCH_SLAB_MINALIGN &&
 	    ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
 		       objp, (int)ARCH_SLAB_MINALIGN);
 	}
 	return objp;
 }
 #else
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (unlikely(cachep == kmem_cache))
 		return false;
 	return should_failslab(cachep->object_size, flags, cachep->flags);
 }
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
 	struct array_cache *ac;
 	bool force_refill = false;
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
 	if (likely(ac->avail)) {
 		ac->touched = 1;
 		objp = ac_get_obj(cachep, ac, flags, false);
 		/*
 		 * Allow for the possibility all avail objects are not allowed
 		 * by the current flags
 		 */
 		if (objp) {
 			STATS_INC_ALLOCHIT(cachep);
 			goto out;
 		}
 		force_refill = true;
 	}
 	STATS_INC_ALLOCMISS(cachep);
 	objp = cache_alloc_refill(cachep, flags, force_refill);
 	/*
 	 * the 'ac' may be updated by cache_alloc_refill(),
 	 * and kmemleak_erase() requires its correct value.
 	 */
 	ac = cpu_cache_get(cachep);
 out:
 	/*
 	 * To avoid a false negative, if an object that is in one of the
 	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
 	 * treat the array pointers as a reference to the object.
 	 */
 	if (objp)
 		kmemleak_erase(&ac->entry[ac->avail]);
 	return objp;
 }
 #ifdef CONFIG_NUMA
 /*
  * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
  *
  * If we are in_interrupt, then process context, including cpusets and
  * mempolicy, may not apply and should not be used for allocation policy.
  */
 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	int nid_alloc, nid_here;
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
 	nid_alloc = nid_here = numa_mem_id();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_slab_spread_node();
 	else if (current->mempolicy)
 		nid_alloc = mempolicy_slab_node();
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
 }
 /*
  * Fallback function if there was no memory available and no objects on a
  * certain node and fall back is permitted. First we scan all the
  * available node for available objects. If that fails then we
  * perform an allocation without specifying a node. This allows the page
  * allocator to do its reclaim / fallback magic. We then insert the
  * slab into the proper nodelist and then allocate from it.
  */
 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	struct zonelist *zonelist;
 	gfp_t local_flags;
 	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
 	unsigned int cpuset_mems_cookie;
 	if (flags & __GFP_THISNODE)
 		return NULL;
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	zonelist = node_zonelist(mempolicy_slab_node(), flags);
 retry:
 	/*
 	 * Look through allowed nodes for objects available
 	 * from existing per node queues.
 	 */
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_hardwall(zone, flags) &&
 			get_node(cache, nid) &&
 			get_node(cache, nid)->free_objects) {
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
 				if (obj)
 					break;
 		}
 	}
 	if (!obj) {
 		/*
 		 * This allocation will be performed within the constraints
 		 * of the current cpuset / memory policy requirements.
 		 * We may trigger various forms of reclaim on the allowed
 		 * set and go into memory reserves if necessary.
 		 */
 		struct page *page;
 		if (local_flags & __GFP_WAIT)
 			local_irq_enable();
 		kmem_flagcheck(cache, flags);
 		page = kmem_getpages(cache, local_flags, numa_mem_id());
 		if (local_flags & __GFP_WAIT)
 			local_irq_disable();
 		if (page) {
 			/*
 			 * Insert into the appropriate per node queues
 			 */
 			nid = page_to_nid(page);
 			if (cache_grow(cache, flags, nid, page)) {
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
 				if (!obj)
 					/*
 					 * Another processor may allocate the
 					 * objects in the slab since we are
 					 * not holding any locks.
 					 */
 					goto retry;
 			} else {
 				/* cache_grow already freed obj */
 				obj = NULL;
 			}
 		}
 	}
 	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return obj;
 }
 /*
  * A interface to enable slab creation on nodeid
  */
 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 				int nodeid)
 {
 	struct list_head *entry;
 	struct page *page;
 	struct kmem_cache_node *n;
 	void *obj;
 	int x;
 	VM_BUG_ON(nodeid > num_online_nodes());
 	n = get_node(cachep, nodeid);
 	BUG_ON(!n);
 retry:
 	check_irq_off();
 	spin_lock(&n->list_lock);
 	entry = n->slabs_partial.next;
 	if (entry == &n->slabs_partial) {
 		n->free_touched = 1;
 		entry = n->slabs_free.next;
 		if (entry == &n->slabs_free)
 			goto must_grow;
 	}
 	page = list_entry(entry, struct page, lru);
 	check_spinlock_acquired_node(cachep, nodeid);
 	STATS_INC_NODEALLOCS(cachep);
 	STATS_INC_ACTIVE(cachep);
 	STATS_SET_HIGH(cachep);
 	BUG_ON(page->active == cachep->num);
 	obj = slab_get_obj(cachep, page, nodeid);
 	n->free_objects--;
 	/* move slabp to correct slabp list: */
 	list_del(&page->lru);
 	if (page->active == cachep->num)
 		list_add(&page->lru, &n->slabs_full);
 	else
 		list_add(&page->lru, &n->slabs_partial);
 	spin_unlock(&n->list_lock);
 	goto done;
 must_grow:
 	spin_unlock(&n->list_lock);
 	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
 	if (x)
 		goto retry;
 	return fallback_alloc(cachep, flags);
 done:
 	return obj;
 }
 static __always_inline void *
 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 		   unsigned long caller)
 {
 	unsigned long save_flags;
 	void *ptr;
 	int slab_node = numa_mem_id();
 	flags &= gfp_allowed_mask;
 	lockdep_trace_alloc(flags);
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 	cachep = memcg_kmem_get_cache(cachep, flags);
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	if (nodeid == NUMA_NO_NODE)
 		nodeid = slab_node;
 	if (unlikely(!get_node(cachep, nodeid))) {
 		/* Node not bootstrapped yet */
 		ptr = fallback_alloc(cachep, flags);
 		goto out;
 	}
 	if (nodeid == slab_node) {
 		/*
 		 * Use the locally cached objects if possible.
 		 * However ____cache_alloc does not allow fallback
 		 * to other nodes. It may fail while we still have
 		 * objects on other nodes available.
 		 */
 		ptr = ____cache_alloc(cachep, flags);
 		if (ptr)
 			goto out;
 	}
 	/* ___cache_alloc_node can fall back to other nodes */
 	ptr = ____cache_alloc_node(cachep, flags, nodeid);
   out:
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 	kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
 				 flags);
 	if (likely(ptr)) {
 		kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
 		if (unlikely(flags & __GFP_ZERO))
 			memset(ptr, 0, cachep->object_size);
 	}
 	return ptr;
 }
 static __always_inline void *
 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	void *objp;
 	if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
 		objp = alternate_node_alloc(cache, flags);
 		if (objp)
 			goto out;
 	}
 	objp = ____cache_alloc(cache, flags);
 	/*
 	 * We may just have run out of memory on the local node.
 	 * ____cache_alloc_node() knows how to locate memory on other nodes
 	 */
 	if (!objp)
 		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
   out:
 	return objp;
 }
 #else
 static __always_inline void *
 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	return ____cache_alloc(cachep, flags);
 }
 #endif /* CONFIG_NUMA */
 static __always_inline void *
 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 {
 	unsigned long save_flags;
 	void *objp;
 	flags &= gfp_allowed_mask;
 	lockdep_trace_alloc(flags);
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 	cachep = memcg_kmem_get_cache(cachep, flags);
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	objp = __do_cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
 				 flags);
 	prefetchw(objp);
 	if (likely(objp)) {
 		kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
 		if (unlikely(flags & __GFP_ZERO))
 			memset(objp, 0, cachep->object_size);
 	}
 	return objp;
 }
 /*
  * Caller needs to acquire correct kmem_cache_node's list_lock
  * @list: List of detached free slabs should be freed by caller
  */
 static void free_block(struct kmem_cache *cachep, void **objpp,
 			int nr_objects, int node, struct list_head *list)
 {
 	int i;
 	struct kmem_cache_node *n = get_node(cachep, node);
 	for (i = 0; i < nr_objects; i++) {
 		void *objp;
 		struct page *page;
 		clear_obj_pfmemalloc(&objpp[i]);
 		objp = objpp[i];
 		page = virt_to_head_page(objp);
 		list_del(&page->lru);
 		check_spinlock_acquired_node(cachep, node);
 		slab_put_obj(cachep, page, objp, node);
 		STATS_DEC_ACTIVE(cachep);
 		n->free_objects++;
 		/* fixup slab chains */
 		if (page->active == 0) {
 			if (n->free_objects > n->free_limit) {
 				n->free_objects -= cachep->num;
 				list_add_tail(&page->lru, list);
 			} else {
 				list_add(&page->lru, &n->slabs_free);
 			}
 		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
 			list_add_tail(&page->lru, &n->slabs_partial);
 		}
 	}
 }
 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 {
 	int batchcount;
 	struct kmem_cache_node *n;
 	int node = numa_mem_id();
 	LIST_HEAD(list);
 	batchcount = ac->batchcount;
 #if DEBUG
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
 	n = get_node(cachep, node);
 	spin_lock(&n->list_lock);
 	if (n->shared) {
 		struct array_cache *shared_array = n->shared;
 		int max = shared_array->limit - shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
 			memcpy(&(shared_array->entry[shared_array->avail]),
 			       ac->entry, sizeof(void *) * batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 	free_block(cachep, ac->entry, batchcount, node, &list);
 free_done:
 #if STATS
 	{
 		int i = 0;
 		struct list_head *p;
 		p = n->slabs_free.next;
 		while (p != &(n->slabs_free)) {
 			struct page *page;
 			page = list_entry(p, struct page, lru);
 			BUG_ON(page->active);
 			i++;
 			p = p->next;
 		}
 		STATS_SET_FREEABLE(cachep, i);
 	}
 #endif
 	spin_unlock(&n->list_lock);
 	slabs_destroy(cachep, &list);
 	ac->avail -= batchcount;
 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 }
 /*
  * Release an obj back to its cache. If the obj has a constructed state, it must
  * be in this state _before_ it is released.  Called with disabled ints.
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, caller);
 	kmemcheck_slab_free(cachep, objp, cachep->object_size);
 	/*
 	 * Skip calling cache_free_alien() when the platform is not numa.
 	 * This will avoid cache misses that happen while accessing slabp (which
 	 * is per page memory  reference) to get nodeid. Instead use a global
 	 * variable to skip the call, which is mostly likely to be present in
 	 * the cache.
 	 */
 	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
 		return;
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
 	}
 	ac_put_obj(cachep, ac, objp);
 }
 /**
  * kmem_cache_alloc - Allocate an object
  * @cachep: The cache to allocate from.
  * @flags: See kmalloc().
  *
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = slab_alloc(cachep, flags, _RET_IP_);
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *
 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
 	void *ret;
 	ret = slab_alloc(cachep, flags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret,
 		      size, cachep->size, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #endif
 #ifdef CONFIG_NUMA
 /**
  * kmem_cache_alloc_node - Allocate an object on the specified node
  * @cachep: The cache to allocate from.
  * @flags: See kmalloc().
  * @nodeid: node number of the target node.
  *
  * Identical to kmem_cache_alloc but it will allocate memory on the given
  * node, which can improve the performance for cpu bound structures.
  *
  * Fallback to other node is possible if __GFP_THISNODE is not set.
  */
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
 				    flags, nodeid);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 				  gfp_t flags,
 				  int nodeid,
 				  size_t size)
 {
 	void *ret;
 	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
 			   flags, nodeid);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
 	struct kmem_cache *cachep;
 	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
 	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 }
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
 		int node, unsigned long caller)
 {
 	return __do_kmalloc_node(size, flags, node, caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #else
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node, 0);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
 #endif /* CONFIG_NUMA */
 /**
  * __do_kmalloc - allocate memory
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate (see kmalloc).
  * @caller: function caller for debug tracking of the caller
  */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  unsigned long caller)
 {
 	struct kmem_cache *cachep;
 	void *ret;
 	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
 	ret = slab_alloc(cachep, flags, caller);
 	trace_kmalloc(caller, ret,
 		      size, cachep->size, flags);
 	return ret;
 }
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	return __do_kmalloc(size, flags, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc);
 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 {
 	return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #else
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	return __do_kmalloc(size, flags, 0);
 }
 EXPORT_SYMBOL(__kmalloc);
 #endif
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
  * @objp: The previously allocated object.
  *
  * Free an object which was previously allocated from this
  * cache.
  */
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
 	cachep = cache_from_obj(cachep, objp);
 	if (!cachep)
 		return;
 	local_irq_save(flags);
 	debug_check_no_locks_freed(objp, cachep->object_size);
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, cachep->object_size);
 	__cache_free(cachep, objp, _RET_IP_);
 	local_irq_restore(flags);
 	trace_kmem_cache_free(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 /**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
  *
  * If @objp is NULL, no operation is performed.
  *
  * Don't free memory not originally allocated by kmalloc()
  * or you will run into trouble.
  */
 void kfree(const void *objp)
 {
 	struct kmem_cache *c;
 	unsigned long flags;
 	trace_kfree(_RET_IP_, objp);
 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
 		return;
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, c->object_size);
 	debug_check_no_obj_freed(objp, c->object_size);
 	__cache_free(c, (void *)objp, _RET_IP_);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
 /*
  * This initializes kmem_cache_node or resizes various caches for all nodes.
  */
 static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int node;
 	struct kmem_cache_node *n;
 	struct array_cache *new_shared;
 	struct alien_cache **new_alien = NULL;
 	for_each_online_node(node) {
                 if (use_alien_caches) {
                         new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                         if (!new_alien)
                                 goto fail;
                 }
 		new_shared = NULL;
 		if (cachep->shared) {
 			new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
 					0xbaadf00d, gfp);
 			if (!new_shared) {
 				free_alien_cache(new_alien);
 				goto fail;
 			}
 		}
 		n = get_node(cachep, node);
 		if (n) {
 			struct array_cache *shared = n->shared;
 			LIST_HEAD(list);
 			spin_lock_irq(&n->list_lock);
 			if (shared)
 				free_block(cachep, shared->entry,
 						shared->avail, node, &list);
 			n->shared = new_shared;
 			if (!n->alien) {
 				n->alien = new_alien;
 				new_alien = NULL;
 			}
 			n->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
 			spin_unlock_irq(&n->list_lock);
 			slabs_destroy(cachep, &list);
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
 		}
 		n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
 		if (!n) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
 			goto fail;
 		}
 		kmem_cache_node_init(n);
 		n->next_reap = jiffies + REAPTIMEOUT_NODE +
 				((unsigned long)cachep) % REAPTIMEOUT_NODE;
 		n->shared = new_shared;
 		n->alien = new_alien;
 		n->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
 		cachep->node[node] = n;
 	}
 	return 0;
 fail:
 	if (!cachep->list.next) {
 		/* Cache is not active yet. Roll back what we did */
 		node--;
 		while (node >= 0) {
 			n = get_node(cachep, node);
 			if (n) {
 				kfree(n->shared);
 				free_alien_cache(n->alien);
 				kfree(n);
 				cachep->node[node] = NULL;
 			}
 			node--;
 		}
 	}
 	return -ENOMEM;
 }
 struct ccupdate_struct {
 	struct kmem_cache *cachep;
 	struct array_cache *new[0];
 };
 static void do_ccupdate_local(void *info)
 {
 	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 	check_irq_off();
 	old = cpu_cache_get(new->cachep);
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
 }
 /* Always called with the slab_mutex held */
 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	struct ccupdate_struct *new;
 	int i;
 	new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
 		      gfp);
 	if (!new)
 		return -ENOMEM;
 	for_each_online_cpu(i) {
 		new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
 						batchcount, gfp);
 		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new->new[i]);
 			kfree(new);
 			return -ENOMEM;
 		}
 	}
 	new->cachep = cachep;
 	on_each_cpu(do_ccupdate_local, (void *)new, 1);
 	check_irq_on();
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
 	for_each_online_cpu(i) {
 		LIST_HEAD(list);
 		struct array_cache *ccold = new->new[i];
 		int node;
 		struct kmem_cache_node *n;
 		if (!ccold)
 			continue;
 		node = cpu_to_mem(i);
 		n = get_node(cachep, node);
 		spin_lock_irq(&n->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, node, &list);
 		spin_unlock_irq(&n->list_lock);
 		slabs_destroy(cachep, &list);
 		kfree(ccold);
 	}
 	kfree(new);
 	return alloc_kmem_cache_node(cachep, gfp);
 }
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	int ret;
 	struct kmem_cache *c = NULL;
 	int i = 0;
 	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 	if (slab_state < FULL)
 		return ret;
 	if ((ret < 0) || !is_root_cache(cachep))
 		return ret;
 	VM_BUG_ON(!mutex_is_locked(&slab_mutex));
 	for_each_memcg_cache_index(i) {
 		c = cache_from_memcg_idx(cachep, i);
 		if (c)
 			/* return value determined by the parent cache only */
 			__do_tune_cpucache(c, limit, batchcount, shared, gfp);
 	}
 	return ret;
 }
 /* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int err;
 	int limit = 0;
 	int shared = 0;
 	int batchcount = 0;
 	if (!is_root_cache(cachep)) {
 		struct kmem_cache *root = memcg_root_cache(cachep);
 		limit = root->limit;
 		shared = root->shared;
 		batchcount = root->batchcount;
 	}
 	if (limit && shared && batchcount)
 		goto skip_setup;
 	/*
 	 * The head array serves three purposes:
 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
 	 * - reduce the number of spinlock operations.
 	 * - reduce the number of linked list operations on the slab and
 	 *   bufctl chains: array operations are cheaper.
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
 	 */
 	if (cachep->size > 131072)
 		limit = 1;
 	else if (cachep->size > PAGE_SIZE)
 		limit = 8;
 	else if (cachep->size > 1024)
 		limit = 24;
 	else if (cachep->size > 256)
 		limit = 54;
 	else
 		limit = 120;
 	/*
 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
 	 * allocation behaviour: Most allocs on one cpu, most free operations
 	 * on another cpu. For these cases, an efficient object passing between
 	 * cpus is necessary. This is provided by a shared array. The array
 	 * replaces Bonwick's magazine layer.
 	 * On uniprocessor, it's functionally equivalent (but less efficient)
 	 * to a larger limit. Thus disabled by default.
 	 */
 	shared = 0;
 	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
 		shared = 8;
 #if DEBUG
 	/*
 	 * With debugging enabled, large batchcount lead to excessively long
 	 * periods with disabled local interrupts. Limit the batchcount
 	 */
 	if (limit > 32)
 		limit = 32;
 #endif
 	batchcount = (limit + 1) / 2;
 skip_setup:
 	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
 	return err;
 }
 /*
  * Drain an array if it contains any elements taking the node lock only if
  * necessary. Note that the node listlock also protects the array_cache
  * if drain_array() is used on the shared array.
  */
 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
 			 struct array_cache *ac, int force, int node)
 {
 	LIST_HEAD(list);
 	int tofree;
 	if (!ac || !ac->avail)
 		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else {
 		spin_lock_irq(&n->list_lock);
 		if (ac->avail) {
 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
 			if (tofree > ac->avail)
 				tofree = (ac->avail + 1) / 2;
 			free_block(cachep, ac->entry, tofree, node, &list);
 			ac->avail -= tofree;
 			memmove(ac->entry, &(ac->entry[tofree]),
 				sizeof(void *) * ac->avail);
 		}
 		spin_unlock_irq(&n->list_lock);
 		slabs_destroy(cachep, &list);
 	}
 }
 /**
  * cache_reap - Reclaim memory from caches.
  * @w: work descriptor
  *
  * Called from workqueue/eventd every few seconds.
  * Purpose:
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
  * If we cannot acquire the cache chain mutex then just give up - we'll try
  * again on the next iteration.
  */
 static void cache_reap(struct work_struct *w)
 {
 	struct kmem_cache *searchp;
 	struct kmem_cache_node *n;
 	int node = numa_mem_id();
 	struct delayed_work *work = to_delayed_work(w);
 	if (!mutex_trylock(&slab_mutex))
 		/* Give up. Setup the next iteration. */
 		goto out;
 	list_for_each_entry(searchp, &slab_caches, list) {
 		check_irq_on();
 		/*
 		 * We only take the node lock if absolutely necessary and we
 		 * have established with reasonable certainty that
 		 * we can do some work if the lock was obtained.
 		 */
 		n = get_node(searchp, node);
 		reap_alien(searchp, n);
 		drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
 		/*
 		 * These are racy checks but it does not matter
 		 * if we skip one check or scan twice.
 		 */
 		if (time_after(n->next_reap, jiffies))
 			goto next;
 		n->next_reap = jiffies + REAPTIMEOUT_NODE;
 		drain_array(searchp, n, n->shared, 0, node);
 		if (n->free_touched)
 			n->free_touched = 0;
 		else {
 			int freed;
 			freed = drain_freelist(searchp, n, (n->free_limit +
 				5 * searchp->num - 1) / (5 * searchp->num));
 			STATS_ADD_REAPED(searchp, freed);
 		}
 next:
 		cond_resched();
 	}
 	check_irq_on();
 	mutex_unlock(&slab_mutex);
 	next_reap_node();
 out:
 	/* Set up the next iteration */
 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
 }
 #ifdef CONFIG_SLABINFO
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
 	struct page *page;
 	unsigned long active_objs;
 	unsigned long num_objs;
 	unsigned long active_slabs = 0;
 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
 	int node;
 	struct kmem_cache_node *n;
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_kmem_cache_node(cachep, node, n) {
 		check_irq_on();
 		spin_lock_irq(&n->list_lock);
 		list_for_each_entry(page, &n->slabs_full, lru) {
 			if (page->active != cachep->num && !error)
 				error = "slabs_full accounting error";
 			active_objs += cachep->num;
 			active_slabs++;
 		}
 		list_for_each_entry(page, &n->slabs_partial, lru) {
 			if (page->active == cachep->num && !error)
 				error = "slabs_partial accounting error";
 			if (!page->active && !error)
 				error = "slabs_partial accounting error";
 			active_objs += page->active;
 			active_slabs++;
 		}
 		list_for_each_entry(page, &n->slabs_free, lru) {
 			if (page->active && !error)
 				error = "slabs_free accounting error";
 			num_slabs++;
 		}
 		free_objects += n->free_objects;
 		if (n->shared)
 			shared_avail += n->shared->avail;
 		spin_unlock_irq(&n->list_lock);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
 	name = cachep->name;
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 	sinfo->active_objs = active_objs;
 	sinfo->num_objs = num_objs;
 	sinfo->active_slabs = active_slabs;
 	sinfo->num_slabs = num_slabs;
 	sinfo->shared_avail = shared_avail;
 	sinfo->limit = cachep->limit;
 	sinfo->batchcount = cachep->batchcount;
 	sinfo->shared = cachep->shared;
 	sinfo->objects_per_slab = cachep->num;
 	sinfo->cache_order = cachep->gfporder;
 }
 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
 {
 #if STATS
 	{			/* node stats */
 		unsigned long high = cachep->high_mark;
 		unsigned long allocs = cachep->num_allocations;
 		unsigned long grown = cachep->grown;
 		unsigned long reaped = cachep->reaped;
 		unsigned long errors = cachep->errors;
 		unsigned long max_freeable = cachep->max_freeable;
 		unsigned long node_allocs = cachep->node_allocs;
 		unsigned long node_frees = cachep->node_frees;
 		unsigned long overflows = cachep->node_overflow;
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
 			   "%4lu %4lu %4lu %4lu %4lu",
 			   allocs, high, grown,
 			   reaped, errors, max_freeable, node_allocs,
 			   node_frees, overflows);
 	}
 	/* cpu stats */
 	{
 		unsigned long allochit = atomic_read(&cachep->allochit);
 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
 		unsigned long freehit = atomic_read(&cachep->freehit);
 		unsigned long freemiss = atomic_read(&cachep->freemiss);
 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
 			   allochit, allocmiss, freehit, freemiss);
 	}
 #endif
 }
 #define MAX_SLABINFO_WRITE 128
 /**
  * slabinfo_write - Tuning for the slab allocator
  * @file: unused
  * @buffer: user buffer
  * @count: data length
  * @ppos: unused
  */
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos)
 {
 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
 	int limit, batchcount, shared, res;
 	struct kmem_cache *cachep;
 	if (count > MAX_SLABINFO_WRITE)
 		return -EINVAL;
 	if (copy_from_user(&kbuf, buffer, count))
 		return -EFAULT;
 	kbuf[MAX_SLABINFO_WRITE] = '\0';
 	tmp = strchr(kbuf, ' ');
 	if (!tmp)
 		return -EINVAL;
 	*tmp = '\0';
 	tmp++;
 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
 		return -EINVAL;
 	/* Find the cache in the chain of caches. */
 	mutex_lock(&slab_mutex);
 	res = -EINVAL;
 	list_for_each_entry(cachep, &slab_caches, list) {
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 || batchcount < 1 ||
 					batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
 						       batchcount, shared,
 						       GFP_KERNEL);
 			}
 			break;
 		}
 	}
 	mutex_unlock(&slab_mutex);
 	if (res >= 0)
 		res = count;
 	return res;
 }
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 static void *leaks_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&slab_mutex);
 	return seq_list_start(&slab_caches, *pos);
 }
 static inline int add_caller(unsigned long *n, unsigned long v)
 {
 	unsigned long *p;
 	int l;
 	if (!v)
 		return 1;
 	l = n[1];
 	p = n + 2;
 	while (l) {
 		int i = l/2;
 		unsigned long *q = p + 2 * i;
 		if (*q == v) {
 			q[1]++;
 			return 1;
 		}
 		if (*q > v) {
 			l = i;
 		} else {
 			p = q + 2;
 			l -= i + 1;
 		}
 	}
 	if (++n[1] == n[0])
 		return 0;
 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
 	p[0] = v;
 	p[1] = 1;
 	return 1;
 }
 static void handle_slab(unsigned long *n, struct kmem_cache *c,
 						struct page *page)
 {
 	void *p;
 	int i;
 	if (n[0] == n[1])
 		return;
 	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
 		if (get_obj_status(page, i) != OBJECT_ACTIVE)
 			continue;
 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
 			return;
 	}
 }
 static void show_symbol(struct seq_file *m, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
 	unsigned long offset, size;
 	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
 	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
 		if (modname[0])
 			seq_printf(m, " [%s]", modname);
 		return;
 	}
 #endif
 	seq_printf(m, "%p", (void *)address);
 }
 static int leaks_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
 	struct page *page;
 	struct kmem_cache_node *n;
 	const char *name;
 	unsigned long *x = m->private;
 	int node;
 	int i;
 	if (!(cachep->flags & SLAB_STORE_USER))
 		return 0;
 	if (!(cachep->flags & SLAB_RED_ZONE))
 		return 0;
 	/* OK, we can do it */
 	x[1] = 0;
 	for_each_kmem_cache_node(cachep, node, n) {
 		check_irq_on();
 		spin_lock_irq(&n->list_lock);
 		list_for_each_entry(page, &n->slabs_full, lru)
 			handle_slab(x, cachep, page);
 		list_for_each_entry(page, &n->slabs_partial, lru)
 			handle_slab(x, cachep, page);
 		spin_unlock_irq(&n->list_lock);
 	}
 	name = cachep->name;
 	if (x[0] == x[1]) {
 		/* Increase the buffer size */
 		mutex_unlock(&slab_mutex);
 		m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
 		if (!m->private) {
 			/* Too bad, we are really out */
 			m->private = x;
 			mutex_lock(&slab_mutex);
 			return -ENOMEM;
 		}
 		*(unsigned long *)m->private = x[0] * 2;
 		kfree(x);
 		mutex_lock(&slab_mutex);
 		/* Now make sure this entry will be retried */
 		m->count = m->size;
 		return 0;
 	}
 	for (i = 0; i < x[1]; i++) {
 		seq_printf(m, "%s: %lu ", name, x[2*i+3]);
 		show_symbol(m, x[2*i+2]);
 		seq_putc(m, '\n');
 	}
 	return 0;
 }
 static const struct seq_operations slabstats_op = {
 	.start = leaks_start,
 	.next = slab_next,
 	.stop = slab_stop,
 	.show = leaks_show,
 };
 static int slabstats_open(struct inode *inode, struct file *file)
 {
 	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	int ret = -ENOMEM;
 	if (n) {
 		ret = seq_open(file, &slabstats_op);
 		if (!ret) {
 			struct seq_file *m = file->private_data;
 			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
 			m->private = n;
 			n = NULL;
 		}
 		kfree(n);
 	}
 	return ret;
 }
 static const struct file_operations proc_slabstats_operations = {
 	.open		= slabstats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release_private,
 };
 #endif
 static int __init slab_proc_init(void)
 {
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
 	return 0;
 }
 module_init(slab_proc_init);
 #endif
 /**
  * ksize - get the actual amount of memory allocated for a given object
  * @objp: Pointer to the object
  *
  * kmalloc may internally round up allocations and return more memory
  * than requested. ksize() can be used to determine the actual amount of
  * memory allocated. The caller may use this additional memory, even though
  * a smaller amount of memory was initially specified with the kmalloc call.
  * The caller must guarantee that objp points to a valid object previously
  * allocated with either kmalloc() or kmem_cache_alloc(). The object
  * must not be freed during the duration of the call.
  */
 size_t ksize(const void *objp)
 {
 	BUG_ON(!objp);
 	if (unlikely(objp == ZERO_SIZE_PTR))
 		return 0;
 	return virt_to_cache(objp)->object_size;
 }
 EXPORT_SYMBOL(ksize);